def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)
Ejemplo n.º 2
0
def index(indexdir):
  lucene.initVM()
  indexDir = SimpleFSDirectory(File(indexdir))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)

  f = open('data/docid.documento-xml.txt')
  st = PorterStemmer()
  for i, line in enumerate(f.readlines()):
    id, xmltext = line.split('\t')
    xmltext = xmltext.rstrip('\n')
    xmldoc = minidom.parseString(xmltext)
    title = xmldoc.getElementsByTagName("TITLE")
    title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
    authors = xmldoc.getElementsByTagName("AUTHORS")
    authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
    abstract = xmldoc.getElementsByTagName("ABSTRACT")
    abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
    doc = Document()
    doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
    writer.addDocument(doc)
    print "indexed %s docs" % (i+1)

  writer.close()
Ejemplo n.º 3
0
    def get_doc(self, filename, path, title, url, contents):
        '''
        Generate a `Document` according to the parameters.

        Input: `filename`: filename of the webpage
               `path`: path of the webpage
               `title`: title of the webpage
               `url`: original url of the webpage
               `contents`: contents of the webpage
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        # doc.add(Field("name", filename, self.property_type))
        # doc.add(Field("path", path, self.property_type))
        # doc.add(Field("title", title, self.property_type))
        # doc.add(Field("url", url, self.property_type))
        doc.add(StringField("name", filename, Field.Store.YES))
        doc.add(StringField("path", path, Field.Store.YES))
        doc.add(TextField("title", title, Field.Store.YES))
        doc.add(TextField("url", url, Field.Store.YES))
        if len(contents) > 0:
            # doc.add(Field("contents", contents, self.content_type))
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print("Warning: No content in {}".format(filename))
        return doc
Ejemplo n.º 4
0
def create_index(storage, paths):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File(storage))
    stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
    for s in stopwords:
        stops.add(s)
    analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
    writer = IndexWriter(indexDir, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading Documents"

    import os
    for path in paths:
        for filen in os.listdir(path):
            text = sent_tokenize(get_data_from_file(path + filen))
            total_sent = len(text)
            for i in range(0, total_sent, 3):
                doc = Document()
                a = i - 5 if i - 5 > 0 else 0
                sentence = ' '.join(text[a:i + 5])
                doc.add(
                    Field("text", sentence, Field.Store.YES,
                          Field.Index.ANALYZED))
                writer.addDocument(doc)
            print("Done %s" % (path + filen))
            print "Indexed (%d docs in index)" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
    def indexDocs(self, root, writer):

        f = codecs.open('picIndex.txt', 'r', encoding='utf-8')
        picDict = {}
        for line in f.xreadlines():
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1]
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url, title, alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(
                Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(
                Field("url", picDict[src][0], Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("title", picDict[src][1], Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(
                Field("alt", picDict[src][2], Field.Store.YES,
                      Field.Index.ANALYZED))
            writer.addDocument(doc)
Ejemplo n.º 6
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        index_file = open("index.txt", 'r')
        for line in index_file.readlines():

            try:
                src = line.strip().split('\t')[0]
                filename = line.strip().split('\t')[1]
                tag = line.strip().split('\t')[2]
                path = os.path.join(root, filename)

                doc = Document()
                doc.add(Field("name", filename, t1))
                doc.add(Field("path", root, t1))
                doc.add(Field("src", src, t1))

                if len(tag) > 0:
                    doc.add(Field("tag", tag, t2))
                else:
                    print "warning: no tag in %s" % filename
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
Ejemplo n.º 7
0
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Ejemplo n.º 8
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(False)
        t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        total=0
        file = open(root,"r")
        for line in file.readlines():
            try:
                imgurl, itemurl, content = line.split('\t')
                total+=1
                print total
                print "adding", content
                contents = ' '.join(jieba.cut(content))
                doc = Document()
                doc.add(Field("imgurl", imgurl, t1))
                doc.add(Field("itemurl", itemurl, t1))
                doc.add(Field("title", content, t1))
                doc.add(Field("contents",contents,t3))
                writer.addDocument(doc)
            except Exception, e:
                    print "Failed in indexDocs:", e
    def testCompressionTools(self):

        bytes = JArray('byte')(self.binaryValCompressed)
        binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes))
        stringFldCompressed = StoredField("stringCompressed", CompressionTools.compressString(self.binaryValCompressed))
    
        doc = Document()
        doc.add(binaryFldCompressed)
        doc.add(stringFldCompressed)
    
        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
        writer.addDocument(doc)
        writer.close()
    
        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assert_(docFromReader is not None)
    
        # fetch the binary compressed field and compare it's content with
        # the original one
        bytes = CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed"))
        binaryFldCompressedTest = bytes.string_
        self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
        self.assertEqual(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed)

        reader.close()
Ejemplo n.º 10
0
    def addDocumentToIndex(self, title, text):
        doc = Document()

        doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))

        self.writer.addDocument(doc)
Ejemplo n.º 11
0
    def testCompressionTools(self):

        bytes = JArray('byte')(self.binaryValCompressed)
        binaryFldCompressed = StoredField("binaryCompressed",
                                          CompressionTools.compress(bytes))
        stringFldCompressed = StoredField(
            "stringCompressed",
            CompressionTools.compressString(self.binaryValCompressed))

        doc = Document()
        doc.add(binaryFldCompressed)
        doc.add(stringFldCompressed)

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer())
        writer.addDocument(doc)
        writer.close()

        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assertTrue(docFromReader is not None)

        # fetch the binary compressed field and compare it's content with
        # the original one
        bytes = CompressionTools.decompress(
            docFromReader.getBinaryValue("binaryCompressed"))
        binaryFldCompressedTest = bytes.string_
        self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
        self.assertEqual(
            CompressionTools.decompressString(
                docFromReader.getBinaryValue("stringCompressed")),
            self.binaryValCompressed)

        reader.close()
Ejemplo n.º 12
0
def indexDictionary(d, writer):
    for k, v in d.iteritems():
        doc = Document()
        doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    return writer.numDocs()
Ejemplo n.º 13
0
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
def indexMovie(movie):
    doc = Document()
    doc.add(Field('id', str(movie), StringField.TYPE_STORED))
    at_lest_one_field = False

    maybe_tags = movies_tags.query('item == @movie')
    if not maybe_tags.empty:
        tags = maybe_tags[['tags']].values.flatten()[0]
        doc.add(Field('tags', tags, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    maybe_description = movies_descriptions.query('item == @movie')
    if not maybe_description.empty:
        description = maybe_description[['description']].values.flatten()[0]
        doc.add(Field('description', description, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    maybe_genres = movies_genres.query('item == @movie')
    if not maybe_genres.empty:
        genres = maybe_genres[['genres']].values.flatten()[0]
        doc.add(Field('genres', genres, TextField.TYPE_NOT_STORED))
        at_lest_one_field = True

    if at_lest_one_field:
        writer.addDocument(doc)
Ejemplo n.º 15
0
    def index_docs(self, tweets, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        t1.setStoreTermVectors(True)
        t1.setStoreTermVectorOffsets(True)

        # add each tweet to the index
        for tweet in tweets:
            try:
                # strip out URLs because they provide false index matches
                contents = []
                for word in tweet[1].text.split():
                    if word.startswith("http://") or word.startswith("https://"):
                        continue
                    contents.append(word)
                contents = " ".join(contents)

                if len(contents) == 0: continue

                doc = Document()
                doc.add(Field("contents", contents, t1))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in index_docs:", e
Ejemplo n.º 16
0
    def indexDocs(self, root, writer):
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                print("adding", filename)
                path = os.path.join(root, filename)
                file = open(path, encoding='utf8')
                url = file.readline()
                title = file.readline()
                contents = file.read()
                file.close()
                img_url = self.getTxtAttribute(contents, 'img_url')
                img_info = self.getTxtAttribute(contents, 'img_info')
                for i in range(len(img_url)):
                    if len(img_info[i]) > 0:
                        title = title
                        doc = Document()

                        doc.add(StringField('title', title, Field.Store.YES))
                        doc.add(StringField('url', url, Field.Store.YES))
                        doc.add(
                            StringField('img_url', img_url[i],
                                        Field.Store.YES))
                        seg_contents = jieba.lcut_for_search(img_info[i])
                        contents = ' '.join(seg_contents)
                        doc.add(
                            TextField('contents', contents, Field.Store.YES))
                        writer.addDocument(doc)
                    else:
                        continue
Ejemplo n.º 17
0
    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)
Ejemplo n.º 18
0
def generate_indices_from_benchmark(writer, counter):
	javafiles = java_files_from_dir(source_path)
	i = 0
	j = 0
	for javafile in javafiles:
		# print javafile
		i += 1
		if i % 1000 == 0:	#1000개 마다 프린트
			print("Counter: %s" % i)
			print "typed_method_call" + str(counter.typed_method_call_count)
		document = Document()
		document.add(Field("file", javafile, Field.Store.YES, Field.Index.NO))
		try:
			with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f:
				file_content = f.read().encode("utf-8", errors='ignore')
			f.close()

			ast = parse(file_content, resolve=False)
			if add_code_keyword_into_document(document, file_content, ast, counter):
				writer.addDocument(document)
				j += 1
				if j % 1000 == 0:
					print "Wrote:: %s files" % j

		except Exception as e:
			print("Error: %s" % e)
			continue
	print "Number of files: %s" % i
	print "Number of duplicates: %s" % len(hashes)
	print "%s files has been indexed" % j
	def rewrite(data_string):
		data=json.loads(data_string)
		toupdate=json.loads(update)
		#primary_key_modified=False

		#delete the appropriate document
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		

		#modify the values
		for key,value in toupdate.items():
			#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)	
			if add_field_if_not_exists==False:
				if key in data.keys():
					data[key]=value
			else:		
				data[key]=value

		#this deletion statement has been intenstionally added here		
		#only if the modified data,has primary keys already not existing,will the updating process continue
		primary_key_update=False
		for key in toupdate.keys():
			if key in primary_keys_map:
				primary_key_update=True
				break
		if primary_key_update == True:
			query_search=BooleanQuery()
			for key in primary_keys_map:
				temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
				query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
			hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
			if len(hits) > 0:
				return 106			
		writer.deleteDocuments(query)

		#add the newly modified document
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			temp=json.dumps(data)
			data_string=base64.b64encode(snappy.compress(temp))
		else:
			temp=json.dumps(data)
			data_string=base64.b64encode(temp)

		field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)
 def _addDocument(self, identifier, isformatof, sort=None):
     doc = Document()
     if isformatof:
         doc.add(NumericDocValuesField("__isformatof__", long(isformatof)))
     if sort:
         doc.add(NumericDocValuesField("__sort__", long(sort)))
     consume(self.lucene.addDocument(identifier, doc))
     self.lucene.commit()  # Explicitly, not required: since commitCount=1.
Ejemplo n.º 21
0
def luceneindex(text):
    for n, l in enumerate(text):
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        #print( "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs()))
        #print( "Closing index of %d docs..." % writer.numDocs())
        writer.close()
Ejemplo n.º 22
0
 def index(self, eid, name, cname, type, info):
     doc = Document()
     doc.add(TextField('id', eid, Field.Store.YES))
     doc.add(TextField('name', name, Field.Store.YES))
     doc.add(TextField('CannonicalName', cname, Field.Store.YES))
     doc.add(TextField('type', type, Field.Store.YES))
     doc.add(TextField('info', info, Field.Store.YES))
     self.writer.addDocument(doc)
Ejemplo n.º 23
0
 def __init__(self, path):
     lazyImport()
     self._writer, self._reader, self._searcher = self._getLucene(path)
     self._latestModifications = {}
     self._doc = Document()
     self._keyField = StringField("key", "", Field.Store.NO)
     self._valueField = Field("value", "", UNINDEXED_TYPE)
     self._doc.add(self._keyField)
     self._doc.add(self._valueField)
Ejemplo n.º 24
0
 def create_document(self, contents):
     """Create a Lucene document from the specified contents.
     Contents is a list of fields to be indexed, represented as a dictionary
     with keys 'field_name', 'field_type', and 'field_value'."""
     doc = Document()
     for f in contents:
         doc.add(Field(f['field_name'], f['field_value'],
                       self.ldf.get_field(f['field_type'])))
     return doc
Ejemplo n.º 25
0
 def create_document(self, contents):
     """Create a Lucene document from the specified contents.
     Contents is a list of fields to be indexed, represented as a dictionary
     with keys 'field_name', 'field_type', and 'field_value'."""
     doc = Document()
     for f in contents:
         doc.add(Field(f['field_name'], f['field_value'],
                       self.ldf.get_field(f['field_type'])))
     return doc
Ejemplo n.º 26
0
def createDocument(item_id, label, viewSimilar, viewProspective):
    doc = Document()
    doc.add(StringField('itemID', item_id, Field.Store.YES))
    doc.add(StringField('label', label, Field.Store.YES))
    for item in viewSimilar:
        doc.add(StoredField("viewSimilar", item))
    for item in viewProspective:
        doc.add(StoredField("viewProspective", item))
    return doc
Ejemplo n.º 27
0
 def index_text(self, sha1, full_text):
     """Index the full text and map it to the source sha1."""
     document = Document()
     document.add(Field("sha1", sha1, ImageIndexer.hash_field))
     if full_text:
         document.add(Field("full_text", full_text, ImageIndexer.text_field))
         self.writer.updateDocument(Term("sha1", sha1), document)
     else:
         logging.info("No text for sha1 %s", sha1)
Ejemplo n.º 28
0
def index_code_snippet(writer):
    HOME = "/extdsk/FaCoY/Git_data/Git_20161108"  #29.06.2015, 03.07.2015, 15.07.2015
    jfiles = java_files_from_dir(HOME)

    N_cores = 4

    # print("Number of Java files to process: %s" % (len(jfiles)))
    source_queue = []

    i = 0
    j = 0

    for jfile in jfiles:
        i += 1
        if i % 1000 == 0:
            print("Counter: %s" % i)
            #break

        document = Document()
        document.add(Field("file", jfile, Field.Store.YES, Field.Index.NO))

        try:
            with codecs.open(jfile, "r", encoding='utf-8') as f:
                file_content = f.read().encode("utf-8")

            #document.add( Field("file_content", compress(file_content), Field.Store.YES, Field.Index.NO) )
            # Check for duplicates files and accumulate source code
            # hash_v =  str(md5(file_content))
            # if hash_v not in hashes:
            # 	source_queue.append((document, file_content))
            # 	hashes.add(hash_v)

            # Wait until source files
            # if len(source_queue) >= N_cores:
            # 	ast_docs = parallize(source_queue)
            # 	source_queue = []

            # 	for ast, file_content, doc in ast_docs:
            ast = parse(file_content, resolve=False)
            if add_code_keyword_into_document(document, file_content, ast):
                writer.addDocument(document)
                j += 1
                if j % 1000 == 0:
                    print "Wrote:: %s files" % j

        except Exception as e:
            #traceback.print_exc()
            #print jfile
            print("Error: %s" % e)
            continue

    print "Number of files: %s" % i

    print "Number of duplicates: %s" % len(hashes)

    print("%s files has been indexed" % j)
def create_document_by_document_sentence(org_title, preprocessed_title, doc_id,
                                         sentence):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(StringField("doc_id", str(doc_id), Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("sentence", sentence, Field.Store.YES))
    return doc
Ejemplo n.º 30
0
    def indexDocs(self, root, writer):

        f = codecs.open('picIndex.txt','r',encoding='utf-8')
        picDict = {}
        for line in f.xreadlines():
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1] 
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url,title,alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(Field("src", src,
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("url", picDict[src][0],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("title", picDict[src][1],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("alt", picDict[src][2],
                                 Field.Store.YES,
                                 Field.Index.ANALYZED))
            writer.addDocument(doc)
Ejemplo n.º 31
0
def create_minidoc(termstring, field='text'):
    # To store term vectors (used for query expansion) we have to use a custom fieldtype
    customfield = FieldType()
    customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS)
    customfield.setStored(True)
    customfield.setTokenized(True)
    customfield.setStoreTermVectors(True)

    doc = Document()
    doc.add(Field(field, termstring, customfield))
    return doc
Ejemplo n.º 32
0
    def setUp(self):
        super(PhraseQueryTestCase, self).setUp()

        doc = Document()
        doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
        writer = self.getWriter()
        writer.addDocument(doc)
        writer.close()

        self.searcher = self.getSearcher()
        self.builder = PhraseQuery.Builder()
Ejemplo n.º 33
0
def indexer(docNumber, docText):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    doc = Document()
    doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
Ejemplo n.º 34
0
    def indexDocs(self, root, writer):

        t1 = FieldType() #t1 is used in URL fields
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t2 = FieldType() #t2 is used to index contents
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType() #t3 is used to index titles
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)



        indextxt = open(self.filedir, 'r')


        while True:
            t = indextxt.readline()
            if (len(t) == 0):
                indextxt.close()
                return
            filename = t.strip()
#        for root, dirnames, filenames in os.walk(root):
#            for filename in filenames:
            print "updating", filename
            try:
                path = os.path.join(root, filename)
                file = open(path)
                title = file.readline()
                print title
                page_URL = file.readline()
                while True:
                    imgsrc = file.readline()
                    if (imgsrc == 'EOF'):
                        file.close()
                        break
                    contents = file.readline()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("imgurl", imgsrc, t1))
                    doc.add(Field("url", page_URL, t1))
                    doc.add(Field("title",title, t3))
                    doc.add(Field("contents", contents, t2))
                    writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
Ejemplo n.º 35
0
    def setUp(self):
        super(PhraseQueryTestCase, self).setUp()

        doc = Document()
        doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
        writer = self.getWriter()
        writer.addDocument(doc)
        writer.close()

        self.searcher = self.getSearcher()
        self.builder = PhraseQuery.Builder()
def create_index():
    for country in cleaned_dictionary:
        doc = Document()
        doc.add(Field("country", country[0], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("country_html", country[1], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("capital", country[2], Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("capital_html", country[3], Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
def create_document_by_document_content(org_title, preprocessed_title,
                                        preprocessed_title_lower, content):
    doc = Document()  # create a new document
    doc.add(StringField("org_title", org_title, Field.Store.YES))
    doc.add(
        TextField("preprocessed_title", preprocessed_title, Field.Store.YES))
    doc.add(
        StringField("preprocessed_title_lower", preprocessed_title_lower,
                    Field.Store.YES))
    # doc.add(StringField("content", content, Field.Store.YES))
    doc.add(TextField("content", content, Field.Store.YES))
    return doc
Ejemplo n.º 38
0
    def parseBook(self, filename, t1, t2, iw):
        with open(filename, 'r', errors="ignore") as book:
            lines = book.readlines()
            doc = Document()
            beginindex = endindex = 0
            author = None
            title = None

            for index, text in enumerate(lines):
                if text.startswith("Author:"):
                    author = text[8:]
                    doc.add(Field("author", text[8:], t1))

                if text.startswith("Title: "):
                    title = text[7:]
                    doc.add(Field("title", text[7:], t1))
                elif text.startswith(" Title: "):
                    title = text[8:]
                    doc.add(Field("title", text[8:], t1))

                if text.startswith("*** START OF THIS PROJECT GUTENBERG"):
                    #   extract rest of the text
                    beginindex = index

                if text.startswith("*** END OF THIS PROJECT GUTENBERG"):
                    endindex = index
                    break

            if author == None:
                print("Didnt find author")
                self.authorcount += 1

            if title == None:
                print("Didnt find title")
                self.titlecount += 1

            if title == None and author == None:
                # Skip this book
                return

            text = None
            # Check if indices are correct
            if beginindex == 0 or endindex == 0:
                print(
                    "Skipping book {}\nSomething went wrong when extracting text"
                    .format(filename))
                text = "".join(lines)
                self.errorcount += 1
            else:
                text = "".join(lines[beginindex:endindex])
            doc.add(Field("content", text, t2))

            iw.addDocument(doc)
Ejemplo n.º 39
0
def retrival_answer(MAX):
    lucene.initVM()
    directory = RAMDirectory()

    indexDir = SimpleFSDirectory(Paths.get('index'))
    writerConfig = IndexWriterConfig(StandardAnalyzer())
    writer = IndexWriter(directory, writerConfig)

    print "%d docs in index" % writer.numDocs()
    print "Reading lines from Document..."

    process_doc = open("Huawei_result/document.txt", "r")
    doc_line = process_doc.readlines()
    for l in doc_line:
        doc = Document()
        doc.add(TextField("text", l, Field.Store.YES))
        writer.addDocument(doc)
    print "Indexed from %d docs in index" % (writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()

    accuracy = []
    process_query = open("Huawei_result/query.txt", "r")
    query_line = process_query.readlines()
    for n, one_query in enumerate(query_line):
        analyzer = StandardAnalyzer()
        # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index')))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        # searcher = IndexSearcher(reader)
        query = QueryParser("text", analyzer).parse(one_query)
        hits = searcher.search(query, MAX)
        # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        # print "The groundtruth document is:", doc_line[n]
        candidate_doc = []
        for hit in hits.scoreDocs:
            # print hit.score, hit.doc, hit.toString()
            doc = searcher.doc(hit.doc)
            # print doc.get("text").encode("utf-8")
            candidate_doc.append(doc.get("text"))

        choices = process.extract(unicode(doc_line[n]), candidate_doc)
        flag = 0
        for i in range(len(choices)):
            if choices[i][1] >= 89:
                flag = 1
        if flag == 1:
            accuracy.append(1)
        else:
            accuracy.append(0)

    final_accuracy = float(sum(accuracy)) / float(len(accuracy))

    print "the final accuracy is:", final_accuracy
Ejemplo n.º 40
0
    def setUp(self):
        super(TestRegexQuery, self).setUp()

        writer = self.getWriter(analyzer=SimpleAnalyzer(self.TEST_VERSION))
        doc = Document()
        doc.add(
            Field(self.FN, "the quick brown fox jumps over the lazy dog",
                  TextField.TYPE_NOT_STORED))
        writer.addDocument(doc)
        writer.commit()
        writer.close()
        self.searcher = self.getSearcher()
Ejemplo n.º 41
0
 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return
Ejemplo n.º 42
0
 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return
Ejemplo n.º 43
0
    def get_doc(self, img):
        '''
        Generate a `Document` according to the parameters.

        Input: `img`: dict containing a single image info
        Output: `Document` with the fields initialized
        '''
        doc = Document()
        doc.add(StringField("img_url", img['img_url'], Field.Store.YES))
        doc.add(TextField("description", img['description'], Field.Store.YES))
        doc.add(StringField("url", img['url'], Field.Store.YES))
        doc.add(StringField("url_title", img['url_title'], Field.Store.YES))
        return doc
Ejemplo n.º 44
0
def addDoc(w, text):
    """
    add single doc to the index
    :param w: writer
    :param doc_name:
    :param text:
    :param file_name:
    :return:
    """
    doc = Document()
    # TextField: sequence of terms: tokenized
    doc.add(TextField("text", text, Field.Store.YES))
    w.addDocument(doc)
Ejemplo n.º 45
0
 def _getNewDocument(self, identifier, oldDoc):
     doc = Document()
     doc.add(StringField(IDENTIFIER_FIELD, identifier, Field.Store.YES))
     doc.add(IntField(HASH_FIELD, Partition.hashId(identifier), Field.Store.NO))
     if oldDoc is not None:
         for oldPrefix in oldDoc.getValues(PREFIX_FIELD):
             doc.add(StringField(PREFIX_FIELD, oldPrefix, Field.Store.YES))
         for oldSet in oldDoc.getValues(SETS_FIELD):
             doc.add(StringField(SETS_FIELD, oldSet, Field.Store.YES))
     return doc
Ejemplo n.º 46
0
Archivo: idx.py Proyecto: mkind/crawler
    def add_document(self, url, field, text):
        """
        add a new document to index writer

        input:
            url     the url of the target to be indexed
            field   fieldname of the value that will be indexed
            text    text to be indexed

        """
        doc = Document()
        doc.add(Field('url', url, TextField.TYPE_STORED))
        doc.add(Field(field, text, TextField.TYPE_STORED))
        self.idx_writer.addDocument(doc)
Ejemplo n.º 47
0
    def indexDocs(self, root, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            # traverse through the doc directory
            for filename in filenames:
                #	if not filename.endswith('.cdc'):
                #		continue
                try:
                    # only add the filename and path for indexing
                    path = os.path.join(root, filename)
                    print "adding file : ", path
                    file = open(path)
                    contents = unicode(file.read(), 'utf-8')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in ", filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "failed in indexDocs:", e
Ejemplo n.º 48
0
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.txt'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'gbk')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
 def add_document(self, fields, header, id_):
     doc = Document()
     if len(fields) > len(header):
         sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
         for field in fields:
             sys.stderr.write('%s\n' % field)
         return
     for idx, field in enumerate(fields):
         fname, fieldtype = header[idx]
         if fieldtype is IntField:
             field = int(field)
         doc.add(fieldtype(fname, field, Field.Store.YES))
     self.writer.addDocument(doc)
     self.num_docs += 1
    def testScore(self):
        reactor = CallTrace('reactor')
        settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False)
        lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings)
        document = Document()
        document.add(TextField('field', 'x '*100, Field.Store.NO))
        returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document))

        q = TermQuery(Term("field", 'x'))
        result = returnValueFromGenerator(lucene.executeQuery(q))
        self.assertAlmostEqual(0.1, result.hits[0].score)

        q.setBoost(10.0)
        result = returnValueFromGenerator(lucene.executeQuery(q))
        self.assertAlmostEqual(1, result.hits[0].score)
Ejemplo n.º 51
0
    def buildDocument(self, fields, record):
        doc = Document()
        doc.add(
            Field("id",
                  record["_id"],
                  self.fieldType2))
        for field in fields:
            if isinstance(record[field], dict):
                self.dictToFields(doc, record[field])
            else:
                doc.add(
                    Field(field,
                          record[field],
                          self.fieldType1))

        return doc
    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT))
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        query = PhraseQuery()
        query.add(Term("field", "stop"))
        query.add(Term("field", "words"))
        scoreDocs = searcher.search(query, None, 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
Ejemplo n.º 53
0
    def testNot(self):

        writer = self.getWriter(analyzer=SimpleAnalyzer())

        d1 = Document()
        d1.add(Field("field", "a b", TextField.TYPE_STORED))

        writer.addDocument(d1)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        query = QueryParser("field", SimpleAnalyzer()).parse("a NOT b")

        topDocs = searcher.search(query, 50)
        self.assertEqual(0, topDocs.totalHits)
    def setUp(self):
        super(BooleanOrTestCase, self).setUp()

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
        d = Document()
        d.add(Field(self.FIELD_T, "Optimize not deleting all files",
                    TextField.TYPE_STORED))
        d.add(Field(self.FIELD_C,
                    "Deleted When I run an optimize in our production environment.",
                    TextField.TYPE_STORED))

        writer.addDocument(d)
        writer.close()

        self.searcher = self.getSearcher()
Ejemplo n.º 55
0
    def testPhraseQueryWithStopAnalyzer(self):

        writer = self.getWriter(analyzer=StopAnalyzer())
        doc = Document()
        doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED))
        writer.addDocument(doc)
        writer.close()

        searcher = self.getSearcher()

        # valid exact phrase query
        builder = PhraseQuery.Builder()
        builder.add(Term("field", "stop"))
        builder.add(Term("field", "words"))
        scoreDocs = searcher.search(builder.build(), 50).scoreDocs
        self.assertEqual(1, len(scoreDocs))
Ejemplo n.º 56
0
def generate_document(path, indexer):
    """ Given a file, convert it into a lucene document that
    is ready to be indexed.

    :param path: The file to add to the search index
    :param indexer: The indexer to operate with
    :returns: The index document for the specified camera
    """
    name = FILE_TYPES.get(path.rsplit(".", 1)[-1], "")
    data = open(path, "r").read()

    document = Document()
    document.add(Field("path", path, indexer.field_clean))
    document.add(Field("type", name, indexer.field_clean))
    document.add(Field("data", data, indexer.field_dirty))
    return document
Ejemplo n.º 57
0
    def setUp(self):
        super(Test_Bug1564, self).setUp()

        self.analyzer = StandardAnalyzer()
        writer = self.getWriter(analyzer=self.analyzer)

        doc = Document()
        doc.add(Field('all', u'windowpane beplaster rapacious \
        catatonia gauntlet wynn depressible swede pick dressmake supreme \
        jeremy plumb theoretic bureaucracy causation chartres equipoise \
        dispersible careen heard', TextField.TYPE_NOT_STORED))
        doc.add(Field('id', '1', StoredField.TYPE))

        writer.addDocument(doc)
        writer.commit()
        writer.close()
Ejemplo n.º 58
0
    def prp_index(self):
        '''
        Prepare the index given our "corpus" file(s)
        '''
        print '=> Preparing Lucene index %s' % self._index_dir
        writer = self._get_writer(create=True)
        print '   Currently %d docs (dir %s)' % (writer.numDocs(), self._index_dir)
        num_pages, num_sections = 0, 0
        page_name, section_name = None, None
        num_lines = 0
        for ifname,fname in enumerate(self._filenames):
            print '   Adding lines to index from file #%d: %s' % (ifname, fname)
            with open(fname,'rt') as infile:
                for text in infile:
                    if len(text)==0:
                        print 'Reached EOF'
                        break # EOF
                    # CorpusReader.PAGE_NAME_PREFIX is <Page>
                    # all our corpus we manipulated them to have this tag as the start of a page
                    if text.startswith(CorpusReader.PAGE_NAME_PREFIX):
                        page_name = text[len(CorpusReader.PAGE_NAME_PREFIX):].strip()
                        section_name = None
                        num_pages += 1
                    elif text.startswith(CorpusReader.SECTION_NAME_PREFIX):
                        section_name = text[len(CorpusReader.SECTION_NAME_PREFIX):].strip()
                        num_sections += 1
                    else:
                        assert (page_name is not None) and (section_name is not None)
                        if self._parser is None:
                            luc_text = text
                        else:
                            # note in our case the we always have SimpleWordParser
                            section_words = self._parser.parse(text, calc_weights=False) #True)
                            luc_text = ' '.join(section_words)
                        # for each section, we add the whole section to Lucene index, we store the text and makes it searchable
                        # seems like page is not necessary here since we do not add document page by page but section by section
                        doc = Document()
                        # there is only one field for each document, which is the text field
                        # section_name is not used as a field
                        doc.add(Field("text", luc_text, Field.Store.YES, Field.Index.ANALYZED))
                        writer.addDocument(doc)
                    num_lines += 1
                    if num_lines % 100000 == 0:
                        print '    read %d lines so far: %d pages, %d sections' % (num_lines, num_pages, num_sections)

        print '   Finished - %d docs (dir %s)' % (writer.numDocs(), self._index_dir)
        writer.close()