def testCompressionTools(self): bytes = JArray('byte')(self.binaryValCompressed) binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes)) stringFldCompressed = StoredField("stringCompressed", CompressionTools.compressString(self.binaryValCompressed)) doc = Document() doc.add(binaryFldCompressed) doc.add(stringFldCompressed) # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assert_(docFromReader is not None) # fetch the binary compressed field and compare it's content with # the original one bytes = CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed")) binaryFldCompressedTest = bytes.string_ self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed) self.assertEqual(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed) reader.close()
def indexDictionary(d, writer): for k, v in d.iteritems(): doc = Document() doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) return writer.numDocs()
def index(personDB, familyDB, relationDB): #config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) #?#indexWriter.setRAMBufferSizeMB(50); KOLLA 256 mt = matchtext() for p in personDB.find({}, no_cursor_timeout=True): matchtxt = mt.matchtextPerson(p, personDB, familyDB, relationDB) doc = Document() doc.add(Field('uid',str(p['_id']), StringField.TYPE_STORED)) doc.add(Field('sex',str(p['sex']), StringField.TYPE_STORED)) doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) #Family matchtext for f in familyDB.find(): matchtxt = mt.matchtextFamily(f, familyDB, personDB, relationDB) doc = Document() doc.add(Field('uid',str(f['_id']), StringField.TYPE_STORED)) doc.add(Field('sex','FAM', StringField.TYPE_STORED)) doc.add(Field("text", matchtxt, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def add_doc(self, doc_id, title, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: if prm.top_tfidf > 0: words_idx = [] words, _ = utils.top_tfidf(txt.lower(), self.idf, prm.top_tfidf, prm.min_term_freq) if len(words) == 0: words.append('unk') for w in words: if w in self.vocab: words_idx.append(self.vocab[w]) else: words_idx.append(-1) # unknown words. else: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc)
def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info): """ Add a document to the index. Does this using direct Lucene access. :param new_doc: dict of fields with values :type new_doc:dict :param metadata: ditto :type metadata:dict :param fields_to_process: only add these fields from the doc dict :type fields_to_process:list """ doc = Document() total_numTerms=bow_info["total_numterms"] # each BOW now comes with its field for field in fields_to_process: field_object=Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES) ## boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) boost=1 / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) field_object.setBoost(float(boost)) doc.add(field_object) json_metadata=json.dumps(metadata) doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO)) doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO)) doc.add(Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def publish_services(self, service_list): transformer = WSDLTransformer() current_document = 1 indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig( Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT)) writerConfig.setSimilarity(BM25Similarity()) index_writer = IndexWriter(indexDir, writerConfig) for wsdl in service_list: if self._document_expansion: #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl)))) bag_of_words = ' '.join( self._semantic_transformer.transform( transformer.transform(wsdl))) else: #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl))) bag_of_words = ' '.join(transformer.transform(wsdl)) doc = Document() doc.add( Field("content", bag_of_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO)) index_writer.addDocument(doc) current_document += 1 index_writer.close()
def generate_indices_from_benchmark(writer, counter): javafiles = java_files_from_dir(source_path) i = 0 j = 0 for javafile in javafiles: # print javafile i += 1 if i % 1000 == 0: #1000개 마다 프린트 print("Counter: %s" % i) print "typed_method_call" + str(counter.typed_method_call_count) document = Document() document.add(Field("file", javafile, Field.Store.YES, Field.Index.NO)) try: with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f: file_content = f.read().encode("utf-8", errors='ignore') f.close() ast = parse(file_content, resolve=False) if add_code_keyword_into_document(document, file_content, ast, counter): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print "%s files has been indexed" % j
def indexDocs(self, root, writer): path = root + "/data/*/*.xml" # print(path) xml_files = glob.glob(path) # xml_files = ["HAM2-031201.xml"] numDocs = 0 for xml in xml_files: try: parser = etree.XMLParser(recover=False, strip_cdata=False) tree = etree.parse(xml, parser=parser) except etree.XMLSyntaxError as e: parser = etree.XMLParser(recover=True, strip_cdata=False) tree = etree.parse(xml, parser=parser) root = tree.getroot() for text in root.iter("TEXT"): contents = "".join(text.xpath("text()")).strip() doc_no = text.getparent().find("DOCNO").text # print("adding", doc_no) try: doc = Document() doc.add(StringField("id", doc_no, Field.Store.YES)) if len(contents) > 0: doc.add( TextField("contents", contents, Field.Store.YES)) else: pass # print("warning: no content in %s" % doc_no) writer.addDocument(doc) numDocs += 1 except Exception as e: print("Failed in indexDocs:", e) return numDocs
def Indexing(self, writer): print("Indexing Segmented File [", SEGMENTATION_FILE, "]") with open(SEGMENTATION_FILE, 'r') as f: line_count = 0 for line in f: # 建立 context 的 fieldtype,需要搭建索引、存储、向量化 fieldtype_context = FieldType() fieldtype_context.setIndexOptions(IndexOptions.DOCS_AND_FREQS) fieldtype_context.setStored(True) fieldtype_context.setTokenized(True) # 建立 phrase 的 fieldtype,只需要保存 fieldtype_phrase = FieldType() fieldtype_phrase.setStored(True) # 对分词好的内容进行处理,把词语和词性分开来存储 processed_context, processed_phrase = self.process_line(line) doc = Document() # context field是用于记录文章的内容 doc.add(Field('context', processed_context, fieldtype_context)) # phrase field适用于记录文章每个词所对应的词性 doc.add(Field('phrase', processed_phrase, fieldtype_phrase)) # 把document写入索引库 writer.addDocument(doc) # 跟踪程序运行情况用 print("\r", str(line_count), " lines", end="", flush=True) line_count = line_count + 1 if line_count > self.index_limit and not self.training: break writer.close() print()
def create_index(storage, paths) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths : for filen in os.listdir(path) : text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3) : doc = Document() a = i-5 if i-5 > 0 else 0 sentence = ' '.join(text[a:i+5]) doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path+filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def build_index(self, dict_data): print("loading data...") t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for k, v in dict_data.items(): doc = Document() doc.add(Field("id", k, t1)) doc.add(Field("content", v, t2)) self.writer.addDocument(doc) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() self.writer.commit() self.writer.close() ticker.tick = False print("done")
def addDocumentToIndex(self, title, text): doc = Document() doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.addDocument(doc)
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % ( n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def addDoc(self, writer, text): d = Document() f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED) d.add(f) writer.addDocument(d)
def indexDocs(self, sourceDir, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for sourceDir, dirnames, filenames in os.walk(sourceDir): for filename in filenames: if not filename.endswith('.txt'): continue print(filename) try: path = os.path.join(sourceDir, filename) file = open(path, 'r', encoding="utf-8") contents = file.read() #contents = str(filecontent, 'utf-8') #contents = filecontent.encode('utf-8') #print('path', path, len(contents)) doc = Document() doc.add(Field("name", filename, t1)) # filename (title) #doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field(queryField, contents, t2)) # content else: print("warning: no content in %s" % filename) writer.addDocument(doc) file.close() except NameError: print("Failed in indexDocs:")
def create_index(storage, paths): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths: for filen in os.listdir(path): text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3): doc = Document() a = i - 5 if i - 5 > 0 else 0 sentence = ' '.join(text[a:i + 5]) doc.add( Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path + filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def index_docs(self, tweets, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t1.setStoreTermVectors(True) t1.setStoreTermVectorOffsets(True) # add each tweet to the index for tweet in tweets: try: # strip out URLs because they provide false index matches contents = [] for word in tweet[1].text.split(): if word.startswith("http://") or word.startswith("https://"): continue contents.append(word) contents = " ".join(contents) if len(contents) == 0: continue doc = Document() doc.add(Field("contents", contents, t1)) writer.addDocument(doc) except Exception, e: print "Failed in index_docs:", e
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) inFile = open(str(args["inputFile"])) indexName = inFile.readline() while (indexName != ''): print "adding", indexName doc = Document() doc.add(Field("name", indexName, t1)) #doc.add(Field("path", root, t1)) text = inFile.readline() if (len(text) > 0): print("contents: %s\n" % text) doc.add(Field("contents", text, t2)) else: print "warning: no content in %s" % indexName indexName = inFile.readline() writer.addDocument(doc) inFile.close()
def testCompressionTools(self): bytes = JArray('byte')(self.binaryValCompressed) binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes)) stringFldCompressed = StoredField( "stringCompressed", CompressionTools.compressString(self.binaryValCompressed)) doc = Document() doc.add(binaryFldCompressed) doc.add(stringFldCompressed) # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer()) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assertTrue(docFromReader is not None) # fetch the binary compressed field and compare it's content with # the original one bytes = CompressionTools.decompress( docFromReader.getBinaryValue("binaryCompressed")) binaryFldCompressedTest = bytes.string_ self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed) self.assertEqual( CompressionTools.decompressString( docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed) reader.close()
def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info): """ Add a document to the index. Does this using direct Lucene access. :param new_doc: dict of fields with values :type new_doc:dict :param metadata: ditto :type metadata:dict :param fields_to_process: only add these fields from the doc dict :type fields_to_process:list """ doc = Document() total_numTerms = bow_info["total_numterms"] # each BOW now comes with its field for field in fields_to_process: field_object = Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES) ## boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) boost = 1 / float( math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) field_object.setBoost(float(boost)) doc.add(field_object) json_metadata = json.dumps(metadata) doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO)) doc.add( Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO)) doc.add( Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def addDoc(w, data): doc = Document() for field in data: value, type = data[field][0], data[field][1] ''' if type!='INTEGER_STORED': #print ('field=%s len=%d'%(field,len(value))) print ('field=%s value=%s'%(field,value)) else: print ('field=%s value=%d'%(field,value)) ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') try: w.addDocument(doc) except: #print ('error cat=%s'%(data['category'][0])) print('-----------------------------------') for field in data: value, type = data[field][0], data[field][1] print('field=%s\nvalue=%s' % (field, str(value)))
def index (cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer from org.apache.lucene.util import Version config = IndexWriterConfig(Version.LUCENE_42, WhitespaceAnalyzer(Version.LUCENE_42)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # FacetFields is a utility class for adding facet fields to a document: facet_fields = FacetFields(taxo) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [CategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List: facetList = Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # use the FacetFields utility class for adding facet fields (i.e. the categories) # to the document (and, as required, to the taxonomy index) facet_fields.addFields(doc, facetList) # finally add the document to the index iw.addDocument(doc) nDocsAdded +=1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
def rewrite(data_string): data=json.loads(data_string) toupdate=json.loads(update) #primary_key_modified=False #delete the appropriate document query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) #modify the values for key,value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists==False: if key in data.keys(): data[key]=value else: data[key]=value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue primary_key_update=False for key in toupdate.keys(): if key in primary_keys_map: primary_key_update=True break if primary_key_update == True: query_search=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query_search,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: temp=json.dumps(data) data_string=base64.b64encode(snappy.compress(temp)) else: temp=json.dumps(data) data_string=base64.b64encode(temp) field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc)
def create_document(file_name): path = './alldocs/' + file_name file = open(path) doc = Document() doc.add(StringField("title", input_file, Field.Store.YES)) doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() return doc
def _addDocument(self, identifier, isformatof, sort=None): doc = Document() if isformatof: doc.add(NumericDocValuesField("__isformatof__", long(isformatof))) if sort: doc.add(NumericDocValuesField("__sort__", long(sort))) consume(self.lucene.addDocument(identifier, doc)) self.lucene.commit() # Explicitly, not required: since commitCount=1.
def addDocument(self, id): global answers_train preA = answers_train[id] doc = Document() doc.add(TextField("pa", preA, Field.Store.YES)) doc.add(StringField("id", str(id), Field.Store.YES)) self.w.addDocument(doc) self.w.commit()
def luceneindex(text): for n, l in enumerate(text): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) #print( "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs())) #print( "Closing index of %d docs..." % writer.numDocs()) writer.close()
def index_code_snippet(writer): HOME = "/Users/Raphael/Downloads/GitArchive" #29.06.2015, 03.07.2015, 15.07.2015 jfiles = java_files_from_dir(HOME) N_cores = 4 # print("Number of Java files to process: %s" % (len(jfiles))) source_queue = [] i = 0 j = 0 for jfile in jfiles: i += 1 if i % 1000 == 0: print("Counter: %s" % i) break document = Document() document.add(Field("file", jfile, Field.Store.YES, Field.Index.NO)) try: with codecs.open(jfile, "r", encoding='utf-8') as f: file_content = f.read().encode("utf-8") document.add( Field("file_content", compress(file_content), Field.Store.YES, Field.Index.NO)) # Check for duplicates files and accumulate source code # hash_v = str(md5(file_content)) # if hash_v not in hashes: # source_queue.append((document, file_content)) # hashes.add(hash_v) # Wait until source files # if len(source_queue) >= N_cores: # ast_docs = parallize(source_queue) # source_queue = [] # for ast, file_content, doc in ast_docs: ast = parse(file_content, resolve=False) if add_code_keyword_into_document(document, file_content, ast): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: #traceback.print_exc() #print jfile print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print("%s files has been indexed" % j)
def create_document(self, contents): """Create a Lucene document from the specified contents. Contents is a list of fields to be indexed, represented as a dictionary with keys 'field_name', 'field_type', and 'field_value'.""" doc = Document() for f in contents: doc.add(Field(f['field_name'], f['field_value'], self.ldf.get_field(f['field_type']))) return doc
def index_text(self, sha1, full_text): """Index the full text and map it to the source sha1.""" document = Document() document.add(Field("sha1", sha1, ImageIndexer.hash_field)) if full_text: document.add(Field("full_text", full_text, ImageIndexer.text_field)) self.writer.updateDocument(Term("sha1", sha1), document) else: logging.info("No text for sha1 %s", sha1)
def getDocument(fname): doc = Document() doc.add( Field('filename', os.path.split(fname)[-1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field('content', open(fname).read(), Field.Store.YES, Field.Index.ANALYZED)) return doc
def _insertDoc(self, writer, content): doc = Document() doc.add(Field("id", "id" + str(self.docCount), StringField.TYPE_STORED)) doc.add(Field("content", content, TextField.TYPE_NOT_STORED)) writer.addDocument(doc) self.docCount += 1
def create_document(file_name): path = INPUT_DIR + file_name # assemble the file descriptor file = open(path) # open in read mode doc = Document() # create a new document # add the title field doc.add(StringField("title", input_file, Field.Store.YES)) # add the whole book doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() # close the file pointer return doc
def index_files(): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() BASE_DIR = path.dirname(path.abspath(sys.argv[0])) INPUT_DIR = BASE_DIR + "/input/" INDEX_DIR = BASE_DIR + "/lucene_index/" NoT = 100000 # Number of Tokens print "------------------------------------------------------" print "PyLucene Demo started (lucene_demo.py)" print "Python version: %d.%d.%d" % ( sys.version_info.major, sys.version_info.minor, sys.version_info.micro) print 'Lucene version:', lucene.VERSION print "------------------------------------------------------\n" # lucene.initVM() # directory = RAMDirectory() index_path = Paths.get(INDEX_DIR) directory = SimpleFSDirectory(index_path) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, NoT) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) print "Number of indexed documents: %d\n" % writer.numDocs() for input_file in listdir(INPUT_DIR): # iterate over all input files print "Current file:", input_file if input_file.endswith(".json"): with open(INPUT_DIR + input_file) as f: for line in f: # doc = create_document(line, input_file) # call the create_document function o = json.loads(line) doc = Document() # create a new document doc.add(TextField("filename", input_file, Field.Store.YES)) # print file doc.add( TextField("username", o['user']['screen_name'], Field.Store.YES)) # print "username: "******"text", o['text'], Field.Store.YES)) # print "text: " + o['text'] if o['user']['location']: doc.add( TextField("location", o['user']['location'], Field.Store.YES)) # print "location: " + o['user']['location'] doc.add(TextField("time", o['created_at'], Field.Store.YES)) writer.addDocument( doc) # add the document to the IndexWriter print "\nNumber of indexed documents: %d" % writer.numDocs() writer.close() print "Finished\n" print "-----------------------------------------------------"
def indexer(docNumber, docText): lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer()) writer = IndexWriter(indexDir, writerConfig) doc = Document() doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def setUp(self): super(PhraseQueryTestCase, self).setUp() doc = Document() doc.add(Field("field", "one two three four five", TextField.TYPE_STORED)) writer = self.getWriter() writer.addDocument(doc) writer.close() self.searcher = self.getSearcher() self.builder = PhraseQuery.Builder()
def create_minidoc(termstring, field='text'): # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) doc = Document() doc.add(Field(field, termstring, customfield)) return doc
def dummyIndex(self): """ Create a dummy index - to avoid problems updating it """ config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(self.indexDir, config) doc = Document() doc.add(Field('uid', 'dummy', StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def addDoc(w, text): """ add single doc to the index :param w: writer :param doc_name: :param text: :param file_name: :return: """ doc = Document() # TextField: sequence of terms: tokenized doc.add(TextField("text", text, Field.Store.YES)) w.addDocument(doc)
def add_document(self, fields, header, id_): doc = Document() if len(fields) > len(header): sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_)) for field in fields: sys.stderr.write('%s\n' % field) return for idx, field in enumerate(fields): fname, fieldtype = header[idx] if fieldtype is IntField: field = int(field) doc.add(fieldtype(fname, field, Field.Store.YES)) self.writer.addDocument(doc) self.num_docs += 1
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def add_document(self, url, field, text): """ add a new document to index writer input: url the url of the target to be indexed field fieldname of the value that will be indexed text text to be indexed """ doc = Document() doc.add(Field('url', url, TextField.TYPE_STORED)) doc.add(Field(field, text, TextField.TYPE_STORED)) self.idx_writer.addDocument(doc)
def setUp(self): super(PyLuceneThreadTestCase, self).setUp() self.classLoader = Thread.currentThread().getContextClassLoader() writer = self.getWriter(analyzer=StandardAnalyzer()) doc1 = Document() doc2 = Document() doc3 = Document() doc4 = Document() doc1.add(Field("field", "one", TextField.TYPE_STORED)) doc2.add(Field("field", "two", TextField.TYPE_STORED)) doc3.add(Field("field", "three", TextField.TYPE_STORED)) doc4.add(Field("field", "one", TextField.TYPE_STORED)) writer.addDocument(doc1) writer.addDocument(doc2) writer.addDocument(doc3) writer.addDocument(doc4) writer.commit() writer.close() self.testData = [('one',2), ('two',1), ('three', 1), ('five', 0)] * 500 self.lock = threading.Lock() self.totalQueries = 0
def indexDocs(self, root, writer): f = codecs.open('picIndex.txt','r',encoding='utf-8') picDict = {} for line in f.xreadlines(): ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url,title,alt] f.close() for src in picDict: doc = Document() doc.add(Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def indexing(self, root = settings.ADMINS_ENGINE.mDocumentDirectory, parent = [], docID = 1, parentID = 0, id = 0): realPath = os.path.abspath(root) for i in os.listdir(realPath): path = os.path.join(realPath, i) if os.path.isfile(path): #index this file doc = Document() doc.add(Field('name', ("%s %s" % (' '.join(parent), i)).strip(), self.mFieldTypes['name'])) doc.add(Field('parent', ' '.join(parent), self.mFieldTypes['parent'])) doc.add(Field('id', str(docID), self.mFieldTypes['id'])) doc.add(Field('parentID', str(parentID), self.mFieldTypes['id'])) fd = open(path, 'r') content = fd.read() fd.close() if len(content) > 0: doc.add(Field('content', content, self.mFieldTypes['content'])) self.mIndexWriter.addDocument(doc) ##################### Logging ############################## if IS_DEBUG: nameDebug = AnalyzerDebug.debug(self.mAnalyzers['name'], ("%s %s" % (' '.join(parent), i)).strip()) parentDebug = AnalyzerDebug.debug(self.mAnalyzers['parent'], ' '.join(parent)) contentDebug = AnalyzerDebug.debug(self.mAnalyzers['default'], content) self.mLog = self.mLog + ( "File %s\n {name - %s}: %s\n {parent - %s}: %s\n {content}: %s\n\n" % (path, docID, nameDebug, parentID, parentDebug, contentDebug) ) docID = docID + 1 ################### index sub commands if os.path.isdir(path + ".sub"): parent.append(i) docID = self.indexing(path + ".sub", parent, docID, docID - 1, id + 1) parent.pop() if id == 0: self.mIndexWriter.commit() self.mIndexWriter.close() if IS_DEBUG: loggingBot = LoggingBot(self.mLog, settings.ADMINS_ENGINE.getIndexingLogQueue()) loggingBot.start() self.mLog = "" return docID
def testScore(self): reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False) lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings) document = Document() document.add(TextField('field', 'x '*100, Field.Store.NO)) returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document)) q = TermQuery(Term("field", 'x')) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(0.1, result.hits[0].score) q.setBoost(10.0) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(1, result.hits[0].score)
def create_index(): for country in cleaned_dictionary: doc = Document() doc.add(Field("country", country[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("country_html", country[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("capital", country[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("capital_html", country[3], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def setUp(self): super(BooleanOrTestCase, self).setUp() # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)) d = Document() d.add(Field(self.FIELD_T, "Optimize not deleting all files", TextField.TYPE_STORED)) d.add(Field(self.FIELD_C, "Deleted When I run an optimize in our production environment.", TextField.TYPE_STORED)) writer.addDocument(d) writer.close() self.searcher = self.getSearcher()
def testNot(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) d1 = Document() d1.add(Field("field", "a b", TextField.TYPE_STORED)) writer.addDocument(d1) writer.commit() writer.close() searcher = self.getSearcher() query = QueryParser("field", SimpleAnalyzer()).parse("a NOT b") topDocs = searcher.search(query, 50) self.assertEqual(0, topDocs.totalHits)
def generate_document(path, indexer): """ Given a file, convert it into a lucene document that is ready to be indexed. :param path: The file to add to the search index :param indexer: The indexer to operate with :returns: The index document for the specified camera """ name = FILE_TYPES.get(path.rsplit(".", 1)[-1], "") data = open(path, "r").read() document = Document() document.add(Field("path", path, indexer.field_clean)) document.add(Field("type", name, indexer.field_clean)) document.add(Field("data", data, indexer.field_dirty)) return document
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer()) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query builder = PhraseQuery.Builder() builder.add(Term("field", "stop")) builder.add(Term("field", "words")) scoreDocs = searcher.search(builder.build(), 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def setUp(self): super(Test_Bug1564, self).setUp() self.analyzer = StandardAnalyzer() writer = self.getWriter(analyzer=self.analyzer) doc = Document() doc.add(Field('all', u'windowpane beplaster rapacious \ catatonia gauntlet wynn depressible swede pick dressmake supreme \ jeremy plumb theoretic bureaucracy causation chartres equipoise \ dispersible careen heard', TextField.TYPE_NOT_STORED)) doc.add(Field('id', '1', StoredField.TYPE)) writer.addDocument(doc) writer.commit() writer.close()
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT)) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query query = PhraseQuery() query.add(Term("field", "stop")) query.add(Term("field", "words")) scoreDocs = searcher.search(query, None, 50).scoreDocs self.assertEqual(1, len(scoreDocs))