def addDoc(self, writer, text): d = Document() f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED) d.add(f) writer.addDocument(d)
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def get_doc(self, filename, path, title, url, contents): ''' Generate a `Document` according to the parameters. Input: `filename`: filename of the webpage `path`: path of the webpage `title`: title of the webpage `url`: original url of the webpage `contents`: contents of the webpage Output: `Document` with the fields initialized ''' doc = Document() # doc.add(Field("name", filename, self.property_type)) # doc.add(Field("path", path, self.property_type)) # doc.add(Field("title", title, self.property_type)) # doc.add(Field("url", url, self.property_type)) doc.add(StringField("name", filename, Field.Store.YES)) doc.add(StringField("path", path, Field.Store.YES)) doc.add(TextField("title", title, Field.Store.YES)) doc.add(TextField("url", url, Field.Store.YES)) if len(contents) > 0: # doc.add(Field("contents", contents, self.content_type)) doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(filename)) return doc
def create_index(storage, paths): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths: for filen in os.listdir(path): text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3): doc = Document() a = i - 5 if i - 5 > 0 else 0 sentence = ' '.join(text[a:i + 5]) doc.add( Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path + filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def indexDocs(self, root, writer): f = codecs.open('picIndex.txt', 'r', encoding='utf-8') picDict = {} for line in f.xreadlines(): ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url, title, alt] f.close() for src in picDict: doc = Document() doc.add( Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) index_file = open("index.txt", 'r') for line in index_file.readlines(): try: src = line.strip().split('\t')[0] filename = line.strip().split('\t')[1] tag = line.strip().split('\t')[2] path = os.path.join(root, filename) doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) doc.add(Field("src", src, t1)) if len(tag) > 0: doc.add(Field("tag", tag, t2)) else: print "warning: no tag in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t3 = FieldType() t3.setIndexed(True) t3.setStored(False) t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格 t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) total=0 file = open(root,"r") for line in file.readlines(): try: imgurl, itemurl, content = line.split('\t') total+=1 print total print "adding", content contents = ' '.join(jieba.cut(content)) doc = Document() doc.add(Field("imgurl", imgurl, t1)) doc.add(Field("itemurl", itemurl, t1)) doc.add(Field("title", content, t1)) doc.add(Field("contents",contents,t3)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def testCompressionTools(self): bytes = JArray('byte')(self.binaryValCompressed) binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes)) stringFldCompressed = StoredField("stringCompressed", CompressionTools.compressString(self.binaryValCompressed)) doc = Document() doc.add(binaryFldCompressed) doc.add(stringFldCompressed) # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assert_(docFromReader is not None) # fetch the binary compressed field and compare it's content with # the original one bytes = CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed")) binaryFldCompressedTest = bytes.string_ self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed) self.assertEqual(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed) reader.close()
def addDocumentToIndex(self, title, text): doc = Document() doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.addDocument(doc)
def testCompressionTools(self): bytes = JArray('byte')(self.binaryValCompressed) binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes)) stringFldCompressed = StoredField( "stringCompressed", CompressionTools.compressString(self.binaryValCompressed)) doc = Document() doc.add(binaryFldCompressed) doc.add(stringFldCompressed) # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer()) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assertTrue(docFromReader is not None) # fetch the binary compressed field and compare it's content with # the original one bytes = CompressionTools.decompress( docFromReader.getBinaryValue("binaryCompressed")) binaryFldCompressedTest = bytes.string_ self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed) self.assertEqual( CompressionTools.decompressString( docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed) reader.close()
def indexDictionary(d, writer): for k, v in d.iteritems(): doc = Document() doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) return writer.numDocs()
def create_index(storage, paths) : lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords : stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" import os for path in paths : for filen in os.listdir(path) : text = sent_tokenize(get_data_from_file(path + filen)) total_sent = len(text) for i in range(0, total_sent, 3) : doc = Document() a = i-5 if i-5 > 0 else 0 sentence = ' '.join(text[a:i+5]) doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print("Done %s" % (path+filen)) print "Indexed (%d docs in index)" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def indexMovie(movie): doc = Document() doc.add(Field('id', str(movie), StringField.TYPE_STORED)) at_lest_one_field = False maybe_tags = movies_tags.query('item == @movie') if not maybe_tags.empty: tags = maybe_tags[['tags']].values.flatten()[0] doc.add(Field('tags', tags, TextField.TYPE_NOT_STORED)) at_lest_one_field = True maybe_description = movies_descriptions.query('item == @movie') if not maybe_description.empty: description = maybe_description[['description']].values.flatten()[0] doc.add(Field('description', description, TextField.TYPE_NOT_STORED)) at_lest_one_field = True maybe_genres = movies_genres.query('item == @movie') if not maybe_genres.empty: genres = maybe_genres[['genres']].values.flatten()[0] doc.add(Field('genres', genres, TextField.TYPE_NOT_STORED)) at_lest_one_field = True if at_lest_one_field: writer.addDocument(doc)
def index_docs(self, tweets, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t1.setStoreTermVectors(True) t1.setStoreTermVectorOffsets(True) # add each tweet to the index for tweet in tweets: try: # strip out URLs because they provide false index matches contents = [] for word in tweet[1].text.split(): if word.startswith("http://") or word.startswith("https://"): continue contents.append(word) contents = " ".join(contents) if len(contents) == 0: continue doc = Document() doc.add(Field("contents", contents, t1)) writer.addDocument(doc) except Exception, e: print "Failed in index_docs:", e
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: print("adding", filename) path = os.path.join(root, filename) file = open(path, encoding='utf8') url = file.readline() title = file.readline() contents = file.read() file.close() img_url = self.getTxtAttribute(contents, 'img_url') img_info = self.getTxtAttribute(contents, 'img_info') for i in range(len(img_url)): if len(img_info[i]) > 0: title = title doc = Document() doc.add(StringField('title', title, Field.Store.YES)) doc.add(StringField('url', url, Field.Store.YES)) doc.add( StringField('img_url', img_url[i], Field.Store.YES)) seg_contents = jieba.lcut_for_search(img_info[i]) contents = ' '.join(seg_contents) doc.add( TextField('contents', contents, Field.Store.YES)) writer.addDocument(doc) else: continue
def generate_indices_from_benchmark(writer, counter): javafiles = java_files_from_dir(source_path) i = 0 j = 0 for javafile in javafiles: # print javafile i += 1 if i % 1000 == 0: #1000개 마다 프린트 print("Counter: %s" % i) print "typed_method_call" + str(counter.typed_method_call_count) document = Document() document.add(Field("file", javafile, Field.Store.YES, Field.Index.NO)) try: with codecs.open(javafile, "r", encoding='utf-8', errors='ignore') as f: file_content = f.read().encode("utf-8", errors='ignore') f.close() ast = parse(file_content, resolve=False) if add_code_keyword_into_document(document, file_content, ast, counter): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print "%s files has been indexed" % j
def rewrite(data_string): data=json.loads(data_string) toupdate=json.loads(update) #primary_key_modified=False #delete the appropriate document query=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query.add(BooleanClause(temp,BooleanClause.Occur.MUST)) #modify the values for key,value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists==False: if key in data.keys(): data[key]=value else: data[key]=value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue primary_key_update=False for key in toupdate.keys(): if key in primary_keys_map: primary_key_update=True break if primary_key_update == True: query_search=BooleanQuery() for key in primary_keys_map: temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key]) query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST)) hits=searcher.search(query_search,MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc=Document() #index files wrt primary key for primary_key in primary_keys_map: try: field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED) doc.add(field) except: # primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_input==True: temp=json.dumps(data) data_string=base64.b64encode(snappy.compress(temp)) else: temp=json.dumps(data) data_string=base64.b64encode(temp) field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc)
def _addDocument(self, identifier, isformatof, sort=None): doc = Document() if isformatof: doc.add(NumericDocValuesField("__isformatof__", long(isformatof))) if sort: doc.add(NumericDocValuesField("__sort__", long(sort))) consume(self.lucene.addDocument(identifier, doc)) self.lucene.commit() # Explicitly, not required: since commitCount=1.
def luceneindex(text): for n, l in enumerate(text): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) #print( "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs())) #print( "Closing index of %d docs..." % writer.numDocs()) writer.close()
def index(self, eid, name, cname, type, info): doc = Document() doc.add(TextField('id', eid, Field.Store.YES)) doc.add(TextField('name', name, Field.Store.YES)) doc.add(TextField('CannonicalName', cname, Field.Store.YES)) doc.add(TextField('type', type, Field.Store.YES)) doc.add(TextField('info', info, Field.Store.YES)) self.writer.addDocument(doc)
def __init__(self, path): lazyImport() self._writer, self._reader, self._searcher = self._getLucene(path) self._latestModifications = {} self._doc = Document() self._keyField = StringField("key", "", Field.Store.NO) self._valueField = Field("value", "", UNINDEXED_TYPE) self._doc.add(self._keyField) self._doc.add(self._valueField)
def create_document(self, contents): """Create a Lucene document from the specified contents. Contents is a list of fields to be indexed, represented as a dictionary with keys 'field_name', 'field_type', and 'field_value'.""" doc = Document() for f in contents: doc.add(Field(f['field_name'], f['field_value'], self.ldf.get_field(f['field_type']))) return doc
def createDocument(item_id, label, viewSimilar, viewProspective): doc = Document() doc.add(StringField('itemID', item_id, Field.Store.YES)) doc.add(StringField('label', label, Field.Store.YES)) for item in viewSimilar: doc.add(StoredField("viewSimilar", item)) for item in viewProspective: doc.add(StoredField("viewProspective", item)) return doc
def index_text(self, sha1, full_text): """Index the full text and map it to the source sha1.""" document = Document() document.add(Field("sha1", sha1, ImageIndexer.hash_field)) if full_text: document.add(Field("full_text", full_text, ImageIndexer.text_field)) self.writer.updateDocument(Term("sha1", sha1), document) else: logging.info("No text for sha1 %s", sha1)
def index_code_snippet(writer): HOME = "/extdsk/FaCoY/Git_data/Git_20161108" #29.06.2015, 03.07.2015, 15.07.2015 jfiles = java_files_from_dir(HOME) N_cores = 4 # print("Number of Java files to process: %s" % (len(jfiles))) source_queue = [] i = 0 j = 0 for jfile in jfiles: i += 1 if i % 1000 == 0: print("Counter: %s" % i) #break document = Document() document.add(Field("file", jfile, Field.Store.YES, Field.Index.NO)) try: with codecs.open(jfile, "r", encoding='utf-8') as f: file_content = f.read().encode("utf-8") #document.add( Field("file_content", compress(file_content), Field.Store.YES, Field.Index.NO) ) # Check for duplicates files and accumulate source code # hash_v = str(md5(file_content)) # if hash_v not in hashes: # source_queue.append((document, file_content)) # hashes.add(hash_v) # Wait until source files # if len(source_queue) >= N_cores: # ast_docs = parallize(source_queue) # source_queue = [] # for ast, file_content, doc in ast_docs: ast = parse(file_content, resolve=False) if add_code_keyword_into_document(document, file_content, ast): writer.addDocument(document) j += 1 if j % 1000 == 0: print "Wrote:: %s files" % j except Exception as e: #traceback.print_exc() #print jfile print("Error: %s" % e) continue print "Number of files: %s" % i print "Number of duplicates: %s" % len(hashes) print("%s files has been indexed" % j)
def create_document_by_document_sentence(org_title, preprocessed_title, doc_id, sentence): doc = Document() # create a new document doc.add(StringField("org_title", org_title, Field.Store.YES)) doc.add( TextField("preprocessed_title", preprocessed_title, Field.Store.YES)) doc.add(StringField("doc_id", str(doc_id), Field.Store.YES)) # doc.add(StringField("content", content, Field.Store.YES)) doc.add(TextField("sentence", sentence, Field.Store.YES)) return doc
def indexDocs(self, root, writer): f = codecs.open('picIndex.txt','r',encoding='utf-8') picDict = {} for line in f.xreadlines(): ls = line.split('seg^*') url = ls[0] title = ls[1] src = ls[2] alt = ls[3] picDict[src] = [url,title,alt] f.close() for src in picDict: doc = Document() doc.add(Field("src", src, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("url", picDict[src][0], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("title", picDict[src][1], Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("alt", picDict[src][2], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def create_minidoc(termstring, field='text'): # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) doc = Document() doc.add(Field(field, termstring, customfield)) return doc
def setUp(self): super(PhraseQueryTestCase, self).setUp() doc = Document() doc.add(Field("field", "one two three four five", TextField.TYPE_STORED)) writer = self.getWriter() writer.addDocument(doc) writer.close() self.searcher = self.getSearcher() self.builder = PhraseQuery.Builder()
def indexer(docNumber, docText): lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer()) writer = IndexWriter(indexDir, writerConfig) doc = Document() doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def indexDocs(self, root, writer): t1 = FieldType() #t1 is used in URL fields t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t2 = FieldType() #t2 is used to index contents t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() #t3 is used to index titles t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) indextxt = open(self.filedir, 'r') while True: t = indextxt.readline() if (len(t) == 0): indextxt.close() return filename = t.strip() # for root, dirnames, filenames in os.walk(root): # for filename in filenames: print "updating", filename try: path = os.path.join(root, filename) file = open(path) title = file.readline() print title page_URL = file.readline() while True: imgsrc = file.readline() if (imgsrc == 'EOF'): file.close() break contents = file.readline() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("imgurl", imgsrc, t1)) doc.add(Field("url", page_URL, t1)) doc.add(Field("title",title, t3)) doc.add(Field("contents", contents, t2)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def create_index(): for country in cleaned_dictionary: doc = Document() doc.add(Field("country", country[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("country_html", country[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("capital", country[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("capital_html", country[3], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def create_document_by_document_content(org_title, preprocessed_title, preprocessed_title_lower, content): doc = Document() # create a new document doc.add(StringField("org_title", org_title, Field.Store.YES)) doc.add( TextField("preprocessed_title", preprocessed_title, Field.Store.YES)) doc.add( StringField("preprocessed_title_lower", preprocessed_title_lower, Field.Store.YES)) # doc.add(StringField("content", content, Field.Store.YES)) doc.add(TextField("content", content, Field.Store.YES)) return doc
def parseBook(self, filename, t1, t2, iw): with open(filename, 'r', errors="ignore") as book: lines = book.readlines() doc = Document() beginindex = endindex = 0 author = None title = None for index, text in enumerate(lines): if text.startswith("Author:"): author = text[8:] doc.add(Field("author", text[8:], t1)) if text.startswith("Title: "): title = text[7:] doc.add(Field("title", text[7:], t1)) elif text.startswith(" Title: "): title = text[8:] doc.add(Field("title", text[8:], t1)) if text.startswith("*** START OF THIS PROJECT GUTENBERG"): # extract rest of the text beginindex = index if text.startswith("*** END OF THIS PROJECT GUTENBERG"): endindex = index break if author == None: print("Didnt find author") self.authorcount += 1 if title == None: print("Didnt find title") self.titlecount += 1 if title == None and author == None: # Skip this book return text = None # Check if indices are correct if beginindex == 0 or endindex == 0: print( "Skipping book {}\nSomething went wrong when extracting text" .format(filename)) text = "".join(lines) self.errorcount += 1 else: text = "".join(lines[beginindex:endindex]) doc.add(Field("content", text, t2)) iw.addDocument(doc)
def retrival_answer(MAX): lucene.initVM() directory = RAMDirectory() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(directory, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from Document..." process_doc = open("Huawei_result/document.txt", "r") doc_line = process_doc.readlines() for l in doc_line: doc = Document() doc.add(TextField("text", l, Field.Store.YES)) writer.addDocument(doc) print "Indexed from %d docs in index" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close() accuracy = [] process_query = open("Huawei_result/query.txt", "r") query_line = process_query.readlines() for n, one_query in enumerate(query_line): analyzer = StandardAnalyzer() # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index'))) searcher = IndexSearcher(DirectoryReader.open(directory)) # searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(one_query) hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) # print "The groundtruth document is:", doc_line[n] candidate_doc = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") candidate_doc.append(doc.get("text")) choices = process.extract(unicode(doc_line[n]), candidate_doc) flag = 0 for i in range(len(choices)): if choices[i][1] >= 89: flag = 1 if flag == 1: accuracy.append(1) else: accuracy.append(0) final_accuracy = float(sum(accuracy)) / float(len(accuracy)) print "the final accuracy is:", final_accuracy
def setUp(self): super(TestRegexQuery, self).setUp() writer = self.getWriter(analyzer=SimpleAnalyzer(self.TEST_VERSION)) doc = Document() doc.add( Field(self.FN, "the quick brown fox jumps over the lazy dog", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) writer.commit() writer.close() self.searcher = self.getSearcher()
def dummyIndex(self): """ Create a dummy index - to avoid problems updating it """ config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(self.indexDir, config) doc = Document() doc.add(Field('uid', 'dummy', StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close() return
def get_doc(self, img): ''' Generate a `Document` according to the parameters. Input: `img`: dict containing a single image info Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("img_url", img['img_url'], Field.Store.YES)) doc.add(TextField("description", img['description'], Field.Store.YES)) doc.add(StringField("url", img['url'], Field.Store.YES)) doc.add(StringField("url_title", img['url_title'], Field.Store.YES)) return doc
def addDoc(w, text): """ add single doc to the index :param w: writer :param doc_name: :param text: :param file_name: :return: """ doc = Document() # TextField: sequence of terms: tokenized doc.add(TextField("text", text, Field.Store.YES)) w.addDocument(doc)
def _getNewDocument(self, identifier, oldDoc): doc = Document() doc.add(StringField(IDENTIFIER_FIELD, identifier, Field.Store.YES)) doc.add(IntField(HASH_FIELD, Partition.hashId(identifier), Field.Store.NO)) if oldDoc is not None: for oldPrefix in oldDoc.getValues(PREFIX_FIELD): doc.add(StringField(PREFIX_FIELD, oldPrefix, Field.Store.YES)) for oldSet in oldDoc.getValues(SETS_FIELD): doc.add(StringField(SETS_FIELD, oldSet, Field.Store.YES)) return doc
def add_document(self, url, field, text): """ add a new document to index writer input: url the url of the target to be indexed field fieldname of the value that will be indexed text text to be indexed """ doc = Document() doc.add(Field('url', url, TextField.TYPE_STORED)) doc.add(Field(field, text, TextField.TYPE_STORED)) self.idx_writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): # traverse through the doc directory for filename in filenames: # if not filename.endswith('.cdc'): # continue try: # only add the filename and path for indexing path = os.path.join(root, filename) print "adding file : ", path file = open(path) contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in ", filename writer.addDocument(doc) except Exception, e: print "failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'gbk') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def add_document(self, fields, header, id_): doc = Document() if len(fields) > len(header): sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_)) for field in fields: sys.stderr.write('%s\n' % field) return for idx, field in enumerate(fields): fname, fieldtype = header[idx] if fieldtype is IntField: field = int(field) doc.add(fieldtype(fname, field, Field.Store.YES)) self.writer.addDocument(doc) self.num_docs += 1
def testScore(self): reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False) lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings) document = Document() document.add(TextField('field', 'x '*100, Field.Store.NO)) returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document)) q = TermQuery(Term("field", 'x')) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(0.1, result.hits[0].score) q.setBoost(10.0) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(1, result.hits[0].score)
def buildDocument(self, fields, record): doc = Document() doc.add( Field("id", record["_id"], self.fieldType2)) for field in fields: if isinstance(record[field], dict): self.dictToFields(doc, record[field]) else: doc.add( Field(field, record[field], self.fieldType1)) return doc
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT)) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query query = PhraseQuery() query.add(Term("field", "stop")) query.add(Term("field", "words")) scoreDocs = searcher.search(query, None, 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def testNot(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) d1 = Document() d1.add(Field("field", "a b", TextField.TYPE_STORED)) writer.addDocument(d1) writer.commit() writer.close() searcher = self.getSearcher() query = QueryParser("field", SimpleAnalyzer()).parse("a NOT b") topDocs = searcher.search(query, 50) self.assertEqual(0, topDocs.totalHits)
def setUp(self): super(BooleanOrTestCase, self).setUp() # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)) d = Document() d.add(Field(self.FIELD_T, "Optimize not deleting all files", TextField.TYPE_STORED)) d.add(Field(self.FIELD_C, "Deleted When I run an optimize in our production environment.", TextField.TYPE_STORED)) writer.addDocument(d) writer.close() self.searcher = self.getSearcher()
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer()) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query builder = PhraseQuery.Builder() builder.add(Term("field", "stop")) builder.add(Term("field", "words")) scoreDocs = searcher.search(builder.build(), 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def generate_document(path, indexer): """ Given a file, convert it into a lucene document that is ready to be indexed. :param path: The file to add to the search index :param indexer: The indexer to operate with :returns: The index document for the specified camera """ name = FILE_TYPES.get(path.rsplit(".", 1)[-1], "") data = open(path, "r").read() document = Document() document.add(Field("path", path, indexer.field_clean)) document.add(Field("type", name, indexer.field_clean)) document.add(Field("data", data, indexer.field_dirty)) return document
def setUp(self): super(Test_Bug1564, self).setUp() self.analyzer = StandardAnalyzer() writer = self.getWriter(analyzer=self.analyzer) doc = Document() doc.add(Field('all', u'windowpane beplaster rapacious \ catatonia gauntlet wynn depressible swede pick dressmake supreme \ jeremy plumb theoretic bureaucracy causation chartres equipoise \ dispersible careen heard', TextField.TYPE_NOT_STORED)) doc.add(Field('id', '1', StoredField.TYPE)) writer.addDocument(doc) writer.commit() writer.close()
def prp_index(self): ''' Prepare the index given our "corpus" file(s) ''' print '=> Preparing Lucene index %s' % self._index_dir writer = self._get_writer(create=True) print ' Currently %d docs (dir %s)' % (writer.numDocs(), self._index_dir) num_pages, num_sections = 0, 0 page_name, section_name = None, None num_lines = 0 for ifname,fname in enumerate(self._filenames): print ' Adding lines to index from file #%d: %s' % (ifname, fname) with open(fname,'rt') as infile: for text in infile: if len(text)==0: print 'Reached EOF' break # EOF # CorpusReader.PAGE_NAME_PREFIX is <Page> # all our corpus we manipulated them to have this tag as the start of a page if text.startswith(CorpusReader.PAGE_NAME_PREFIX): page_name = text[len(CorpusReader.PAGE_NAME_PREFIX):].strip() section_name = None num_pages += 1 elif text.startswith(CorpusReader.SECTION_NAME_PREFIX): section_name = text[len(CorpusReader.SECTION_NAME_PREFIX):].strip() num_sections += 1 else: assert (page_name is not None) and (section_name is not None) if self._parser is None: luc_text = text else: # note in our case the we always have SimpleWordParser section_words = self._parser.parse(text, calc_weights=False) #True) luc_text = ' '.join(section_words) # for each section, we add the whole section to Lucene index, we store the text and makes it searchable # seems like page is not necessary here since we do not add document page by page but section by section doc = Document() # there is only one field for each document, which is the text field # section_name is not used as a field doc.add(Field("text", luc_text, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) num_lines += 1 if num_lines % 100000 == 0: print ' read %d lines so far: %d pages, %d sections' % (num_lines, num_pages, num_sections) print ' Finished - %d docs (dir %s)' % (writer.numDocs(), self._index_dir) writer.close()