def index_one(self, article): """Create index for one url object in the database. """ try: date_published_str = article['date_published'].strftime( self.date_format) except Exception as e: logger.warning('Error when formating date_published %r: %s ', article['canonical_url'], e) return doc = Document() doc.add(StoredField('group_id', article['group_id'])) doc.add(StoredField('article_id', article['article_id'])) doc.add( StringField('date_published', date_published_str, Field.Store.YES)) doc.add( SortedDocValuesField('date_published', BytesRef(date_published_str))) doc.add(StoredField('date_published', date_published_str)) doc.add(StringField('domain', article['domain'], Field.Store.YES)) doc.add(StringField('site_type', article['site_type'], Field.Store.YES)) doc.add( TextField('canonical_url', article['canonical_url'], Field.Store.YES)) doc.add(TextField('title', article['title'], Field.Store.YES)) doc.add(TextField('meta', article['meta'], Field.Store.NO)) doc.add(TextField('content', article['content'], Field.Store.NO)) doc.add(StoredField('uq_id_str', article['uq_id_str'])) self.writer.addDocument(doc)
def get_doc(self, filename, path, title, url, contents): ''' Generate a `Document` according to the parameters. Input: `filename`: filename of the webpage `path`: path of the webpage `title`: title of the webpage `url`: original url of the webpage `contents`: contents of the webpage Output: `Document` with the fields initialized ''' doc = Document() # doc.add(Field("name", filename, self.property_type)) # doc.add(Field("path", path, self.property_type)) # doc.add(Field("title", title, self.property_type)) # doc.add(Field("url", url, self.property_type)) doc.add(StringField("name", filename, Field.Store.YES)) doc.add(StringField("path", path, Field.Store.YES)) doc.add(TextField("title", title, Field.Store.YES)) doc.add(TextField("url", url, Field.Store.YES)) if len(contents) > 0: # doc.add(Field("contents", contents, self.content_type)) doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(filename)) return doc
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: print("adding", filename) path = os.path.join(root, filename) file = open(path, encoding='utf8') url = file.readline() title = file.readline() contents = file.read() file.close() img_url = self.getTxtAttribute(contents, 'img_url') img_info = self.getTxtAttribute(contents, 'img_info') for i in range(len(img_url)): if len(img_info[i]) > 0: title = title doc = Document() doc.add(StringField('title', title, Field.Store.YES)) doc.add(StringField('url', url, Field.Store.YES)) doc.add( StringField('img_url', img_url[i], Field.Store.YES)) seg_contents = jieba.lcut_for_search(img_info[i]) contents = ' '.join(seg_contents) doc.add( TextField('contents', contents, Field.Store.YES)) writer.addDocument(doc) else: continue
def indexDocs(self, root, writer): for root,dirnames,filenames in os.walk(root): for dirname in dirnames: #遍历文件夹 path1 = os.path.join(root,dirname) for trivial1 , trivial2 , filenames in os.walk(path1): #遍历文件夹下的文件 for filename in filenames: #print(root,dirnames,filename) print("adding", filename) # try: path = os.path.join(path1, filename) file = open(path, encoding='utf8') page = file.readline() title = file.readline() contents = file.read() file.close() # jieba 分词 seg_contents = jieba.lcut_for_search(contents) contents = ' '.join(seg_contents) url = page seg_url = jieba.lcut_for_search(page) page = ' '.join(list(set(seg_url)-set(['.','http','https','/',':','?','=','html','shtml','www']))) doc = Document() doc.add(StringField("name", filename, Field.Store.YES)) doc.add(StringField("path", path, Field.Store.YES)) if len(contents) > 0: doc.add(TextField('title', title, Field.Store.YES)) doc.add(TextField('site', page, Field.Store.YES)) doc.add(TextField('url',url,Field.Store.YES)) doc.add(TextField('contents', contents, Field.Store.YES)) else: print("warning: no content in %s" % filename) writer.addDocument(doc)
def addDoc(w, name, birth_date, death_date, birth_note, death_note): doc = Document() doc.add(TextField("name", name, Field.Store.YES)) doc.add(StringField("birth_date", birth_date, Field.Store.YES)) doc.add(StringField("death_date", death_date, Field.Store.YES)) doc.add(StringField("birth_note", birth_note, Field.Store.YES)) doc.add(StringField("death_note", death_note, Field.Store.YES)) w.addDocument(doc)
def createDocument(item_id, label, viewSimilar, viewProspective): doc = Document() doc.add(StringField('itemID', item_id, Field.Store.YES)) doc.add(StringField('label', label, Field.Store.YES)) for item in viewSimilar: doc.add(StoredField("viewSimilar", item)) for item in viewProspective: doc.add(StoredField("viewProspective", item)) return doc
def __init__(self, path): lazyImport() self._writer, self._reader, self._searcher = self._getLucene(path) self._latestModifications = {} self._doc = Document() self._keyField = StringField("key", "", Field.Store.NO) self._valueField = Field("value", "", UNINDEXED_TYPE) self._doc.add(self._keyField) self._doc.add(self._valueField)
def create_document_by_document_sentence(org_title, preprocessed_title, doc_id, sentence): doc = Document() # create a new document doc.add(StringField("org_title", org_title, Field.Store.YES)) doc.add( TextField("preprocessed_title", preprocessed_title, Field.Store.YES)) doc.add(StringField("doc_id", str(doc_id), Field.Store.YES)) # doc.add(StringField("content", content, Field.Store.YES)) doc.add(TextField("sentence", sentence, Field.Store.YES)) return doc
def create_document_by_document_content(org_title, preprocessed_title, preprocessed_title_lower, content): doc = Document() # create a new document doc.add(StringField("org_title", org_title, Field.Store.YES)) doc.add( TextField("preprocessed_title", preprocessed_title, Field.Store.YES)) doc.add( StringField("preprocessed_title_lower", preprocessed_title_lower, Field.Store.YES)) # doc.add(StringField("content", content, Field.Store.YES)) doc.add(TextField("content", content, Field.Store.YES)) return doc
def create_doc(item_id, label, viewSimilar, viewProspective, model="default"): doc = Document() now_time = int(time.time()) _id = hashlib.md5(f"{label}_{item_id}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(StringField("itemID", item_id, Field.Store.YES)) doc.add(StringField("label", label, Field.Store.YES)) doc.add(StoredField("viewSimilar", viewSimilar)) doc.add(StoredField("viewProspective", viewProspective)) doc.add(StringField("model", model, Field.Store.YES)) doc.add(StringField("ttl", str(now_time), Field.Store.NO)) return _id, doc
def get_doc(self, img): ''' Generate a `Document` according to the parameters. Input: `img`: dict containing a single image info Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("img_url", img['img_url'], Field.Store.YES)) doc.add(TextField("description", img['description'], Field.Store.YES)) doc.add(StringField("url", img['url'], Field.Store.YES)) doc.add(StringField("url_title", img['url_title'], Field.Store.YES)) return doc
def create_document(line): doc = Document() line = line.split() keyterm = line[0] doc.add(StringField("keyterm", keyterm, Field.Store.YES)) index = line[1] doc.add(StringField("Sno", index, Field.Store.YES)) del line[0:2] line = ' '.join(line) qterm = keyterm.replace("_", " ") if qterm not in line: line = qterm + ' ' + line doc.add(TextField("text", line, Field.Store.YES)) return doc
def obj_to_document(obj): def conv_to_str(x): if isinstance(x, unicode): return x.encode('utf8') return str(x) res = Document() res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES)) res.add(StringField('type', obj.__class__.__name__, Field.Store.YES)) for k, v in vars(obj.data).items(): if v is None: res.add(Field(k, '', Field.Store.YES, Field.Index.NO)) fieldtype = LT_NONE elif isinstance(v, list): if len(v) > 0 and isinstance(v[0], int): res.add( TextField(k, ' '.join((str(x) for x in set(v))), Field.Store.YES)) fieldtype = LT_INTLIST else: res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES)) fieldtype = LT_LIST elif isinstance(v, str) or isinstance(v, unicode): res.add(Field(k, v, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v)), Field.Store.NO)) fieldtype = LT_STRING elif isinstance(v, hyper_text): res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v.text)), Field.Store.NO)) fieldtype = LT_HYPERTEXT elif isinstance(v, bool): if v: vs = '1' else: vs = '0' res.add(StringField(k, vs, Field.Store.YES)) fieldtype = LT_BOOL elif isinstance(v, int) or isinstance(v, long): res.add(StringField(k, str(v), Field.Store.YES)) fieldtype = LT_INT else: raise Exception('unrecognized data type') res.add( Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO)) return res
def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close()
def addDoc(w, data): doc = Document() for field in data: value, type = data[field][0], data[field][1] ''' if type!='INTEGER_STORED': #print ('field=%s len=%d'%(field,len(value))) print ('field=%s value=%s'%(field,value)) else: print ('field=%s value=%d'%(field,value)) ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') try: w.addDocument(doc) except: #print ('error cat=%s'%(data['category'][0])) print('-----------------------------------') for field in data: value, type = data[field][0], data[field][1] print('field=%s\nvalue=%s' % (field, str(value)))
def createDoc(self, url, html, duplicate): title, contents = self.parseHtml(url, html) doc = Document() doc.add(StringField("title", title, Field.Store.YES)) doc.add(StringField("url", url, Field.Store.YES)) doc.add( StringField("duplicate", str(duplicate).lower(), Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print "Warning: No content in %s" % url return doc
def add_doc(self, item_data): item_id = item_data['item_id'] ttl = item_data['ttl'] version = item_data.get('version', 'default') view_similar = json.dumps(item_data.get('view_similar', {})) view_prospective = json.dumps(item_data.get('view_prospective', {})) doc = Document() _id = hashlib.md5(f"{item_id}_{version}".encode('utf-8')).hexdigest() doc.add(StringField("id", _id, Field.Store.NO)) doc.add(LongPoint("ttl", ttl)) doc.add(StringField("version", version, Field.Store.YES)) doc.add(StringField("item_id", item_id, Field.Store.YES)) doc.add(StoredField("view_similar", view_similar)) doc.add(StoredField("view_prospective", view_prospective)) self.writer.updateDocument(Term("id", _id), doc)
def indexDocs(self, root, writer): path = root + "/data/*/*.xml" # print(path) xml_files = glob.glob(path) # xml_files = ["HAM2-031201.xml"] numDocs = 0 for xml in xml_files: try: parser = etree.XMLParser(recover=False, strip_cdata=False) tree = etree.parse(xml, parser=parser) except etree.XMLSyntaxError as e: parser = etree.XMLParser(recover=True, strip_cdata=False) tree = etree.parse(xml, parser=parser) root = tree.getroot() for text in root.iter("TEXT"): contents = "".join(text.xpath("text()")).strip() doc_no = text.getparent().find("DOCNO").text # print("adding", doc_no) try: doc = Document() doc.add(StringField("id", doc_no, Field.Store.YES)) if len(contents) > 0: doc.add( TextField("contents", contents, Field.Store.YES)) else: pass # print("warning: no content in %s" % doc_no) writer.addDocument(doc) numDocs += 1 except Exception as e: print("Failed in indexDocs:", e) return numDocs
def index_single_file(self, doc_file): logger.info("adding {}".format(doc_file)) lucene_doc_num = 0 try: with open(doc_file) as df: for line in df: wiki_doc = json.loads(line) doc_title = wiki_doc['title'] doc_text = wiki_doc['plaintext'] doc_id = wiki_doc['_id'] paragraphs = doc_text.split('\n\n') if len(paragraphs) < 3: continue doc_text = rm_special_chars(doc_text) doc = Document() doc.add(StringField("id", str(doc_id), Field.Store.YES)) doc.add(TextField("title", doc_title, Field.Store.YES)) doc.add(TextField("text", doc_text, Field.Store.YES)) self.writer.addDocument(doc) lucene_doc_num += 1 if lucene_doc_num % 10000 == 0: logger.info('added {} lucene docs'.format(lucene_doc_num)) except Exception as e: import traceback traceback.print_tb(e.__traceback__) logger.error("Failed in: {}".format(doc_file)) return lucene_doc_num
def addDoc(w, data): doc = Document() #print ('----------------------------') for field in data: value, type = data[field][0], data[field][1] ''' print ('field:%s type:%s'%(field,type)) print (value+'\n') ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_DF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF)) elif type == 'CUSTOM_FIELD_TEXT_BF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') w.addDocument(doc)
def add_code_keyword_into_document(document, file_content, node, counter): # Flag is set when at least 1 code characteristics has been stored flag = False # document.add(Field("line_numbers", str(dict(node["line_numbers"])), Field.Store.YES, Field.Index.NO)) # document.add(Field("hash", str(md5(file_content)), Field.Store.YES, Field.Index.NO)) # document.add(Field("code", so_tokenizer(file_content, False), Field.Store.YES, Field.Index.ANALYZED)) for m in node["typed_method_call"]: if m: document.add( Field("word", m, Field.Store.YES, Field.Index.ANALYZED)) counter.typed_method_call_count += 1 flag = True for e in node["extends"]: if e: document.add(Field("word", e, Field.Store.NO, Field.Index.ANALYZED)) counter.extends_count += 1 for c in node["used_classes"]: if c: document.add( Field("word", str(c), Field.Store.YES, Field.Index.ANALYZED)) counter.used_classes_count += 1 for i in node["class_instance_creation"]: if i: document.add( Field("word", i, Field.Store.YES, Field.Index.ANALYZED)) counter.class_instance_creation_count += 1 flag = True for m in node["methods"]: if m: document.add( Field("word", m, Field.Store.YES, Field.Index.ANALYZED)) counter.methods_count += 1 for m in node["methods_called"]: if m: document.add( Field("word", m, Field.Store.YES, Field.Index.ANALYZED)) counter.methods_called_count += 1 flag = True for m in node["unresolved_method_calls"]: if m: document.add( Field("word", m, Field.Store.YES, Field.Index.ANALYZED)) counter.unresolved_method_calls_count += 1 for l in node["literals"]: if l: document.add(StringField("word", l, Field.Store.YES)) counter.literals_count += 1 flag = True return flag
def create_document(file_name): path = './alldocs/' + file_name file = open(path) doc = Document() doc.add(StringField("title", input_file, Field.Store.YES)) doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() return doc
def index_document(self, wiki_doc): """ :param wiki_doc: the document to be indexed. :return: """ # Method that indexes documents i = 0 for section in wiki_doc.sections: doc = Document() doc.add(StringField("id_article", wiki_doc.id, Field.Store.YES)) doc.add(TextField("title_article", wiki_doc.title, Field.Store.YES)) doc.add(StringField("id_section", str( wiki_doc.id) + "_" + str(i), Field.Store.YES)) doc.add(TextField("title_section", section.title, Field.Store.YES)) doc.add(TextField("content_section", section.text, Field.Store.YES)) self.writer.addDocument(doc) i += 1
def addDocument(self, id): global answers_train preA = answers_train[id] doc = Document() doc.add(TextField("pa", preA, Field.Store.YES)) doc.add(StringField("id", str(id), Field.Store.YES)) self.w.addDocument(doc) self.w.commit()
def create_document(file_name): path = INPUT_DIR + file_name # assemble the file descriptor file = open(path) # open in read mode doc = Document() # create a new document # add the title field doc.add(StringField("title", input_file, Field.Store.YES)) # add the whole book doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() # close the file pointer return doc
def _setMetadataPrefixes(self, doc, metadataPrefixes, delete, oldDeletedPrefixes, deleteInPrefixes): allMetadataPrefixes = set(doc.getValues(PREFIX_FIELD)) allDeletedPrefixes = set(oldDeletedPrefixes) for prefix in metadataPrefixes: allDeletedPrefixes.discard(prefix) for prefix in metadataPrefixes.union(deleteInPrefixes): if prefix not in allMetadataPrefixes: doc.add(StringField(PREFIX_FIELD, prefix, Field.Store.YES)) self._prefixes.setdefault(prefix, ('', '')) allMetadataPrefixes.add(prefix) allDeletedPrefixes.update(deleteInPrefixes) if delete: allDeletedPrefixes = allMetadataPrefixes for prefix in allDeletedPrefixes: doc.add(StringField(PREFIX_DELETED_FIELD, prefix, Field.Store.YES)) return allMetadataPrefixes, allDeletedPrefixes
def get_doc(self, doc_info, contents): ''' Generate a `Document` according to the given info. Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`) `contents`: contents of the webpage Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("name", doc_info['name'], Field.Store.YES)) doc.add(StringField("path", doc_info['path'], Field.Store.YES)) doc.add(StringField("title", doc_info['title'], Field.Store.YES)) doc.add(StringField("url", doc_info['url'], Field.Store.YES)) doc.add(TextField("site", doc_info['site'], Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(doc_info['name'])) return doc
def _getNewDocument(self, identifier, oldDoc, purgeSets=None): doc = Document() doc.add(StringField(IDENTIFIER_FIELD, identifier, Field.Store.YES)) doc.add(IntPoint(HASH_FIELD, Partition.hashId(identifier))) oldDeletedSets = set() oldDeletedPrefixes = set() if oldDoc is not None: filterPurgedSets = lambda x: x if purgeSets: filterPurgedSets = lambda sets: [ s for s in sets if s not in purgeSets ] for oldPrefix in oldDoc.getValues(PREFIX_FIELD): doc.add(StringField(PREFIX_FIELD, oldPrefix, Field.Store.YES)) for oldSet in filterPurgedSets(oldDoc.getValues(SETS_FIELD)): doc.add(StringField(SETS_FIELD, oldSet, Field.Store.YES)) oldDeletedSets.update(oldDoc.getValues(SETS_DELETED_FIELD)) oldDeletedPrefixes.update( filterPurgedSets(oldDoc.getValues(PREFIX_DELETED_FIELD))) return doc, oldDeletedSets, oldDeletedPrefixes
def index_docs(self, input_documents): for document in tqdm(input_documents, total=len(input_documents)): doc = Document() doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES)) doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES)) type = FieldType() type.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) type.setStored(True) type.setStoreTermVectors(True) type.setTokenized(True) if ".W" in document and ".M" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower() + document[".W"].lower())), type)) elif ".M" in document and ".W" not in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower())), type)) elif ".M" not in document and ".W" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".T"].lower() + document[".W"].lower())), type)) elif ".M" not in document and ".W" not in document: doc.add( Field("text", " ".join(tokenizer.tokenize(document[".T"].lower())), type)) if self.writer.getConfig().getOpenMode( ) == IndexWriterConfig.OpenMode.CREATE: self.writer.addDocument(doc) else: self.writer.updateDocument(Term(".U", document[".U"]), doc) self.writer.close()
def _setSets(self, doc, setSpecs, delete, deleteInSets, oldDeletedSets): currentSets = set(doc.getValues(SETS_FIELD)) allSets = set(currentSets) for setSpec in _validSetSpecs(setSpecs): allSets.update(_setSpecAndSubsets(setSpec)) if delete: allDeletedSets = set(allSets) else: allDeletedSets = set(oldDeletedSets) for setSpec in _validSetSpecs(setSpecs): allDeletedSets.difference_update(_setSpecAndSubsets(setSpec)) if self._deleteInSetsSupport and deleteInSets: allSets.update(deleteInSets) allDeletedSets.update(deleteInSets) for aSet in allSets: if not aSet in currentSets: self._sets.setdefault(aSet, '') doc.add(StringField(SETS_FIELD, aSet, Field.Store.YES)) allSets.add(aSet) for aSet in allDeletedSets: doc.add(StringField(SETS_DELETED_FIELD, aSet, Field.Store.YES)) return allSets, allDeletedSets
class LuceneKeyValueStore(object): def __init__(self, path): lazyImport() self._writer, self._reader, self._searcher = self._getLucene(path) self._latestModifications = {} self._doc = Document() self._keyField = StringField("key", "", Field.Store.NO) self._valueField = Field("value", "", UNINDEXED_TYPE) self._doc.add(self._keyField) self._doc.add(self._valueField) def get(self, key, default=None): try: return self[key] except KeyError: return default def __setitem__(self, key, value): key = str(key) value = str(value) self._maybeReopen() self._keyField.setStringValue(key) self._valueField.setStringValue(value) self._writer.updateDocument(Term("key", key), self._doc) self._latestModifications[key] = value def __getitem__(self, key): key = str(key) value = self._latestModifications.get(key) if value is DELETED_RECORD: raise KeyError(key) if not value is None: return value self._maybeReopen() topDocs = self._searcher.search(TermQuery(Term("key", key)), 1) if topDocs.totalHits == 0: raise KeyError(key) return self._searcher.doc(topDocs.scoreDocs[0].doc).get("value") def __delitem__(self, key): key = str(key) self._writer.deleteDocuments(Term("key", key)) self._latestModifications[key] = DELETED_RECORD def __len__(self): raise NotImplementedError def __iter__(self): raise NotImplementedError def items(self): raise NotImplementedError def keys(self): raise NotImplementedError def values(self): raise NotImplementedError def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher def _maybeReopen(self): if len(self._latestModifications) > 10000: newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True) if not newReader is None: self._reader.close() self._reader = newReader self._searcher = IndexSearcher(self._reader) self._latestModifications.clear() def commit(self): self._writer.commit() def close(self): self._writer.close()