class LuceneKeyValueStore(object): def __init__(self, path): lazyImport() self._writer, self._reader, self._searcher = self._getLucene(path) self._latestModifications = {} self._doc = Document() self._keyField = StringField("key", "", Field.Store.NO) self._valueField = Field("value", "", UNINDEXED_TYPE) self._doc.add(self._keyField) self._doc.add(self._valueField) def get(self, key, default=None): try: return self[key] except KeyError: return default def __setitem__(self, key, value): key = str(key) value = str(value) self._maybeReopen() self._keyField.setStringValue(key) self._valueField.setStringValue(value) self._writer.updateDocument(Term("key", key), self._doc) self._latestModifications[key] = value def __getitem__(self, key): key = str(key) value = self._latestModifications.get(key) if value is DELETED_RECORD: raise KeyError(key) if not value is None: return value self._maybeReopen() topDocs = self._searcher.search(TermQuery(Term("key", key)), 1) if topDocs.totalHits.value == 0: raise KeyError(key) return self._searcher.doc(topDocs.scoreDocs[0].doc).get("value") def __delitem__(self, key): key = str(key) self._writer.deleteDocuments(Term("key", key)) self._latestModifications[key] = DELETED_RECORD def __len__(self): raise NotImplementedError def __iter__(self): raise NotImplementedError def items(self): raise NotImplementedError def keys(self): raise NotImplementedError def values(self): raise NotImplementedError def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher def _maybeReopen(self): if len(self._latestModifications) > 10000: newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True) if not newReader is None: self._reader.close() self._reader = newReader self._searcher = IndexSearcher(self._reader) self._latestModifications.clear() def commit(self): self._writer.commit() def close(self): self._writer.close()
def testSimple(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) doc = Document() field = Field("foo", "", TextField.TYPE_NOT_STORED) doc.add(field) dvField = FloatDocValuesField("foo_boost", 0.0) doc.add(dvField) field2 = Field("bar", "", TextField.TYPE_NOT_STORED) doc.add(field2) field.setStringValue("quick brown fox") field2.setStringValue("quick brown fox") dvField.setFloatValue(2.0) # boost x2 writer.addDocument(doc) field.setStringValue("jumps over lazy brown dog") field2.setStringValue("jumps over lazy brown dog") dvField.setFloatValue(4.0) # boost x4 writer.addDocument(doc) reader = writer.getReader() writer.close() # no boosting searcher1 = self.getSearcher(reader=reader) base = searcher1.getSimilarity(True) # boosting searcher2 = self.getSearcher(reader=reader) class _similarity(PythonPerFieldSimilarityWrapper): def __init__(_self, base): super(_similarity, _self).__init__() _self.base = base _self.fooSim = BoostingSimilarity(base, "foo_boost") def get(_self, field): return _self.fooSim if "foo" == field else _self.base searcher2.setSimilarity(_similarity(base)) # in this case, we searched on field "foo". first document should have # 2x the score. tq = TermQuery(Term("foo", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2.0, SCORE_EPSILON) # this query matches only the second document, which should have 4x # the score. tq = TermQuery(Term("foo", "jumps")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4.0, SCORE_EPSILON) # search on on field bar just for kicks, nothing should happen, since # we setup our sim provider to only use foo_boost for field foo. tq = TermQuery(Term("bar", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON) reader.close()
class Indexer(Retriever): def __init__(self, lang, dataset, analyzer, index_path=None, data_path=None, ram_size=2048): """ Returns scored documents in multiple languages. Parameters: dataset (str): ['mlqa_dev', 'mlqa_test', 'wiki'] lang (str): ['en', 'es', 'de'] anlyzer (str): ['en', 'es', 'de', 'standard'] ram_size (int): Size of memory used while indexing Returns: """ super().__init__() idxdir = self.get_index(lang, dataset, index_path) self.mlqa = True if dataset == 'mlqa_dev': self.dataset = MLQADataset('dev', lang, lang, data_path) elif dataset == 'mlqa_test': self.dataset = MLQADataset('test', lang, lang, data_path) elif dataset == 'wiki': self.mlqa = False self.dataset = Wiki(lang, data_path) else: raise RuntimeError("No dataloader for {}".format(dataset)) # stores index files, poor concurency try NIOFSDirectory instead store = SimpleFSDirectory(Paths.get(idxdir)) # limit max. number of tokens per document. # analyzer will not consume more tokens than that #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) # configuration for index writer config = IndexWriterConfig(analyzers[analyzer]()) # creates or overwrites index config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) # setting similarity BM25Similarity(k1=1.2,b=0.75) similarity = BM25Similarity(self.k1, self.b) config.setSimilarity(similarity) config.setRAMBufferSizeMB(float(ram_size)) # create index writer self.writer = IndexWriter(store, config) self.ftdata = FieldType() self.ftmeta = FieldType() # IndexSearcher will return value of the field self.ftdata.setStored(True) self.ftmeta.setStored(True) # will be analyzed by Analyzer self.ftdata.setTokenized(True) self.ftmeta.setTokenized(False) # what informations are stored (probabli DOCS would be sufficient) # DOCS: Only documents are indexed: term frequencies and positions are omitted. # Phrase and other positional queries on the field will throw an exception, # and scoring will behave as if any term in the document appears only once. # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are # omitted. This enables normal scoring, except Phrase and other positional # queries will throw an exception. # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions. # This is a typical default for full-text search: full scoring is enabled # and positional queries are supported. self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.ftmeta.setIndexOptions(IndexOptions.DOCS) # instantiate some reusable objects # TODO: create document, add fields then change only field value and # re-add document self.doc = Document() # Id cannot be reused because there is multiple values # I could store list of fields and add one if its not enough #self.fieldId = Field("id", "dummy", self.ftmeta) self.fieldTitle = Field("title", "dummy", self.ftdata) self.doc.add(self.fieldTitle) self.fieldContext = Field("context", "dummy", self.ftdata) self.doc.add(self.fieldContext) self.fieldIds = [Field("id", "dummy", self.ftmeta)] def addDoc(self, ids, title, context): # to save resources field objects are not created each time a new # document is being added. fieldIds keeps already created objects for n, i in enumerate(ids): if n < len(self.fieldIds): self.fieldIds[n].setStringValue(i) else: self.fieldIds.append(Field("id", i, self.ftmeta)) self.doc.add(self.fieldIds[n]) self.fieldTitle.setStringValue(title) self.fieldContext.setStringValue(context) self.writer.addDocument(self.doc) # because the number of ids is not known, they have to be deleted # otherwise there could contain values from previous iteration self.doc.removeFields("id") def createIndex(self): ids = [] for i, doc in enumerate(self.dataset.get()): if self.mlqa: ids = doc['qid'] self.addDoc(ids, doc['title'], doc['context']) self.commit() def commit(self): self.writer.commit() self.writer.close() if not self.mlqa: self.dataset.close()
class LuceneKeyValueStore(object): def __init__(self, path): lazyImport() self._writer, self._reader, self._searcher = self._getLucene(path) self._latestModifications = {} self._doc = Document() self._keyField = StringField("key", "", Field.Store.NO) self._valueField = Field("value", "", UNINDEXED_TYPE) self._doc.add(self._keyField) self._doc.add(self._valueField) def get(self, key, default=None): try: return self[key] except KeyError: return default def __setitem__(self, key, value): key = str(key) value = str(value) self._maybeReopen() self._keyField.setStringValue(key) self._valueField.setStringValue(value) self._writer.updateDocument(Term("key", key), self._doc) self._latestModifications[key] = value def __getitem__(self, key): key = str(key) value = self._latestModifications.get(key) if value is DELETED_RECORD: raise KeyError(key) if not value is None: return value self._maybeReopen() topDocs = self._searcher.search(TermQuery(Term("key", key)), 1) if topDocs.totalHits == 0: raise KeyError(key) return self._searcher.doc(topDocs.scoreDocs[0].doc).get("value") def __delitem__(self, key): key = str(key) self._writer.deleteDocuments(Term("key", key)) self._latestModifications[key] = DELETED_RECORD def __len__(self): raise NotImplementedError def __iter__(self): raise NotImplementedError def items(self): raise NotImplementedError def keys(self): raise NotImplementedError def values(self): raise NotImplementedError def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher def _maybeReopen(self): if len(self._latestModifications) > 10000: newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True) if not newReader is None: self._reader.close() self._reader = newReader self._searcher = IndexSearcher(self._reader) self._latestModifications.clear() def commit(self): self._writer.commit() def close(self): self._writer.close()
def testSimple(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) doc = Document() field = Field("foo", "", TextField.TYPE_NOT_STORED) doc.add(field) dvField = FloatDocValuesField("foo_boost", 0.0) doc.add(dvField) field2 = Field("bar", "", TextField.TYPE_NOT_STORED) doc.add(field2) field.setStringValue("quick brown fox") field2.setStringValue("quick brown fox") dvField.setFloatValue(2.0) # boost x2 writer.addDocument(doc) field.setStringValue("jumps over lazy brown dog") field2.setStringValue("jumps over lazy brown dog") dvField.setFloatValue(4.0) # boost x4 writer.addDocument(doc) reader = writer.getReader() writer.close() # no boosting searcher1 = self.getSearcher(reader=reader) base = searcher1.getSimilarity(True) # boosting searcher2 = self.getSearcher(reader=reader) class _similarity(PythonPerFieldSimilarityWrapper): def __init__(_self, base): super(_similarity, _self).__init__() _self.base = base _self.fooSim = BoostingSimilarity(base, "foo_boost") def get(_self, field): return _self.fooSim if "foo" == field else _self.base searcher2.setSimilarity(_similarity(base)) # in this case, we searched on field "foo". first document should have # 2x the score. tq = TermQuery(Term("foo", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2.0, SCORE_EPSILON) # this query matches only the second document, which should have 4x # the score. tq = TermQuery(Term("foo", "jumps")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4.0, SCORE_EPSILON) # search on on field bar just for kicks, nothing should happen, since # we setup our sim provider to only use foo_boost for field foo. tq = TermQuery(Term("bar", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON) reader.close()