Ejemplo n.º 1
0
class Indexer(Retriever):
    def __init__(self,
                 lang,
                 dataset,
                 analyzer,
                 index_path=None,
                 data_path=None,
                 ram_size=2048):
        """ Returns scored documents in multiple languages.

        Parameters:
        dataset  (str): ['mlqa_dev', 'mlqa_test', 'wiki']
        lang     (str): ['en', 'es', 'de']
        anlyzer  (str): ['en', 'es', 'de', 'standard']
        ram_size (int): Size of memory used while indexing

        Returns:
        """
        super().__init__()

        idxdir = self.get_index(lang, dataset, index_path)
        self.mlqa = True
        if dataset == 'mlqa_dev':
            self.dataset = MLQADataset('dev', lang, lang, data_path)
        elif dataset == 'mlqa_test':
            self.dataset = MLQADataset('test', lang, lang, data_path)
        elif dataset == 'wiki':
            self.mlqa = False
            self.dataset = Wiki(lang, data_path)
        else:
            raise RuntimeError("No dataloader for {}".format(dataset))

        # stores index files, poor concurency try NIOFSDirectory instead
        store = SimpleFSDirectory(Paths.get(idxdir))
        # limit max. number of tokens per document.
        # analyzer will not consume more tokens than that
        #analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        # configuration for index writer
        config = IndexWriterConfig(analyzers[analyzer]())
        # creates or overwrites index
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        # setting similarity BM25Similarity(k1=1.2,b=0.75)
        similarity = BM25Similarity(self.k1, self.b)
        config.setSimilarity(similarity)
        config.setRAMBufferSizeMB(float(ram_size))
        # create index writer
        self.writer = IndexWriter(store, config)

        self.ftdata = FieldType()
        self.ftmeta = FieldType()
        # IndexSearcher will return value of the field
        self.ftdata.setStored(True)
        self.ftmeta.setStored(True)
        # will be analyzed by Analyzer
        self.ftdata.setTokenized(True)
        self.ftmeta.setTokenized(False)
        # what informations are stored (probabli DOCS would be sufficient)
        # DOCS: Only documents are indexed: term frequencies and positions are omitted.
        #       Phrase and other positional queries on the field will throw an exception,
        #       and scoring will behave as if any term in the document appears only once.
        # DOCS_AND_FREQS: Only documents and term frequencies are indexed: positions are
        #       omitted. This enables normal scoring, except Phrase and other positional
        #       queries will throw an exception.
        # DOCS_AND_FREQS_AND_POSITIONS: Indexes documents, frequencies and positions.
        #       This is a typical default for full-text search: full scoring is enabled
        #       and positional queries are supported.
        self.ftdata.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        self.ftmeta.setIndexOptions(IndexOptions.DOCS)
        # instantiate some reusable objects
        # TODO: create document, add fields then change only field value and
        # re-add document
        self.doc = Document()
        # Id cannot be reused because there is multiple values
        # I could store list of fields and add one if its not enough
        #self.fieldId = Field("id", "dummy", self.ftmeta)
        self.fieldTitle = Field("title", "dummy", self.ftdata)
        self.doc.add(self.fieldTitle)
        self.fieldContext = Field("context", "dummy", self.ftdata)
        self.doc.add(self.fieldContext)
        self.fieldIds = [Field("id", "dummy", self.ftmeta)]

    def addDoc(self, ids, title, context):
        # to save resources field objects are not created each time a new
        # document is being added. fieldIds keeps already created objects
        for n, i in enumerate(ids):
            if n < len(self.fieldIds):
                self.fieldIds[n].setStringValue(i)
            else:
                self.fieldIds.append(Field("id", i, self.ftmeta))
            self.doc.add(self.fieldIds[n])

        self.fieldTitle.setStringValue(title)
        self.fieldContext.setStringValue(context)
        self.writer.addDocument(self.doc)
        # because the number of ids is not known, they have to be deleted
        # otherwise there could contain values from previous iteration
        self.doc.removeFields("id")

    def createIndex(self):
        ids = []
        for i, doc in enumerate(self.dataset.get()):
            if self.mlqa:
                ids = doc['qid']
            self.addDoc(ids, doc['title'], doc['context'])
        self.commit()

    def commit(self):
        self.writer.commit()
        self.writer.close()
        if not self.mlqa:
            self.dataset.close()