Ejemplo n.º 1
0
 def build_jsoup(self, path, replace=False, optimize=True):
     with logger.duration(f'building {self._path}'):
         if self._settings['built']:
             if replace:
                 logger.warn(f'removing index: {self._path}')
                 shutil.rmtree(self._path)
             else:
                 logger.warn(f'adding to existing index: {self._path}')
         thread_count = onir.util.safe_thread_count()
         index_args = J.A_IndexArgs()
         index_args.collectionClass = 'TrecCollection'
         index_args.generatorClass = 'JsoupGenerator'
         index_args.threads = thread_count
         index_args.input = path
         index_args.index = self._path
         index_args.storePositions = True
         index_args.storeDocvectors = True
         index_args.storeRawDocs = True
         index_args.storeTransformedDocs = True
         index_args.keepStopwords = self._settings['keep_stops']
         index_args.stemmer = self._settings['stemmer']
         index_args.optimize = optimize
         indexer = J.A_IndexCollection(index_args)
         thread = threading.Thread(target=indexer.run)
         thread.start()
         thread.join()
         self._settings['built'] = True
         self._dump_settings()
 def fn(inputs):
     file, count = inputs
     args = J.A_SearchArgs()
     parser = J.M_CmdLineParser(args)
     arg_args = [
         '-index',
         self._path,
         '-topics',
         file,
         '-output',
         file + '.run',
         '-topicreader',
         'TsvString',
         '-hits',
         str(topk),
         '-stemmer',
         self._settings['stemmer'],
         '-indexfield',
         self._primary_field,
     ]
     arg_args += self._model2args(model)
     parser.parseArgument(*arg_args)
     searcher = J.A_SearchCollection(args)
     searcher.runTopics()
     searcher.close()
     return file + '.run', count
Ejemplo n.º 3
0
 def _model(self, model):
     if model == 'randomqrels':
         return self._model('bm25_k1-0.6_b-0.5')
     if model.startswith('bm25'):
         k1, b = 0.9, 0.4
         Model = J.L_BM25Similarity
         for arg in model.split('_')[1:]:
             if '-' in arg:
                 k, v = arg.split('-')
             else:
                 k, v = arg, None
             if k == 'k1':
                 k1 = float(v)
             elif k == 'b':
                 b = float(v)
             elif k == 'noidf':
                 Model = J.A_BM25SimilarityNoIdf
             else:
                 raise ValueError(f'unknown bm25 parameter {k}={v}')
         return Model(k1, b)
     elif model == 'vsm':
         return J.L_ClassicSimilarity()
     elif model == 'ql':
         mu = 1000.
         for k, v in [arg.split('-') for arg in model.split('_')[1:]]:
             if k == 'mu':
                 mu = float(v)
             else:
                 raise ValueError(f'unknown ql parameter {k}={v}')
         return J.L_LMDirichletSimilarity(mu)
     raise ValueError(f'unknown model {model}')
Ejemplo n.º 4
0
 def build(self,
           doc_iter,
           replace=False,
           optimize=True,
           store_term_weights=False):
     with logger.duration(f'building {self._base_path}'):
         thread_count = onir.util.safe_thread_count()
         with tempfile.TemporaryDirectory() as d:
             if self._settings['built']:
                 if replace:
                     logger.warn(f'removing index: {self._base_path}')
                     shutil.rmtree(self._base_path)
                 else:
                     logger.warn(
                         f'adding to existing index: {self._base_path}')
             fifos = []
             for t in range(thread_count):
                 fifo = os.path.join(d, f'{t}.json')
                 os.mkfifo(fifo)
                 fifos.append(fifo)
             index_args = J.A_IndexArgs()
             index_args.collectionClass = 'JsonCollection'
             index_args.generatorClass = 'LuceneDocumentGenerator'
             index_args.threads = thread_count
             index_args.input = d
             index_args.index = self._base_path
             index_args.storePositions = True
             index_args.storeDocvectors = True
             index_args.storeTermWeights = store_term_weights
             index_args.keepStopwords = self._settings['keep_stops']
             index_args.stemmer = self._settings['stemmer']
             index_args.optimize = optimize
             indexer = J.A_IndexCollection(index_args)
             thread = threading.Thread(target=indexer.run)
             thread.start()
             time.sleep(
                 1
             )  # give it some time to start up, otherwise fails due to race condition
             for i, doc in enumerate(doc_iter):
                 f = fifos[hash(i) % thread_count]
                 if isinstance(f, str):
                     f = open(f, 'wt')
                     fifos[hash(i) % thread_count] = f
                 data = {'id': doc.did, 'contents': 'a'}
                 data.update(doc.data)
                 json.dump(data, f)
                 f.write('\n')
             for f in fifos:
                 if not isinstance(f, str):
                     f.close()
                 else:
                     with open(f, 'wt'):
                         pass  # open and close to indicate file is done
             logger.debug('waiting to join')
             thread.join()
             self._settings['built'] = True
             self._dump_settings()
Ejemplo n.º 5
0
 def term2idf_unstemmed(self, term):
     term = J.A_AnalyzerUtils.analyze(self._get_analyzer(), term).toArray()
     if len(term) == 1:
         df = self._reader().docFreq(
             J.L_Term(J.A_IndexArgs.CONTENTS, term[0]))
         return math.log((self._reader().numDocs() + 1) / (df + 1))
     return 0.  # stop word; very common
 def term2idf_unstemmed(self, term):
     term = J.A_AnalyzerUtils.tokenize(self._get_analyzer(), term).toArray()
     if len(term) == 1:
         df = self._reader().docFreq(J.L_Term(self._primary_field, term[0]))
         doc_count = self.collection_stats().docCount()
         return math.log(1 + (doc_count - df + 0.5) / (df + 0.5))
     return 0.  # stop word; very common
Ejemplo n.º 7
0
 def get_query_doc_scores(self, query, did, model, skip_invividual=False):
     sim = self._model(model)
     self._searcher().setSimilarity(sim)
     ldid = self._get_index_utils().convertDocidToLuceneDocid(did)
     if ldid == -1:
         return -999. * len(query), [-999.] * len(query)
     analyzer = self._get_stemmed_analyzer()
     query = list(itertools.chain(*[J.A_AnalyzerUtils.analyze(analyzer, t).toArray() for t in query]))
     if not skip_invividual:
         result = []
         for q in query:
             q = _anserini_escape(q, J)
             lquery = J.L_QueryParser().parse(q, J.A_IndexArgs.CONTENTS)
             explain = self._searcher().explain(lquery, ldid)
             result.append(explain.getValue().doubleValue())
         return sum(result), result
     lquery = J.L_QueryParser().parse(_anserini_escape(' '.join(query), J), J.A_IndexArgs.CONTENTS)
     explain = self._searcher().explain(lquery, ldid)
     return explain.getValue()
Ejemplo n.º 8
0
 def get_query_doc_scores_batch(self, query, dids, model):
     sim = self._model(model)
     self._searcher().setSimilarity(sim)
     ldids = {self._get_index_utils().convertDocidToLuceneDocid(did): did for did in dids}
     analyzer = self._get_stemmed_analyzer()
     query = J.A_AnalyzerUtils.analyze(analyzer, query).toArray()
     query = ' '.join(_anserini_escape(q, J) for q in query)
     docs = ' '.join(f'{J.A_IndexArgs.ID}:{did}' for did in dids)
     lquery = J.L_QueryParser().parse(f'({query}) AND ({docs})', J.A_IndexArgs.CONTENTS)
     result = {}
     search_results = self._searcher().search(lquery, len(dids))
     for top_doc in search_results.scoreDocs:
         result[ldids[top_doc.doc]] = top_doc.score
     del search_results
     return result
Ejemplo n.º 9
0
 def simple_searcher(self, model):
     result = J.A_SimpleSearcher(self._path)
     if model.startswith('bm25'):
         k1, b = 0.9, 0.4
         model_args = [arg.split('-', 1) for arg in model.split('_')[1:]]
         for arg in model_args:
             if len(arg) == 1:
                 k, v = arg[0], None
             elif len(arg) == 2:
                 k, v = arg
             if k == 'k1':
                 k1 = v
             elif k == 'b':
                 b = v
             else:
                 raise ValueError(f'unknown bm25 parameter {arg}')
         result.setBM25Similarity(k1, b)
     else:
         raise ValueError(f'unsupported model {model}')
     return result
Ejemplo n.º 10
0
 def _get_stemmed_analyzer(self):
     #return J.A_DefaultEnglishAnalyzer(self._settings['stemmer'], J.L_CharArraySet(0, False))
     return J.A_DefaultEnglishAnalyzer.newStemmingInstance(
         self._settings['stemmer'], J.L_CharArraySet(0, False))
Ejemplo n.º 11
0
 def _get_analyzer(self):
     return J.L_StandardAnalyzer(J.L_CharArraySet(0, False))
Ejemplo n.º 12
0
 def _get_index_utils(self):
     return J.A_IndexUtils(self._path)
Ejemplo n.º 13
0
 def doc_freq(self, term):
     return self._reader().docFreq(J.L_Term(self._primary_field, term))
Ejemplo n.º 14
0
 def _searcher(self):
     return J.L_IndexSearcher(self._reader().getContext())
Ejemplo n.º 15
0
 def _reader(self):
     return J.L_DirectoryReader.open(
         J.L_FSDirectory.open(J.File(self._path).toPath()))
 def _get_stemmed_analyzer(self):
     return J.A_EnglishStemmingAnalyzer(self._settings['stemmer'],
                                        J.L_CharArraySet(0, False))
Ejemplo n.º 17
0
 def batch_query(self, queries, model, topk, destf=None, quiet=False):
     THREADS = onir.util.safe_thread_count()
     query_file_splits = 1000
     if hasattr(queries, '__len__'):
         if len(queries) < THREADS:
             THREADS = len(queries)
             query_file_splits = 1
         elif len(queries) < THREADS * 10:
             query_file_splits = ((len(queries)+1) // THREADS)
         elif len(queries) < THREADS * 100:
             query_file_splits = ((len(queries)+1) // (THREADS * 10))
         else:
             query_file_splits = ((len(queries)+1) // (THREADS * 100))
     with tempfile.TemporaryDirectory() as topic_d, tempfile.TemporaryDirectory() as run_d:
         run_f = os.path.join(run_d, 'run')
         topic_files = []
         current_file = None
         total_topics = 0
         for i, (qid, text) in enumerate(queries):
             topic_file = '{}/{}.queries'.format(topic_d, i // query_file_splits)
             if current_file is None or current_file.name != topic_file:
                 if current_file is not None:
                     topic_files.append(current_file.name)
                     current_file.close()
                 current_file = open(topic_file, 'wt')
             current_file.write(f'{qid}\t{text}\n')
             total_topics += 1
         if current_file is not None:
             topic_files.append(current_file.name)
         current_file.close()
         args = J.A_SearchArgs()
         parser = J.M_CmdLineParser(args)
         arg_args = [
             '-index', self._path,
             '-topics', *topic_files,
             '-output', run_f,
             '-topicreader', 'TsvString',
             '-threads', str(THREADS),
             '-hits', str(topk),
             '-language', self._settings['lang'],
         ]
         if model.startswith('bm25'):
             arg_args.append('-bm25')
             model_args = [arg.split('-', 1) for arg in model.split('_')[1:]]
             for arg in model_args:
                 if len(arg) == 1:
                     k, v = arg[0], None
                 elif len(arg) == 2:
                     k, v = arg
                 if k == 'k1':
                     arg_args.append('-bm25.k1')
                     arg_args.append(v)
                 elif k == 'b':
                     arg_args.append('-bm25.b')
                     arg_args.append(v)
                 elif k == 'rm3':
                     arg_args.append('-rm3')
                 elif k == 'rm3.fbTerms':
                     arg_args.append('-rm3.fbTerms')
                     arg_args.append(v)
                 elif k == 'rm3.fbDocs':
                     arg_args.append('-rm3.fbDocs')
                     arg_args.append(v)
                 else:
                     raise ValueError(f'unknown bm25 parameter {arg}')
         elif model.startswith('ql'):
             arg_args.append('-qld')
             model_args = [arg.split('-', 1) for arg in model.split('_')[1:]]
             for arg in model_args:
                 if len(arg) == 1:
                     k, v = arg[0], None
                 elif len(arg) == 2:
                     k, v = arg
                 if k == 'mu':
                     arg_args.append('-qld.mu')
                     arg_args.append(v)
                 else:
                     raise ValueError(f'unknown ql parameter {arg}')
         elif model.startswith('sdm'):
             arg_args.append('-sdm')
             arg_args.append('-qld')
             model_args = [arg.split('-', 1) for arg in model.split('_')[1:]]
             for arg in model_args:
                 if len(arg) == 1:
                     k, v = arg[0], None
                 elif len(arg) == 2:
                     k, v = arg
                 if k == 'mu':
                     arg_args.append('-qld.mu')
                     arg_args.append(v)
                 elif k == 'tw':
                     arg_args.append('-sdm.tw')
                     arg_args.append(v)
                 elif k == 'ow':
                     arg_args.append('-sdm.ow')
                     arg_args.append(v)
                 elif k == 'uw':
                     arg_args.append('-sdm.uw')
                     arg_args.append(v)
                 else:
                     raise ValueError(f'unknown sdm parameter {arg}')
         else:
             raise ValueError(f'unknown model {model}')
         parser.parseArgument(*arg_args)
         with contextlib.ExitStack() as stack:
             stack.enter_context(J.listen_java_log(_surpress_log('io.anserini.search.SearchCollection')))
             if not quiet:
                 pbar = stack.enter_context(logger.pbar_raw(desc=f'batch_query ({model})', total=total_topics))
                 stack.enter_context(J.listen_java_log(pbar_bq_listener(pbar)))
             searcher = J.A_SearchCollection(args)
             searcher.runTopics()
             searcher.close()
         if destf:
             shutil.copy(run_f, destf)
         else:
             return trec.read_run_dict(run_f)
Ejemplo n.º 18
0
    def batch_query(self, queries, model, topk, destf=None, quiet=False):
        THREADS = onir.util.safe_thread_count()
        query_file_splits = 1000
        if hasattr(queries, '__len__'):
            if len(queries) < THREADS:
                THREADS = len(queries)
                query_file_splits = 1
            elif len(queries) < THREADS * 10:
                query_file_splits = ((len(queries) + 1) // THREADS)
            elif len(queries) < THREADS * 100:
                query_file_splits = ((len(queries) + 1) // (THREADS * 10))
            else:
                query_file_splits = ((len(queries) + 1) // (THREADS * 100))
        with tempfile.TemporaryDirectory(
        ) as topic_d, tempfile.TemporaryDirectory() as run_d:
            run_f = os.path.join(run_d, 'run')
            topic_files = []
            file_topic_counts = []
            current_file = None
            total_topics = 0
            for i, (qid, text) in enumerate(queries):
                topic_file = '{}/{}.queries'.format(topic_d,
                                                    i // query_file_splits)
                if current_file is None or current_file.name != topic_file:
                    if current_file is not None:
                        topic_files.append(current_file.name)
                        current_file.close()
                    current_file = open(topic_file, 'wt')
                    file_topic_counts.append(0)
                current_file.write(f'{qid}\t{text}\n')
                file_topic_counts[-1] += 1
                total_topics += 1
            if current_file is not None:
                topic_files.append(current_file.name)
                current_file.close()
            J.initialize()
            with ThreadPool(THREADS) as pool, \
                 logger.pbar_raw(desc=f'batch_query ({model})', total=total_topics) as pbar:

                def fn(inputs):
                    file, count = inputs
                    args = J.A_SearchArgs()
                    parser = J.M_CmdLineParser(args)
                    arg_args = [
                        '-index',
                        self._path,
                        '-topics',
                        file,
                        '-output',
                        file + '.run',
                        '-topicreader',
                        'TsvString',
                        '-hits',
                        str(topk),
                        '-stemmer',
                        self._settings['stemmer'],
                        '-indexfield',
                        self._primary_field,
                    ]
                    arg_args += self._model2args(model)
                    parser.parseArgument(*arg_args)
                    searcher = J.A_SearchCollection(args)
                    searcher.runTopics()
                    searcher.close()
                    return file + '.run', count

                if destf:
                    result = open(destf + '.tmp', 'wb')
                else:
                    result = {}
                for resultf, count in pool.imap_unordered(
                        fn, zip(topic_files, file_topic_counts)):
                    if destf:
                        with open(resultf, 'rb') as f:
                            for line in f:
                                result.write(line)
                    else:
                        run = trec.read_run_dict(resultf)
                        result.update(run)
                    pbar.update(count)
                if destf:
                    result.close()
                    shutil.move(destf + '.tmp', destf)
                else:
                    return result
Ejemplo n.º 19
0
J.register(
    jars=[
        "bin/lucene-backward-codecs-8.0.0.jar", "bin/anserini-0.8.0-fatjar.jar"
    ],
    defs=dict(
        # [L]ucene
        L_FSDirectory='org.apache.lucene.store.FSDirectory',
        L_DirectoryReader='org.apache.lucene.index.DirectoryReader',
        L_Term='org.apache.lucene.index.Term',
        L_IndexSearcher='org.apache.lucene.search.IndexSearcher',
        L_BM25Similarity='org.apache.lucene.search.similarities.BM25Similarity',
        L_ClassicSimilarity=
        'org.apache.lucene.search.similarities.ClassicSimilarity',
        L_LMDirichletSimilarity=
        'org.apache.lucene.search.similarities.LMDirichletSimilarity',
        L_QueryParser=
        'org.apache.lucene.queryparser.flexible.standard.StandardQueryParser',
        L_QueryParserUtil=
        'org.apache.lucene.queryparser.flexible.standard.QueryParserUtil',
        L_StandardAnalyzer=
        'org.apache.lucene.analysis.standard.StandardAnalyzer',
        L_EnglishAnalyzer='org.apache.lucene.analysis.en.EnglishAnalyzer',
        L_CharArraySet='org.apache.lucene.analysis.CharArraySet',
        L_MultiFields='org.apache.lucene.index.MultiFields',

        # [A]nserini
        A_IndexCollection='io.anserini.index.IndexCollection',
        A_IndexArgs='io.anserini.index.IndexArgs',
        A_IndexUtils='io.anserini.index.IndexUtils',
        A_LuceneDocumentGenerator=
        'io.anserini.index.generator.LuceneDocumentGenerator',
        A_SearchCollection='io.anserini.search.SearchCollection',
        A_SearchArgs='io.anserini.search.SearchArgs',
        A_DefaultEnglishAnalyzer='io.anserini.analysis.DefaultEnglishAnalyzer',
        A_AnalyzerUtils='io.anserini.analysis.AnalyzerUtils',

        # [M]isc
        M_CmdLineParser='org.kohsuke.args4j.CmdLineParser',
    ))