Example #1
0
    def test_lucene_analyzer_en_book_examples(self):
        analyzer = Analyzer(get_lucene_analyzer())

        tokens = analyzer.analyze(
            'walking talking balking biking hiking rolling scrolling')
        self.assertEqual(
            ['walk', 'talk', 'balk', 'bike', 'hike', 'roll', 'scroll'], tokens)

        tokens = analyzer.analyze('rolling scrolling')
        self.assertEqual(['roll', 'scroll'], tokens)

        tokens = analyzer.analyze('biostatistics')
        self.assertEqual(['biostatist'], tokens)

        tokens = analyzer.analyze('adversarial')
        self.assertEqual(['adversari'], tokens)
Example #2
0
    def test_lucene_analyzer_fr_book_examples(self):
        analyzer = Analyzer(get_lucene_analyzer(name='french'))

        tokens = analyzer.analyze(
            'marche parler vélo randonnée rouler défilement')
        self.assertEqual(['march', 'parl', 'vélo', 'randon', 'roul', 'defil'],
                         tokens)

        tokens = analyzer.analyze('défilement roulant')
        self.assertEqual(['defil', 'roulant'], tokens)

        tokens = analyzer.analyze('biostatistique')
        self.assertEqual(['biostatist'], tokens)

        tokens = analyzer.analyze('antagoniste')
        self.assertEqual(['antagonist'], tokens)
Example #3
0
    def test_lucene_analyzer_zh_book_examples(self):
        analyzer = Analyzer(get_lucene_analyzer(name='cjk'))

        tokens = analyzer.analyze('走路说话骑自行车远足滚动滚动')
        self.assertEqual([
            '走路', '路说', '说话', '话骑', '骑自', '自行', '行车', '车远', '远足', '足滚', '滚动',
            '动滚', '滚动'
        ], tokens)

        tokens = analyzer.analyze('滚动滚动')
        self.assertEqual(['滚动', '动滚', '滚动'], tokens)

        tokens = analyzer.analyze('生物统计学')
        self.assertEqual(['生物', '物统', '统计', '计学'], tokens)

        tokens = analyzer.analyze('对抗的')
        self.assertEqual(['对抗', '抗的'], tokens)
Example #4
0
class Vectorizer:
    """Base class for vectorizer implemented on top of Pyserini.

    Parameters
    ----------
    lucene_index_path : str
        Path to lucene index folder
    min_df : int
        Minimum acceptable document frequency
    verbose : bool
        Whether to print out debugging information
    """
    def __init__(self,
                 lucene_index_path: str,
                 min_df: int = 1,
                 verbose: bool = False):
        self.min_df: int = min_df
        self.verbose: bool = verbose
        self.index_reader = index.IndexReader(lucene_index_path)
        self.searcher = search.LuceneSearcher(lucene_index_path)
        self.num_docs: int = self.searcher.num_docs
        self.stats = self.index_reader.stats()
        self.analyzer = Analyzer(get_lucene_analyzer())

        # build vocabulary
        self.vocabulary_ = set()
        for term in self.index_reader.terms():
            if term.df > self.min_df:
                self.vocabulary_.add(term.term)
        self.vocabulary_ = sorted(self.vocabulary_)

        # build term to index mapping
        self.term_to_index = {}
        for i, term in enumerate(self.vocabulary_):
            self.term_to_index[term] = i
        self.vocabulary_size = len(self.vocabulary_)

        if self.verbose:
            print(
                f'Found {self.vocabulary_size} terms with min_df={self.min_df}'
            )

    def get_query_vector(self, query: str):
        matrix_row, matrix_col, matrix_data = [], [], []
        tokens = self.analyzer.analyze(query)
        for term in tokens:
            if term in self.vocabulary_:
                matrix_row.append(0)
                matrix_col.append(self.term_to_index[term])
                matrix_data.append(1)
        vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)),
                             shape=(1, self.vocabulary_size))
        return vectors
Example #5
0
class Bm25Reranker(Reranker):
    def __init__(self,
                 k1: float = 1.6,
                 b: float = 0.75,
                 index_path: str = None):
        self.k1 = k1
        self.b = b
        self.use_corpus_estimator = False
        self.analyzer = Analyzer(get_lucene_analyzer())
        if index_path:
            self.use_corpus_estimator = True
            self.index_utils = IndexReader(index_path)

    def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
        query_words = self.analyzer.analyze(query.text)
        sentences = list(map(self.analyzer.analyze, (t.text for t in texts)))

        query_words_set = set(query_words)
        sentence_sets = list(map(set, sentences))
        if not self.use_corpus_estimator:
            idfs = {
                w: math.log(
                    len(sentence_sets) /
                    (1 + sum(int(w in sent) for sent in sentence_sets)))
                for w in query_words_set
            }
        mean_len = np.mean(list(map(len, sentences)))
        d_len = len(sentences)

        texts = deepcopy(texts)
        for sent_words, text in zip(sentences, texts):
            tf = Counter(filter(query_words.__contains__, sent_words))
            if self.use_corpus_estimator:
                idfs = {
                    w: self.index_utils.compute_bm25_term_weight(
                        text.metadata['docid'], w)
                    for w in tf
                }
            score = sum(idfs[w] * tf[w] * (self.k1 + 1) /
                        (tf[w] + self.k1 * (1 - self.b + self.b *
                                            (d_len / mean_len))) for w in tf)
            if np.isnan(score):
                score = 0
            text.score = score
        return texts
Example #6
0
def get_term_query(term, field="contents", analyzer=get_lucene_analyzer()):
    """Searches the collection.

    Parameters
    ----------
    term : str
        The query term string.
    field : str
        Field to search.
    analyzer : Analyzer
        Analyzer to use for tokenizing the query term.

    Returns
    -------
    JTermQuery
    """
    analyzer = Analyzer(analyzer)
    return JTermQuery(JTerm(field, analyzer.analyze(term)[0]))
Example #7
0
    def test_analysis(self):
        # Default is Porter stemmer
        analyzer = Analyzer(get_lucene_analyzer())
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Porter stemmer explicitly
        analyzer = Analyzer(get_lucene_analyzer(stemmer='porter'))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Krovetz stemmer explicitly
        analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'bus', 'running', 'time'])

        # No stemming
        analyzer = Analyzer(get_lucene_analyzer(stemming=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'buses', 'running', 'time'])

        # No stopword filter, no stemming
        analyzer = Analyzer(
            get_lucene_analyzer(stemming=False, stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens,
                         ['city', 'buses', 'are', 'running', 'on', 'time'])

        # No stopword filter, with stemming
        analyzer = Analyzer(get_lucene_analyzer(stemming=True,
                                                stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
    total_words = 0

    with open(args.collection_path) as f_corpus, open(
            args.predictions) as f_pred:
        for i, (line_doc, line_pred) in enumerate(zip(f_corpus, f_pred)):
            # Write to a new file when the current one reaches maximum capacity.
            if i % args.max_docs_per_file == 0:
                if i > 0:
                    output_jsonl_file.close()
                output_path = os.path.join(args.output_folder,
                                           f'docs{file_index:02d}.json')
                output_jsonl_file = open(output_path, 'w')
                file_index += 1

            doc_json = json.loads(line_doc)
            pred_text = line_pred.rstrip()

            predict_text = pred_text + ' '
            analyzed = analyzer.analyze(predict_text)
            for token in analyzed:
                assert ' ' not in token
            predict = ' '.join(analyzed)

            doc_json['predict'] = predict
            output_jsonl_file.write(json.dumps(doc_json) + '\n')

            if i % 100000 == 0:
                print('Converted {} docs in {} files'.format(i, file_index))

    output_jsonl_file.close()
    print('Done!')
Example #9
0
ln = 0
for line in tqdm(inpFile):
    ln += 1
    line = line.strip()
    if not line:
        continue
    fields = line.split('\t')
    if len(fields) != 2:
        print('Misformated line %d ignoring:' % ln)
        print(line.replace('\t', '<field delimiter>'))
        continue

    did, query = fields

    query_lemmas, query_unlemm = nlp.proc_text(query)
    analyzed = analyzer.analyze(query)
    for token in analyzed:
        if ' ' in token:
            print(analyzed)

    query_toks = query_lemmas.split()

    doc = nlp_ent(query)
    entity = {}
    for i in range(len(doc.ents)):
        entity[doc.ents[i].text] = doc.ents[i].label_
    entity = json.dumps(entity)

    if len(query_toks) >= minQueryTokQty:
        doc = {
            "id": did,
Example #10
0
def analyze_query(query):
    analyzer = Analyzer(get_lucene_analyzer())
    query = analyzer.analyze(query)
    return query