Example #1
0
    def get_term_counts(
        self, term: str, analyzer: Optional[JAnalyzer] = get_lucene_analyzer()
    ) -> Tuple[int, int]:
        """Return the document frequency and collection frequency of a term. Applies Anserini's default Lucene
        ``Analyzer`` if analyzer is not specified.

        Parameters
        ----------
        term : str
            Unanalyzed term.
        analyzer : analyzer
            Analyzer to apply.

        Returns
        -------
        Tuple[int, int]
            Document frequency and collection frequency.
        """
        if analyzer is None:
            analyzer = get_lucene_analyzer(stemming=False, stopwords=False)

        term_map = self.object.getTermCountsWithAnalyzer(
            self.reader, term, analyzer)

        return term_map.get('docFreq'), term_map.get('collectionFreq')
Example #2
0
 def test_different_analyzers_are_different(self):
     self.searcher.set_analyzer(
         analysis.get_lucene_analyzer(stemming=False))
     hits_first = self.searcher.search('information retrieval')
     self.searcher.set_analyzer(analysis.get_lucene_analyzer())
     hits_second = self.searcher.search('information retrieval')
     self.assertNotEqual(hits_first, hits_second)
Example #3
0
    def get_postings_list(
        self, term: str, analyzer=get_lucene_analyzer()) -> List[Posting]:
        """Return the postings list for a term.

        Parameters
        ----------
        term : str
            Raw term.
        analyzer : analyzer
            Analyzer to apply. Defaults to Anserini's default.

        Returns
        -------
        List[Posting]
            List of :class:`Posting` objects corresponding to the postings list for the term.
        """
        if analyzer is None:
            postings_list = self.object.getPostingsListForAnalyzedTerm(
                self.reader, term)
        else:
            postings_list = self.object.getPostingsListWithAnalyzer(
                self.reader, term, analyzer)

        if postings_list is None:
            return None

        result = []
        for posting in postings_list.toArray():
            result.append(
                Posting(posting.getDocid(), posting.getTF(),
                        posting.getPositions()))
        return result
Example #4
0
    def compute_bm25_term_weight(self,
                                 docid: str,
                                 term: str,
                                 analyzer=get_lucene_analyzer(),
                                 k1=0.9,
                                 b=0.4) -> float:
        """Compute the BM25 weight of a term in a document. Specify ``analyzer=None`` for an already analyzed term,
        e.g., from the output of :func:`get_document_vector`.

        Parameters
        ----------
        docid : str
            Collection ``docid``.
        term : str
            Term.
        analyzer : analyzer
            Lucene analyzer to use, ``None`` if term is already analyzed.
        k1 : float
            BM25 k1 parameter.
        b : float
            BM25 b parameter.

        Returns
        -------
        float
            BM25 weight of the term in the document, or 0 if the term does not exist in the document.
        """
        if analyzer is None:
            return self.object.getBM25AnalyzedTermWeightWithParameters(
                self.reader, docid, term, float(k1), float(b))
        else:
            return self.object.getBM25UnanalyzedTermWeightWithParameters(
                self.reader, docid, term, analyzer, float(k1), float(b))
Example #5
0
    def __init__(self,
                 lucene_index_path: str,
                 min_df: int = 1,
                 verbose: bool = False):
        self.min_df: int = min_df
        self.verbose: bool = verbose
        self.index_reader = index.IndexReader(lucene_index_path)
        self.searcher = search.LuceneSearcher(lucene_index_path)
        self.num_docs: int = self.searcher.num_docs
        self.stats = self.index_reader.stats()
        self.analyzer = Analyzer(get_lucene_analyzer())

        # build vocabulary
        self.vocabulary_ = set()
        for term in self.index_reader.terms():
            if term.df > self.min_df:
                self.vocabulary_.add(term.term)
        self.vocabulary_ = sorted(self.vocabulary_)

        # build term to index mapping
        self.term_to_index = {}
        for i, term in enumerate(self.vocabulary_):
            self.term_to_index[term] = i
        self.vocabulary_size = len(self.vocabulary_)

        if self.verbose:
            print(
                f'Found {self.vocabulary_size} terms with min_df={self.min_df}'
            )
Example #6
0
def get_tfidf_terms_text(text, index_utils, n, total_N=595031):
    """Extract tf idfterms from a text with wapo as background corpus."""
    # retrieve already analyzed terms in dict: tf
    analyzer = analysis.Analyzer(analysis.get_lucene_analyzer())
    analyzed_text = analyzer.analyze(text)
    unique, counts = np.unique(analyzed_text, return_counts=True)
    tf = dict(zip(unique, counts))

    # Filter terms: should not contain numbers and len >= 2.
    w_pattern = re.compile("[a-z]+")
    filtered_tf = {term: tf for term, tf in tf.items() if len(w_pattern.findall(term)) == 1 and
                   len(term.replace('.', '')) >= 2 and
                   re.search("[a-z]+", term)[0] == term}

    # df
    df = {term: (index_utils.get_term_counts(term, analyzer=None))
          [0] for term in filtered_tf.keys()}

    # calcute tfidf for each term and store in dict.
    terms_tfidf = {term: tfidf(tf[term], df[term], total_N) for term in filtered_tf.keys()
                   if tfidf(tf[term], df[term], total_N) >= 3.5}

    # Sort terms based on tfidf score.
    tfidf_terms_sorted = {term: tf[term] for term, tfidf in sorted(
        terms_tfidf.items(), key=itemgetter(1), reverse=True)[:n]}

    return tfidf_terms_sorted
Example #7
0
    def testTermQuery2(self):
        term_query1 = querybuilder.get_term_query('inform', analyzer=get_lucene_analyzer(stemming=False))
        term_query2 = querybuilder.get_term_query('retriev', analyzer=get_lucene_analyzer(stemming=False))

        should = querybuilder.JBooleanClauseOccur['should'].value

        boolean_query1 = querybuilder.get_boolean_query_builder()
        boolean_query1.add(term_query1, should)
        boolean_query1.add(term_query2, should)

        bq1 = boolean_query1.build()
        hits1 = self.searcher.search(bq1)
        hits2 = self.searcher.search('information retrieval')

        for h1, h2 in zip(hits1, hits2):
            self.assertEqual(h1.docid, h2.docid)
            self.assertEqual(h1.score, h2.score)
Example #8
0
def batch_process(batch):
    if (os.getcwd().endswith('ltr_msmarco')):
        stopwords = read_stopwords('stopwords.txt', lower_case=True)
    else:
        stopwords = read_stopwords('./scripts/ltr_msmarco/stopwords.txt',
                                   lower_case=True)
    nlp = SpacyTextParser('en_core_web_sm',
                          stopwords,
                          keep_only_alpha_num=True,
                          lower_case=True)
    analyzer = Analyzer(get_lucene_analyzer())
    #nlp_ent = spacy.load("en_core_web_sm")
    bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def process(line):
        if not line:
            return None

        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 2:
            return None

        pid, body = fields

        text, text_unlemm = nlp.proc_text(body)

        #doc = nlp_ent(body)
        #entity = {}
        #for i in range(len(doc.ents)):
        #entity[doc.ents[i].text] = doc.ents[i].label_
        #entity = json.dumps(entity)

        analyzed = analyzer.analyze(body)
        for token in analyzed:
            assert ' ' not in token
        contents = ' '.join(analyzed)

        doc = {
            "id": pid,
            "text": text,
            "text_unlemm": text_unlemm,
            'contents': contents,
            "raw": body
        }
        doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower())
        return doc

    res = []
    start = time.time()
    for line in batch:
        res.append(process(line))
        if len(res) % 1000 == 0:
            end = time.time()
            print(f'finish {len(res)} using {end-start}')
            start = end
    return res
Example #9
0
 def test_analyze_with_analyzer(self):
     analyzer = analysis.get_lucene_analyzer(stemming=False)
     self.assertTrue(isinstance(analyzer, JAnalyzer))
     query = JString('information retrieval')
     only_tokenization = JAnalyzerUtils.analyze(analyzer, query)
     token_list = []
     for token in only_tokenization.toArray():
         token_list.append(token)
     self.assertEqual(token_list, ['information', 'retrieval'])
Example #10
0
 def test_analyze(self):
     self.assertEqual(' '.join(self.index_reader.analyze('retrieval')), 'retriev')
     self.assertEqual(' '.join(self.index_reader.analyze('rapid retrieval, space economy')),
                      'rapid retriev space economi')
     tokenizer = analysis.get_lucene_analyzer(stemming=False)
     self.assertEqual(' '.join(self.index_reader.analyze('retrieval', analyzer=tokenizer)), 'retrieval')
     self.assertEqual(' '.join(self.index_reader.analyze('rapid retrieval, space economy', analyzer=tokenizer)),
                      'rapid retrieval space economy')
     # Test utf encoding:
     self.assertEqual(self.index_reader.analyze('zoölogy')[0], 'zoölog')
     self.assertEqual(self.index_reader.analyze('zoölogy', analyzer=tokenizer)[0], 'zoölogy')
Example #11
0
 def __init__(self,
              k1: float = 1.6,
              b: float = 0.75,
              index_path: str = None):
     self.k1 = k1
     self.b = b
     self.use_corpus_estimator = False
     self.analyzer = Analyzer(get_lucene_analyzer())
     if index_path:
         self.use_corpus_estimator = True
         self.index_utils = IndexReader(index_path)
Example #12
0
def batch_process(batch):
    #assume call the script from the root dir
    stopwords = read_stopwords('./scripts/ltr_msmarco/stopwords.txt', lower_case=True)
    nlp = SpacyTextParser('en_core_web_sm', stopwords, keep_only_alpha_num=True, lower_case=True)
    analyzer = Analyzer(get_lucene_analyzer())
    bert_tokenizer =AutoTokenizer.from_pretrained("bert-base-uncased")

    def process(line):
        if not line:
            return None
        json_line = json.loads(line)
        pid = json_line['id']
        body = json_line['contents']
        #url = json_line['url']
        #title = json_line['title']

        text, text_unlemm = nlp.proc_text(body)

        #_,title_unlemm = nlp.proc_text(title)

        analyzed = analyzer.analyze(body)
        for token in analyzed:
            assert ' ' not in token
        contents = ' '.join(analyzed)

        doc = {"id": pid,
               "text": text,
               "text_unlemm": text_unlemm,
               'contents': contents,
               #"title_unlemm": title_unlemm,
               #"url": url,
               "raw": body}
        
        if (len(body)>512):
            doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()[:512])
        else:
            doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower())
        return doc
    
    res = []
    start = time.time()
    for line in batch:
        res.append(process(line))
        if len(res) % 10000 == 0:
            end = time.time()
            print(f'finish {len(res)} using {end-start}')
            start = end
    return res
Example #13
0
    def test_lucene_analyzer_en_book_examples(self):
        analyzer = Analyzer(get_lucene_analyzer())

        tokens = analyzer.analyze(
            'walking talking balking biking hiking rolling scrolling')
        self.assertEqual(
            ['walk', 'talk', 'balk', 'bike', 'hike', 'roll', 'scroll'], tokens)

        tokens = analyzer.analyze('rolling scrolling')
        self.assertEqual(['roll', 'scroll'], tokens)

        tokens = analyzer.analyze('biostatistics')
        self.assertEqual(['biostatist'], tokens)

        tokens = analyzer.analyze('adversarial')
        self.assertEqual(['adversari'], tokens)
Example #14
0
    def test_lucene_analyzer_fr_book_examples(self):
        analyzer = Analyzer(get_lucene_analyzer(name='french'))

        tokens = analyzer.analyze(
            'marche parler vélo randonnée rouler défilement')
        self.assertEqual(['march', 'parl', 'vélo', 'randon', 'roul', 'defil'],
                         tokens)

        tokens = analyzer.analyze('défilement roulant')
        self.assertEqual(['defil', 'roulant'], tokens)

        tokens = analyzer.analyze('biostatistique')
        self.assertEqual(['biostatist'], tokens)

        tokens = analyzer.analyze('antagoniste')
        self.assertEqual(['antagonist'], tokens)
Example #15
0
    def test_lucene_analyzer_zh_book_examples(self):
        analyzer = Analyzer(get_lucene_analyzer(name='cjk'))

        tokens = analyzer.analyze('走路说话骑自行车远足滚动滚动')
        self.assertEqual([
            '走路', '路说', '说话', '话骑', '骑自', '自行', '行车', '车远', '远足', '足滚', '滚动',
            '动滚', '滚动'
        ], tokens)

        tokens = analyzer.analyze('滚动滚动')
        self.assertEqual(['滚动', '动滚', '滚动'], tokens)

        tokens = analyzer.analyze('生物统计学')
        self.assertEqual(['生物', '物统', '统计', '计学'], tokens)

        tokens = analyzer.analyze('对抗的')
        self.assertEqual(['对抗', '抗的'], tokens)
Example #16
0
def get_term_query(term, field="contents", analyzer=get_lucene_analyzer()):
    """Searches the collection.

    Parameters
    ----------
    term : str
        The query term string.
    field : str
        Field to search.
    analyzer : Analyzer
        Analyzer to use for tokenizing the query term.

    Returns
    -------
    JTermQuery
    """
    analyzer = Analyzer(analyzer)
    return JTermQuery(JTerm(field, analyzer.analyze(term)[0]))
Example #17
0
    def test_analysis(self):
        # Default is Porter stemmer
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer())
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Porter stemmer explicitly
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemmer='porter'))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Krovetz stemmer explicitly
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemmer='krovetz'))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'bus', 'running', 'time'])

        # No stemming
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemming=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'buses', 'running', 'time'])

        # No stopword filter, no stemming
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemming=False, stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens,
                         ['city', 'buses', 'are', 'running', 'on', 'time'])

        # No stopword filter, with stemming
        analyzer = analysis.Analyzer(
            analysis.get_lucene_analyzer(stemming=True, stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
Example #18
0
 def test_invalid_analysis(self):
     # Invalid configuration, make sure we get an exception.
     with self.assertRaises(ValueError):
         analysis.Analyzer(analysis.get_lucene_analyzer('blah'))
Example #19
0
def analyze_query(query):
    analyzer = Analyzer(get_lucene_analyzer())
    query = analyzer.analyze(query)
    return query
    parser.add_argument('--collection_path',
                        required=True,
                        help='MS MARCO .tsv collection file')
    parser.add_argument('--predictions',
                        required=True,
                        help='File containing predicted queries.')
    parser.add_argument('--output_folder', required=True, help='output folder')
    parser.add_argument('--max_docs_per_file',
                        default=1000000,
                        type=int,
                        help='maximum number of documents in each jsonl file.')

    args = parser.parse_args()
    if not os.path.exists(args.output_folder):
        os.makedirs(args.output_folder)
    analyzer = Analyzer(get_lucene_analyzer())
    print('Converting collection...')

    file_index = 0
    new_words = 0
    total_words = 0

    with open(args.collection_path) as f_corpus, open(
            args.predictions) as f_pred:
        for i, (line_doc, line_pred) in enumerate(zip(f_corpus, f_pred)):
            # Write to a new file when the current one reaches maximum capacity.
            if i % args.max_docs_per_file == 0:
                if i > 0:
                    output_jsonl_file.close()
                output_path = os.path.join(args.output_folder,
                                           f'docs{file_index:02d}.json')