def get_term_counts( self, term: str, analyzer: Optional[JAnalyzer] = get_lucene_analyzer() ) -> Tuple[int, int]: """Return the document frequency and collection frequency of a term. Applies Anserini's default Lucene ``Analyzer`` if analyzer is not specified. Parameters ---------- term : str Unanalyzed term. analyzer : analyzer Analyzer to apply. Returns ------- Tuple[int, int] Document frequency and collection frequency. """ if analyzer is None: analyzer = get_lucene_analyzer(stemming=False, stopwords=False) term_map = self.object.getTermCountsWithAnalyzer( self.reader, term, analyzer) return term_map.get('docFreq'), term_map.get('collectionFreq')
def test_different_analyzers_are_different(self): self.searcher.set_analyzer( analysis.get_lucene_analyzer(stemming=False)) hits_first = self.searcher.search('information retrieval') self.searcher.set_analyzer(analysis.get_lucene_analyzer()) hits_second = self.searcher.search('information retrieval') self.assertNotEqual(hits_first, hits_second)
def get_postings_list( self, term: str, analyzer=get_lucene_analyzer()) -> List[Posting]: """Return the postings list for a term. Parameters ---------- term : str Raw term. analyzer : analyzer Analyzer to apply. Defaults to Anserini's default. Returns ------- List[Posting] List of :class:`Posting` objects corresponding to the postings list for the term. """ if analyzer is None: postings_list = self.object.getPostingsListForAnalyzedTerm( self.reader, term) else: postings_list = self.object.getPostingsListWithAnalyzer( self.reader, term, analyzer) if postings_list is None: return None result = [] for posting in postings_list.toArray(): result.append( Posting(posting.getDocid(), posting.getTF(), posting.getPositions())) return result
def compute_bm25_term_weight(self, docid: str, term: str, analyzer=get_lucene_analyzer(), k1=0.9, b=0.4) -> float: """Compute the BM25 weight of a term in a document. Specify ``analyzer=None`` for an already analyzed term, e.g., from the output of :func:`get_document_vector`. Parameters ---------- docid : str Collection ``docid``. term : str Term. analyzer : analyzer Lucene analyzer to use, ``None`` if term is already analyzed. k1 : float BM25 k1 parameter. b : float BM25 b parameter. Returns ------- float BM25 weight of the term in the document, or 0 if the term does not exist in the document. """ if analyzer is None: return self.object.getBM25AnalyzedTermWeightWithParameters( self.reader, docid, term, float(k1), float(b)) else: return self.object.getBM25UnanalyzedTermWeightWithParameters( self.reader, docid, term, analyzer, float(k1), float(b))
def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = False): self.min_df: int = min_df self.verbose: bool = verbose self.index_reader = index.IndexReader(lucene_index_path) self.searcher = search.LuceneSearcher(lucene_index_path) self.num_docs: int = self.searcher.num_docs self.stats = self.index_reader.stats() self.analyzer = Analyzer(get_lucene_analyzer()) # build vocabulary self.vocabulary_ = set() for term in self.index_reader.terms(): if term.df > self.min_df: self.vocabulary_.add(term.term) self.vocabulary_ = sorted(self.vocabulary_) # build term to index mapping self.term_to_index = {} for i, term in enumerate(self.vocabulary_): self.term_to_index[term] = i self.vocabulary_size = len(self.vocabulary_) if self.verbose: print( f'Found {self.vocabulary_size} terms with min_df={self.min_df}' )
def get_tfidf_terms_text(text, index_utils, n, total_N=595031): """Extract tf idfterms from a text with wapo as background corpus.""" # retrieve already analyzed terms in dict: tf analyzer = analysis.Analyzer(analysis.get_lucene_analyzer()) analyzed_text = analyzer.analyze(text) unique, counts = np.unique(analyzed_text, return_counts=True) tf = dict(zip(unique, counts)) # Filter terms: should not contain numbers and len >= 2. w_pattern = re.compile("[a-z]+") filtered_tf = {term: tf for term, tf in tf.items() if len(w_pattern.findall(term)) == 1 and len(term.replace('.', '')) >= 2 and re.search("[a-z]+", term)[0] == term} # df df = {term: (index_utils.get_term_counts(term, analyzer=None)) [0] for term in filtered_tf.keys()} # calcute tfidf for each term and store in dict. terms_tfidf = {term: tfidf(tf[term], df[term], total_N) for term in filtered_tf.keys() if tfidf(tf[term], df[term], total_N) >= 3.5} # Sort terms based on tfidf score. tfidf_terms_sorted = {term: tf[term] for term, tfidf in sorted( terms_tfidf.items(), key=itemgetter(1), reverse=True)[:n]} return tfidf_terms_sorted
def testTermQuery2(self): term_query1 = querybuilder.get_term_query('inform', analyzer=get_lucene_analyzer(stemming=False)) term_query2 = querybuilder.get_term_query('retriev', analyzer=get_lucene_analyzer(stemming=False)) should = querybuilder.JBooleanClauseOccur['should'].value boolean_query1 = querybuilder.get_boolean_query_builder() boolean_query1.add(term_query1, should) boolean_query1.add(term_query2, should) bq1 = boolean_query1.build() hits1 = self.searcher.search(bq1) hits2 = self.searcher.search('information retrieval') for h1, h2 in zip(hits1, hits2): self.assertEqual(h1.docid, h2.docid) self.assertEqual(h1.score, h2.score)
def batch_process(batch): if (os.getcwd().endswith('ltr_msmarco')): stopwords = read_stopwords('stopwords.txt', lower_case=True) else: stopwords = read_stopwords('./scripts/ltr_msmarco/stopwords.txt', lower_case=True) nlp = SpacyTextParser('en_core_web_sm', stopwords, keep_only_alpha_num=True, lower_case=True) analyzer = Analyzer(get_lucene_analyzer()) #nlp_ent = spacy.load("en_core_web_sm") bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") def process(line): if not line: return None line = line[:maxDocSize] # cut documents that are too long! fields = line.split('\t') if len(fields) != 2: return None pid, body = fields text, text_unlemm = nlp.proc_text(body) #doc = nlp_ent(body) #entity = {} #for i in range(len(doc.ents)): #entity[doc.ents[i].text] = doc.ents[i].label_ #entity = json.dumps(entity) analyzed = analyzer.analyze(body) for token in analyzed: assert ' ' not in token contents = ' '.join(analyzed) doc = { "id": pid, "text": text, "text_unlemm": text_unlemm, 'contents': contents, "raw": body } doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()) return doc res = [] start = time.time() for line in batch: res.append(process(line)) if len(res) % 1000 == 0: end = time.time() print(f'finish {len(res)} using {end-start}') start = end return res
def test_analyze_with_analyzer(self): analyzer = analysis.get_lucene_analyzer(stemming=False) self.assertTrue(isinstance(analyzer, JAnalyzer)) query = JString('information retrieval') only_tokenization = JAnalyzerUtils.analyze(analyzer, query) token_list = [] for token in only_tokenization.toArray(): token_list.append(token) self.assertEqual(token_list, ['information', 'retrieval'])
def test_analyze(self): self.assertEqual(' '.join(self.index_reader.analyze('retrieval')), 'retriev') self.assertEqual(' '.join(self.index_reader.analyze('rapid retrieval, space economy')), 'rapid retriev space economi') tokenizer = analysis.get_lucene_analyzer(stemming=False) self.assertEqual(' '.join(self.index_reader.analyze('retrieval', analyzer=tokenizer)), 'retrieval') self.assertEqual(' '.join(self.index_reader.analyze('rapid retrieval, space economy', analyzer=tokenizer)), 'rapid retrieval space economy') # Test utf encoding: self.assertEqual(self.index_reader.analyze('zoölogy')[0], 'zoölog') self.assertEqual(self.index_reader.analyze('zoölogy', analyzer=tokenizer)[0], 'zoölogy')
def __init__(self, k1: float = 1.6, b: float = 0.75, index_path: str = None): self.k1 = k1 self.b = b self.use_corpus_estimator = False self.analyzer = Analyzer(get_lucene_analyzer()) if index_path: self.use_corpus_estimator = True self.index_utils = IndexReader(index_path)
def batch_process(batch): #assume call the script from the root dir stopwords = read_stopwords('./scripts/ltr_msmarco/stopwords.txt', lower_case=True) nlp = SpacyTextParser('en_core_web_sm', stopwords, keep_only_alpha_num=True, lower_case=True) analyzer = Analyzer(get_lucene_analyzer()) bert_tokenizer =AutoTokenizer.from_pretrained("bert-base-uncased") def process(line): if not line: return None json_line = json.loads(line) pid = json_line['id'] body = json_line['contents'] #url = json_line['url'] #title = json_line['title'] text, text_unlemm = nlp.proc_text(body) #_,title_unlemm = nlp.proc_text(title) analyzed = analyzer.analyze(body) for token in analyzed: assert ' ' not in token contents = ' '.join(analyzed) doc = {"id": pid, "text": text, "text_unlemm": text_unlemm, 'contents': contents, #"title_unlemm": title_unlemm, #"url": url, "raw": body} if (len(body)>512): doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()[:512]) else: doc["text_bert_tok"] = get_retokenized(bert_tokenizer, body.lower()) return doc res = [] start = time.time() for line in batch: res.append(process(line)) if len(res) % 10000 == 0: end = time.time() print(f'finish {len(res)} using {end-start}') start = end return res
def test_lucene_analyzer_en_book_examples(self): analyzer = Analyzer(get_lucene_analyzer()) tokens = analyzer.analyze( 'walking talking balking biking hiking rolling scrolling') self.assertEqual( ['walk', 'talk', 'balk', 'bike', 'hike', 'roll', 'scroll'], tokens) tokens = analyzer.analyze('rolling scrolling') self.assertEqual(['roll', 'scroll'], tokens) tokens = analyzer.analyze('biostatistics') self.assertEqual(['biostatist'], tokens) tokens = analyzer.analyze('adversarial') self.assertEqual(['adversari'], tokens)
def test_lucene_analyzer_fr_book_examples(self): analyzer = Analyzer(get_lucene_analyzer(name='french')) tokens = analyzer.analyze( 'marche parler vélo randonnée rouler défilement') self.assertEqual(['march', 'parl', 'vélo', 'randon', 'roul', 'defil'], tokens) tokens = analyzer.analyze('défilement roulant') self.assertEqual(['defil', 'roulant'], tokens) tokens = analyzer.analyze('biostatistique') self.assertEqual(['biostatist'], tokens) tokens = analyzer.analyze('antagoniste') self.assertEqual(['antagonist'], tokens)
def test_lucene_analyzer_zh_book_examples(self): analyzer = Analyzer(get_lucene_analyzer(name='cjk')) tokens = analyzer.analyze('走路说话骑自行车远足滚动滚动') self.assertEqual([ '走路', '路说', '说话', '话骑', '骑自', '自行', '行车', '车远', '远足', '足滚', '滚动', '动滚', '滚动' ], tokens) tokens = analyzer.analyze('滚动滚动') self.assertEqual(['滚动', '动滚', '滚动'], tokens) tokens = analyzer.analyze('生物统计学') self.assertEqual(['生物', '物统', '统计', '计学'], tokens) tokens = analyzer.analyze('对抗的') self.assertEqual(['对抗', '抗的'], tokens)
def get_term_query(term, field="contents", analyzer=get_lucene_analyzer()): """Searches the collection. Parameters ---------- term : str The query term string. field : str Field to search. analyzer : Analyzer Analyzer to use for tokenizing the query term. Returns ------- JTermQuery """ analyzer = Analyzer(analyzer) return JTermQuery(JTerm(field, analyzer.analyze(term)[0]))
def test_analysis(self): # Default is Porter stemmer analyzer = analysis.Analyzer(analysis.get_lucene_analyzer()) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'run', 'time']) # Specify Porter stemmer explicitly analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemmer='porter')) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'run', 'time']) # Specify Krovetz stemmer explicitly analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemmer='krovetz')) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'bus', 'running', 'time']) # No stemming analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemming=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'buses', 'running', 'time']) # No stopword filter, no stemming analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemming=False, stopwords=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'buses', 'are', 'running', 'on', 'time']) # No stopword filter, with stemming analyzer = analysis.Analyzer( analysis.get_lucene_analyzer(stemming=True, stopwords=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
def test_invalid_analysis(self): # Invalid configuration, make sure we get an exception. with self.assertRaises(ValueError): analysis.Analyzer(analysis.get_lucene_analyzer('blah'))
def analyze_query(query): analyzer = Analyzer(get_lucene_analyzer()) query = analyzer.analyze(query) return query
parser.add_argument('--collection_path', required=True, help='MS MARCO .tsv collection file') parser.add_argument('--predictions', required=True, help='File containing predicted queries.') parser.add_argument('--output_folder', required=True, help='output folder') parser.add_argument('--max_docs_per_file', default=1000000, type=int, help='maximum number of documents in each jsonl file.') args = parser.parse_args() if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) analyzer = Analyzer(get_lucene_analyzer()) print('Converting collection...') file_index = 0 new_words = 0 total_words = 0 with open(args.collection_path) as f_corpus, open( args.predictions) as f_pred: for i, (line_doc, line_pred) in enumerate(zip(f_corpus, f_pred)): # Write to a new file when the current one reaches maximum capacity. if i % args.max_docs_per_file == 0: if i > 0: output_jsonl_file.close() output_path = os.path.join(args.output_folder, f'docs{file_index:02d}.json')