def test_lucene_analyzer_en_book_examples(self): analyzer = Analyzer(get_lucene_analyzer()) tokens = analyzer.analyze( 'walking talking balking biking hiking rolling scrolling') self.assertEqual( ['walk', 'talk', 'balk', 'bike', 'hike', 'roll', 'scroll'], tokens) tokens = analyzer.analyze('rolling scrolling') self.assertEqual(['roll', 'scroll'], tokens) tokens = analyzer.analyze('biostatistics') self.assertEqual(['biostatist'], tokens) tokens = analyzer.analyze('adversarial') self.assertEqual(['adversari'], tokens)
def test_lucene_analyzer_fr_book_examples(self): analyzer = Analyzer(get_lucene_analyzer(name='french')) tokens = analyzer.analyze( 'marche parler vélo randonnée rouler défilement') self.assertEqual(['march', 'parl', 'vélo', 'randon', 'roul', 'defil'], tokens) tokens = analyzer.analyze('défilement roulant') self.assertEqual(['defil', 'roulant'], tokens) tokens = analyzer.analyze('biostatistique') self.assertEqual(['biostatist'], tokens) tokens = analyzer.analyze('antagoniste') self.assertEqual(['antagonist'], tokens)
def test_lucene_analyzer_zh_book_examples(self): analyzer = Analyzer(get_lucene_analyzer(name='cjk')) tokens = analyzer.analyze('走路说话骑自行车远足滚动滚动') self.assertEqual([ '走路', '路说', '说话', '话骑', '骑自', '自行', '行车', '车远', '远足', '足滚', '滚动', '动滚', '滚动' ], tokens) tokens = analyzer.analyze('滚动滚动') self.assertEqual(['滚动', '动滚', '滚动'], tokens) tokens = analyzer.analyze('生物统计学') self.assertEqual(['生物', '物统', '统计', '计学'], tokens) tokens = analyzer.analyze('对抗的') self.assertEqual(['对抗', '抗的'], tokens)
class Vectorizer: """Base class for vectorizer implemented on top of Pyserini. Parameters ---------- lucene_index_path : str Path to lucene index folder min_df : int Minimum acceptable document frequency verbose : bool Whether to print out debugging information """ def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = False): self.min_df: int = min_df self.verbose: bool = verbose self.index_reader = index.IndexReader(lucene_index_path) self.searcher = search.LuceneSearcher(lucene_index_path) self.num_docs: int = self.searcher.num_docs self.stats = self.index_reader.stats() self.analyzer = Analyzer(get_lucene_analyzer()) # build vocabulary self.vocabulary_ = set() for term in self.index_reader.terms(): if term.df > self.min_df: self.vocabulary_.add(term.term) self.vocabulary_ = sorted(self.vocabulary_) # build term to index mapping self.term_to_index = {} for i, term in enumerate(self.vocabulary_): self.term_to_index[term] = i self.vocabulary_size = len(self.vocabulary_) if self.verbose: print( f'Found {self.vocabulary_size} terms with min_df={self.min_df}' ) def get_query_vector(self, query: str): matrix_row, matrix_col, matrix_data = [], [], [] tokens = self.analyzer.analyze(query) for term in tokens: if term in self.vocabulary_: matrix_row.append(0) matrix_col.append(self.term_to_index[term]) matrix_data.append(1) vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(1, self.vocabulary_size)) return vectors
class Bm25Reranker(Reranker): def __init__(self, k1: float = 1.6, b: float = 0.75, index_path: str = None): self.k1 = k1 self.b = b self.use_corpus_estimator = False self.analyzer = Analyzer(get_lucene_analyzer()) if index_path: self.use_corpus_estimator = True self.index_utils = IndexReader(index_path) def rerank(self, query: Query, texts: List[Text]) -> List[Text]: query_words = self.analyzer.analyze(query.text) sentences = list(map(self.analyzer.analyze, (t.text for t in texts))) query_words_set = set(query_words) sentence_sets = list(map(set, sentences)) if not self.use_corpus_estimator: idfs = { w: math.log( len(sentence_sets) / (1 + sum(int(w in sent) for sent in sentence_sets))) for w in query_words_set } mean_len = np.mean(list(map(len, sentences))) d_len = len(sentences) texts = deepcopy(texts) for sent_words, text in zip(sentences, texts): tf = Counter(filter(query_words.__contains__, sent_words)) if self.use_corpus_estimator: idfs = { w: self.index_utils.compute_bm25_term_weight( text.metadata['docid'], w) for w in tf } score = sum(idfs[w] * tf[w] * (self.k1 + 1) / (tf[w] + self.k1 * (1 - self.b + self.b * (d_len / mean_len))) for w in tf) if np.isnan(score): score = 0 text.score = score return texts
def get_term_query(term, field="contents", analyzer=get_lucene_analyzer()): """Searches the collection. Parameters ---------- term : str The query term string. field : str Field to search. analyzer : Analyzer Analyzer to use for tokenizing the query term. Returns ------- JTermQuery """ analyzer = Analyzer(analyzer) return JTermQuery(JTerm(field, analyzer.analyze(term)[0]))
def test_analysis(self): # Default is Porter stemmer analyzer = Analyzer(get_lucene_analyzer()) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'run', 'time']) # Specify Porter stemmer explicitly analyzer = Analyzer(get_lucene_analyzer(stemmer='porter')) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'run', 'time']) # Specify Krovetz stemmer explicitly analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz')) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'bus', 'running', 'time']) # No stemming analyzer = Analyzer(get_lucene_analyzer(stemming=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'buses', 'running', 'time']) # No stopword filter, no stemming analyzer = Analyzer( get_lucene_analyzer(stemming=False, stopwords=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'buses', 'are', 'running', 'on', 'time']) # No stopword filter, with stemming analyzer = Analyzer(get_lucene_analyzer(stemming=True, stopwords=False)) self.assertTrue(isinstance(analyzer, Analyzer)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
total_words = 0 with open(args.collection_path) as f_corpus, open( args.predictions) as f_pred: for i, (line_doc, line_pred) in enumerate(zip(f_corpus, f_pred)): # Write to a new file when the current one reaches maximum capacity. if i % args.max_docs_per_file == 0: if i > 0: output_jsonl_file.close() output_path = os.path.join(args.output_folder, f'docs{file_index:02d}.json') output_jsonl_file = open(output_path, 'w') file_index += 1 doc_json = json.loads(line_doc) pred_text = line_pred.rstrip() predict_text = pred_text + ' ' analyzed = analyzer.analyze(predict_text) for token in analyzed: assert ' ' not in token predict = ' '.join(analyzed) doc_json['predict'] = predict output_jsonl_file.write(json.dumps(doc_json) + '\n') if i % 100000 == 0: print('Converted {} docs in {} files'.format(i, file_index)) output_jsonl_file.close() print('Done!')
ln = 0 for line in tqdm(inpFile): ln += 1 line = line.strip() if not line: continue fields = line.split('\t') if len(fields) != 2: print('Misformated line %d ignoring:' % ln) print(line.replace('\t', '<field delimiter>')) continue did, query = fields query_lemmas, query_unlemm = nlp.proc_text(query) analyzed = analyzer.analyze(query) for token in analyzed: if ' ' in token: print(analyzed) query_toks = query_lemmas.split() doc = nlp_ent(query) entity = {} for i in range(len(doc.ents)): entity[doc.ents[i].text] = doc.ents[i].label_ entity = json.dumps(entity) if len(query_toks) >= minQueryTokQty: doc = { "id": did,
def analyze_query(query): analyzer = Analyzer(get_lucene_analyzer()) query = analyzer.analyze(query) return query