def main(): base_path = os.path.join(os.path.dirname(__file__), 'data') paths = { 'qrels_path': os.path.join(base_path, 'msmarco-doctrain-qrels.tsv'), 'top100_path': os.path.join(base_path, 'msmarco-doctrain-top100'), 'queries_path': os.path.join(base_path, 'msmarco-doctrain-queries.tsv'), } index_reader = IndexReader(sys.argv[1]) with open(sys.argv[2], 'w') as output_file: for (query_id, query, doc_id, is_positive) in generate_examples(**paths): query_terms = index_reader.analyze(query) feature_vector = [ np.sum(compute_tf(query_terms, index_reader, doc_id)), np.sum(compute_idf(query_terms, index_reader)), np.sum(compute_tf_idf(query_terms, index_reader, doc_id)), compute_document_length(index_reader, doc_id), np.sum(compute_bm25(query_terms, index_reader, doc_id)), ] line = [ '1' if is_positive else '0', f'qid:{query_id}', ] for i, feature in enumerate(feature_vector): line.append(f'{i}:{feature}') output_file.write(' '.join(line) + '\n')
def main(queries_file, qrels_file, output_file, write_negative): queries = read_topics(queries_file) index_reader = IndexReader('indexes/msmarco-passage') document_count = int(index_reader.stats()['documents']) qrels = open(qrels_file, 'r') with open(output_file, 'w') as output_file_handle: for line in qrels: line = line.strip().split('\t') qid = int(line[0]) docid = line[2] target = line[3] query = queries[qid]['title'] features = compute_features(index_reader, query, docid) output_file_handle.write( format_qrel_line(target, qid, features, docid)) # The evaluation set doesn't need negative examples. if write_negative: negative_docid = str(get_negative_docid(document_count, docid)) features = compute_features(index_reader, query, negative_docid) output_file_handle.write( format_qrel_line(0, qid, features, negative_docid))
def __init__(self, model: str, ibm_model: str, index: str, data: str): self.model = model self.ibm_model = ibm_model self.fe = FeatureExtractor(index, max(multiprocessing.cpu_count() // 2, 1)) self.index_reader = IndexReader(index) self.data = data
def _compute_idf(index_path): from pyserini.index import IndexReader index_reader = IndexReader(index_path) tokens = [] dfs = [] for term in index_reader.terms(): dfs.append(term.df) tokens.append(term.term) idfs = np.log((index_reader.stats()['documents'] / (np.array(dfs)))) return dict(zip(tokens, idfs))
def compute_idf(query_terms: List[str], index_reader: IndexReader) -> np.ndarray: """log ( (|C| - df(term) + 0.5) / (df(term) + 0.5)""" C = index_reader.stats()['documents'] query_idf = np.zeros(len(query_terms)) for i, term in enumerate(query_terms): term_df = index_reader.get_term_counts(term, analyzer=None)[0] query_idf[i] = np.log(np.divide(C - term_df + 0.5, term_df + 0.5)) return query_idf
def __init__(self, k1: float = 1.6, b: float = 0.75, index_path: str = None): self.k1 = k1 self.b = b self.use_corpus_estimator = False self.analyzer = Analyzer(get_lucene_analyzer()) if index_path: self.use_corpus_estimator = True self.index_utils = IndexReader(index_path)
def __init__(self, strategy="GREEDY", seed=2020, max_iter=20): """ This class produces a baseline BM25 ranking and uses LDA topic modelling in combination with the general re-ranking procedure of Huang and Hu (2009) """ self.seed = seed self.max_iter = max_iter self.utils = Utils() # Amount of documents to rank and rerank self.N= 100 # Select a strategy for weighing final topics self.strategy = strategy # K to use in TOP-K-AVG strategy self.top_k = 10 # TODO ideally we don't want to first rank every time for the reranking self.baseline = BaselineBM25(k=self.N) self.baseline.rank() # For each topic, the system outputs N retrieved articles. self.batch_hits = self.baseline.get_batch_hits() # Read index to retrieve document contents # N.B. the `contents` field is currently empty; we stored "raw" instead. self.index_loc = self.baseline.get_index_loc() reader = IndexReader(self.index_loc) # Vocabulary in index #vocabulary = [ term.term for term in reader.terms()] #print(f"{len(vocabulary)} terms in vocabulary") # Topics and the retrieved articles are represented as the keyword sequences self.topics = self.baseline.get_topics() self.topic_keywords = { id: topic['title'].lower().split() for (id, topic) in self.topics.items() } self.query_ids = self.baseline.get_query_ids() # Next line returns preprocessed documents per query docs_per_query = { query_id: [ reader.analyze( reader.doc(hit.docid).raw()) for hit in hits] for query_id, hits in self.batch_hits.items() } # Prepare bag-of-words dataset for gensim self.X = defaultdict(list) for id in self.query_ids: dictionary = Dictionary(docs_per_query[id]) # Dictionary expects a list of lists, elements being lists of tokens self.X[id] = [dictionary.doc2bow(doc) for doc in docs_per_query[id]]
def main(): try: # Location of the generated index index_loc = "indexes/msmarco-passage/lucene-index-msmarco" # Create a searcher object searcher = SimpleSearcher(index_loc) # Set the active scorer to BM25 searcher.set_bm25(k1=0.9, b=0.4) # Fetch 3 results for the given test query results = searcher.search('this is a test query', k=3) # For all results print the docid and the score expected = ['5578280', '2016011', '7004677'] docids = [x.docid for x in results] if expected != docids: raise Exception('Test query results do not match expected:', expected, '(expecteD)', docids, '(actual)') # IndexReader can give information about the index indexer = IndexReader(index_loc) if indexer.stats()['total_terms'] != 352316036: raise Exception( 'There are an unexpected number of terms in your index set, perhaps something went wrong while downloading and indexing the dataset?' ) topics = get_topics("msmarco-passage-dev-subset") if topics == {}: raise Exception( 'Could not find msmarco-passage-dev-subset... Best approach is to retry indexing the dataset.' ) first_query = topics[list(topics.keys())[0]]['title'] if first_query != "why do people grind teeth in sleep": raise Exception( 'Found a different first query than expected in the dataset. Did you download the right dataset?' ) # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch # Using the pyserini tokenizer/stemmer/etc. to create queries from scratch query = "This is a test query in which things are tested. Found using www.google.com of course!" # Tokenizing in pyserini is called Analyzing output = indexer.analyze(query) if len(output) != 9: raise Exception( 'Tokenizer is not working correctly, something is probably wrong in Anserini. Perhaps try to install Anserini again.' ) except Exception as inst: print('ERROR: something went wrong in the installation') print(inst) else: print("INSTALLATION OK")
def compute_tf(query_terms: List[str], index_reader: IndexReader, doc_id: str) -> np.ndarray: query_tf = np.zeros(len(query_terms)) doc_vector = index_reader.get_document_vector(doc_id) for i, term in enumerate(query_terms): query_tf[i] = doc_vector.get(term, 0) return query_tf
def main(): index_reader = IndexReader("../anserini/indexes/msmarco-doc/lucene-index-msmarco") generate_libsvm_representation(index_reader, "data/msmarco-doctrain-queries.tsv.gz", "data/msmarco-doctrain-qrels.tsv.gz", "data/msmarco-doc-libsvm/msmarco-doctrain-libsvm.txt", num_queries=100) # generate_libsvm_representation(index_reader, "data/msmarco-test2019-queries.tsv.gz", # "data/2019qrels-docs.txt.gz", # "data/msmarco-doc-libsvm/msmarco-doctest-libsvm.txt", num_queries=10) generate_libsvm_representation(index_reader, "data/msmarco-docdev-queries.tsv.gz", "data/msmarco-docdev-qrels.tsv.gz", "data/msmarco-doc-libsvm/msmarco-docdev-libsvm.txt", num_queries=100)
def compute_bm25(query_terms: List[str], index_reader: IndexReader, doc_id: str, k1=0.9, b=0.4) -> float: scores = np.zeros(len(query_terms)) for i, term in enumerate(query_terms): bm25 = index_reader.compute_bm25_term_weight(doc_id, term, analyzer=None, k1=k1, b=b) scores[i] = bm25 return scores
class Bm25Reranker(Reranker): def __init__(self, k1: float = 1.6, b: float = 0.75, index_path: str = None): self.k1 = k1 self.b = b self.use_corpus_estimator = False self.analyzer = Analyzer(get_lucene_analyzer()) if index_path: self.use_corpus_estimator = True self.index_utils = IndexReader(index_path) def rerank(self, query: Query, texts: List[Text]) -> List[Text]: query_words = self.analyzer.analyze(query.text) sentences = list(map(self.analyzer.analyze, (t.text for t in texts))) query_words_set = set(query_words) sentence_sets = list(map(set, sentences)) if not self.use_corpus_estimator: idfs = { w: math.log( len(sentence_sets) / (1 + sum(int(w in sent) for sent in sentence_sets))) for w in query_words_set } mean_len = np.mean(list(map(len, sentences))) d_len = len(sentences) texts = deepcopy(texts) for sent_words, text in zip(sentences, texts): tf = Counter(filter(query_words.__contains__, sent_words)) if self.use_corpus_estimator: idfs = { w: self.index_utils.compute_bm25_term_weight( text.metadata['docid'], w) for w in tf } score = sum(idfs[w] * tf[w] * (self.k1 + 1) / (tf[w] + self.k1 * (1 - self.b + self.b * (d_len / mean_len))) for w in tf) if np.isnan(score): score = 0 text.score = score return texts
from pyserini.index import IndexReader import math, numpy index_reader = IndexReader('marcoindex') number_of_docs = 8841823 number_of_all_terms=491404850 def IDF(term) df, cf = index_reader.get_term_counts(term) return math.log10(number_of_docsdf) def ictf(term) df, cf = index_reader.get_term_counts(term) return math.log10(number_of_all_terms cf ) def SCS(query) q_terms=query.split() avgictf=[] for t in q_terms avgictf.append(ictf(index_reader,t)) part_A= math.log10 ( 1 len(q_terms)) part_B = numpy.mean(avgictf) return ( part_A + part_B ) def SCQ(term) df, cf = index_reader.get_term_counts(term) part_A= 1 + math.log10(cf) part_B=IDF(index_reader,term) return (part_A part_B)
def load_samples(self): indexer = IndexReader(self.args.index_dir) custom_bm25 = search.LuceneSimilarities.bm25(self.args.bm25_k1, self.args.bm25_b) qrels_path = os.path.join(self.args.msmarco_dir, f"qrels.{self.mode}.tsv") candidates_path = os.path.join(self.args.msmarco_dir, f"top_candidates.{self.mode}.tsv") # all queries text (for calculating the BM25 scores) queries_text = dict() for line in open( os.path.join(self.args.msmarco_dir, f"queries.{self.mode}.tsv"), 'r'): qid, text = line.split('\t') text = text.rstrip() queries_text[qid] = text if self.mode == 'train': # qrel (labels) qrel_lst = defaultdict(list) for line in open(qrels_path, 'r'): qid, _, pid, _ = line.split('\t') qrel_lst[qid].append(int(pid)) qrel_lst = dict(qrel_lst) # top docs by BM25 (neg samples) top_lst = defaultdict(list) for line in tqdm(open(candidates_path, 'r'), desc=f"{self.mode} top candidates"): qid, pid, score = line.split('\t') if int(pid) not in qrel_lst[qid]: top_lst[qid].append({ 'pid': int(pid), 'score': float(score) }) top_lst = dict(top_lst) qids, pos_pids, neg_pids, pos_scores, neg_scores = [], [], [], [], [] for qid in tqdm(qrel_lst, desc=f"{self.mode} samples"): if qid in top_lst: for pos_pid in qrel_lst[qid]: pos_score = indexer.compute_query_document_score( str(pos_pid), queries_text[qid], similarity=custom_bm25) neg_docs = top_lst[ qid][:self.args. p] #probability p? not clear, since author doesn't mention for neg_doc in neg_docs: qids.append(qid) pos_pids.append(pos_pid) neg_pids.append(neg_doc['pid']) pos_scores.append(pos_score) neg_scores.append(neg_doc['score']) self.qids, self.pos_pids, self.neg_pids = qids, pos_pids, neg_pids self.pos_scores, self.neg_scores = pos_scores, neg_scores else: # top docs by BM25 (neg samples) top_lst = defaultdict(list) for line in tqdm(open(candidates_path, 'r'), desc=f"{self.mode} top candidates"): qid, pid, score = line.split('\t') top_lst[qid].append({'pid': int(pid), 'score': float(score)}) top_lst = dict(top_lst) qids, pids, scores = [], [], [] for i, qid in enumerate(top_lst): for doc in top_lst[qid]: qids.append(qid) pids.append(doc['pid']) scores.append(doc['score']) if (i + 1) == self.args.num_eval_queries: break self.qids, self.pids, self.scores = qids, pids, scores
def check_sparse(index): for entry in index: print(f'# Validating "{entry}"...') IndexReader.validate_prebuilt_index(entry) print('\n')
def main(is_training): embeddings_file = 'glove.840B.300d' print(f'Processing {embeddings_file}') if is_training: qrels_file = 'qrels.train.tsv' queries_file = 'queries.train.tsv' query_embeddings_file = f'embeddings/{embeddings_file}/queries-embeddings.train.tsv' doc_embeddings_file = f'embeddings/{embeddings_file}/documents-embeddings.train.tsv' output_file = f'ranklib-features/{embeddings_file}/data_ranklib-embeddings-train.txt' else: qrels_file = 'runs/run.msmarco-test2019-queries-bm25.trec' queries_file = 'msmarco-test2019-queries.tsv' query_embeddings_file = f'embeddings/{embeddings_file}/queries-embeddings.test.tsv' doc_embeddings_file = f'embeddings/{embeddings_file}/documents-embeddings.test.tsv' output_file = f'ranklib-features/{embeddings_file}/data_ranklib-embeddings-test.txt' queries = read_topics(queries_file) index_reader = IndexReader('indexes/msmarco-passage') qrels = open(qrels_file, 'r') print('Reading query vectors') query_embeddings_handle = open(query_embeddings_file, 'r') query_vector_id, query_vector_values = load_fasttext_line( query_embeddings_handle.readline()) print('Reading document vectors') doc_vectors = load_fasttext_vectors(doc_embeddings_file, False) doc_ids = list(doc_vectors.keys()) count = 0 print('Calculating features') os.system(f'mkdir -p ranklib-features/{embeddings_file}') with open(output_file, 'w') as output_file_handle: for line in qrels: line = line.strip().split('\t') qid = int(line[0]) docid = line[2] target = line[3] query = queries[qid]['title'] if int(query_vector_id) != qid: old_id = query_vector_id while int(old_id) == int(query_vector_id): query_vector_id, query_vector_values = load_fasttext_line( query_embeddings_handle.readline()) doc_vector = doc_vectors[docid] if math.isnan(query_vector_values[0]) or math.isnan(doc_vector[0]): count += 1 continue features = { **compute_similarity(query_vector_values, doc_vector), **compute_features(index_reader, query, docid) } output_file_handle.write( format_qrel_line(target, qid, features, docid)) # The evaluation set doesn't need negative examples. if is_training: negative_docid = str(get_negative_docid(doc_ids, docid)) features = { **compute_similarity(query_vector_values, doc_vectors[negative_docid]), **compute_features(index_reader, query, docid) } output_file_handle.write( format_qrel_line(0, qid, features, negative_docid)) if count % 10000 == 0: print(count) count += 1
line = line.strip().split("\t") # Try and parse the keys into integers try: topic_key = int(line[0]) except ValueError: topic_key = line[0] topics[topic_key] = { 'title': line[1], } return topics C_size = 50 index_reader = IndexReader('indexes/msmarco-passage') top_25 = [{ 'term': 'you', 'cf': 3704969 }, { 'term': 'your', 'cf': 2871978 }, { 'term': 'from', 'cf': 2433977 }, { 'term': 'us', 'cf': 2215803 }, { 'term': 'can',
# hits contains: docid, retrieval score, and document content # N.B. "black bear attacks" is the title of topic 336 query = 'black bear attacks' hits = searcher.search(query) # Print first 10 hits utils.print_top_n_results(hits, 10) # ---------------- # IndexReaderUtils # ---------------- from pyserini.index import IndexReader # Now we do not search the index, but retrieve a document directly from the index reader = IndexReader(index_loc) # Retrieve a document using its docid #id = 'd6ed7028c686e5756ceb0aa0c9b62e0d' id = hits[0].docid # See class Document in https://github.com/castorini/pyserini/blob/master/pyserini/search/_base.py # properties: docid; id (alias); lucene_document; contents; raw doc = reader.doc(id).raw() #print(doc) # Get analyzed form (tokenized, stemmed, stopwords removed) analyzed = reader.analyze(doc) #print(analyzed) # Raw document VECTOR is also stored
def compute_document_length(index_reader: IndexReader, doc_id: str) -> int: return len(index_reader.doc_raw(doc_id))
class MsmarcoLtrSearcher: def __init__(self, model: str, ibm_model: str, index: str, data: str): self.model = model self.ibm_model = ibm_model self.fe = FeatureExtractor(index, max(multiprocessing.cpu_count() // 2, 1)) self.index_reader = IndexReader(index) self.data = data def add_fe(self): #self.fe.add(RunList('collections/msmarco-ltr-passage/run.monot5.run_list.whole.trec','t5')) for qfield, ifield in [('analyzed', 'contents'), ('text_unlemm', 'text_unlemm'), ('text_bert_tok', 'text_bert_tok')]: print(qfield, ifield) self.fe.add( BM25Stat(SumPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) self.fe.add( BM25Stat(AvgPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) self.fe.add( BM25Stat(MedianPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) self.fe.add( BM25Stat(MaxPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) self.fe.add( BM25Stat(MinPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) self.fe.add( BM25Stat(MaxMinRatioPooler(), k1=2.0, b=0.75, field=ifield, qfield=qfield)) self.fe.add( LmDirStat(SumPooler(), mu=1000, field=ifield, qfield=qfield)) self.fe.add( LmDirStat(AvgPooler(), mu=1000, field=ifield, qfield=qfield)) self.fe.add( LmDirStat(MedianPooler(), mu=1000, field=ifield, qfield=qfield)) self.fe.add( LmDirStat(MaxPooler(), mu=1000, field=ifield, qfield=qfield)) self.fe.add( LmDirStat(MinPooler(), mu=1000, field=ifield, qfield=qfield)) self.fe.add( LmDirStat(MaxMinRatioPooler(), mu=1000, field=ifield, qfield=qfield)) self.fe.add(NormalizedTfIdf(field=ifield, qfield=qfield)) self.fe.add(ProbalitySum(field=ifield, qfield=qfield)) self.fe.add(DfrGl2Stat(SumPooler(), field=ifield, qfield=qfield)) self.fe.add(DfrGl2Stat(AvgPooler(), field=ifield, qfield=qfield)) self.fe.add(DfrGl2Stat(MedianPooler(), field=ifield, qfield=qfield)) self.fe.add(DfrGl2Stat(MaxPooler(), field=ifield, qfield=qfield)) self.fe.add(DfrGl2Stat(MinPooler(), field=ifield, qfield=qfield)) self.fe.add( DfrGl2Stat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) self.fe.add( DfrInExpB2Stat(SumPooler(), field=ifield, qfield=qfield)) self.fe.add( DfrInExpB2Stat(AvgPooler(), field=ifield, qfield=qfield)) self.fe.add( DfrInExpB2Stat(MedianPooler(), field=ifield, qfield=qfield)) self.fe.add( DfrInExpB2Stat(MaxPooler(), field=ifield, qfield=qfield)) self.fe.add( DfrInExpB2Stat(MinPooler(), field=ifield, qfield=qfield)) self.fe.add( DfrInExpB2Stat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) self.fe.add(DphStat(SumPooler(), field=ifield, qfield=qfield)) self.fe.add(DphStat(AvgPooler(), field=ifield, qfield=qfield)) self.fe.add(DphStat(MedianPooler(), field=ifield, qfield=qfield)) self.fe.add(DphStat(MaxPooler(), field=ifield, qfield=qfield)) self.fe.add(DphStat(MinPooler(), field=ifield, qfield=qfield)) self.fe.add( DphStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) self.fe.add(Proximity(field=ifield, qfield=qfield)) self.fe.add(TpScore(field=ifield, qfield=qfield)) self.fe.add(TpDist(field=ifield, qfield=qfield)) self.fe.add(DocSize(field=ifield)) self.fe.add(QueryLength(qfield=qfield)) self.fe.add(QueryCoverageRatio(qfield=qfield)) self.fe.add(UniqueTermCount(qfield=qfield)) self.fe.add(MatchingTermCount(field=ifield, qfield=qfield)) self.fe.add(SCS(field=ifield, qfield=qfield)) self.fe.add(TfStat(AvgPooler(), field=ifield, qfield=qfield)) self.fe.add(TfStat(MedianPooler(), field=ifield, qfield=qfield)) self.fe.add(TfStat(SumPooler(), field=ifield, qfield=qfield)) self.fe.add(TfStat(MinPooler(), field=ifield, qfield=qfield)) self.fe.add(TfStat(MaxPooler(), field=ifield, qfield=qfield)) self.fe.add( TfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) self.fe.add( TfIdfStat(True, AvgPooler(), field=ifield, qfield=qfield)) self.fe.add( TfIdfStat(True, MedianPooler(), field=ifield, qfield=qfield)) self.fe.add( TfIdfStat(True, SumPooler(), field=ifield, qfield=qfield)) self.fe.add( TfIdfStat(True, MinPooler(), field=ifield, qfield=qfield)) self.fe.add( TfIdfStat(True, MaxPooler(), field=ifield, qfield=qfield)) self.fe.add( TfIdfStat(True, MaxMinRatioPooler(), field=ifield, qfield=qfield)) self.fe.add( NormalizedTfStat(AvgPooler(), field=ifield, qfield=qfield)) self.fe.add( NormalizedTfStat(MedianPooler(), field=ifield, qfield=qfield)) self.fe.add( NormalizedTfStat(SumPooler(), field=ifield, qfield=qfield)) self.fe.add( NormalizedTfStat(MinPooler(), field=ifield, qfield=qfield)) self.fe.add( NormalizedTfStat(MaxPooler(), field=ifield, qfield=qfield)) self.fe.add( NormalizedTfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) self.fe.add(IdfStat(AvgPooler(), field=ifield, qfield=qfield)) self.fe.add(IdfStat(MedianPooler(), field=ifield, qfield=qfield)) self.fe.add(IdfStat(SumPooler(), field=ifield, qfield=qfield)) self.fe.add(IdfStat(MinPooler(), field=ifield, qfield=qfield)) self.fe.add(IdfStat(MaxPooler(), field=ifield, qfield=qfield)) self.fe.add( IdfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) self.fe.add(IcTfStat(AvgPooler(), field=ifield, qfield=qfield)) self.fe.add(IcTfStat(MedianPooler(), field=ifield, qfield=qfield)) self.fe.add(IcTfStat(SumPooler(), field=ifield, qfield=qfield)) self.fe.add(IcTfStat(MinPooler(), field=ifield, qfield=qfield)) self.fe.add(IcTfStat(MaxPooler(), field=ifield, qfield=qfield)) self.fe.add( IcTfStat(MaxMinRatioPooler(), field=ifield, qfield=qfield)) self.fe.add( UnorderedSequentialPairs(3, field=ifield, qfield=qfield)) self.fe.add( UnorderedSequentialPairs(8, field=ifield, qfield=qfield)) self.fe.add( UnorderedSequentialPairs(15, field=ifield, qfield=qfield)) self.fe.add(OrderedSequentialPairs(3, field=ifield, qfield=qfield)) self.fe.add(OrderedSequentialPairs(8, field=ifield, qfield=qfield)) self.fe.add(OrderedSequentialPairs(15, field=ifield, qfield=qfield)) self.fe.add(UnorderedQueryPairs(3, field=ifield, qfield=qfield)) self.fe.add(UnorderedQueryPairs(8, field=ifield, qfield=qfield)) self.fe.add(UnorderedQueryPairs(15, field=ifield, qfield=qfield)) self.fe.add(OrderedQueryPairs(3, field=ifield, qfield=qfield)) self.fe.add(OrderedQueryPairs(8, field=ifield, qfield=qfield)) self.fe.add(OrderedQueryPairs(15, field=ifield, qfield=qfield)) start = time.time() self.fe.add( IbmModel1(f"{self.ibm_model}/title_unlemm", "text_unlemm", "title_unlemm", "text_unlemm")) end = time.time() print('IBM model Load takes %.2f seconds' % (end - start)) start = end self.fe.add( IbmModel1(f"{self.ibm_model}url_unlemm", "text_unlemm", "url_unlemm", "text_unlemm")) end = time.time() print('IBM model Load takes %.2f seconds' % (end - start)) start = end self.fe.add( IbmModel1(f"{self.ibm_model}body", "text_unlemm", "body", "text_unlemm")) end = time.time() print('IBM model Load takes %.2f seconds' % (end - start)) start = end self.fe.add( IbmModel1(f"{self.ibm_model}text_bert_tok", "text_bert_tok", "text_bert_tok", "text_bert_tok")) end = time.time() print('IBM model Load takes %.2f seconds' % (end - start)) start = end def batch_extract(self, df, queries, fe): tasks = [] task_infos = [] group_lst = [] for qid, group in tqdm(df.groupby('qid')): task = { "qid": qid, "docIds": [], "rels": [], "query_dict": queries[qid] } for t in group.reset_index().itertuples(): if (self.data == 'document'): if (self.index_reader.doc(t.pid) != None): task["docIds"].append(t.pid) task_infos.append((qid, t.pid, t.rel)) else: task["docIds"].append(t.pid) task_infos.append((qid, t.pid, t.rel)) tasks.append(task) group_lst.append((qid, len(task['docIds']))) if len(tasks) == 1000: features = fe.batch_extract(tasks) task_infos = pd.DataFrame(task_infos, columns=['qid', 'pid', 'rel']) group = pd.DataFrame(group_lst, columns=['qid', 'count']) print(features.shape) print(task_infos.qid.drop_duplicates().shape) print(group.mean()) print(features.head(10)) print(features.info()) yield task_infos, features, group tasks = [] task_infos = [] group_lst = [] # deal with rest if len(tasks) > 0: features = fe.batch_extract(tasks) task_infos = pd.DataFrame(task_infos, columns=['qid', 'pid', 'rel']) group = pd.DataFrame(group_lst, columns=['qid', 'count']) print(features.shape) print(task_infos.qid.drop_duplicates().shape) print(group.mean()) print(features.head(10)) print(features.info()) yield task_infos, features, group return def batch_predict(self, models, dev_extracted, feature_name): task_infos, features, group = dev_extracted dev_X = features.loc[:, feature_name] task_infos['score'] = 0. for gbm in models: task_infos['score'] += gbm.predict(dev_X) def search(self, dev, queries): batch_info = [] start_extract = time.time() models = pickle.load(open(self.model + '/model.pkl', 'rb')) metadata = json.load(open(self.model + '/metadata.json', 'r')) feature_used = metadata['feature_names'] for dev_extracted in self.batch_extract(dev, queries, self.fe): end_extract = time.time() print(f'extract 1000 queries take {end_extract - start_extract}s') task_infos, features, group = dev_extracted start_predict = time.time() self.batch_predict(models, dev_extracted, feature_used) end_predict = time.time() print(f'predict 1000 queries take {end_predict - start_predict}s') batch_info.append(task_infos) start_extract = time.time() batch_info = pd.concat(batch_info, axis=0, ignore_index=True) return batch_info
def index_reader(self): from pyserini.index import IndexReader return IndexReader(str(self.path))
def run(): parser = argparse.ArgumentParser(description="TREC-COVID document ranker CLI") parser.add_argument("-v", "--verbose", help="Increase output verbosity", action="store_true", default=False) parser.add_argument("-cp", "--compute_pickle", help="Compute mapping from internal lucene id's to external docid's", action="store_true", default=False) parser.add_argument("-n", "--n_queries", help="Naximum number of queries to run", type=int, default=999) parser.add_argument("-m", "--model", help="which model used in ranking from {bm25, tf_idf}", default="bm25") parser.add_argument("-d", "--doc_at_a_time", help="Use document_at_a_time algorithm", action="store_true", default=False) parser.add_argument("-k", "--k_docs", help="Numer of documents to retrieve", type=int, default=100) parser.add_argument("-r", "--rerank", help="Which rerank model to use 'rocchio', or 'ide'", default="none") args = parser.parse_args() global verbose verbose = args.verbose model = args.model doc_at_a_time = args.doc_at_a_time k = args.k_docs rerank = args.rerank index_reader = IndexReader(LUCENE_INDEX) searcher = SimpleSearcher(LUCENE_INDEX) models = Models(index_reader, QRELFILE) trec_index = Index(index_reader, searcher) if not os.path.exists('output'): os.makedirs('output') if args.compute_pickle: print("Computing id index dict") docidx_docid = {docidx : (trec_index.get_docid_from_index(docidx), trec_index.get_n_of_words_in_inverted_list_doc(docidx)) for docidx in range(trec_index.get_max_docindex())} with open('blob/mapping.pickle', 'wb') as handle: pickle.dump(docidx_docid, handle, protocol=pickle.HIGHEST_PROTOCOL) if True: with open('blob/mapping.pickle', 'rb') as handle: print("Loading id index dict") docidx_docid = pickle.load(handle) print("Finished initializing id index dict") topics = parse_topics(TOPICSFILE) rocchio = False if model == "bm25": rankfun = score_bm25 elif model == "tf_idf": rankfun = score_tf_idf else: print("Model should be 'tf_idf' or 'bm25' (default)!") sys.exit(1) t = time.localtime() current_time = time.strftime("%H:%M", t) rankfile = "output/ranking-{0}-{1}.txt".format(model, current_time) resultfile = "output/results-{0}-{1}.json".format(model, current_time) if doc_at_a_time: try: with open(rankfile, 'w') as outfile: for idx in range(1, min(args.n_queries+1, len(topics)+1)): for i, (score, docid) in enumerate(document_at_a_time(topics[str(idx)]["query"], trec_index, models, k, docidx_docid), 1): outfile.write(write_output(idx, docid, i, score, "document_at_a_time")) finally: outfile.close() else: try: with open(rankfile, 'w') as outfile: for idx in range(1, min(args.n_queries+1, len(topics)+1)): for i, (score, docid) in enumerate( get_docs_and_score_query(topics[str(idx)]["query"], rankfun, trec_index, models, idx, k, docidx_docid, rerank=rerank), 1): outfile.write(write_output(idx, docid, i, score, "score_query")) finally: outfile.close() results = pytrec_evaluation(rankfile, QRELFILE) with open(resultfile, 'w') as outjson: json.dump(results, outjson)
from pyserini.index import IndexReader from pyserini.search import SimpleSearcher parser = argparse.ArgumentParser() parser.add_argument('--msmarco_dir', type=str, default="./data") parser.add_argument('--index_dir', type=str, default="./data/index") parser.add_argument('--output_dir', type=str, default="./data/bm25_result") parser.add_argument('--bm25_k1', type=float, default=0.6) parser.add_argument('--bm25_b', type=float, default=0.8) parser.add_argument('--threads', type=int, default=4) parser.add_argument('--sample', type=int, default=0) args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) indexer = IndexReader(args.index_dir) searcher = SimpleSearcher(args.index_dir) searcher.set_bm25(k1=args.bm25_k1, b=args.bm25_b) num_candidates = indexer.stats()['documents'] def calculate_bm25(query): qid, text = query with open(os.path.join(args.output_dir, f"{qid}.tsv"), 'w') as outfile: candidates = searcher.search(text, k=num_candidates) for i in range(len(candidates)): outfile.write(f"{candidates[i].docid}\t{candidates[i].score}\n") if __name__ == "__main__": # load the queries