def test_plus_more_cols(self): import pyterrier.transformer as ptt from pyterrier.model import add_ranks mock1 = pt.Transformer.from_df(add_ranks( pd.DataFrame([["q1", "a query", "doc1", 5]], columns=["qid", "query", "docno", "score"])), uniform=True) mock2 = pt.Transformer.from_df(add_ranks( pd.DataFrame([["q1", "a query", "doc1", 10]], columns=["qid", "query", "docno", "score"])), uniform=True) combined = mock1 + mock2 # we dont need an input, as both Identity transformers will return anyway rtr = combined.transform(None) self.assertEqual(1, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(15, rtr.iloc[0]["score"]) bad_columns = [ "rank_x", "rank_y", "rank_r", "query_x", "query_y", "query_R", "score_x", "score_y", "score_r" ] for bad in bad_columns: self.assertFalse(bad in rtr.columns, "column %s in returned dataframe" % bad)
def test_rank_two_queries(self): df = pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 4], ["q2", "doc1", 4]], columns=["qid", "docno", "score"]) df = add_ranks(df) self.assertTrue("rank" in df.columns) self.assertEqual(df.iloc[0]["rank"], FIRST_RANK) self.assertEqual(df.iloc[1]["rank"], FIRST_RANK+1) self.assertEqual(df.iloc[2]["rank"], FIRST_RANK)
def test_mul(self): import pyterrier.transformer as ptt mock = pt.Transformer.from_df(pd.DataFrame( [["q1", "doc1", 5]], columns=["qid", "docno", "score"]), uniform=True) for comb in [mock * 10, 10 * mock]: rtr = comb.transform(None) self.assertEqual(1, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(50, rtr.iloc[0]["score"]) import pyterrier.transformer as ptt from pyterrier.model import add_ranks mock = pt.Transformer.from_df(add_ranks( pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 10]], columns=["qid", "docno", "score"])), uniform=True) rtr = mock.search("bla", qid="q1") self.assertEqual(2, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc2", rtr.iloc[0]["docno"]) self.assertEqual(pt.model.FIRST_RANK, rtr.iloc[0]["rank"]) rtr = (-1 * mock).search("bla", qid="q1") self.assertEqual(2, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(pt.model.FIRST_RANK, rtr.iloc[0]["rank"])
def transform(self, topics_and_res): scoredict=defaultdict(lambda: defaultdict(dict)) lastqid=None qids=[] for i, row in topics_and_res.iterrows(): qid = row["qid"] if qid != lastqid: qids.append(qid) lastqid = qid docno, passage = row["docno"].split("%p") scoredict[qid][docno][int(passage)] = row["score"] rows=[] for qid in qids: for docno in scoredict[qid]: if self.agg == 'first': first_passage_id = min( scoredict[qid][docno].keys() ) score = scoredict[qid][docno][first_passage_id] if self.agg == 'max': score = max( scoredict[qid][docno].values() ) if self.agg == 'mean': score = sum( scoredict[qid][docno].values() ) / len(scoredict[qid][docno]) if self.agg == "kmaxavg": values = np.fromiter(scoredict[qid][docno].values(), dtype=float) K = self.K score = np.argpartition( values , -K)[-K:].mean() if len(values) > K else values.mean() rows.append([qid, docno, score]) rtr = pd.DataFrame(rows, columns=["qid", "docno", "score"]) # add the queries back queries = topics_and_res[["qid", "query"]].dropna(axis=0, subset=["query"]).drop_duplicates() rtr = rtr.merge(queries, on=["qid"]) rtr = add_ranks(rtr) return rtr
def transform(self, topics_and_res): topics_and_res = topics_and_res.copy() topics_and_res[["olddocno", "pid"]] = topics_and_res.docno.str.split("%p", expand=True) if self.agg == 'max': groups = topics_and_res.groupby(['qid', 'olddocno']) group_max_idx = groups['score'].idxmax() rtr = topics_and_res.loc[group_max_idx, :] rtr = rtr.drop(columns=['docno', 'pid']).rename(columns={"olddocno" : "docno"}) if self.agg == 'first': #could this be done by just selectin pid = 0? topics_and_res.pid = topics_and_res.pid.astype(int) rtr = topics_and_res[topics_and_res.pid == 0].rename(columns={"olddocno" : "docno"}) groups = topics_and_res.groupby(['qid', 'olddocno']) group_first_idx = groups['pid'].idxmin() rtr = topics_and_res.loc[group_first_idx, ] rtr = rtr.drop(columns=['docno', 'pid']).rename(columns={"olddocno" : "docno"}) if self.agg == 'mean': rtr = topics_and_res.groupby(['qid', 'olddocno']).mean()['score'].reset_index().rename(columns={'olddocno' : 'docno'}) from .model import query_columns #add query columns back rtr = rtr.merge(topics_and_res[query_columns(topics_and_res)].drop_duplicates(), on='qid') if self.agg == 'kmaxavg': rtr = topics_and_res.groupby(['qid', 'olddocno'])['score'].apply(lambda ser: ser.nlargest(2).mean()).reset_index().rename(columns={'olddocno' : 'docno'}) from .model import query_columns #add query columns back rtr = rtr.merge(topics_and_res[query_columns(topics_and_res)].drop_duplicates(), on='qid') rtr = add_ranks(rtr) return rtr
def test_rank_one_query(self): df = pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 5]], columns=["qid", "docno", "score"]) df = add_ranks(df) self.assertTrue("rank" in df.columns) # check that first item is rank 1 self.assertEqual(df.iloc[0]["rank"], FIRST_RANK) # check that ties are resolved by keeping the same order. # trec_eval instead breaks ties on ascending docno self.assertEqual(df.iloc[1]["rank"], FIRST_RANK+1)
def test_rank_one_query_neg(self): df = pd.DataFrame([["q1", "doc1", -4], ["q1", "doc2", -5]], columns=["qid", "docno", "score"]) df = add_ranks(df) df = df.sort_values("rank", ascending=True) self.assertTrue("rank" in df.columns) # check that first item is rank 1 self.assertEqual(df.iloc[0]["rank"], FIRST_RANK) self.assertEqual(df.iloc[0]["docno"], "doc1") df = pd.DataFrame([["q1", "doc2", -5], ["q1", "doc1", -4]], columns=["qid", "docno", "score"]) df = add_ranks(df) df = df.sort_values("rank", ascending=True) self.assertTrue("rank" in df.columns) # check that first item is rank 1 self.assertEqual(df.iloc[0]["rank"], FIRST_RANK) self.assertEqual(df.iloc[0]["docno"], "doc1")
def test_rank_one_query_sort(self): import pyterrier as pt sort_status = pt.model.STRICT_SORT pt.model.STRICT_SORT = True df = pd.DataFrame([["q1", "doc1", 4], ["q1", "doc2", 5]], columns=["qid", "docno", "score"]) df = add_ranks(df) print(df) self.assertTrue("rank" in df.columns) # check that first item is rank 1 self.assertEqual(df.iloc[0]["rank"], FIRST_RANK) self.assertEqual(df.iloc[0]["docno"], "doc2") pt.model.STRICT_SORT = sort_status
def transform(self, te): te_dataset = DFDataset(te, self.tokenizer, split="test", get_doc_fn=self.get_doc_fn) # we permit to adjust the batch size to allow better testing scores = bert4ir_score(self.model, te_dataset, batch_size=self.test_batch_size) assert len(scores) == len( te), "Expected %d scores, but got %d" % (len(tr), len(scores)) te["score"] = scores return add_ranks(te)
def transform(self, topics_res): # wmdistance is a _distance_, so we take the negative as our "similarity" score lambda_row = lambda row: self.wv.wmdistance( list(tokenize(row["query"])), list(tokenize(row[self.doc_attr]))) # could take a while, add a progress bar if asked to if self.verbose: tqdm.pandas() topics_res["score"] = -1 * topics_res.progress_apply(lambda_row, axis=1) else: topics_res["score"] = -1 * topics_res.apply(lambda_row, axis=1) return add_ranks(topics_res)
def transform(self, topics): from pyterrier import tqdm queries = [] qid2q = {} for q, qid in zip(topics["query"].to_list(), topics["qid"].to_list()): passage = self.tokenizer.encode( q, add_special_tokens=True, max_length=self.args.max_seq_length, ) passage_len = min(len(passage), self.args.max_query_length) input_id_b = pad_input_ids(passage, self.args.max_query_length) queries.append([passage_len, input_id_b]) qid2q[qid] = q print("***** inference of %d queries *****" % len(queries)) dev_query_embedding, dev_query_embedding2id = StreamInferenceDoc( self.args, self.model, GetProcessingFn(self.args, query=True), "transform", queries, is_query_inference=True) print("***** faiss search for %d queries on %d shards *****" % (len(queries), self.segments)) rtr = [] for i, offset in enumerate(tqdm(self.shard_offsets, unit="shard")): scores, neighbours = self.cpu_index[i].search( dev_query_embedding, self.num_results) res = self._calc_scores(topics["qid"].values, self.passage_embedding2id[i], neighbours, scores, num_results=self.num_results, offset=offset, qid2q=qid2q) rtr.append(res) rtr = pd.concat(rtr) rtr = add_ranks(rtr) rtr = rtr[rtr["rank"] < self.num_results] rtr = rtr.sort_values(by=["qid", "score", "docno"], ascending=[True, False, True]) return rtr
def transform(self, queries_and_docs): groupby = queries_and_docs.groupby("qid") rtr = [] with torch.no_grad(): for qid, group in tqdm( groupby, total=len(groupby), desc='colbert', unit="q") if self.verbose else groupby: query = group["query"].values[0] ranking = rerank(self.args, query, group["docno"].values, group[self.doc_attr].values, index=None) for rank, (score, pid, passage) in enumerate(ranking): rtr.append([qid, query, pid, score, rank]) return add_ranks( pd.DataFrame(rtr, columns=["qid", "query", "docno", "score", "rank"]))
def transform(self, topics_and_res): import pandas as pd rtr = [] grouper = topics_and_res.groupby("qid") from pyterrier import tqdm, started assert started() #for each query, get the results, and pass to _for_each_query for qid, group in tqdm(grouper, desc="BERTQE", unit="q") if self.verbose else grouper: query = group["query"].iloc[0] scores = self._for_each_query(qid, query, group[["docno", self.body_attr]]) # assigned the scores to the input documents for i, s in enumerate(scores.tolist()): rtr.append([qid, query, group.iloc[i]["docno"], s]) # returns the final dataframe df = pd.DataFrame(rtr, columns=["qid", "query", "docno", "score"]) return add_ranks(df)
def transform(self, topics_res): def lambda_row(row): q = list(tokenize(row["query"])) d = list(tokenize(row[self.doc_attr])) qs = np.array( [self.wv[t] if t in self.wv else self.oov for t in q]) ds = np.array( [self.wv[t] if t in self.wv else self.oov for t in d]) qs_avg = np.average(qs, axis=0) ds_avg = np.average(ds, axis=0) return qs_avg @ qs_avg.T / (norm(qs_avg) * norm(ds_avg)) # could take a while, add a progress bar if asked to if self.verbose: tqdm.pandas() topics_res["score"] = topics_res.progress_apply(lambda_row, axis=1) else: topics_res["score"] = topics_res.apply(lambda_row, axis=1) return add_ranks(topics_res)
def transform(self, queries_and_docs): from cedr import train import pyterrier as pt train.tqdm = pt.tqdm import pandas as pd test_run = self._make_cedr_run(queries_and_docs, None) dataset = self._make_cedr_dataset(queries_and_docs) run_values = train.run_model(self.model, dataset, test_run, desc="CEDR") run_df_rows = [] for q, docs in run_values.items(): for d in docs: run_df_rows.append([q, d, docs[d]]) run_df = pd.DataFrame(run_df_rows, columns=["qid", "docno", "score"]) if "score" in queries_and_docs.columns: queries_and_docs = queries_and_docs.drop(columns="score") final_df = run_df.merge(queries_and_docs, on=["qid", "docno"]) final_df = add_ranks(final_df) return final_df
def test_rank_zero_query(self): df = pd.DataFrame([], columns=["qid", "docno", "score"]) df = add_ranks(df) self.assertTrue("rank" in df.columns)