def test_intersect(self): import pyterrier.transformer as ptt mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "q1texta", "doc1", 5, "body text"]], columns=["qid", "query", "docno", "score", "body"])) mock2 = ptt.UniformTransformer( pd.DataFrame([["q1", "q1textb", "doc2", 10, "body text"], ["q1", "q1textb", "doc1", 10, "body text"]], columns=["qid", "query", "docno", "score", "body"])) combined = mock1 & mock2 # we dont need an input, as both Identity transformers will return anyway rtr = combined.transform(None) self.assertEqual(1, len(rtr)) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) self.assertFalse("doc2" in rtr["docno"].values) # in case we have different values for query for the same (qid, docno), we use the left one self.assertTrue("q1texta" in rtr["query"].values) self.assertTrue("q1textb" not in rtr["query"].values) for col in ["qid", "query", "docno", "body"]: self.assertTrue(col in rtr.columns, "%s not found in cols" % col) for col in ["rank", "score"]: self.assertFalse(col in rtr.columns, "%s found in cols" % col)
def test_plus_more_cols(self): import pyterrier.transformer as ptt from pyterrier.model import add_ranks mock1 = ptt.UniformTransformer( add_ranks( pd.DataFrame([["q1", "a query", "doc1", 5]], columns=["qid", "query", "docno", "score"]))) mock2 = ptt.UniformTransformer( add_ranks( pd.DataFrame([["q1", "a query", "doc1", 10]], columns=["qid", "query", "docno", "score"]))) combined = mock1 + mock2 # we dont need an input, as both Identity transformers will return anyway rtr = combined.transform(None) self.assertEqual(1, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(15, rtr.iloc[0]["score"]) bad_columns = [ "rank_x", "rank_y", "rank_r", "query_x", "query_y", "query_R", "score_x", "score_y", "score_r" ] for bad in bad_columns: self.assertFalse(bad in rtr.columns, "column %s in returned dataframe" % bad)
def test_concatenate(self): import numpy as np import pyterrier.transformer as ptt mock1 = ptt.UniformTransformer( pd.DataFrame( [["q1", "d2", 2, 4.9, np.array([1, 2])], ["q1", "d3", 1, 5.1, np.array([1, 2])]], columns=["qid", "docno", "rank", "score", "bla"])) mock2 = ptt.UniformTransformer( pd.DataFrame( [["q1", "d1", 1, 4.9, np.array([1, 1])], ["q1", "d3", 2, 5.1, np.array([1, 2])]], columns=["qid", "docno", "rank", "score", "bla"])) cutpipe = mock1 ^ mock2 rtr = cutpipe.transform(None) self.assertEqual(3, len(rtr)) row0 = rtr.iloc[0] self.assertEqual("d3", row0["docno"]) self.assertEqual(5.1, row0["score"]) row1 = rtr.iloc[1] self.assertEqual("d2", row1["docno"]) self.assertEqual(4.9, row1["score"]) row2 = rtr.iloc[2] self.assertEqual("d1", row2["docno"]) self.assertEqual(4.9 - 0.0001, row2["score"])
def test_mul(self): import pyterrier.transformer as ptt mock = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) for comb in [mock * 10, 10 * mock]: rtr = comb.transform(None) self.assertEqual(1, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(50, rtr.iloc[0]["score"]) import pyterrier.transformer as ptt from pyterrier.model import add_ranks mock = ptt.UniformTransformer( add_ranks( pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 10]], columns=["qid", "docno", "score"]))) rtr = mock.search("bla", qid="q1") self.assertEqual(2, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc2", rtr.iloc[0]["docno"]) self.assertEqual(pt.model.FIRST_RANK, rtr.iloc[0]["rank"]) rtr = (-1 * mock).search("bla", qid="q1") self.assertEqual(2, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(pt.model.FIRST_RANK, rtr.iloc[0]["rank"])
def test_feature_union(self): mock_input = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) mock_f1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) mock_f2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 50]], columns=["qid", "docno", "score"])) # test using direct instantiation, as well as using the ** operator for pipeline in [ mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2), mock_input >> mock_f1**mock_f2 ]: # we dont need an input, as both Identity transformers will return anyway rtr = pipeline.transform(None) self.assertEqual(1, len(rtr)) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) self.assertTrue("score" in rtr.columns) self.assertTrue("features" in rtr.columns) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) import numpy as np self.assertTrue( np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"]))
def test_concatenate(self): mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "d2", 1, 4.9], ["q1", "d3", 2, 5.1]], columns=["qid", "docno", "rank", "score"])) mock2 = ptt.UniformTransformer( pd.DataFrame([["q1", "d1", 1, 4.9], ["q1", "d3", 2, 5.1]], columns=["qid", "docno", "rank", "score"])) cutpipe = mock1 ^ mock2 rtr = cutpipe.transform(None) self.assertEqual(3, len(rtr))
def test_plus(self): mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) mock2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) combined = mock1 + mock2 # we dont need an input, as both Identity transformers will return anyway rtr = combined.transform(None) self.assertEqual(1, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(15, rtr.iloc[0]["score"])
def test_intersect(self): mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) mock2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc2", 10], ["q1", "doc1", 10]], columns=["qid", "docno", "score"])) combined = mock1 & mock2 # we dont need an input, as both Identity transformers will return anyway rtr = combined.transform(None) self.assertEqual(1, len(rtr)) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) self.assertFalse("doc2" in rtr["docno"].values)
def test_rank_cutoff(self): mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "d2", 1, 5.1], ["q1", "d3", 2, 5.1]], columns=["qid", "docno", "rank", "score"])) cutpipe = mock1 % 1 rtr = cutpipe.transform(None) self.assertEqual(1, len(rtr))
def test_differing_queries(self): topics = pd.DataFrame([["q1", "q1"], ["q2", "q1"]], columns=["qid", "query"]) res1 = pd.DataFrame([["q1", "d1", 1.0]], columns=["qid", "docno", "score"]) res2 = pd.DataFrame([["q1", "d1", 1.0], ["q2", "d1", 2.0]], columns=["qid", "docno", "score"]) qrels = pd.DataFrame([["q1", "d1", 1], ["q2", "d1", 1]], columns=["qid", "docno", "label"]) with warnings.catch_warnings(record=True) as w: pt.pipelines.Experiment( [ptt.UniformTransformer(res1), ptt.UniformTransformer(res2)], topics, qrels, ["map"], baseline=0) assert "missing" in str(w[-1].message)
def test_feature_union_multi(self): mock0 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 0]], columns=["qid", "docno", "score"])) mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) mock2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) mock3 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 15]], columns=["qid", "docno", "score"])) mock12a = mock1**mock2 mock123a = mock1**mock2**mock3 mock123b = mock12a**mock3 self.assertEqual(2, len(mock12a.models)) self.assertEqual(2, len(mock12a.models)) ptt.setup_rewrites() mock123_simple = mock123a.compile() self.assertIsNotNone(mock123_simple) self.assertEqual( "FeatureUnionPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer())", mock123_simple.__repr__()) # #mock123a, mock123b self.assertEqual(3, len(mock123_simple.models)) for expression in [mock123_simple]: # we dont need an input, as both Identity transformers will return anyway rtr = (mock0 >> expression).transform(None) self.assertIsNotNone(rtr) self.assertEqual(1, len(rtr)) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) self.assertTrue("score" in rtr.columns) self.assertTrue("features" in rtr.columns) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) import numpy as np self.assertTrue( np.array_equal(np.array([5, 10, 15]), rtr.iloc[0]["features"]))
def test_feature_union(self): import pyterrier.transformer as ptt mock_input = ptt.UniformTransformer( pd.DataFrame([["q1", "a query", "doc1", 5]], columns=["qid", "query", "docno", "score"])) mock_f1 = ptt.UniformTransformer( pd.DataFrame([["q1", "a query", "doc1", 10]], columns=["qid", "query", "docno", "score"])) mock_f2 = ptt.UniformTransformer( pd.DataFrame([["q1", "a query", "doc1", 50]], columns=["qid", "query", "docno", "score"])) def _test_expression(pipeline): # check access to the objects self.assertEqual(2, len(pipeline)) self.assertEqual(2, len(pipeline[1])) # we dont need an input, as both Uniform transformers will return anyway rtr = pipeline.transform(None) self.assertEqual(1, len(rtr)) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) #self.assertTrue("score" in rtr.columns) self.assertTrue("features" in rtr.columns) bad_columns = [ "rank_x", "rank_y", "rank_r", "query_x", "query_y", "query_R", "score_x", "score_y", "score_r", "features_x", "features_y" ] print(rtr.columns) for bad in bad_columns: self.assertFalse(bad in rtr.columns, "column %s in returned dataframe" % bad) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) import numpy as np self.assertTrue( np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"])) # test using direct instantiation, as well as using the ** operator _test_expression( mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2)) _test_expression(mock_input >> mock_f1**mock_f2)
def test_mul(self): mock = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) for comb in [mock * 10, 10 * mock]: rtr = comb.transform(None) self.assertEqual(1, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(50, rtr.iloc[0]["score"])
def test_then_multi(self): import pyterrier.transformer as ptt mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) mock2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) mock3 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) mock4 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) combined12 = mock1 >> mock2 combined23 = mock2 >> mock3 combined123_a = combined12 >> mock3 combined123_b = mock1 >> mock2 >> mock3 combined123_c = mock2 >> combined23 combined123_a_C = combined123_a.compile() combined123_b_C = combined123_b.compile() combined123_c_C = combined123_c.compile() self.assertEqual(2, len(combined12.models)) self.assertEqual(2, len(combined23.models)) self.assertEqual(2, len(combined12.models)) self.assertEqual(2, len(combined23.models)) for C in [combined123_a_C, combined123_b_C, combined123_c_C]: self.assertEqual(3, len(C.models)) self.assertEqual( "ComposedPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer())", C.__repr__()) # finally check recursive application C4 = (mock1 >> mock2 >> mock3 >> mock4).compile() self.assertEqual( "ComposedPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer(), UniformTransformer())", C4.__repr__()) self.assertEqual(4, len(C4.models))
def test_plus_multi_rewrite(self): mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) mock2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) mock3 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 15]], columns=["qid", "docno", "score"])) combined = mock1 + mock2 + mock3 for pipe in [combined, combined.compile()]: # we dont need an input, as both Identity transformers will return anyway rtr = pipe.transform(None) self.assertEqual(1, len(rtr)) self.assertEqual("q1", rtr.iloc[0]["qid"]) self.assertEqual("doc1", rtr.iloc[0]["docno"]) self.assertEqual(30, rtr.iloc[0]["score"])
def test_maxmin_normalisation(self): df = pd.DataFrame([ ["q1", "doc1", 10], ["q1", "doc2", 2], ["q2", "doc1", 1], ["q3", "doc1", 0], ["q3", "doc2", 0]], columns=["qid", "docno", "score"]) mock_input = ptt.UniformTransformer(df) pipe = mock_input >> ptp.PerQueryMaxMinScoreTransformer() rtr = pipe.transform(None) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) self.assertTrue("score" in rtr.columns) thedict = rtr.set_index(['qid', 'docno']).to_dict()['score'] print(thedict) self.assertEqual(1, thedict[("q1", "doc1")]) self.assertEqual(0, thedict[("q1", "doc2")]) self.assertEqual(0, thedict[("q2", "doc1")]) self.assertEqual(0, thedict[("q3", "doc1")]) self.assertEqual(0, thedict[("q3", "doc2")])
def test_feature_union_multi(self): import pyterrier.transformer as ptt mock0 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 0], ["q1", "doc2", 0]], columns=["qid", "docno", "score"])) mock1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 0]], columns=["qid", "docno", "score"])) mock2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10], ["q1", "doc2", 0]], columns=["qid", "docno", "score"])) mock3 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 15], ["q1", "doc2", 0]], columns=["qid", "docno", "score"])) mock3_empty = ptt.UniformTransformer( pd.DataFrame([], columns=["qid", "docno", "score"])) mock2_partial = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) mock3_partial = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 15]], columns=["qid", "docno", "score"])) mock12a = mock1**mock2 mock123a = mock1**mock2**mock3 mock123b = mock12a**mock3 mock123a_manual = ptt.FeatureUnionPipeline( ptt.FeatureUnionPipeline(mock1, mock2), mock3) mock123b_manual = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock2, mock3), ) mock123e = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock2, mock3_empty), ) mock12e3 = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock3_empty, mock3), ) mock123p = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock2, mock3_partial), ) mock12p3 = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock2_partial, mock3), ) self.assertEqual(2, len(mock12a.models)) self.assertEqual(2, len(mock12a.models)) ptt.setup_rewrites() mock123_simple = mock123a.compile() self.assertIsNotNone(mock123_simple) self.assertEqual( "FeatureUnionPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer())", mock123_simple.__repr__()) # # self.assertEqual(3, len(mock123_simple.models)) def _test_expression(expression): # we dont need an input, as both Identity transformers will return anyway rtr = (mock0 >> expression).transform(None) #print(rtr) self.assertIsNotNone(rtr) self.assertEqual(2, len(rtr)) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) self.assertFalse("features_x" in rtr.columns) self.assertFalse("features_y" in rtr.columns) self.assertTrue("features" in rtr.columns) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) import numpy as np self.assertTrue( np.allclose(np.array([5, 10, 15]), rtr.iloc[0]["features"])) _test_expression(mock123_simple) _test_expression(mock123a) _test_expression(mock123b) _test_expression(mock123b) with self.assertRaises(ValueError): _test_expression(mock123e) with self.assertRaises(ValueError): _test_expression(mock12e3) with warnings.catch_warnings(record=True) as w: _test_expression(mock123p) assert "Got number of results" in str(w[-1].message) with warnings.catch_warnings(record=True) as w: _test_expression(mock12p3) assert "Got number of results" in str(w[-1].message)