def test_feature_union(self): mock_input = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) mock_f1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) mock_f2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 50]], columns=["qid", "docno", "score"])) # test using direct instantiation, as well as using the ** operator for pipeline in [ mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2), mock_input >> mock_f1**mock_f2 ]: # we dont need an input, as both Identity transformers will return anyway rtr = pipeline.transform(None) self.assertEqual(1, len(rtr)) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) self.assertTrue("score" in rtr.columns) self.assertTrue("features" in rtr.columns) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) import numpy as np self.assertTrue( np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"]))
def test_feature_union(self): import pyterrier.transformer as ptt mock_input = pt.Transformer.from_df(pd.DataFrame( [["q1", "a query", "doc1", 5]], columns=["qid", "query", "docno", "score"]), uniform=True) mock_f1 = pt.Transformer.from_df(pd.DataFrame( [["q1", "a query", "doc1", 10]], columns=["qid", "query", "docno", "score"]), uniform=True) mock_f2 = pt.Transformer.from_df(pd.DataFrame( [["q1", "a query", "doc1", 50]], columns=["qid", "query", "docno", "score"]), uniform=True) def _test_expression(pipeline): # check access to the objects self.assertEqual(2, len(pipeline)) self.assertEqual(2, len(pipeline[1])) # we dont need an input, as both Uniform transformers will return anyway rtr = pipeline.transform(None) self.assertEqual(1, len(rtr)) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) #self.assertTrue("score" in rtr.columns) self.assertTrue("features" in rtr.columns) bad_columns = [ "rank_x", "rank_y", "rank_r", "query_x", "query_y", "query_R", "score_x", "score_y", "score_r", "features_x", "features_y" ] print(rtr.columns) for bad in bad_columns: self.assertFalse(bad in rtr.columns, "column %s in returned dataframe" % bad) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) import numpy as np self.assertTrue( np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"])) # test using direct instantiation, as well as using the ** operator _test_expression( mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2)) _test_expression(mock_input >> mock_f1**mock_f2)
def test_feature_union(self): mock_input = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 5]], columns=["qid", "docno", "score"])) mock_f1 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 10]], columns=["qid", "docno", "score"])) mock_f2 = ptt.UniformTransformer( pd.DataFrame([["q1", "doc1", 50]], columns=["qid", "docno", "score"])) def _test_expression(pipeline): # check access to the objects self.assertEqual(2, len(pipeline)) self.assertEqual(2, len(pipeline[1])) # we dont need an input, as both Uniform transformers will return anyway rtr = pipeline.transform(None) self.assertEqual(1, len(rtr)) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) #self.assertTrue("score" in rtr.columns) self.assertTrue("features" in rtr.columns) self.assertFalse("features_x" in rtr.columns) self.assertFalse("features_y" in rtr.columns) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) import numpy as np self.assertTrue( np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"])) # test using direct instantiation, as well as using the ** operator _test_expression( mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2)) _test_expression(mock_input >> mock_f1**mock_f2)
def test_feature_union_multi(self): import pyterrier.transformer as ptt mock0 = pt.Transformer.from_df(pd.DataFrame( [["q1", "doc1", 0], ["q1", "doc2", 0]], columns=["qid", "docno", "score"]), uniform=True) mock1 = pt.Transformer.from_df(pd.DataFrame( [["q1", "doc1", 5], ["q1", "doc2", 0]], columns=["qid", "docno", "score"]), uniform=True) mock2 = pt.Transformer.from_df(pd.DataFrame( [["q1", "doc1", 10], ["q1", "doc2", 0]], columns=["qid", "docno", "score"]), uniform=True) mock3 = pt.Transformer.from_df(pd.DataFrame( [["q1", "doc1", 15], ["q1", "doc2", 0]], columns=["qid", "docno", "score"]), uniform=True) mock3_empty = pt.Transformer.from_df(pd.DataFrame( [], columns=["qid", "docno", "score"]), uniform=True) mock2_partial = pt.Transformer.from_df(pd.DataFrame( [["q1", "doc1", 10]], columns=["qid", "docno", "score"]), uniform=True) mock3_partial = pt.Transformer.from_df(pd.DataFrame( [["q1", "doc1", 15]], columns=["qid", "docno", "score"]), uniform=True) mock12a = mock1**mock2 mock123a = mock1**mock2**mock3 mock123b = mock12a**mock3 mock123a_manual = ptt.FeatureUnionPipeline( ptt.FeatureUnionPipeline(mock1, mock2), mock3) mock123b_manual = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock2, mock3), ) mock123e = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock2, mock3_empty), ) mock12e3 = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock3_empty, mock3), ) mock123p = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock2, mock3_partial), ) mock12p3 = ptt.FeatureUnionPipeline( mock1, ptt.FeatureUnionPipeline(mock2_partial, mock3), ) self.assertEqual(2, len(mock12a.models)) self.assertEqual(2, len(mock12a.models)) ptt.setup_rewrites() mock123_simple = mock123a.compile() self.assertIsNotNone(mock123_simple) self.assertEqual( "FeatureUnionPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer())", mock123_simple.__repr__()) # # self.assertEqual(3, len(mock123_simple.models)) def _test_expression(expression): # we dont need an input, as both Identity transformers will return anyway rtr = (mock0 >> expression).transform(None) #print(rtr) self.assertIsNotNone(rtr) self.assertEqual(2, len(rtr)) self.assertTrue("qid" in rtr.columns) self.assertTrue("docno" in rtr.columns) self.assertFalse("features_x" in rtr.columns) self.assertFalse("features_y" in rtr.columns) self.assertTrue("features" in rtr.columns) self.assertTrue("q1" in rtr["qid"].values) self.assertTrue("doc1" in rtr["docno"].values) import numpy as np self.assertTrue( np.allclose(np.array([5, 10, 15]), rtr.iloc[0]["features"])) _test_expression(mock123_simple) _test_expression(mock123a) _test_expression(mock123b) _test_expression(mock123b) with self.assertRaises(ValueError): _test_expression(mock123e) with self.assertRaises(ValueError): _test_expression(mock12e3) with warnings.catch_warnings(record=True) as w: _test_expression(mock123p) assert "Got number of results" in str(w[-1].message) with warnings.catch_warnings(record=True) as w: _test_expression(mock12p3) assert "Got number of results" in str(w[-1].message)