Beispiel #1
0
    def test_feature_union(self):
        mock_input = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))

        mock_f1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))
        mock_f2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 50]],
                         columns=["qid", "docno", "score"]))

        # test using direct instantiation, as well as using the ** operator
        for pipeline in [
                mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2),
                mock_input >> mock_f1**mock_f2
        ]:

            # we dont need an input, as both Identity transformers will return anyway
            rtr = pipeline.transform(None)
            self.assertEqual(1, len(rtr))
            self.assertTrue("qid" in rtr.columns)
            self.assertTrue("docno" in rtr.columns)
            self.assertTrue("score" in rtr.columns)
            self.assertTrue("features" in rtr.columns)
            self.assertTrue("q1" in rtr["qid"].values)
            self.assertTrue("doc1" in rtr["docno"].values)
            import numpy as np
            self.assertTrue(
                np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"]))
Beispiel #2
0
    def test_feature_union(self):
        import pyterrier.transformer as ptt
        mock_input = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "a query", "doc1", 5]],
            columns=["qid", "query", "docno", "score"]),
                                            uniform=True)

        mock_f1 = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "a query", "doc1", 10]],
            columns=["qid", "query", "docno", "score"]),
                                         uniform=True)
        mock_f2 = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "a query", "doc1", 50]],
            columns=["qid", "query", "docno", "score"]),
                                         uniform=True)

        def _test_expression(pipeline):
            # check access to the objects
            self.assertEqual(2, len(pipeline))
            self.assertEqual(2, len(pipeline[1]))

            # we dont need an input, as both Uniform transformers will return anyway
            rtr = pipeline.transform(None)
            self.assertEqual(1, len(rtr))
            self.assertTrue("qid" in rtr.columns)
            self.assertTrue("docno" in rtr.columns)
            #self.assertTrue("score" in rtr.columns)
            self.assertTrue("features" in rtr.columns)

            bad_columns = [
                "rank_x", "rank_y", "rank_r", "query_x", "query_y", "query_R",
                "score_x", "score_y", "score_r", "features_x", "features_y"
            ]
            print(rtr.columns)
            for bad in bad_columns:
                self.assertFalse(bad in rtr.columns,
                                 "column %s in returned dataframe" % bad)

            self.assertTrue("q1" in rtr["qid"].values)
            self.assertTrue("doc1" in rtr["docno"].values)
            import numpy as np
            self.assertTrue(
                np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"]))

        # test using direct instantiation, as well as using the ** operator
        _test_expression(
            mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2))
        _test_expression(mock_input >> mock_f1**mock_f2)
Beispiel #3
0
    def test_feature_union(self):
        mock_input = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))

        mock_f1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))
        mock_f2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 50]],
                         columns=["qid", "docno", "score"]))

        def _test_expression(pipeline):
            # check access to the objects
            self.assertEqual(2, len(pipeline))
            self.assertEqual(2, len(pipeline[1]))

            # we dont need an input, as both Uniform transformers will return anyway
            rtr = pipeline.transform(None)
            self.assertEqual(1, len(rtr))
            self.assertTrue("qid" in rtr.columns)
            self.assertTrue("docno" in rtr.columns)
            #self.assertTrue("score" in rtr.columns)
            self.assertTrue("features" in rtr.columns)
            self.assertFalse("features_x" in rtr.columns)
            self.assertFalse("features_y" in rtr.columns)
            self.assertTrue("q1" in rtr["qid"].values)
            self.assertTrue("doc1" in rtr["docno"].values)
            import numpy as np
            self.assertTrue(
                np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"]))

        # test using direct instantiation, as well as using the ** operator
        _test_expression(
            mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2))
        _test_expression(mock_input >> mock_f1**mock_f2)
Beispiel #4
0
    def test_feature_union_multi(self):
        import pyterrier.transformer as ptt
        mock0 = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "doc1", 0], ["q1", "doc2", 0]],
            columns=["qid", "docno", "score"]),
                                       uniform=True)

        mock1 = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "doc1", 5], ["q1", "doc2", 0]],
            columns=["qid", "docno", "score"]),
                                       uniform=True)
        mock2 = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "doc1", 10], ["q1", "doc2", 0]],
            columns=["qid", "docno", "score"]),
                                       uniform=True)
        mock3 = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "doc1", 15], ["q1", "doc2", 0]],
            columns=["qid", "docno", "score"]),
                                       uniform=True)

        mock3_empty = pt.Transformer.from_df(pd.DataFrame(
            [], columns=["qid", "docno", "score"]),
                                             uniform=True)
        mock2_partial = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "doc1", 10]], columns=["qid", "docno", "score"]),
                                               uniform=True)
        mock3_partial = pt.Transformer.from_df(pd.DataFrame(
            [["q1", "doc1", 15]], columns=["qid", "docno", "score"]),
                                               uniform=True)

        mock12a = mock1**mock2
        mock123a = mock1**mock2**mock3
        mock123b = mock12a**mock3
        mock123a_manual = ptt.FeatureUnionPipeline(
            ptt.FeatureUnionPipeline(mock1, mock2), mock3)
        mock123b_manual = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock2, mock3),
        )
        mock123e = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock2, mock3_empty),
        )

        mock12e3 = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock3_empty, mock3),
        )

        mock123p = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock2, mock3_partial),
        )

        mock12p3 = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock2_partial, mock3),
        )

        self.assertEqual(2, len(mock12a.models))
        self.assertEqual(2, len(mock12a.models))
        ptt.setup_rewrites()

        mock123_simple = mock123a.compile()
        self.assertIsNotNone(mock123_simple)
        self.assertEqual(
            "FeatureUnionPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer())",
            mock123_simple.__repr__())
        #
        #
        self.assertEqual(3, len(mock123_simple.models))

        def _test_expression(expression):
            # we dont need an input, as both Identity transformers will return anyway
            rtr = (mock0 >> expression).transform(None)
            #print(rtr)
            self.assertIsNotNone(rtr)
            self.assertEqual(2, len(rtr))
            self.assertTrue("qid" in rtr.columns)
            self.assertTrue("docno" in rtr.columns)
            self.assertFalse("features_x" in rtr.columns)
            self.assertFalse("features_y" in rtr.columns)
            self.assertTrue("features" in rtr.columns)
            self.assertTrue("q1" in rtr["qid"].values)
            self.assertTrue("doc1" in rtr["docno"].values)
            import numpy as np
            self.assertTrue(
                np.allclose(np.array([5, 10, 15]), rtr.iloc[0]["features"]))

        _test_expression(mock123_simple)
        _test_expression(mock123a)
        _test_expression(mock123b)
        _test_expression(mock123b)
        with self.assertRaises(ValueError):
            _test_expression(mock123e)
        with self.assertRaises(ValueError):
            _test_expression(mock12e3)

        with warnings.catch_warnings(record=True) as w:
            _test_expression(mock123p)
            assert "Got number of results" in str(w[-1].message)

        with warnings.catch_warnings(record=True) as w:
            _test_expression(mock12p3)
            assert "Got number of results" in str(w[-1].message)