def test_intersect(self):
        import pyterrier.transformer as ptt
        mock1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "q1texta", "doc1", 5, "body text"]],
                         columns=["qid", "query", "docno", "score", "body"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "q1textb", "doc2", 10, "body text"],
                          ["q1", "q1textb", "doc1", 10, "body text"]],
                         columns=["qid", "query", "docno", "score", "body"]))

        combined = mock1 & mock2
        # we dont need an input, as both Identity transformers will return anyway
        rtr = combined.transform(None)

        self.assertEqual(1, len(rtr))
        self.assertTrue("q1" in rtr["qid"].values)
        self.assertTrue("doc1" in rtr["docno"].values)
        self.assertFalse("doc2" in rtr["docno"].values)
        # in case we have different values for query for the same (qid, docno), we use the left one
        self.assertTrue("q1texta" in rtr["query"].values)
        self.assertTrue("q1textb" not in rtr["query"].values)

        for col in ["qid", "query", "docno", "body"]:
            self.assertTrue(col in rtr.columns, "%s not found in cols" % col)

        for col in ["rank", "score"]:
            self.assertFalse(col in rtr.columns, "%s found in cols" % col)
Example #2
0
    def test_plus_more_cols(self):
        import pyterrier.transformer as ptt
        from pyterrier.model import add_ranks
        mock1 = ptt.UniformTransformer(
            add_ranks(
                pd.DataFrame([["q1", "a query", "doc1", 5]],
                             columns=["qid", "query", "docno", "score"])))
        mock2 = ptt.UniformTransformer(
            add_ranks(
                pd.DataFrame([["q1", "a query", "doc1", 10]],
                             columns=["qid", "query", "docno", "score"])))

        combined = mock1 + mock2
        # we dont need an input, as both Identity transformers will return anyway
        rtr = combined.transform(None)

        self.assertEqual(1, len(rtr))
        self.assertEqual("q1", rtr.iloc[0]["qid"])
        self.assertEqual("doc1", rtr.iloc[0]["docno"])
        self.assertEqual(15, rtr.iloc[0]["score"])
        bad_columns = [
            "rank_x", "rank_y", "rank_r", "query_x", "query_y", "query_R",
            "score_x", "score_y", "score_r"
        ]
        for bad in bad_columns:
            self.assertFalse(bad in rtr.columns,
                             "column %s in returned dataframe" % bad)
    def test_concatenate(self):
        import numpy as np
        import pyterrier.transformer as ptt
        mock1 = ptt.UniformTransformer(
            pd.DataFrame(
                [["q1", "d2", 2, 4.9, np.array([1, 2])],
                 ["q1", "d3", 1, 5.1, np.array([1, 2])]],
                columns=["qid", "docno", "rank", "score", "bla"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame(
                [["q1", "d1", 1, 4.9, np.array([1, 1])],
                 ["q1", "d3", 2, 5.1, np.array([1, 2])]],
                columns=["qid", "docno", "rank", "score", "bla"]))

        cutpipe = mock1 ^ mock2
        rtr = cutpipe.transform(None)
        self.assertEqual(3, len(rtr))
        row0 = rtr.iloc[0]
        self.assertEqual("d3", row0["docno"])
        self.assertEqual(5.1, row0["score"])
        row1 = rtr.iloc[1]
        self.assertEqual("d2", row1["docno"])
        self.assertEqual(4.9, row1["score"])
        row2 = rtr.iloc[2]
        self.assertEqual("d1", row2["docno"])
        self.assertEqual(4.9 - 0.0001, row2["score"])
Example #4
0
    def test_mul(self):

        import pyterrier.transformer as ptt
        mock = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))
        for comb in [mock * 10, 10 * mock]:
            rtr = comb.transform(None)
            self.assertEqual(1, len(rtr))
            self.assertEqual("q1", rtr.iloc[0]["qid"])
            self.assertEqual("doc1", rtr.iloc[0]["docno"])
            self.assertEqual(50, rtr.iloc[0]["score"])

        import pyterrier.transformer as ptt
        from pyterrier.model import add_ranks
        mock = ptt.UniformTransformer(
            add_ranks(
                pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 10]],
                             columns=["qid", "docno", "score"])))
        rtr = mock.search("bla", qid="q1")
        self.assertEqual(2, len(rtr))
        self.assertEqual("q1", rtr.iloc[0]["qid"])
        self.assertEqual("doc2", rtr.iloc[0]["docno"])
        self.assertEqual(pt.model.FIRST_RANK, rtr.iloc[0]["rank"])

        rtr = (-1 * mock).search("bla", qid="q1")
        self.assertEqual(2, len(rtr))
        self.assertEqual("q1", rtr.iloc[0]["qid"])
        self.assertEqual("doc1", rtr.iloc[0]["docno"])
        self.assertEqual(pt.model.FIRST_RANK, rtr.iloc[0]["rank"])
Example #5
0
    def test_feature_union(self):
        mock_input = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))

        mock_f1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))
        mock_f2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 50]],
                         columns=["qid", "docno", "score"]))

        # test using direct instantiation, as well as using the ** operator
        for pipeline in [
                mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2),
                mock_input >> mock_f1**mock_f2
        ]:

            # we dont need an input, as both Identity transformers will return anyway
            rtr = pipeline.transform(None)
            self.assertEqual(1, len(rtr))
            self.assertTrue("qid" in rtr.columns)
            self.assertTrue("docno" in rtr.columns)
            self.assertTrue("score" in rtr.columns)
            self.assertTrue("features" in rtr.columns)
            self.assertTrue("q1" in rtr["qid"].values)
            self.assertTrue("doc1" in rtr["docno"].values)
            import numpy as np
            self.assertTrue(
                np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"]))
Example #6
0
    def test_concatenate(self):
        mock1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "d2", 1, 4.9], ["q1", "d3", 2, 5.1]],
                         columns=["qid", "docno", "rank", "score"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "d1", 1, 4.9], ["q1", "d3", 2, 5.1]],
                         columns=["qid", "docno", "rank", "score"]))

        cutpipe = mock1 ^ mock2
        rtr = cutpipe.transform(None)
        self.assertEqual(3, len(rtr))
Example #7
0
    def test_plus(self):
        mock1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))

        combined = mock1 + mock2
        # we dont need an input, as both Identity transformers will return anyway
        rtr = combined.transform(None)

        self.assertEqual(1, len(rtr))
        self.assertEqual("q1", rtr.iloc[0]["qid"])
        self.assertEqual("doc1", rtr.iloc[0]["docno"])
        self.assertEqual(15, rtr.iloc[0]["score"])
Example #8
0
    def test_intersect(self):
        mock1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc2", 10], ["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))

        combined = mock1 & mock2
        # we dont need an input, as both Identity transformers will return anyway
        rtr = combined.transform(None)

        self.assertEqual(1, len(rtr))
        self.assertTrue("q1" in rtr["qid"].values)
        self.assertTrue("doc1" in rtr["docno"].values)
        self.assertFalse("doc2" in rtr["docno"].values)
Example #9
0
 def test_rank_cutoff(self):
     mock1 = ptt.UniformTransformer(
         pd.DataFrame([["q1", "d2", 1, 5.1], ["q1", "d3", 2, 5.1]],
                      columns=["qid", "docno", "rank", "score"]))
     cutpipe = mock1 % 1
     rtr = cutpipe.transform(None)
     self.assertEqual(1, len(rtr))
Example #10
0
 def test_differing_queries(self):
     topics = pd.DataFrame([["q1", "q1"], ["q2", "q1"]],
                           columns=["qid", "query"])
     res1 = pd.DataFrame([["q1", "d1", 1.0]],
                         columns=["qid", "docno", "score"])
     res2 = pd.DataFrame([["q1", "d1", 1.0], ["q2", "d1", 2.0]],
                         columns=["qid", "docno", "score"])
     qrels = pd.DataFrame([["q1", "d1", 1], ["q2", "d1", 1]],
                          columns=["qid", "docno", "label"])
     with warnings.catch_warnings(record=True) as w:
         pt.pipelines.Experiment(
             [ptt.UniformTransformer(res1),
              ptt.UniformTransformer(res2)],
             topics,
             qrels, ["map"],
             baseline=0)
         assert "missing" in str(w[-1].message)
Example #11
0
    def test_feature_union_multi(self):
        mock0 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 0]],
                         columns=["qid", "docno", "score"]))

        mock1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))
        mock3 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 15]],
                         columns=["qid", "docno", "score"]))

        mock12a = mock1**mock2
        mock123a = mock1**mock2**mock3
        mock123b = mock12a**mock3

        self.assertEqual(2, len(mock12a.models))
        self.assertEqual(2, len(mock12a.models))
        ptt.setup_rewrites()

        mock123_simple = mock123a.compile()
        self.assertIsNotNone(mock123_simple)
        self.assertEqual(
            "FeatureUnionPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer())",
            mock123_simple.__repr__())
        #
        #mock123a, mock123b
        self.assertEqual(3, len(mock123_simple.models))
        for expression in [mock123_simple]:
            # we dont need an input, as both Identity transformers will return anyway
            rtr = (mock0 >> expression).transform(None)
            self.assertIsNotNone(rtr)
            self.assertEqual(1, len(rtr))
            self.assertTrue("qid" in rtr.columns)
            self.assertTrue("docno" in rtr.columns)
            self.assertTrue("score" in rtr.columns)
            self.assertTrue("features" in rtr.columns)
            self.assertTrue("q1" in rtr["qid"].values)
            self.assertTrue("doc1" in rtr["docno"].values)
            import numpy as np
            self.assertTrue(
                np.array_equal(np.array([5, 10, 15]), rtr.iloc[0]["features"]))
Example #12
0
    def test_feature_union(self):
        import pyterrier.transformer as ptt
        mock_input = ptt.UniformTransformer(
            pd.DataFrame([["q1", "a query", "doc1", 5]],
                         columns=["qid", "query", "docno", "score"]))

        mock_f1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "a query", "doc1", 10]],
                         columns=["qid", "query", "docno", "score"]))
        mock_f2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "a query", "doc1", 50]],
                         columns=["qid", "query", "docno", "score"]))

        def _test_expression(pipeline):
            # check access to the objects
            self.assertEqual(2, len(pipeline))
            self.assertEqual(2, len(pipeline[1]))

            # we dont need an input, as both Uniform transformers will return anyway
            rtr = pipeline.transform(None)
            self.assertEqual(1, len(rtr))
            self.assertTrue("qid" in rtr.columns)
            self.assertTrue("docno" in rtr.columns)
            #self.assertTrue("score" in rtr.columns)
            self.assertTrue("features" in rtr.columns)

            bad_columns = [
                "rank_x", "rank_y", "rank_r", "query_x", "query_y", "query_R",
                "score_x", "score_y", "score_r", "features_x", "features_y"
            ]
            print(rtr.columns)
            for bad in bad_columns:
                self.assertFalse(bad in rtr.columns,
                                 "column %s in returned dataframe" % bad)

            self.assertTrue("q1" in rtr["qid"].values)
            self.assertTrue("doc1" in rtr["docno"].values)
            import numpy as np
            self.assertTrue(
                np.array_equal(np.array([10, 50]), rtr.iloc[0]["features"]))

        # test using direct instantiation, as well as using the ** operator
        _test_expression(
            mock_input >> ptt.FeatureUnionPipeline(mock_f1, mock_f2))
        _test_expression(mock_input >> mock_f1**mock_f2)
Example #13
0
 def test_mul(self):
     mock = ptt.UniformTransformer(
         pd.DataFrame([["q1", "doc1", 5]],
                      columns=["qid", "docno", "score"]))
     for comb in [mock * 10, 10 * mock]:
         rtr = comb.transform(None)
         self.assertEqual(1, len(rtr))
         self.assertEqual("q1", rtr.iloc[0]["qid"])
         self.assertEqual("doc1", rtr.iloc[0]["docno"])
         self.assertEqual(50, rtr.iloc[0]["score"])
Example #14
0
    def test_then_multi(self):
        import pyterrier.transformer as ptt
        mock1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))
        mock3 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))
        mock4 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))

        combined12 = mock1 >> mock2
        combined23 = mock2 >> mock3
        combined123_a = combined12 >> mock3
        combined123_b = mock1 >> mock2 >> mock3
        combined123_c = mock2 >> combined23

        combined123_a_C = combined123_a.compile()
        combined123_b_C = combined123_b.compile()
        combined123_c_C = combined123_c.compile()

        self.assertEqual(2, len(combined12.models))
        self.assertEqual(2, len(combined23.models))
        self.assertEqual(2, len(combined12.models))
        self.assertEqual(2, len(combined23.models))

        for C in [combined123_a_C, combined123_b_C, combined123_c_C]:
            self.assertEqual(3, len(C.models))
            self.assertEqual(
                "ComposedPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer())",
                C.__repr__())

        # finally check recursive application
        C4 = (mock1 >> mock2 >> mock3 >> mock4).compile()
        self.assertEqual(
            "ComposedPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer(), UniformTransformer())",
            C4.__repr__())
        self.assertEqual(4, len(C4.models))
Example #15
0
    def test_plus_multi_rewrite(self):
        mock1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5]],
                         columns=["qid", "docno", "score"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))
        mock3 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 15]],
                         columns=["qid", "docno", "score"]))

        combined = mock1 + mock2 + mock3
        for pipe in [combined, combined.compile()]:

            # we dont need an input, as both Identity transformers will return anyway
            rtr = pipe.transform(None)

            self.assertEqual(1, len(rtr))
            self.assertEqual("q1", rtr.iloc[0]["qid"])
            self.assertEqual("doc1", rtr.iloc[0]["docno"])
            self.assertEqual(30, rtr.iloc[0]["score"])
Example #16
0
 def test_maxmin_normalisation(self):
     df = pd.DataFrame([
         ["q1", "doc1", 10], ["q1", "doc2", 2], ["q2", "doc1", 1], ["q3", "doc1", 0], ["q3", "doc2", 0]], columns=["qid", "docno", "score"])
     mock_input = ptt.UniformTransformer(df)
     pipe = mock_input >> ptp.PerQueryMaxMinScoreTransformer()
     rtr = pipe.transform(None)
     self.assertTrue("qid" in rtr.columns)
     self.assertTrue("docno" in rtr.columns)
     self.assertTrue("score" in rtr.columns)
     thedict = rtr.set_index(['qid', 'docno']).to_dict()['score']
     print(thedict)
     self.assertEqual(1, thedict[("q1", "doc1")])
     self.assertEqual(0, thedict[("q1", "doc2")])
     self.assertEqual(0, thedict[("q2", "doc1")])
     self.assertEqual(0, thedict[("q3", "doc1")])
     self.assertEqual(0, thedict[("q3", "doc2")])
Example #17
0
    def test_feature_union_multi(self):
        import pyterrier.transformer as ptt
        mock0 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 0], ["q1", "doc2", 0]],
                         columns=["qid", "docno", "score"]))

        mock1 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 5], ["q1", "doc2", 0]],
                         columns=["qid", "docno", "score"]))
        mock2 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10], ["q1", "doc2", 0]],
                         columns=["qid", "docno", "score"]))
        mock3 = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 15], ["q1", "doc2", 0]],
                         columns=["qid", "docno", "score"]))

        mock3_empty = ptt.UniformTransformer(
            pd.DataFrame([], columns=["qid", "docno", "score"]))
        mock2_partial = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 10]],
                         columns=["qid", "docno", "score"]))
        mock3_partial = ptt.UniformTransformer(
            pd.DataFrame([["q1", "doc1", 15]],
                         columns=["qid", "docno", "score"]))

        mock12a = mock1**mock2
        mock123a = mock1**mock2**mock3
        mock123b = mock12a**mock3
        mock123a_manual = ptt.FeatureUnionPipeline(
            ptt.FeatureUnionPipeline(mock1, mock2), mock3)
        mock123b_manual = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock2, mock3),
        )
        mock123e = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock2, mock3_empty),
        )

        mock12e3 = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock3_empty, mock3),
        )

        mock123p = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock2, mock3_partial),
        )

        mock12p3 = ptt.FeatureUnionPipeline(
            mock1,
            ptt.FeatureUnionPipeline(mock2_partial, mock3),
        )

        self.assertEqual(2, len(mock12a.models))
        self.assertEqual(2, len(mock12a.models))
        ptt.setup_rewrites()

        mock123_simple = mock123a.compile()
        self.assertIsNotNone(mock123_simple)
        self.assertEqual(
            "FeatureUnionPipeline(UniformTransformer(), UniformTransformer(), UniformTransformer())",
            mock123_simple.__repr__())
        #
        #
        self.assertEqual(3, len(mock123_simple.models))

        def _test_expression(expression):
            # we dont need an input, as both Identity transformers will return anyway
            rtr = (mock0 >> expression).transform(None)
            #print(rtr)
            self.assertIsNotNone(rtr)
            self.assertEqual(2, len(rtr))
            self.assertTrue("qid" in rtr.columns)
            self.assertTrue("docno" in rtr.columns)
            self.assertFalse("features_x" in rtr.columns)
            self.assertFalse("features_y" in rtr.columns)
            self.assertTrue("features" in rtr.columns)
            self.assertTrue("q1" in rtr["qid"].values)
            self.assertTrue("doc1" in rtr["docno"].values)
            import numpy as np
            self.assertTrue(
                np.allclose(np.array([5, 10, 15]), rtr.iloc[0]["features"]))

        _test_expression(mock123_simple)
        _test_expression(mock123a)
        _test_expression(mock123b)
        _test_expression(mock123b)
        with self.assertRaises(ValueError):
            _test_expression(mock123e)
        with self.assertRaises(ValueError):
            _test_expression(mock12e3)

        with warnings.catch_warnings(record=True) as w:
            _test_expression(mock123p)
            assert "Got number of results" in str(w[-1].message)

        with warnings.catch_warnings(record=True) as w:
            _test_expression(mock12p3)
            assert "Got number of results" in str(w[-1].message)