Esempio n. 1
0
    def process(self, b: Evaluation) -> Evaluation:
        a_nums = self.a.get_df().select_dtypes(include=[np.number])
        b_nums = b.get_df().select_dtypes(include=[np.number])
        diff = (b_nums - a_nums) / a_nums
        difference_eval = Evaluation(diff)

        return difference_eval
Esempio n. 2
0
    def test_filterbyindex_multindex(self):
        """ tests if filtering by index works for multi-index dataframe """
        # test dataframe
        # {"group": "a", "key": "a", "value": 10},
        # {"group": "a", "key": "b", "value": 5},
        # {"group": "a", "key": "c", "value": 3},
        # {"group": "b", "key": "d", "value": 100},
        # {"group": "b", "key": "e", "value": 31}

        idx_arrays = [["a", "a", "a", "b", "b"], ["a", "b", "c", "d", "e"]]
        index = pd.MultiIndex.from_arrays(idx_arrays, names=("group", "key"))
        df = pd.DataFrame({"value": [10, 5, 3, 100, 31]}, index=index)
        eval1 = Evaluation(df, eval_id=10)

        # filter by first index
        pipeline = [FilterByIndex("group", "a")]
        result = eval1.process(pipeline)

        expected_index = pd.Index(["a", "b", "c"], name="key")
        expected_df = pd.DataFrame({"value": [10, 5, 3]}, index=expected_index)

        assert_frame_equal(result.get_df(), expected_df)
        assert result.get_eval_id() == 10

        # filter by second index
        pipeline = [FilterByIndex("key", "a")]
        result = eval1.process(pipeline)

        expected_index = pd.Index(["a"], name="group")
        expected_df = pd.DataFrame({"value": [10]}, index=expected_index)

        assert_frame_equal(result.get_df(), expected_df)
        assert result.get_eval_id() == 10
Esempio n. 3
0
    def test_evaluation_add_different_columns(self):
        """
        Using the + magic method should return a new Evaluation that consists
        of the concatenated Evaluations, even if columns don't match
        """
        df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
        result1 = Evaluation(df1)

        df2 = pd.DataFrame([{"a": 5}, {"a": 7}])
        result2 = Evaluation(df2)

        sum_result = result1 + result2
        expected = pd.DataFrame([{
            "a": 1,
            "b": 2
        }, {
            "a": 3,
            "b": 4
        }, {
            "a": 5
        }, {
            "a": 7
        }])

        assert_frame_equal(sum_result.get_df(), expected)
        assert sum_result.get_eval_id() is None
Esempio n. 4
0
    def test_evaluation_add(self):
        """
        Using the + magic method should return a new Evaluation that consists
        of the concatenated Evaluations.
        """
        df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
        result1 = Evaluation(df1)

        df2 = pd.DataFrame([{"a": 5, "b": 6}, {"a": 7, "b": 8}])
        result2 = Evaluation(df2)

        sum_result = result1 + result2
        expected = pd.DataFrame([{
            "a": 1,
            "b": 2
        }, {
            "a": 3,
            "b": 4
        }, {
            "a": 5,
            "b": 6
        }, {
            "a": 7,
            "b": 8
        }])

        assert_frame_equal(sum_result.get_df(), expected)
        assert sum_result.get_eval_id() is None
Esempio n. 5
0
    def test_evaluation_add_multiple(self):
        """
        Using the sum() built-in function should return a new Evaluation
        that consists of concatenated Evaluations
        """
        eval1 = Evaluation(pd.DataFrame([{"a": 1, "b": 2}]))
        eval2 = Evaluation(pd.DataFrame([{"a": 3, "b": 4}]))
        eval3 = Evaluation(pd.DataFrame([{"a": 5, "b": 6}]))
        eval4 = Evaluation(pd.DataFrame([{"a": 7, "b": 8}]))

        sum_result = sum([eval1, eval2, eval3, eval4])
        expected = pd.DataFrame([{
            "a": 1,
            "b": 2
        }, {
            "a": 3,
            "b": 4
        }, {
            "a": 5,
            "b": 6
        }, {
            "a": 7,
            "b": 8
        }])

        assert_frame_equal(sum_result.get_df(), expected)
        assert sum_result.get_eval_id() is None
Esempio n. 6
0
    def test_singletablevisualizer(self):
        df = pd.DataFrame([
            {
                "a": 0.1,
                "b": 0.2,
                "c": 0.3
            },
            {
                "a": 0.4,
                "b": 0.5,
                "c": 0.6
            },
            {
                "a": 0.7,
                "b": 0.8,
                "c": 0.9
            },
        ])
        eval1 = Evaluation(df)
        cmap = sns.diverging_palette(180, 0, s=75, l=75, sep=100, as_cmap=True)
        styled_eval = eval1.process([ColorMapStyle(cmap)])
        vis = SingleTableVisualizer(eval1,
                                    styled_eval,
                                    column_order=["a", "b", "c"])
        vis.get_visualization()  # should not fail

        with pytest.raises(KeyError):
            vis = SingleTableVisualizer(eval1, styled_eval, column_order=["d"])
            vis.get_visualization()
Esempio n. 7
0
    def test_addnormalizedcolumn(self):
        """ Test whether normalized column is added """
        df = pd.DataFrame([
            {
                "group": "a",
                "value": 10
            },
            {
                "group": "a",
                "value": 5
            },
            {
                "group": "a",
                "value": 3
            },
            {
                "group": "b",
                "value": 100
            },
            {
                "group": "b",
                "value": 31
            },
        ])
        eval1 = Evaluation(df, eval_id=10)

        pipeline = [AddNormalizedColumn("group", "value", "normalized")]
        result = eval1.process(pipeline)
        expected = pd.DataFrame([
            {
                "group": "a",
                "value": 10,
                "normalized": 1.0
            },
            {
                "group": "a",
                "value": 5,
                "normalized": 0.5
            },
            {
                "group": "a",
                "value": 3,
                "normalized": 0.3
            },
            {
                "group": "b",
                "value": 100,
                "normalized": 1.0
            },
            {
                "group": "b",
                "value": 31,
                "normalized": 0.31
            },
        ])

        assert_frame_equal(result.get_df(), expected)

        assert result.get_eval_id() == 10
Esempio n. 8
0
    def test_evaluation_process_empty(self):
        """
        Calling process() with an empty list of processors should return the
        input DF without any changes.
        """
        df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
        result = Evaluation(df).process([]).get_df()

        self.assertTrue(result.equals(df))
Esempio n. 9
0
    def test_normalize(self):
        """
        Test whether all values are normalized
        """
        df = pd.DataFrame(data=[
            {
                "group": "b",
                "value": -50
            },
            {
                "group": "b",
                "value": 50
            },
            {
                "group": "b",
                "value": 100
            },
            {
                "group": "a",
                "value": 0
            },
            {
                "group": "a",
                "value": 10
            },
        ])
        eval1 = Evaluation(df, eval_id=10)

        normalize_direction = {"value": Direction.MINIMIZE}
        pipeline = [Normalize(normalize_direction)]

        result = eval1.process(pipeline)
        expected = pd.DataFrame(data=[
            {
                "group": "b",
                "value": 0.25
            },
            {
                "group": "b",
                "value": 0.75
            },
            {
                "group": "b",
                "value": 1.0
            },
            {
                "group": "a",
                "value": 0.5
            },
            {
                "group": "a",
                "value": 0.55
            },
        ], )

        assert_frame_equal(result.get_df(), expected)
        assert result.get_eval_id() == 10
Esempio n. 10
0
    def test_normalize_negated(self):
        """
        Test whether all values are normalized
        """
        df = pd.DataFrame(data=[
            {
                "group": "b",
                "value": -50
            },
            {
                "group": "b",
                "value": 50
            },
            {
                "group": "b",
                "value": 100
            },
            {
                "group": "a",
                "value": 0
            },
            {
                "group": "a",
                "value": 10
            },
        ])
        eval1 = Evaluation(df)

        normalize_direction = {"value": -1}
        pipeline = [Normalize(normalize_direction)]

        result = eval1.process(pipeline).get_df()
        expected = pd.DataFrame(data=[
            {
                "group": "b",
                "value": 0.75
            },
            {
                "group": "b",
                "value": 0.25
            },
            {
                "group": "b",
                "value": 0
            },
            {
                "group": "a",
                "value": 0.5
            },
            {
                "group": "a",
                "value": 0.45
            },
        ], )

        assert_frame_equal(result, expected)
Esempio n. 11
0
 def process(self, input_eval: Evaluation):
     old_df = input_eval.get_df()
     if isinstance(old_df.index, pd.MultiIndex):
         new_df = old_df.xs(self.index_value, level=self.index_name)
     elif isinstance(old_df.index, pd.Index):
         # slicing instead of indexing to maintain shape
         new_df = old_df.loc[self.index_value:self.index_value]
     else:
         raise ValueError("Incompatible dataframe index.")
     return Evaluation(new_df, input_eval.get_eval_id())
Esempio n. 12
0
    def test_evaluation_get_df_equality(self):
        """
        get_df() should return a dataframe with values equal to constructor
        argument
        """
        df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
        result = Evaluation(df).get_df()

        self.assertIsInstance(result, pd.DataFrame)
        self.assertTrue(result.equals(df))
Esempio n. 13
0
    def test_comparetofirst_dir_subset(self):
        """ Test if CompareToFirst works with different direction and subset"""
        df = pd.DataFrame([
            {
                "a": 1,
                "b": 5
            },
            {
                "a": 2,
                "b": 4
            },
            {
                "a": 3,
                "b": 3
            },
            {
                "a": 4,
                "b": 2
            },
            {
                "a": 5,
                "b": 1
            },
        ])
        eval1 = Evaluation(df, eval_id=20)

        direction = {"a": Direction.MINIMIZE}

        pipeline = [CompareToFirst(direction)]
        eval1 = eval1.process(pipeline)

        expected_df = pd.DataFrame([
            {
                "a": 1,
                "a.relative": 1.0 / 1
            },
            {
                "a": 2,
                "a.relative": 1.0 / 2
            },
            {
                "a": 3,
                "a.relative": 1.0 / 3
            },
            {
                "a": 4,
                "a.relative": 1.0 / 4
            },
            {
                "a": 5,
                "a.relative": 1.0 / 5
            },
        ])
        assert_frame_equal(eval1.get_df(), expected_df)
        assert eval1.get_eval_id() == 20
Esempio n. 14
0
    def test_evaluation_get_df_defensive_copy(self):
        """
        get_df() should return a copy of the constructor argument to prevent
        caller from mutating the dataframe
        """
        df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
        result = Evaluation(df).get_df()

        result["a"][0] = 5  # mutate result

        self.assertFalse(result.equals(df))
Esempio n. 15
0
    def test_standardizetypes(self):
        """ Test whether types are standardized """
        types = {"a": float}
        df = pd.DataFrame({"a": [1, 2, 3, 4, 5]})
        eval1 = Evaluation(df, eval_id=10)

        assert eval1.get_df().dtypes["a"] == int

        pipeline = [StandardizeTypes(types)]

        result = eval1.process(pipeline)

        assert result.get_df().dtypes["a"] == float

        assert result.get_eval_id() == 10
Esempio n. 16
0
 def get_evaluation(self) -> Evaluation:
     """
     Returns an Evaluation that represents the fetched data.
     """
     data = self._download()
     preprocessed_df = self._preprocess(data)
     return Evaluation(preprocessed_df, eval_id=self._abs_eval_id)
Esempio n. 17
0
    def test_debugvisualizer(self):
        """Test for raised exceptions when valid inputs"""
        df = pd.DataFrame([
            {
                "a": 0.1,
                "b": 0.2,
                "c": 0.3
            },
            {
                "a": 0.4,
                "b": 0.5,
                "c": 0.6
            },
            {
                "a": 0.7,
                "b": 0.8,
                "c": 0.9
            },
        ])
        eval1 = Evaluation(df)
        vis = DebugVisualizer(eval1, column_order=["a", "b", "c"])
        vis.get_visualization()  # should not fail

        with pytest.raises(KeyError):
            vis = DebugVisualizer(eval1, column_order=["d"])
            vis.get_visualization()
Esempio n. 18
0
    def test_evaluation_get_copy(self):
        """
        get_copy() should return a deep copy of Evaluation
        """
        df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
        eval1 = Evaluation(df, eval_id=1000)

        result = eval1.get_copy()
        assert result.get_eval_id() == 1000
        assert_frame_equal(result.get_df(), df)

        # change original
        df.iloc[0]["a"] = 0

        # assert that copy has not changed
        assert result.get_df().iloc[0]["a"] == 1
Esempio n. 19
0
    def test_evaluation_process_single(self):
        """
        Calling process() with one MinusOne should return a new df that
        subtract one from every input value, but not affect the input dataframe.
        """
        pipeline = [MinusOne()]

        df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
        result = Evaluation(df).process(pipeline).get_df()
        expected = pd.DataFrame([{"a": 0, "b": 1}, {"a": 2, "b": 3}])

        # check if input eval has been altered
        self.assertFalse(result.equals(df))
        self.assertFalse(df.equals(expected))

        # check if output eval has correct values
        self.assertTrue(result.equals(expected))
Esempio n. 20
0
    def process(self, input_eval: Evaluation) -> Evaluation:
        input_df = input_eval.get_df()

        try:
            # if int, we might need to convert to float first
            # (e.g. int(float("6.0")))
            if int in self.types.values():
                # create dictionary replacing int with float
                pre_df_types = {
                    k: (v if v != int else float) for k, v in self.types.items()
                }
                input_df = input_df.astype(pre_df_types)

            new_df = input_df.astype(self.types)
        except KeyError:
            raise KeyError("A key in the types parameter does not exist in the evaluation.")
        return Evaluation(new_df, input_eval.get_eval_id())
Esempio n. 21
0
    def test_reindex(self):
        """ Test whether the dataframe was reindexed """
        df = pd.DataFrame([
            {
                "group": "a",
                "key": "a",
                "value": 10
            },
            {
                "group": "a",
                "key": "b",
                "value": 5
            },
            {
                "group": "a",
                "key": "c",
                "value": 3
            },
            {
                "group": "b",
                "key": "d",
                "value": 100
            },
            {
                "group": "b",
                "key": "e",
                "value": 31
            },
        ])
        eval1 = Evaluation(df, eval_id=10)

        pipeline = [Reindex(["value"])]
        result = eval1.process(pipeline)
        expected_index = pd.Index([10, 5, 3, 100, 31], name="value")
        assert_index_equal(result.get_df().index, expected_index)
        assert result.get_eval_id() == 10

        pipeline = [Reindex(["group", "key"])]
        result = eval1.process(pipeline)
        arrays = [["a", "a", "a", "b", "b"], ["a", "b", "c", "d", "e"]]
        expected_index = pd.MultiIndex.from_arrays(arrays,
                                                   names=("group", "key"))
        assert_index_equal(result.get_df().index, expected_index)
        assert result.get_eval_id() == 10
Esempio n. 22
0
    def test_aggregate(self):
        """ Test aggregate processor with custom aggregator functions """
        df = pd.DataFrame([
            {
                "a": 1,
                "b": 1,
                "c": 5
            },
            {
                "a": 1,
                "b": 2,
                "c": 4
            },
            {
                "a": 3,
                "b": 3,
                "c": 3
            },
            {
                "a": 4,
                "b": 4,
                "c": 2
            },
            {
                "a": 5,
                "b": 5,
                "c": 1
            },
        ])
        eval1 = Evaluation(df, eval_id=20)

        pipeline = [Aggregate(lambda x: x.sum())]
        result = eval1.process(pipeline)

        expected_df = pd.DataFrame([{"a": 14, "b": 15, "c": 15}])
        assert_frame_equal(result.get_df(), expected_df)
        assert eval1.get_eval_id() == 20

        pipeline2 = [Aggregate(lambda x: x.product())]
        result2 = eval1.process(pipeline2)

        expected_df2 = pd.DataFrame([{"a": 60, "b": 120, "c": 120}])
        assert_frame_equal(result2.get_df(), expected_df2)
        assert result2.get_eval_id() == 20
Esempio n. 23
0
    def test_geomean_aggregate(self):
        """ Test built-in geomean aggregator """
        df = pd.DataFrame([
            {
                "a": 1,
                "b": 1,
                "c": 5
            },
            {
                "a": 1,
                "b": 2,
                "c": 4
            },
            {
                "a": 3,
                "b": 3,
                "c": 3
            },
            {
                "a": 4,
                "b": 4,
                "c": 2
            },
            {
                "a": 5,
                "b": 5,
                "c": 1
            },
        ])
        eval1 = Evaluation(df, eval_id=20)

        pipeline = [GeomeanAggregate()]
        eval1 = eval1.process(pipeline)

        expected_a = (1 * 1 * 3 * 4 * 5)**(1 / 5)
        expected_b = expected_c = (1 * 2 * 3 * 4 * 5)**(1 / 5)
        expected_df = pd.DataFrame([{
            "a": expected_a,
            "b": expected_b,
            "c": expected_c
        }])
        assert_frame_equal(eval1.get_df(), expected_df)
        assert eval1.get_eval_id() == 20
Esempio n. 24
0
    def test_minusone(self):
        """ Test whether all values are correctly changed """

        df = pd.DataFrame({"a": [1, 2, 3, 4, 5]})
        eval1 = Evaluation(df, eval_id=10)

        result = eval1.get_df()["a"]
        expected = pd.Series([1, 2, 3, 4, 5], name="a")

        assert_series_equal(result, expected)

        pipeline = [MinusOne()]
        result_processed = eval1.process(pipeline)
        result_df = result_processed.get_df()["a"]
        expected_df = pd.Series([0, 1, 2, 3, 4], name="a")

        assert_series_equal(result_df, expected_df)

        assert result_processed.get_eval_id() == 10
Esempio n. 25
0
    def test_relativediff(self):
        """
        Test if difference is correct
        """
        a = pd.DataFrame(data=[
            {
                "a": 2,
                "b": 2
            },
            {
                "a": 5,
                "b": 10
            },
        ])
        b = pd.DataFrame(data=[
            {
                "a": 4,
                "b": 1
            },
            {
                "a": 20,
                "b": 1
            },
        ])

        a_eval = Evaluation(a)
        b_eval = Evaluation(b)

        diff = b_eval.process([RelativeDiff(a_eval)])
        result = diff.get_df()

        expected = pd.DataFrame(data=[
            {
                "a": 1.0,
                "b": -0.5
            },
            {
                "a": 3.0,
                "b": -0.9
            },
        ])

        assert_frame_equal(expected, result)
Esempio n. 26
0
    def test_colormapstyle_process(self):
        cmap = sns.diverging_palette(180, 0, s=75, l=75, sep=100, as_cmap=True)
        df = pd.DataFrame([
            {
                "a": 0.1,
                "b": 0.2,
                "c": 0.3
            },
            {
                "a": 0.4,
                "b": 0.5,
                "c": 0.6
            },
            {
                "a": 0.7,
                "b": 0.8,
                "c": 0.9
            },
        ])
        eval1 = Evaluation(df)
        style = ColorMapStyle(cmap)
        pipeline = [style]

        result = eval1.process(pipeline).get_df()
        expected = pd.DataFrame([
            {
                "a": "background-color: #8ed8d0",
                "b": "background-color: #bde8e3",
                "c": "background-color: #eaf7f6",
            },
            {
                "a": "background-color: #f2f2f2",
                "b": "background-color: #f2f2f2",
                "c": "background-color: #f2f2f2",
            },
            {
                "a": "background-color: #fbe8eb",
                "b": "background-color: #f7d1d9",
                "c": "background-color: #f3bac5",
            },
        ])

        assert_frame_equal(result, expected)
Esempio n. 27
0
    def test_cleanduplicates_sorting(self):
        """
        Test by sorting before removing duplicate.
        """
        df = pd.DataFrame([
            {
                "a": 1,
                "b": 1,
                "c": 5
            },
            {
                "a": 1,
                "b": 2,
                "c": 4
            },
            {
                "a": 3,
                "b": 3,
                "c": 3
            },
            {
                "a": 4,
                "b": 4,
                "c": 2
            },
            {
                "a": 5,
                "b": 5,
                "c": 1
            },
        ])
        eval1 = Evaluation(df)

        pipeline = [CleanDuplicates(["a"], ["c"])]
        result = eval1.process(pipeline).get_df()  # will remove idx 1
        expected = df.drop(1)
        assert_frame_equal(result, expected, check_like=True)

        pipeline = [CleanDuplicates(["a"], ["c"], reverse_sort=True)]
        result = eval1.process(pipeline).get_df()  # will remove idx 0
        expected = df.drop(0).sort_index(level=0, ascending=False)
        assert_frame_equal(result, expected, check_like=True)
Esempio n. 28
0
    def test_cleanduplicates_no_duplicates(self):
        """ Test for evaluation that has no duplicates in specified column """
        df = pd.DataFrame([
            {
                "a": 1,
                "b": 1,
                "c": 5
            },
            {
                "a": 1,
                "b": 2,
                "c": 4
            },
            {
                "a": 3,
                "b": 3,
                "c": 3
            },
            {
                "a": 4,
                "b": 4,
                "c": 2
            },
            {
                "a": 5,
                "b": 5,
                "c": 1
            },
        ])
        eval1 = Evaluation(df, eval_id=10)

        # test no duplicates
        pipeline = [CleanDuplicates(["b"])]
        result = eval1.process(pipeline)
        assert_frame_equal(result.get_df(), df, check_like=True)

        assert result.get_eval_id() == 10
Esempio n. 29
0
    def test_aggregate_exclude_nonnumeric(self):
        """ Check if aggregate processor excludes fields that are non-numeric """
        df = pd.DataFrame([
            {
                "a": 1,
                "b": 1,
                "c": "a"
            },
            {
                "a": 1,
                "b": 2,
                "c": "b"
            },
            {
                "a": 3,
                "b": 3,
                "c": "c"
            },
            {
                "a": 4,
                "b": 4,
                "c": "d"
            },
            {
                "a": 5,
                "b": 5,
                "c": "e"
            },
        ])
        eval1 = Evaluation(df, eval_id=20)

        pipeline = [Aggregate(lambda x: x.sum())]
        result = eval1.process(pipeline)

        expected_df = pd.DataFrame([{"a": 14, "b": 15}])
        assert_frame_equal(result.get_df(), expected_df)
        assert eval1.get_eval_id() == 20
Esempio n. 30
0
 def test_cleanduplicates_multi_col(self):
     """
     Test for evaluation that doesn't have duplicates when comparing
     more than one column
     """
     df = pd.DataFrame([
         {
             "a": 1,
             "b": 1,
             "c": 5
         },
         {
             "a": 1,
             "b": 2,
             "c": 4
         },
         {
             "a": 3,
             "b": 3,
             "c": 3
         },
         {
             "a": 4,
             "b": 4,
             "c": 2
         },
         {
             "a": 5,
             "b": 5,
             "c": 1
         },
     ])
     eval1 = Evaluation(df)
     pipeline = [CleanDuplicates(["a", "b"])]
     result2 = eval1.process(pipeline).get_df()
     assert_frame_equal(result2, df, check_like=True)