def process(self, b: Evaluation) -> Evaluation: a_nums = self.a.get_df().select_dtypes(include=[np.number]) b_nums = b.get_df().select_dtypes(include=[np.number]) diff = (b_nums - a_nums) / a_nums difference_eval = Evaluation(diff) return difference_eval
def test_filterbyindex_multindex(self): """ tests if filtering by index works for multi-index dataframe """ # test dataframe # {"group": "a", "key": "a", "value": 10}, # {"group": "a", "key": "b", "value": 5}, # {"group": "a", "key": "c", "value": 3}, # {"group": "b", "key": "d", "value": 100}, # {"group": "b", "key": "e", "value": 31} idx_arrays = [["a", "a", "a", "b", "b"], ["a", "b", "c", "d", "e"]] index = pd.MultiIndex.from_arrays(idx_arrays, names=("group", "key")) df = pd.DataFrame({"value": [10, 5, 3, 100, 31]}, index=index) eval1 = Evaluation(df, eval_id=10) # filter by first index pipeline = [FilterByIndex("group", "a")] result = eval1.process(pipeline) expected_index = pd.Index(["a", "b", "c"], name="key") expected_df = pd.DataFrame({"value": [10, 5, 3]}, index=expected_index) assert_frame_equal(result.get_df(), expected_df) assert result.get_eval_id() == 10 # filter by second index pipeline = [FilterByIndex("key", "a")] result = eval1.process(pipeline) expected_index = pd.Index(["a"], name="group") expected_df = pd.DataFrame({"value": [10]}, index=expected_index) assert_frame_equal(result.get_df(), expected_df) assert result.get_eval_id() == 10
def test_evaluation_add_different_columns(self): """ Using the + magic method should return a new Evaluation that consists of the concatenated Evaluations, even if columns don't match """ df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) result1 = Evaluation(df1) df2 = pd.DataFrame([{"a": 5}, {"a": 7}]) result2 = Evaluation(df2) sum_result = result1 + result2 expected = pd.DataFrame([{ "a": 1, "b": 2 }, { "a": 3, "b": 4 }, { "a": 5 }, { "a": 7 }]) assert_frame_equal(sum_result.get_df(), expected) assert sum_result.get_eval_id() is None
def test_evaluation_add(self): """ Using the + magic method should return a new Evaluation that consists of the concatenated Evaluations. """ df1 = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) result1 = Evaluation(df1) df2 = pd.DataFrame([{"a": 5, "b": 6}, {"a": 7, "b": 8}]) result2 = Evaluation(df2) sum_result = result1 + result2 expected = pd.DataFrame([{ "a": 1, "b": 2 }, { "a": 3, "b": 4 }, { "a": 5, "b": 6 }, { "a": 7, "b": 8 }]) assert_frame_equal(sum_result.get_df(), expected) assert sum_result.get_eval_id() is None
def test_evaluation_add_multiple(self): """ Using the sum() built-in function should return a new Evaluation that consists of concatenated Evaluations """ eval1 = Evaluation(pd.DataFrame([{"a": 1, "b": 2}])) eval2 = Evaluation(pd.DataFrame([{"a": 3, "b": 4}])) eval3 = Evaluation(pd.DataFrame([{"a": 5, "b": 6}])) eval4 = Evaluation(pd.DataFrame([{"a": 7, "b": 8}])) sum_result = sum([eval1, eval2, eval3, eval4]) expected = pd.DataFrame([{ "a": 1, "b": 2 }, { "a": 3, "b": 4 }, { "a": 5, "b": 6 }, { "a": 7, "b": 8 }]) assert_frame_equal(sum_result.get_df(), expected) assert sum_result.get_eval_id() is None
def test_singletablevisualizer(self): df = pd.DataFrame([ { "a": 0.1, "b": 0.2, "c": 0.3 }, { "a": 0.4, "b": 0.5, "c": 0.6 }, { "a": 0.7, "b": 0.8, "c": 0.9 }, ]) eval1 = Evaluation(df) cmap = sns.diverging_palette(180, 0, s=75, l=75, sep=100, as_cmap=True) styled_eval = eval1.process([ColorMapStyle(cmap)]) vis = SingleTableVisualizer(eval1, styled_eval, column_order=["a", "b", "c"]) vis.get_visualization() # should not fail with pytest.raises(KeyError): vis = SingleTableVisualizer(eval1, styled_eval, column_order=["d"]) vis.get_visualization()
def test_addnormalizedcolumn(self): """ Test whether normalized column is added """ df = pd.DataFrame([ { "group": "a", "value": 10 }, { "group": "a", "value": 5 }, { "group": "a", "value": 3 }, { "group": "b", "value": 100 }, { "group": "b", "value": 31 }, ]) eval1 = Evaluation(df, eval_id=10) pipeline = [AddNormalizedColumn("group", "value", "normalized")] result = eval1.process(pipeline) expected = pd.DataFrame([ { "group": "a", "value": 10, "normalized": 1.0 }, { "group": "a", "value": 5, "normalized": 0.5 }, { "group": "a", "value": 3, "normalized": 0.3 }, { "group": "b", "value": 100, "normalized": 1.0 }, { "group": "b", "value": 31, "normalized": 0.31 }, ]) assert_frame_equal(result.get_df(), expected) assert result.get_eval_id() == 10
def test_evaluation_process_empty(self): """ Calling process() with an empty list of processors should return the input DF without any changes. """ df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) result = Evaluation(df).process([]).get_df() self.assertTrue(result.equals(df))
def test_normalize(self): """ Test whether all values are normalized """ df = pd.DataFrame(data=[ { "group": "b", "value": -50 }, { "group": "b", "value": 50 }, { "group": "b", "value": 100 }, { "group": "a", "value": 0 }, { "group": "a", "value": 10 }, ]) eval1 = Evaluation(df, eval_id=10) normalize_direction = {"value": Direction.MINIMIZE} pipeline = [Normalize(normalize_direction)] result = eval1.process(pipeline) expected = pd.DataFrame(data=[ { "group": "b", "value": 0.25 }, { "group": "b", "value": 0.75 }, { "group": "b", "value": 1.0 }, { "group": "a", "value": 0.5 }, { "group": "a", "value": 0.55 }, ], ) assert_frame_equal(result.get_df(), expected) assert result.get_eval_id() == 10
def test_normalize_negated(self): """ Test whether all values are normalized """ df = pd.DataFrame(data=[ { "group": "b", "value": -50 }, { "group": "b", "value": 50 }, { "group": "b", "value": 100 }, { "group": "a", "value": 0 }, { "group": "a", "value": 10 }, ]) eval1 = Evaluation(df) normalize_direction = {"value": -1} pipeline = [Normalize(normalize_direction)] result = eval1.process(pipeline).get_df() expected = pd.DataFrame(data=[ { "group": "b", "value": 0.75 }, { "group": "b", "value": 0.25 }, { "group": "b", "value": 0 }, { "group": "a", "value": 0.5 }, { "group": "a", "value": 0.45 }, ], ) assert_frame_equal(result, expected)
def process(self, input_eval: Evaluation): old_df = input_eval.get_df() if isinstance(old_df.index, pd.MultiIndex): new_df = old_df.xs(self.index_value, level=self.index_name) elif isinstance(old_df.index, pd.Index): # slicing instead of indexing to maintain shape new_df = old_df.loc[self.index_value:self.index_value] else: raise ValueError("Incompatible dataframe index.") return Evaluation(new_df, input_eval.get_eval_id())
def test_evaluation_get_df_equality(self): """ get_df() should return a dataframe with values equal to constructor argument """ df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) result = Evaluation(df).get_df() self.assertIsInstance(result, pd.DataFrame) self.assertTrue(result.equals(df))
def test_comparetofirst_dir_subset(self): """ Test if CompareToFirst works with different direction and subset""" df = pd.DataFrame([ { "a": 1, "b": 5 }, { "a": 2, "b": 4 }, { "a": 3, "b": 3 }, { "a": 4, "b": 2 }, { "a": 5, "b": 1 }, ]) eval1 = Evaluation(df, eval_id=20) direction = {"a": Direction.MINIMIZE} pipeline = [CompareToFirst(direction)] eval1 = eval1.process(pipeline) expected_df = pd.DataFrame([ { "a": 1, "a.relative": 1.0 / 1 }, { "a": 2, "a.relative": 1.0 / 2 }, { "a": 3, "a.relative": 1.0 / 3 }, { "a": 4, "a.relative": 1.0 / 4 }, { "a": 5, "a.relative": 1.0 / 5 }, ]) assert_frame_equal(eval1.get_df(), expected_df) assert eval1.get_eval_id() == 20
def test_evaluation_get_df_defensive_copy(self): """ get_df() should return a copy of the constructor argument to prevent caller from mutating the dataframe """ df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) result = Evaluation(df).get_df() result["a"][0] = 5 # mutate result self.assertFalse(result.equals(df))
def test_standardizetypes(self): """ Test whether types are standardized """ types = {"a": float} df = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) eval1 = Evaluation(df, eval_id=10) assert eval1.get_df().dtypes["a"] == int pipeline = [StandardizeTypes(types)] result = eval1.process(pipeline) assert result.get_df().dtypes["a"] == float assert result.get_eval_id() == 10
def get_evaluation(self) -> Evaluation: """ Returns an Evaluation that represents the fetched data. """ data = self._download() preprocessed_df = self._preprocess(data) return Evaluation(preprocessed_df, eval_id=self._abs_eval_id)
def test_debugvisualizer(self): """Test for raised exceptions when valid inputs""" df = pd.DataFrame([ { "a": 0.1, "b": 0.2, "c": 0.3 }, { "a": 0.4, "b": 0.5, "c": 0.6 }, { "a": 0.7, "b": 0.8, "c": 0.9 }, ]) eval1 = Evaluation(df) vis = DebugVisualizer(eval1, column_order=["a", "b", "c"]) vis.get_visualization() # should not fail with pytest.raises(KeyError): vis = DebugVisualizer(eval1, column_order=["d"]) vis.get_visualization()
def test_evaluation_get_copy(self): """ get_copy() should return a deep copy of Evaluation """ df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) eval1 = Evaluation(df, eval_id=1000) result = eval1.get_copy() assert result.get_eval_id() == 1000 assert_frame_equal(result.get_df(), df) # change original df.iloc[0]["a"] = 0 # assert that copy has not changed assert result.get_df().iloc[0]["a"] == 1
def test_evaluation_process_single(self): """ Calling process() with one MinusOne should return a new df that subtract one from every input value, but not affect the input dataframe. """ pipeline = [MinusOne()] df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) result = Evaluation(df).process(pipeline).get_df() expected = pd.DataFrame([{"a": 0, "b": 1}, {"a": 2, "b": 3}]) # check if input eval has been altered self.assertFalse(result.equals(df)) self.assertFalse(df.equals(expected)) # check if output eval has correct values self.assertTrue(result.equals(expected))
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() try: # if int, we might need to convert to float first # (e.g. int(float("6.0"))) if int in self.types.values(): # create dictionary replacing int with float pre_df_types = { k: (v if v != int else float) for k, v in self.types.items() } input_df = input_df.astype(pre_df_types) new_df = input_df.astype(self.types) except KeyError: raise KeyError("A key in the types parameter does not exist in the evaluation.") return Evaluation(new_df, input_eval.get_eval_id())
def test_reindex(self): """ Test whether the dataframe was reindexed """ df = pd.DataFrame([ { "group": "a", "key": "a", "value": 10 }, { "group": "a", "key": "b", "value": 5 }, { "group": "a", "key": "c", "value": 3 }, { "group": "b", "key": "d", "value": 100 }, { "group": "b", "key": "e", "value": 31 }, ]) eval1 = Evaluation(df, eval_id=10) pipeline = [Reindex(["value"])] result = eval1.process(pipeline) expected_index = pd.Index([10, 5, 3, 100, 31], name="value") assert_index_equal(result.get_df().index, expected_index) assert result.get_eval_id() == 10 pipeline = [Reindex(["group", "key"])] result = eval1.process(pipeline) arrays = [["a", "a", "a", "b", "b"], ["a", "b", "c", "d", "e"]] expected_index = pd.MultiIndex.from_arrays(arrays, names=("group", "key")) assert_index_equal(result.get_df().index, expected_index) assert result.get_eval_id() == 10
def test_aggregate(self): """ Test aggregate processor with custom aggregator functions """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": 5 }, { "a": 1, "b": 2, "c": 4 }, { "a": 3, "b": 3, "c": 3 }, { "a": 4, "b": 4, "c": 2 }, { "a": 5, "b": 5, "c": 1 }, ]) eval1 = Evaluation(df, eval_id=20) pipeline = [Aggregate(lambda x: x.sum())] result = eval1.process(pipeline) expected_df = pd.DataFrame([{"a": 14, "b": 15, "c": 15}]) assert_frame_equal(result.get_df(), expected_df) assert eval1.get_eval_id() == 20 pipeline2 = [Aggregate(lambda x: x.product())] result2 = eval1.process(pipeline2) expected_df2 = pd.DataFrame([{"a": 60, "b": 120, "c": 120}]) assert_frame_equal(result2.get_df(), expected_df2) assert result2.get_eval_id() == 20
def test_geomean_aggregate(self): """ Test built-in geomean aggregator """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": 5 }, { "a": 1, "b": 2, "c": 4 }, { "a": 3, "b": 3, "c": 3 }, { "a": 4, "b": 4, "c": 2 }, { "a": 5, "b": 5, "c": 1 }, ]) eval1 = Evaluation(df, eval_id=20) pipeline = [GeomeanAggregate()] eval1 = eval1.process(pipeline) expected_a = (1 * 1 * 3 * 4 * 5)**(1 / 5) expected_b = expected_c = (1 * 2 * 3 * 4 * 5)**(1 / 5) expected_df = pd.DataFrame([{ "a": expected_a, "b": expected_b, "c": expected_c }]) assert_frame_equal(eval1.get_df(), expected_df) assert eval1.get_eval_id() == 20
def test_minusone(self): """ Test whether all values are correctly changed """ df = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) eval1 = Evaluation(df, eval_id=10) result = eval1.get_df()["a"] expected = pd.Series([1, 2, 3, 4, 5], name="a") assert_series_equal(result, expected) pipeline = [MinusOne()] result_processed = eval1.process(pipeline) result_df = result_processed.get_df()["a"] expected_df = pd.Series([0, 1, 2, 3, 4], name="a") assert_series_equal(result_df, expected_df) assert result_processed.get_eval_id() == 10
def test_relativediff(self): """ Test if difference is correct """ a = pd.DataFrame(data=[ { "a": 2, "b": 2 }, { "a": 5, "b": 10 }, ]) b = pd.DataFrame(data=[ { "a": 4, "b": 1 }, { "a": 20, "b": 1 }, ]) a_eval = Evaluation(a) b_eval = Evaluation(b) diff = b_eval.process([RelativeDiff(a_eval)]) result = diff.get_df() expected = pd.DataFrame(data=[ { "a": 1.0, "b": -0.5 }, { "a": 3.0, "b": -0.9 }, ]) assert_frame_equal(expected, result)
def test_colormapstyle_process(self): cmap = sns.diverging_palette(180, 0, s=75, l=75, sep=100, as_cmap=True) df = pd.DataFrame([ { "a": 0.1, "b": 0.2, "c": 0.3 }, { "a": 0.4, "b": 0.5, "c": 0.6 }, { "a": 0.7, "b": 0.8, "c": 0.9 }, ]) eval1 = Evaluation(df) style = ColorMapStyle(cmap) pipeline = [style] result = eval1.process(pipeline).get_df() expected = pd.DataFrame([ { "a": "background-color: #8ed8d0", "b": "background-color: #bde8e3", "c": "background-color: #eaf7f6", }, { "a": "background-color: #f2f2f2", "b": "background-color: #f2f2f2", "c": "background-color: #f2f2f2", }, { "a": "background-color: #fbe8eb", "b": "background-color: #f7d1d9", "c": "background-color: #f3bac5", }, ]) assert_frame_equal(result, expected)
def test_cleanduplicates_sorting(self): """ Test by sorting before removing duplicate. """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": 5 }, { "a": 1, "b": 2, "c": 4 }, { "a": 3, "b": 3, "c": 3 }, { "a": 4, "b": 4, "c": 2 }, { "a": 5, "b": 5, "c": 1 }, ]) eval1 = Evaluation(df) pipeline = [CleanDuplicates(["a"], ["c"])] result = eval1.process(pipeline).get_df() # will remove idx 1 expected = df.drop(1) assert_frame_equal(result, expected, check_like=True) pipeline = [CleanDuplicates(["a"], ["c"], reverse_sort=True)] result = eval1.process(pipeline).get_df() # will remove idx 0 expected = df.drop(0).sort_index(level=0, ascending=False) assert_frame_equal(result, expected, check_like=True)
def test_cleanduplicates_no_duplicates(self): """ Test for evaluation that has no duplicates in specified column """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": 5 }, { "a": 1, "b": 2, "c": 4 }, { "a": 3, "b": 3, "c": 3 }, { "a": 4, "b": 4, "c": 2 }, { "a": 5, "b": 5, "c": 1 }, ]) eval1 = Evaluation(df, eval_id=10) # test no duplicates pipeline = [CleanDuplicates(["b"])] result = eval1.process(pipeline) assert_frame_equal(result.get_df(), df, check_like=True) assert result.get_eval_id() == 10
def test_aggregate_exclude_nonnumeric(self): """ Check if aggregate processor excludes fields that are non-numeric """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": "a" }, { "a": 1, "b": 2, "c": "b" }, { "a": 3, "b": 3, "c": "c" }, { "a": 4, "b": 4, "c": "d" }, { "a": 5, "b": 5, "c": "e" }, ]) eval1 = Evaluation(df, eval_id=20) pipeline = [Aggregate(lambda x: x.sum())] result = eval1.process(pipeline) expected_df = pd.DataFrame([{"a": 14, "b": 15}]) assert_frame_equal(result.get_df(), expected_df) assert eval1.get_eval_id() == 20
def test_cleanduplicates_multi_col(self): """ Test for evaluation that doesn't have duplicates when comparing more than one column """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": 5 }, { "a": 1, "b": 2, "c": 4 }, { "a": 3, "b": 3, "c": 3 }, { "a": 4, "b": 4, "c": 2 }, { "a": 5, "b": 5, "c": 1 }, ]) eval1 = Evaluation(df) pipeline = [CleanDuplicates(["a", "b"])] result2 = eval1.process(pipeline).get_df() assert_frame_equal(result2, df, check_like=True)