def process(self, input_eval: Evaluation) -> Evaluation: if self._sort_col_names is None: new_df = input_eval.get_df().drop_duplicates( subset=self._duplicate_col_names ) return Evaluation(new_df, input_eval.get_eval_id()) else: new_df = ( input_eval.get_df() .sort_values(by=self._sort_col_names, ascending=self._reverse_sort) .drop_duplicates(subset=self._duplicate_col_names) ) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, b: Evaluation) -> Evaluation: a_nums = self.a.get_df().select_dtypes(include=[np.number]) b_nums = b.get_df().select_dtypes(include=[np.number]) diff = (b_nums - a_nums) / a_nums difference_eval = Evaluation(diff) return difference_eval
def process(self, input_eval: Evaluation): old_df = input_eval.get_df() if isinstance(old_df.index, pd.MultiIndex): new_df = old_df.xs(self.index_value, level=self.index_name) elif isinstance(old_df.index, pd.Index): # slicing instead of indexing to maintain shape new_df = old_df.loc[self.index_value:self.index_value] else: raise ValueError("Incompatible dataframe index.") return Evaluation(new_df, input_eval.get_eval_id())
def test_comparetofirst_dir_subset(self): """ Test if CompareToFirst works with different direction and subset""" df = pd.DataFrame([ { "a": 1, "b": 5 }, { "a": 2, "b": 4 }, { "a": 3, "b": 3 }, { "a": 4, "b": 2 }, { "a": 5, "b": 1 }, ]) eval1 = Evaluation(df, eval_id=20) direction = {"a": Direction.MINIMIZE} pipeline = [CompareToFirst(direction)] eval1 = eval1.process(pipeline) expected_df = pd.DataFrame([ { "a": 1, "a.relative": 1.0 / 1 }, { "a": 2, "a.relative": 1.0 / 2 }, { "a": 3, "a.relative": 1.0 / 3 }, { "a": 4, "a.relative": 1.0 / 4 }, { "a": 5, "a.relative": 1.0 / 5 }, ]) assert_frame_equal(eval1.get_df(), expected_df) assert eval1.get_eval_id() == 20
def test_standardizetypes(self): """ Test whether types are standardized """ types = {"a": float} df = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) eval1 = Evaluation(df, eval_id=10) assert eval1.get_df().dtypes["a"] == int pipeline = [StandardizeTypes(types)] result = eval1.process(pipeline) assert result.get_df().dtypes["a"] == float assert result.get_eval_id() == 10
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() try: # if int, we might need to convert to float first # (e.g. int(float("6.0"))) if int in self.types.values(): # create dictionary replacing int with float pre_df_types = { k: (v if v != int else float) for k, v in self.types.items() } input_df = input_df.astype(pre_df_types) new_df = input_df.astype(self.types) except KeyError: raise KeyError("A key in the types parameter does not exist in the evaluation.") return Evaluation(new_df, input_eval.get_eval_id())
def test_geomean_aggregate(self): """ Test built-in geomean aggregator """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": 5 }, { "a": 1, "b": 2, "c": 4 }, { "a": 3, "b": 3, "c": 3 }, { "a": 4, "b": 4, "c": 2 }, { "a": 5, "b": 5, "c": 1 }, ]) eval1 = Evaluation(df, eval_id=20) pipeline = [GeomeanAggregate()] eval1 = eval1.process(pipeline) expected_a = (1 * 1 * 3 * 4 * 5)**(1 / 5) expected_b = expected_c = (1 * 2 * 3 * 4 * 5)**(1 / 5) expected_df = pd.DataFrame([{ "a": expected_a, "b": expected_b, "c": expected_c }]) assert_frame_equal(eval1.get_df(), expected_df) assert eval1.get_eval_id() == 20
def test_minusone(self): """ Test whether all values are correctly changed """ df = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) eval1 = Evaluation(df, eval_id=10) result = eval1.get_df()["a"] expected = pd.Series([1, 2, 3, 4, 5], name="a") assert_series_equal(result, expected) pipeline = [MinusOne()] result_processed = eval1.process(pipeline) result_df = result_processed.get_df()["a"] expected_df = pd.Series([0, 1, 2, 3, 4], name="a") assert_series_equal(result_df, expected_df) assert result_processed.get_eval_id() == 10
def process(self, input_eval: Evaluation): old_df = input_eval.get_df() numeric_columns = old_df.select_dtypes(include=['number']).dropna(axis=1).columns new_df = pd.DataFrame([old_df[numeric_columns].agg(self.func)]) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = self._normalize(input_df) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: return Evaluation(input_eval.get_df() - 1, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = input_df.groupby(self._groupby).apply(self._normalize_around) return Evaluation(new_df, input_eval.get_eval_id())
def test_comparetofirst_suffix(self): """ Test if CompareToFirst works with different suffix """ df = pd.DataFrame([ { "a": 1, "b": 5 }, { "a": 2, "b": 4 }, { "a": 3, "b": 3 }, { "a": 4, "b": 2 }, { "a": 5, "b": 1 }, ]) eval1 = Evaluation(df, eval_id=20) direction = {"a": Direction.MAXIMIZE, "b": Direction.MAXIMIZE} pipeline = [CompareToFirst(direction, suffix=".diff")] eval1 = eval1.process(pipeline) expected_df = pd.DataFrame([ { "a": 1, "a.diff": 1.0 / 1, "b": 5, "b.diff": 5.0 / 5 }, { "a": 2, "a.diff": 2.0 / 1, "b": 4, "b.diff": 4.0 / 5 }, { "a": 3, "a.diff": 3.0 / 1, "b": 3, "b.diff": 3.0 / 5 }, { "a": 4, "a.diff": 4.0 / 1, "b": 2, "b.diff": 2.0 / 5 }, { "a": 5, "a.diff": 5.0 / 1, "b": 1, "b.diff": 1.0 / 5 }, ]) assert_frame_equal(eval1.get_df(), expected_df) assert eval1.get_eval_id() == 20
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = input_df.set_index(self._reindex_names) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = input_df.groupby(self._input_col_name).apply(self._expansion) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = self._compare_to_first(input_df) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = input_df.sort_index(level=self._sort_names) return Evaluation(new_df, input_eval.get_eval_id())