def test_evaluation_get_eval_id(self): """ get_eval_id() should return the eval_id specified when initialized """ df = pd.DataFrame([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) result = Evaluation(df, eval_id=1000) assert result.get_eval_id() == 1000 result = Evaluation(df, eval_id=0) assert result.get_eval_id() == 0
def process(self, input_eval: Evaluation) -> Evaluation: if self._sort_col_names is None: new_df = input_eval.get_df().drop_duplicates( subset=self._duplicate_col_names ) return Evaluation(new_df, input_eval.get_eval_id()) else: new_df = ( input_eval.get_df() .sort_values(by=self._sort_col_names, ascending=self._reverse_sort) .drop_duplicates(subset=self._duplicate_col_names) ) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation): old_df = input_eval.get_df() if isinstance(old_df.index, pd.MultiIndex): new_df = old_df.xs(self.index_value, level=self.index_name) elif isinstance(old_df.index, pd.Index): # slicing instead of indexing to maintain shape new_df = old_df.loc[self.index_value:self.index_value] else: raise ValueError("Incompatible dataframe index.") return Evaluation(new_df, input_eval.get_eval_id())
def test_comparetofirst_dir_subset(self): """ Test if CompareToFirst works with different direction and subset""" df = pd.DataFrame([ { "a": 1, "b": 5 }, { "a": 2, "b": 4 }, { "a": 3, "b": 3 }, { "a": 4, "b": 2 }, { "a": 5, "b": 1 }, ]) eval1 = Evaluation(df, eval_id=20) direction = {"a": Direction.MINIMIZE} pipeline = [CompareToFirst(direction)] eval1 = eval1.process(pipeline) expected_df = pd.DataFrame([ { "a": 1, "a.relative": 1.0 / 1 }, { "a": 2, "a.relative": 1.0 / 2 }, { "a": 3, "a.relative": 1.0 / 3 }, { "a": 4, "a.relative": 1.0 / 4 }, { "a": 5, "a.relative": 1.0 / 5 }, ]) assert_frame_equal(eval1.get_df(), expected_df) assert eval1.get_eval_id() == 20
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() try: # if int, we might need to convert to float first # (e.g. int(float("6.0"))) if int in self.types.values(): # create dictionary replacing int with float pre_df_types = { k: (v if v != int else float) for k, v in self.types.items() } input_df = input_df.astype(pre_df_types) new_df = input_df.astype(self.types) except KeyError: raise KeyError("A key in the types parameter does not exist in the evaluation.") return Evaluation(new_df, input_eval.get_eval_id())
def test_aggregate(self): """ Test aggregate processor with custom aggregator functions """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": 5 }, { "a": 1, "b": 2, "c": 4 }, { "a": 3, "b": 3, "c": 3 }, { "a": 4, "b": 4, "c": 2 }, { "a": 5, "b": 5, "c": 1 }, ]) eval1 = Evaluation(df, eval_id=20) pipeline = [Aggregate(lambda x: x.sum())] result = eval1.process(pipeline) expected_df = pd.DataFrame([{"a": 14, "b": 15, "c": 15}]) assert_frame_equal(result.get_df(), expected_df) assert eval1.get_eval_id() == 20 pipeline2 = [Aggregate(lambda x: x.product())] result2 = eval1.process(pipeline2) expected_df2 = pd.DataFrame([{"a": 60, "b": 120, "c": 120}]) assert_frame_equal(result2.get_df(), expected_df2) assert result2.get_eval_id() == 20
def test_geomean_aggregate(self): """ Test built-in geomean aggregator """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": 5 }, { "a": 1, "b": 2, "c": 4 }, { "a": 3, "b": 3, "c": 3 }, { "a": 4, "b": 4, "c": 2 }, { "a": 5, "b": 5, "c": 1 }, ]) eval1 = Evaluation(df, eval_id=20) pipeline = [GeomeanAggregate()] eval1 = eval1.process(pipeline) expected_a = (1 * 1 * 3 * 4 * 5)**(1 / 5) expected_b = expected_c = (1 * 2 * 3 * 4 * 5)**(1 / 5) expected_df = pd.DataFrame([{ "a": expected_a, "b": expected_b, "c": expected_c }]) assert_frame_equal(eval1.get_df(), expected_df) assert eval1.get_eval_id() == 20
def test_aggregate_exclude_nonnumeric(self): """ Check if aggregate processor excludes fields that are non-numeric """ df = pd.DataFrame([ { "a": 1, "b": 1, "c": "a" }, { "a": 1, "b": 2, "c": "b" }, { "a": 3, "b": 3, "c": "c" }, { "a": 4, "b": 4, "c": "d" }, { "a": 5, "b": 5, "c": "e" }, ]) eval1 = Evaluation(df, eval_id=20) pipeline = [Aggregate(lambda x: x.sum())] result = eval1.process(pipeline) expected_df = pd.DataFrame([{"a": 14, "b": 15}]) assert_frame_equal(result.get_df(), expected_df) assert eval1.get_eval_id() == 20
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = input_df.sort_index(level=self._sort_names) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = self._compare_to_first(input_df) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation): old_df = input_eval.get_df() numeric_columns = old_df.select_dtypes(include=['number']).dropna(axis=1).columns new_df = pd.DataFrame([old_df[numeric_columns].agg(self.func)]) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = self._normalize(input_df) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: return Evaluation(input_eval.get_df() - 1, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = input_df.groupby(self._groupby).apply(self._normalize_around) return Evaluation(new_df, input_eval.get_eval_id())
def test_comparetofirst_suffix(self): """ Test if CompareToFirst works with different suffix """ df = pd.DataFrame([ { "a": 1, "b": 5 }, { "a": 2, "b": 4 }, { "a": 3, "b": 3 }, { "a": 4, "b": 2 }, { "a": 5, "b": 1 }, ]) eval1 = Evaluation(df, eval_id=20) direction = {"a": Direction.MAXIMIZE, "b": Direction.MAXIMIZE} pipeline = [CompareToFirst(direction, suffix=".diff")] eval1 = eval1.process(pipeline) expected_df = pd.DataFrame([ { "a": 1, "a.diff": 1.0 / 1, "b": 5, "b.diff": 5.0 / 5 }, { "a": 2, "a.diff": 2.0 / 1, "b": 4, "b.diff": 4.0 / 5 }, { "a": 3, "a.diff": 3.0 / 1, "b": 3, "b.diff": 3.0 / 5 }, { "a": 4, "a.diff": 4.0 / 1, "b": 2, "b.diff": 2.0 / 5 }, { "a": 5, "a.diff": 5.0 / 1, "b": 1, "b.diff": 1.0 / 5 }, ]) assert_frame_equal(eval1.get_df(), expected_df) assert eval1.get_eval_id() == 20
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = input_df.set_index(self._reindex_names) return Evaluation(new_df, input_eval.get_eval_id())
def process(self, input_eval: Evaluation) -> Evaluation: input_df = input_eval.get_df() new_df = input_df.groupby(self._input_col_name).apply(self._expansion) return Evaluation(new_df, input_eval.get_eval_id())