class TestMutate(TestCase): def setUp(self): self.df = DataFrame({'Cat': ["A", "A", "A", "B", "B", "B", "B", "C", "C", "C"], 'Date': ["2021-03-01", "2021-03-02", "2021-03-03", "2021-03-01", "2021-03-02", "2021-03-03", "2021-03-04", "2021-03-01", "2021-03-02", "2021-03-03"], 'Var1': [10, 14, 13, 22, 28, 23, 30, 6, 8, 9], 'Var2': [250, 245, 209, 360, 328, 359, 372, 74, 78, 80], 'VarNone': [87, 99, None, 120, 128, None, 99, None, None, None], 'VarMixed': [-3, 2, "Null", 0, None, "Null", 5, None, "Null", "Null"], 'VarTextNum': ["5", "8", "-1", "0", "7.4", "11.9", "-8.44", "5", -4.8, 9.2]}) def test_compute_var(self): compute_var = Mutate(Var3="Var1+Var2") result = compute_var.apply(self.df) self.assertEqual([260, 259, 222, 382, 356, 382, 402, 80, 86, 89], result['Var3']) def test_vars_with_lag(self): vars_with_lag = Mutate(Var4="Var1-lag_Var1") result = vars_with_lag.apply(self.df) self.assertEqual([None, 4, -1, 9, 6, -5, 7, -24, 2, 1], result['Var4']) def test_leads_with_groups(self): self.df.add_group('Cat') leads_with_goups = Mutate(Var5='Var2+lead_Var2') result = leads_with_goups.apply(self.df) self.assertEqual([495, 454, None, 688, 687, 731, None, 152, 158, None], result['Var5'])
def apply(self, df): result = DataFrame() for var in self.vars: result.add_column(var, df[var]) kept_group_vars = [var for var in df.groups_vars if var in self.vars] result = GroupBy(*kept_group_vars).apply(result) return result
def setUp(self): self.df = DataFrame({'Dep': ["1", "1", "1", "1", "1", "2", "2", "2", "2", "2"], 'Jour': ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04", "2021-01-05", "2020-11-16", "2020-11-17", "2020-11-18", "2020-11-19", "2020-11-20"], 'Var': [248, 245, 209, 359, 326, 86, 92, 74, 80, 77], 'VarNone': [87, 99, None, 120, 128, None, 99, None, None, None], 'VarMixed': [-3, 2, "Null", 0, None, "Null", 5, None, "Null", "Null"], 'VarTextNum': ["5", "8", "-1", "0", "7.4", "11.9", "-8.44", "5", -4.8, 9.2]})
def setUp(self): self.df = DataFrame({'Cat': ["A", "A", "A", "B", "B", "B", "B", "C", "C", "C"], 'Date': ["2021-03-01", "2021-03-02", "2021-03-03", "2021-03-01", "2021-03-02", "2021-03-03", "2021-03-04", "2021-03-01", "2021-03-02", "2021-03-03"], 'Var1': [10, 14, 13, 22, 28, 23, 30, 6, 8, 9], 'Var2': [250, 245, 209, 360, 328, 359, 372, 74, 78, 80], 'VarNone': [87, 99, None, 120, 128, None, 99, None, None, None], 'VarMixed': [-3, 2, "Null", 0, None, "Null", 5, None, "Null", "Null"], 'VarTextNum': ["5", "8", "-1", "0", "7.4", "11.9", "-8.44", "5", -4.8, 9.2]})
def setUp(self): self.df1 = DataFrame({'Reg': ["1", "2", "3", "4", "5", "6", "7", "8", "9", "9"], 'Var1': [10, 14, 13, 22, 28, 23, 30, 6, 8, 9], 'Var2': [250, 245, 209, 360, 328, 359, 372, 74, 78, 80], 'VarNone': [87, 99, None, 120, 128, None, 99, None, None, None], 'VarMixed': [-3, 2, "Null", 0, None, "Null", 5, None, "Null", "Null"], 'VarTextNum': ["5", "8", "-1", "0", "7.4", "11.9", "-8.44", "5", -4.8, 9.2]}) self.df2 = DataFrame({'Region': ["1", "2", "3", "4", "5", "5", "7", "8", "9", "10"], 'Nom': ["A","B","C","D","E","Ebis","G","H","I","J"], 'Var1':[22, 44, 84, 16, 7, 99, 11, 14, 29, 22]})
def setUp(self): self.df = DataFrame({ 'Cat': ["A", "A", "A", "B", "B", "B", "B", "C", "C", "C"], 'Date': [ "2021-03-01", "2021-03-02", "2021-03-03", "2021-03-01", "2021-03-02", "2021-03-03", "2021-03-04", "2021-03-01", "2021-03-02", "2021-03-03" ], 'Var1': [10, 14, 13, 22, 28, 23, 30, 6, 8, 9], 'Var2': [250, 245, 209, 360, 328, 359, 372, 74, 78, 80] }) self.df.add_group('Cat')
def setUp(self): self.df = DataFrame({ "Cat": ["A", "A", "A", "B", "B", "B", "C", "C"], "Var1": [348, 402, 397, 380, 589, 520, 620, 289], "Var2": [74, 81, 85, 71, 102, 99, 101, 76], "Var3": [74, None, 85, 71, "NaN", 99, 101, 76] })
class TestUngroup(TestCase): def setUp(self): self.df = DataFrame({ 'Cat': ["A", "A", "A", "B", "B", "B", "B", "C", "C", "C"], 'Date': [ "2021-03-01", "2021-03-02", "2021-03-03", "2021-03-01", "2021-03-02", "2021-03-03", "2021-03-04", "2021-03-01", "2021-03-02", "2021-03-03" ], 'Var1': [10, 14, 13, 22, 28, 23, 30, 6, 8, 9], 'Var2': [250, 245, 209, 360, 328, 359, 372, 74, 78, 80] }) self.df.add_group('Cat') def test_ungroup(self): self.assertEqual([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], Ungroup().apply(self.df).groups)
def setUp(self): self.df = DataFrame({'Cat': ["A", "A", "A", "B", "B", "B", "B", "C", "C", "C"], 'Date': ["2021-03-01", "2021-03-02", "2021-03-03", "2021-03-01", "2021-03-02", "2021-03-03", "2021-03-04", "2021-03-01", "2021-03-02", "2021-03-03"], 'Var1': [10.1273, 14.26834, 13.9122, 22.74, 28.09, 23.1, 30, 6.06, 8.57, 9.532], 'Var2': [250.1823, 245.682109, 209.0111111111, 360.873421, 328.09534163, 359.07426, 372.07227173, 74, 78, 80], 'VarNone': [87.09912, 99.102, None, 120, 128.88, None, 99.64, None, None, None], 'VarMixed': [-3, 2, "Null", 0, None, "Null", 5, None, "Null", "Null"], 'VarTextNum': ["5", "8", "-1.54", "0.1", "7.4", "11.9", "-8.644", "5", -4.8, 9.2]})
def setUp(self): self.df = DataFrame({ 'X': [ 2, 4, 3.5, -3, -1.5, 1, 0.75, -2, -0.5, -3.5, -5, 2.5, 5, -1, 3.5, -1.75, 3, 4, -3.5, -4 ], 'Y': [ 1, 3, -2.5, 2.5, -3, 5, -0.5, 4, 1, -2.5, -4, -4.5, 0.75, 1.5, -5, -2.5, 2.5, -2, 2, -1.25 ] })
class TestMovingAverage(TestCase): def setUp(self): self.df = DataFrame({'Dep': ["1", "1", "1", "1", "1", "2", "2", "2", "2", "2"], 'Jour': ["2021-01-01", "2021-01-02", "2021-01-03", "2021-01-04", "2021-01-05", "2020-11-16", "2020-11-17", "2020-11-18", "2020-11-19", "2020-11-20"], 'Var': [248, 245, 209, 359, 326, 86, 92, 74, 80, 77], 'VarNone': [87, 99, None, 120, 128, None, 99, None, None, None], 'VarMixed': [-3, 2, "Null", 0, None, "Null", 5, None, "Null", "Null"], 'VarTextNum': ["5", "8", "-1", "0", "7.4", "11.9", "-8.44", "5", -4.8, 9.2]}) def test_ungroup_ma(self): ungroup_ma = MovingAverage(3, 'Jour', 'Var') result = ungroup_ma.apply(self.df) self.assertEqual([None, None, 84, 82, 77, 135, 190, 234, 271, 298], result['Var_MA3']) def test_group_ma(self): self.df.add_group('Dep') group_ma = MovingAverage(3, 'Jour', 'Var') result = group_ma.apply(self.df) self.assertEqual([None, None, 234, 271, 298, None, None, 84, 82, 77], result['Var_MA3'])
def _operation(self, df): vars_with_criterion = list(self.__criteria.keys()) if any(var not in df.vars for var in vars_with_criterion): raise KeyError result = DataFrame() for var in df.vars: result.add_column(var) for i in range(len(df)): row_dict = df.row_as_dict(i) add_row = True for var in vars_with_criterion: test_result = False try: test_result = eval( str(var) + " " + str(self.__criteria[var]), {"__builtins__": {}}, row_dict) except TypeError: pass except Exception as e: raise e finally: if isinstance(test_result, bool): add_row *= test_result else: raise TypeError if add_row: result.add_row(df[None, i]) return result
def _operation(self, group_df): nested_group_list = [] for row in group_df: nested_group_list.append(row) index_criteria = [] for var in self.vars: is_desc = False if var.startswith("desc_"): is_desc = True var = var[5:] index_criterion = group_df.vars.index(var) + 1 if is_desc: index_criterion *= -1 index_criteria.append(index_criterion) default_recursion_limit = getrecursionlimit() setrecursionlimit(len(group_df) + 10) sorted_group_list = Sort.__merge_sort(nested_group_list, index_criteria) setrecursionlimit(default_recursion_limit) result = DataFrame() for var in group_df.vars: result.add_column(var) for row in sorted_group_list: result.add_row(row) return result
def import_json(path, root=None): """ Imports a JSON file as DataFrame. Parameters ---------- path : str Absolute or relative path to the JSON file to import root : str = None Name of the root's node to import ; if None, imports the first root node of the file Returns ------- DataFrame A DataFrame with the contents of the JSON file """ with open(path) as jsonfile: data = json.load(jsonfile) roots = list(data.keys()) if len(roots) == 1 or root is None: root = roots[0] elif root not in roots: raise KeyError table = data[root] table_vars = list(table[0].keys()) df = DataFrame() for var in table_vars: df.add_column(var) for row in table: df.add_row(list(row.values())) return df
def apply(self, df): result = DataFrame() groups = df.groups_df for group_df in groups: transformed_group = self._operation(group_df) if len(transformed_group) > 0: if len(result.vars) == 0: for var in transformed_group.vars: result.add_column(var) for row in transformed_group: result.add_row(row) result = GroupBy(*df.groups_vars).apply(result) return result
def import_csv(path, headers=True, delimiter=";", encoding='ISO-8859-1'): """ Imports a CSV file as DataFrame Parameters ---------- path : str Absolute or relative path to the CSV file to import headers : bool = True Specify if the file have headers delimiter : str = ";" Specify the file's delimiter encoding : str = 'ISO-8859-1' Specify the file's encoding Returns ------- DataFrame A DataFrame with the contents of the CSV file """ df = DataFrame() with open(path, newline='', encoding=encoding) as csv_file: reader = csv.reader(csv_file, delimiter=delimiter) first_row = True for row in reader: if first_row: if headers: for var in row: df.add_column(var) else: for i in range(len(row)): name = "Var" + str(i) df.add_column(name) first_row = False else: df.add_row(row) return df
def apply(self, df): result = DataFrame() other_vars = [ var for var in self.__other.vars if var not in list(self.__matches.keys()) ] for var in df.vars: result.add_column(var) for var in other_vars: if var in df.vars: result.add_column("Y_" + str(var)) else: result.add_column(var) known_matches = {} for i in range(len(df)): base_row = df[None, i] filter_kw = {} filter_str = "" for key in list(self.__matches.keys()): target_value = df[self.__matches[key], i] filter_kw[key] = '=="' + str(target_value) + '"' filter_str += str(key) + "_" + str(target_value) if known_matches.get(filter_str) is None: matches = Filter(**filter_kw).apply(self.__other) if len(matches) == 0: other_content = [None] * len(other_vars) else: other_content = Select(*other_vars).apply(matches) known_matches[filter_str] = other_content else: other_content = known_matches[filter_str] if isinstance(other_content, DataFrame): for row in other_content: new_row = deepcopy(base_row) new_row.extend(row) result.add_row(new_row) else: new_row = deepcopy(base_row) new_row.extend(other_content) result.add_row(new_row) return result
def apply(self, df): list_vars = [*df.groups_vars, *self.vars] df = Select(*list_vars).apply(df) result = DataFrame() for var in list_vars: result.add_column(var) groups = df.groups_df for group_df in groups: row = [] for group_var in df.groups_vars: row.append(group_df[group_var, 0]) for var in self.vars: col = group_df[var] if self.__del_na: col = [val for val in col if val is not None] if self.__del_nan: col = [val for val in col if isinstance(val, Number)] partial_result = self._operation(col) if isinstance(partial_result, dict): keys = list(partial_result.keys()) if (var + "_" + keys[0]) not in result.vars: last = var for key in keys: new_var = var + "_" + key result.add_column(new_var, after=last) last = new_var result.del_column(var) row.extend(list(partial_result.values())) else: row.append(partial_result) result.add_row(row) result = GroupBy(*df.groups_vars[:-1]).apply(result) return result
def setUp(self): self.df = DataFrame({"Cat": ["A", "A", "A", "A", "B", "B", "B", "B"], "Var1": [10, 10, 10, 10, 10, 10, 10, 10], "Var2": [10, 10, 11, 11, 13, 13, 13, 13]})
def test_init_emptyDataFrame(self): df = DataFrame() self.assertIsInstance(df, DataFrame) self.assertEqual(len(df), 0) self.assertEqual(len(df.vars), 0)
def setUp(self): self.df = DataFrame({ "Cat": ["A", "A", "A", "B", "B", "B", "C", "C"], "Var1": [98, 100, 102, 100, 200, 150, 620, 40], "Var2": [74, 81, 85, 71, 103, 99, 101, 76] })