def test_max_var1_group_by(self): self.assertEqual( Pipeline(GroupBy("Cat"), Max("Var1")).apply(self.df)["Var1_Max"][0], 402) self.assertEqual( Pipeline(GroupBy("Cat"), Max("Var1")).apply(self.df)["Var1_Max"][1], 589) self.assertEqual( Pipeline(GroupBy("Cat"), Max("Var1")).apply(self.df)["Var1_Max"][2], 620)
def test_min_var2_group_by(self): self.assertEqual( Pipeline(GroupBy("Cat"), Min("Var2")).apply(self.df)["Var2_Min"][0], 74) self.assertEqual( Pipeline(GroupBy("Cat"), Min("Var2")).apply(self.df)["Var2_Min"][1], 71) self.assertEqual( Pipeline(GroupBy("Cat"), Min("Var2")).apply(self.df)["Var2_Min"][2], 76)
def test_count_var2_group_by(self): self.assertEqual( Pipeline(GroupBy("Cat"), Count("Var2")).apply(self.df)["Var2_Count"][0], 3) self.assertEqual( Pipeline(GroupBy("Cat"), Count("Var2")).apply(self.df)["Var2_Count"][1], 3) self.assertEqual( Pipeline(GroupBy("Cat"), Count("Var2")).apply(self.df)["Var2_Count"][2], 2)
def test_min_var1_group_by(self): self.assertEqual( Pipeline(GroupBy("Cat"), Min("Var1")).apply(self.df)["Var1_Min"][0], 348) self.assertEqual( Pipeline(GroupBy("Cat"), Min("Var1")).apply(self.df)["Var1_Min"][1], 380) self.assertEqual( Pipeline(GroupBy("Cat"), Min("Var1")).apply(self.df)["Var1_Min"][2], 289)
def test_average_var2_group_by(self): self.assertEqual( Pipeline(GroupBy("Cat"), Average("Var2")).apply(self.df)["Var2_Average"][0], 80.0) self.assertEqual( Pipeline(GroupBy("Cat"), Average("Var2")).apply(self.df)["Var2_Average"][1], 91.0) self.assertEqual( Pipeline(GroupBy("Cat"), Average("Var2")).apply(self.df)["Var2_Average"][2], 88.5)
def test_average_var1_group_by(self): self.assertEqual( Pipeline(GroupBy("Cat"), Average("Var1")).apply(self.df)["Var1_Average"][0], 100.0) self.assertEqual( Pipeline(GroupBy("Cat"), Average("Var1")).apply(self.df)["Var1_Average"][1], 150.0) self.assertEqual( Pipeline(GroupBy("Cat"), Average("Var1")).apply(self.df)["Var1_Average"][2], 330.0)
def test_max_var2_group_by(self): self.assertEqual( Pipeline(GroupBy("Cat"), Max("Var2")).apply(self.df)["Var2_Max"][0], 85) self.assertEqual( Pipeline(GroupBy("Cat"), Max("Var2")).apply(self.df)["Var2_Max"][1], 102) self.assertEqual( Pipeline(GroupBy("Cat"), Max("Var2")).apply(self.df)["Var2_Max"][2], 101)
def apply(self, df): list_vars = [*df.groups_vars, *self.vars] df = Select(*list_vars).apply(df) result = DataFrame() for var in list_vars: result.add_column(var) groups = df.groups_df for group_df in groups: row = [] for group_var in df.groups_vars: row.append(group_df[group_var, 0]) for var in self.vars: col = group_df[var] if self.__del_na: col = [val for val in col if val is not None] if self.__del_nan: col = [val for val in col if isinstance(val, Number)] partial_result = self._operation(col) if isinstance(partial_result, dict): keys = list(partial_result.keys()) if (var + "_" + keys[0]) not in result.vars: last = var for key in keys: new_var = var + "_" + key result.add_column(new_var, after=last) last = new_var result.del_column(var) row.extend(list(partial_result.values())) else: row.append(partial_result) result.add_row(row) result = GroupBy(*df.groups_vars[:-1]).apply(result) return result
def apply(self, df): result = DataFrame() for var in self.vars: result.add_column(var, df[var]) kept_group_vars = [var for var in df.groups_vars if var in self.vars] result = GroupBy(*kept_group_vars).apply(result) return result
def _operation(self, group_df): from Summarize import Min, Max, Average result = deepcopy(group_df) base = Select(*self.vars).apply(result) if self.__normalize: normalize = Normalize(*self.vars) var_std = [] var_keys = {} for var in self.vars: var_std.append(str(var) + "_Std") var_keys[var] = str(var) + "_Std" select = Select(*var_std) rename = Rename(**var_keys) normalize_pipeline = Pipeline(normalize, select, rename) base = normalize_pipeline.apply(base) mins = Min(*self.vars).apply(base) maxs = Max(*self.vars).apply(base) if self.__seed: seed(self.__seed) centers = [] for i in range(self.__clusters): new_center = [] for var in base.vars: new_center.append( uniform(mins[str(var) + "_Min", 0], maxs[str(var) + "_Max", 0])) centers.append(new_center) cluster = [None] * len(base) continue_loop = True iteration = 0 while continue_loop: new_cluster = [] iteration += 1 for row in base: best_distance = None best_cluster = None for i in range(len(centers)): distance = self.__euclidean_distance(row, centers[i]) if best_distance is None or (distance < best_distance): best_distance = distance best_cluster = i new_cluster.append(best_cluster) if new_cluster == cluster: continue_loop = False else: new_centers = deepcopy(base) new_centers.add_column("Partition", new_cluster) group = GroupBy("Partition") averages = Average(*base.vars) compute_centers = Pipeline(group, averages) new_centers = compute_centers.apply(new_centers) for row in new_centers: centers[row[0]] = row[1:] cluster = new_cluster if iteration > self.__max_iter: continue_loop = False result.add_column("Partition", cluster) return result
def apply(self, df): result = DataFrame() groups = df.groups_df for group_df in groups: transformed_group = self._operation(group_df) if len(transformed_group) > 0: if len(result.vars) == 0: for var in transformed_group.vars: result.add_column(var) for row in transformed_group: result.add_row(row) result = GroupBy(*df.groups_vars).apply(result) return result
def test_sd_var2_without_variance_group_by(self): self.assertEqual(Pipeline(GroupBy("Cat"), Variance("Var2", get_var=False)).apply(self.df)["Var2_SD"][0], 0.50) self.assertEqual(Pipeline(GroupBy("Cat"), Variance("Var2", get_var=False)).apply(self.df)["Var2_SD"][1], 0)
def test_variance_var1_without_sd_group_by(self): self.assertEqual(Pipeline(GroupBy("Cat"), Variance("Var1", get_sd=False)).apply(self.df)["Var1_Var"][0], 0) self.assertEqual(Pipeline(GroupBy("Cat"), Variance("Var1", get_sd=False)).apply(self.df)["Var1_Var"][1], 0)
def test_sum_var2_group_by(self): self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var2")).apply(self.df)["Var2_Sum"][0], 240) self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var2")).apply(self.df)["Var2_Sum"][1], 272) self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var2")).apply(self.df)["Var2_Sum"][2], 177)
def test_setOneGroup(self): set_one_group = GroupBy('Cat') result = set_one_group.apply(self.df) self.assertEqual(result.groups, [1, 1, 1, 2, 2, 2, 2, 3, 3, 3])
from IO import Import from Summarize import Sum from Transform import AsNumeric, Rename, GroupBy, Sort, Mutate, Filter from Pipeline import Pipeline if __name__ == '__main__': movies = Import.import_csv("data/movies.csv", delimiter=",") convert = AsNumeric('year', 'intgross_2013', "budget_2013") remove_na = Filter(intgross_2013="!='N/A'", budget_2013="!='N/A'") group_bechdel = GroupBy("binary") sums = Sum('intgross_2013', 'budget_2013') rename_vars = Rename(intgross_2013="intgross_2013_Sum", budget_2013="budget_2013_Sum") compute_ratio = Mutate(profitability_2013="intgross_2013/budget_2013") sort_profitability = Sort("desc_profitability_2013") # Profitability of Movies passing Bechdel test vs Movies failing Bechdel test bechdel_analysis = Pipeline(remove_na, convert, group_bechdel, sums, rename_vars, compute_ratio, sort_profitability) result_bechdel = bechdel_analysis.apply(movies) print(result_bechdel) # Most Proficient Movie profitability_analysis = Pipeline(remove_na, convert, compute_ratio, sort_profitability) result_profitability = profitability_analysis.apply(movies) print(result_profitability)
def test_setTwoGroups(self): set_two_groups = GroupBy('Cat', 'Reg') result = set_two_groups.apply(self.df) self.assertEqual(result.groups, [1, 1, 2, 3, 3, 4, 4, 5, 6, 6])
def test_sum_var1_group_by(self): self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var1")).apply(self.df)["Var1_Sum"][0], 1147) self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var1")).apply(self.df)["Var1_Sum"][1], 1489) self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var1")).apply(self.df)["Var1_Sum"][2], 909)