def test_max_var1_group_by(self):
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Max("Var1")).apply(self.df)["Var1_Max"][0], 402)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Max("Var1")).apply(self.df)["Var1_Max"][1], 589)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Max("Var1")).apply(self.df)["Var1_Max"][2], 620)
 def test_min_var2_group_by(self):
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Min("Var2")).apply(self.df)["Var2_Min"][0], 74)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Min("Var2")).apply(self.df)["Var2_Min"][1], 71)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Min("Var2")).apply(self.df)["Var2_Min"][2], 76)
 def test_count_var2_group_by(self):
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Count("Var2")).apply(self.df)["Var2_Count"][0], 3)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Count("Var2")).apply(self.df)["Var2_Count"][1], 3)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Count("Var2")).apply(self.df)["Var2_Count"][2], 2)
 def test_min_var1_group_by(self):
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Min("Var1")).apply(self.df)["Var1_Min"][0], 348)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Min("Var1")).apply(self.df)["Var1_Min"][1], 380)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Min("Var1")).apply(self.df)["Var1_Min"][2], 289)
 def test_average_var2_group_by(self):
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Average("Var2")).apply(self.df)["Var2_Average"][0], 80.0)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Average("Var2")).apply(self.df)["Var2_Average"][1], 91.0)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Average("Var2")).apply(self.df)["Var2_Average"][2], 88.5)
 def test_average_var1_group_by(self):
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Average("Var1")).apply(self.df)["Var1_Average"][0], 100.0)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Average("Var1")).apply(self.df)["Var1_Average"][1], 150.0)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Average("Var1")).apply(self.df)["Var1_Average"][2], 330.0)
 def test_max_var2_group_by(self):
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Max("Var2")).apply(self.df)["Var2_Max"][0], 85)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Max("Var2")).apply(self.df)["Var2_Max"][1], 102)
     self.assertEqual(
         Pipeline(GroupBy("Cat"),
                  Max("Var2")).apply(self.df)["Var2_Max"][2], 101)
Exemple #8
0
 def apply(self, df):
     list_vars = [*df.groups_vars, *self.vars]
     df = Select(*list_vars).apply(df)
     result = DataFrame()
     for var in list_vars:
         result.add_column(var)
     groups = df.groups_df
     for group_df in groups:
         row = []
         for group_var in df.groups_vars:
             row.append(group_df[group_var, 0])
         for var in self.vars:
             col = group_df[var]
             if self.__del_na:
                 col = [val for val in col if val is not None]
             if self.__del_nan:
                 col = [val for val in col if isinstance(val, Number)]
             partial_result = self._operation(col)
             if isinstance(partial_result, dict):
                 keys = list(partial_result.keys())
                 if (var + "_" + keys[0]) not in result.vars:
                     last = var
                     for key in keys:
                         new_var = var + "_" + key
                         result.add_column(new_var, after=last)
                         last = new_var
                     result.del_column(var)
                 row.extend(list(partial_result.values()))
             else:
                 row.append(partial_result)
         result.add_row(row)
     result = GroupBy(*df.groups_vars[:-1]).apply(result)
     return result
Exemple #9
0
 def apply(self, df):
     result = DataFrame()
     for var in self.vars:
         result.add_column(var, df[var])
     kept_group_vars = [var for var in df.groups_vars if var in self.vars]
     result = GroupBy(*kept_group_vars).apply(result)
     return result
Exemple #10
0
 def _operation(self, group_df):
     from Summarize import Min, Max, Average
     result = deepcopy(group_df)
     base = Select(*self.vars).apply(result)
     if self.__normalize:
         normalize = Normalize(*self.vars)
         var_std = []
         var_keys = {}
         for var in self.vars:
             var_std.append(str(var) + "_Std")
             var_keys[var] = str(var) + "_Std"
         select = Select(*var_std)
         rename = Rename(**var_keys)
         normalize_pipeline = Pipeline(normalize, select, rename)
         base = normalize_pipeline.apply(base)
     mins = Min(*self.vars).apply(base)
     maxs = Max(*self.vars).apply(base)
     if self.__seed:
         seed(self.__seed)
     centers = []
     for i in range(self.__clusters):
         new_center = []
         for var in base.vars:
             new_center.append(
                 uniform(mins[str(var) + "_Min", 0], maxs[str(var) + "_Max",
                                                          0]))
         centers.append(new_center)
     cluster = [None] * len(base)
     continue_loop = True
     iteration = 0
     while continue_loop:
         new_cluster = []
         iteration += 1
         for row in base:
             best_distance = None
             best_cluster = None
             for i in range(len(centers)):
                 distance = self.__euclidean_distance(row, centers[i])
                 if best_distance is None or (distance < best_distance):
                     best_distance = distance
                     best_cluster = i
             new_cluster.append(best_cluster)
         if new_cluster == cluster:
             continue_loop = False
         else:
             new_centers = deepcopy(base)
             new_centers.add_column("Partition", new_cluster)
             group = GroupBy("Partition")
             averages = Average(*base.vars)
             compute_centers = Pipeline(group, averages)
             new_centers = compute_centers.apply(new_centers)
             for row in new_centers:
                 centers[row[0]] = row[1:]
         cluster = new_cluster
         if iteration > self.__max_iter:
             continue_loop = False
     result.add_column("Partition", cluster)
     return result
 def apply(self, df):
     result = DataFrame()
     groups = df.groups_df
     for group_df in groups:
         transformed_group = self._operation(group_df)
         if len(transformed_group) > 0:
             if len(result.vars) == 0:
                 for var in transformed_group.vars:
                     result.add_column(var)
             for row in transformed_group:
                 result.add_row(row)
     result = GroupBy(*df.groups_vars).apply(result)
     return result
Exemple #12
0
 def test_sd_var2_without_variance_group_by(self):
     self.assertEqual(Pipeline(GroupBy("Cat"), Variance("Var2", get_var=False)).apply(self.df)["Var2_SD"][0], 0.50)
     self.assertEqual(Pipeline(GroupBy("Cat"), Variance("Var2", get_var=False)).apply(self.df)["Var2_SD"][1], 0)
Exemple #13
0
 def test_variance_var1_without_sd_group_by(self):
     self.assertEqual(Pipeline(GroupBy("Cat"), Variance("Var1", get_sd=False)).apply(self.df)["Var1_Var"][0], 0)
     self.assertEqual(Pipeline(GroupBy("Cat"), Variance("Var1", get_sd=False)).apply(self.df)["Var1_Var"][1], 0)
 def test_sum_var2_group_by(self):
     self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var2")).apply(self.df)["Var2_Sum"][0], 240)
     self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var2")).apply(self.df)["Var2_Sum"][1], 272)
     self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var2")).apply(self.df)["Var2_Sum"][2], 177)
Exemple #15
0
 def test_setOneGroup(self):
     set_one_group = GroupBy('Cat')
     result = set_one_group.apply(self.df)
     self.assertEqual(result.groups, [1, 1, 1, 2, 2, 2, 2, 3, 3, 3])
from IO import Import
from Summarize import Sum
from Transform import AsNumeric, Rename, GroupBy, Sort, Mutate, Filter
from Pipeline import Pipeline

if __name__ == '__main__':
    movies = Import.import_csv("data/movies.csv", delimiter=",")
    convert = AsNumeric('year', 'intgross_2013', "budget_2013")
    remove_na = Filter(intgross_2013="!='N/A'", budget_2013="!='N/A'")
    group_bechdel = GroupBy("binary")
    sums = Sum('intgross_2013', 'budget_2013')
    rename_vars = Rename(intgross_2013="intgross_2013_Sum",
                         budget_2013="budget_2013_Sum")
    compute_ratio = Mutate(profitability_2013="intgross_2013/budget_2013")
    sort_profitability = Sort("desc_profitability_2013")
    # Profitability of Movies passing Bechdel test vs Movies failing Bechdel test
    bechdel_analysis = Pipeline(remove_na, convert, group_bechdel, sums,
                                rename_vars, compute_ratio, sort_profitability)
    result_bechdel = bechdel_analysis.apply(movies)
    print(result_bechdel)
    # Most Proficient Movie
    profitability_analysis = Pipeline(remove_na, convert, compute_ratio,
                                      sort_profitability)
    result_profitability = profitability_analysis.apply(movies)
    print(result_profitability)
Exemple #17
0
 def test_setTwoGroups(self):
     set_two_groups = GroupBy('Cat', 'Reg')
     result = set_two_groups.apply(self.df)
     self.assertEqual(result.groups, [1, 1, 2, 3, 3, 4, 4, 5, 6, 6])
 def test_sum_var1_group_by(self):
     self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var1")).apply(self.df)["Var1_Sum"][0], 1147)
     self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var1")).apply(self.df)["Var1_Sum"][1], 1489)
     self.assertEqual(Pipeline(GroupBy("Cat"), Sum("Var1")).apply(self.df)["Var1_Sum"][2], 909)