コード例 #1
0
    def set_params(self):
        self.subset_columns = self._df_context.get_column_subset()
        if not self.subset_columns == None:
            self._data_frame = self.subset_data_frame(self.subset_columns)

        self.measure_suggestions = self._df_context.get_measure_suggestions()

        if self.measure_suggestions != None:
            self.measure_suggestions = [
                m for m in self.measure_suggestions if m in self.subset_columns
            ]
            if len(self.measure_suggestions) > 0:
                self.clean_data_frame()

        self.df_filterer = DataFrameFilterer(self._data_frame)
        self.dimension_filter = self._df_context.get_dimension_filters()
        if not self.dimension_filter == None:
            for colmn in self.dimension_filter.keys():
                self.df_filterer.values_in(colmn, self.dimension_filter[colmn])
        self.measure_filter = self._df_context.get_measure_filters()
        if not self.measure_filter == None:
            for colmn in self.measure_filter.keys():
                self.df_filterer.values_between(colmn,
                                                self.measure_filter[colmn][0],
                                                self.measure_filter[colmn][1],
                                                1, 1)

        self._data_frame = self.df_filterer.get_filtered_data_frame()
コード例 #2
0
class DataFilterHelper(object):
    def __init__(self, data_frame, df_context):
        self._data_frame = data_frame
        self._df_context = df_context
        self._pandas_flag = self._df_context._pandas_flag

    def clean_data_frame(self):
        """
        used to convert dimension columns to measures takes input from config (measure suggestions).
        """
        try:
            func = udf(lambda x: utils.tryconvert(x), FloatType())
            self._data_frame = self._data_frame.select(*[
                func(c).alias(c) if c in self.measure_suggestions else c
                for c in self.columns
            ])
            self._data_frame.schema.fields
        except:
            pass

    def set_params(self):
        self.subset_columns = self._df_context.get_column_subset()
        if not self.subset_columns == None:
            self._data_frame = self.subset_data_frame(self.subset_columns)

        self.measure_suggestions = self._df_context.get_measure_suggestions()

        if self.measure_suggestions != None:
            self.measure_suggestions = [
                m for m in self.measure_suggestions if m in self.subset_columns
            ]
            if len(self.measure_suggestions) > 0:
                self.clean_data_frame()

        self.df_filterer = DataFrameFilterer(self._data_frame,
                                             self._pandas_flag)
        self.dimension_filter = self._df_context.get_dimension_filters()
        if not self.dimension_filter == None:
            for colmn in list(self.dimension_filter.keys()):
                self.df_filterer.values_in(colmn, self.dimension_filter[colmn])
        self.measure_filter = self._df_context.get_measure_filters()
        if not self.measure_filter == None:
            for colmn in list(self.measure_filter.keys()):
                self.df_filterer.values_between(colmn,
                                                self.measure_filter[colmn][0],
                                                self.measure_filter[colmn][1],
                                                1, 1)

        self._data_frame = self.df_filterer.get_filtered_data_frame()

    def get_data_frame(self):
        return self._data_frame

    def subset_data_frame(self, columns):
        return self._data_frame.select(*columns)
コード例 #3
0
    def extract_rules(self, rules, colname, rule_list=None):
        if rule_list is None:
            rule_list = []
        case = 0
        var = ''
        limit = None
        levels = ''
        return_value = False
        new_tree = {}
        new_tree['name'] = rules['name']

        if 'children' in rules:
            new_tree['children'] = []
            for children in rules['children']:
                new_tree['children'].append(
                    self.extract_rules(rules=children,
                                       colname=colname,
                                       rule_list=rule_list + [rules['name']]))
            return new_tree
        else:
            DFF = DataFrameFilterer(self._data_frame1)
            success = 0
            total = 0
            target = rules['name'][9:]
            for rule in rule_list:
                if ' <= ' in rule:
                    var, limit = re.split(' <= ', rule)
                    DFF.values_below(var, limit)
                elif ' > ' in rule:
                    var, limit = re.split(' > ', rule)
                    DFF.values_above(var, limit)
                elif ' not in ' in rule:
                    var, levels = re.split(' not in ', rule)
                    DFF.values_not_in(var, levels)
                elif ' in ' in rule:
                    var, levels = re.split(' in ', rule)
                    DFF.values_in(var, levels)
            for rows in DFF.get_aggregated_result(colname, target):
                if (rows[0] == target):
                    success = rows[1]
                total = total + rows[1]
            if (total > 0):
                if target not in self._new_rules:
                    self._new_rules[target] = []
                    self._total[target] = []
                    self._success[target] = []
                    self._probability[target] = []
                self._new_rules[target].append(','.join(rule_list))
                self._total[target].append(total)
                self._success[target].append(success)
                self._probability[target].append(
                    old_div(success * 100.0, total))
                return new_tree
コード例 #4
0
 def extract_rules(self, rule_list, target):
     if target not in self._important_vars:
         self._important_vars[target] = []
     target = self._reverse_map[target]
     DFF = DataFrameFilterer(self._data_frame1, self._pandas_flag)
     colname = self._target_dimension
     success = 0
     total = 0
     important_vars = []
     targetcols = []
     row_count = []
     dict_tree = []
     data_dict = {}
     for rows in DFF.get_count_result(colname):
         if rows is not None:
             data_dict[rows[0]] = rows[1]
     dict_tree.append(data_dict)
     for rule in rule_list:
         if ' <= ' in rule:
             var, limit = re.split(' <= ', rule)
             DFF.values_below(var, float(limit))
             data_dict = {}
             for rows in DFF.get_count_result(colname):
                 if rows is not None:
                     data_dict[rows[0]] = rows[1]
             dict_tree.append(data_dict)
         elif ' > ' in rule:
             var, limit = re.split(' > ', rule)
             DFF.values_above(var, float(limit))
             data_dict = {}
             for rows in DFF.get_count_result(colname):
                 if rows is not None:
                     data_dict[rows[0]] = rows[1]
             dict_tree.append(data_dict)
         elif ' not in ' in rule:
             var, levels = re.split(' not in ', rule)
             levels = levels[1:-1].split(",")
             #levels = [self._alias_dict[x] for x in levels]
             levels1 = [
                 key if x == key else self._alias_dict[x] for x in levels
                 for key in list(self._alias_dict.keys())
             ]
             DFF.values_not_in(var, levels1, self._measure_columns)
             data_dict = {}
             for rows in DFF.get_count_result(colname):
                 if rows is not None:
                     data_dict[rows[0]] = rows[1]
             dict_tree.append(data_dict)
         elif ' in ' in rule:
             var, levels = re.split(' in ', rule)
             levels = levels[1:-1].split(",")
             #levels = [self._alias_dict[x] for x in levels]
             levels1 = [
                 key if x == key else self._alias_dict[x] for x in levels
                 for key in list(self._alias_dict.keys())
             ]
             DFF.values_in(var, levels1, self._measure_columns)
             data_dict = {}
             for rows in DFF.get_count_result(colname):
                 if rows is not None:
                     data_dict[rows[0]] = rows[1]
             dict_tree.append(data_dict)
         important_vars.append(var)
     for rows in DFF.get_aggregated_result(colname, target):
         if (rows[0] == target):
             success = rows[1]
         total = total + rows[1]
     target = self._mapping_dict[self._target_dimension][target]
     self._important_vars[target] = list(
         set(self._important_vars[target] + important_vars))
     if (total > 0):
         if target not in self._new_rules:
             self._new_rules[target] = []
             self._total[target] = []
             self._success[target] = []
             self._probability[target] = []
         self._new_rules[target].append(','.join(rule_list))
         self._total[target].append(total)
         self._success[target].append(success)
         self._probability[target].append(old_div(success * 100.0, total))
         return success, total, dict_tree
コード例 #5
0
    def extract_rules(self, rules, colname, rule_list=None):
        if rule_list is None:
            rule_list = []
        case = 0
        var = ''
        limit = None
        levels = ''
        return_value = False
        new_tree = {}
        new_tree['name'] = rules['name']

        if rules.has_key('children'):
            new_tree['children'] = []
            for children in rules['children']:
                new_tree['children'].append(
                    self.extract_rules(rules=children,
                                       colname=colname,
                                       rule_list=rule_list + [rules['name']]))
            return new_tree
        else:
            DFF = DataFrameFilterer(self._data_frame1)
            success = 0
            total = 0
            target = rules['name'][9:]
            for rule in rule_list:
                if ' <= ' in rule:
                    var, limit = re.split(' <= ', rule)
                    DFF.values_below(var, limit)
                elif ' > ' in rule:
                    var, limit = re.split(' > ', rule)
                    DFF.values_above(var, limit)
                elif ' not in ' in rule:
                    var, levels = re.split(' not in ', rule)
                    DFF.values_not_in(var, levels)
                elif ' in ' in rule:
                    var, levels = re.split(' in ', rule)
                    DFF.values_in(var, levels)
            self._splits.sort()
            self._splits = list(set(self._splits))
            binned_colname = DFF.bucketize(self._splits, colname)
            target = self._map[float(target.strip())]['group']
            agg_result = DFF.get_aggregated_result(binned_colname, target)
            for rows in agg_result:
                if (self._label_code[rows[0]] == target):
                    success = rows[1]
                total = total + rows[1]
            if (total > 0):
                if not self._new_rules.has_key(target):
                    self._new_rules[target] = []
                    self._total[target] = []
                    self._success[target] = []
                    self._probability[target] = []
                self._new_rules[target].append(','.join(rule_list))
                self._total[target].append(total)
                self._success[target].append(success)
                self._probability[target].append(success * 100.0 / total)
                key = float(new_tree['name'][9:])
                new_tree['name'] = 'Predict: ' + self._map[key]['group']
                return new_tree
コード例 #6
0
 def extract_rules(self, rule_list, target):
     if not self._important_vars.has_key(target):
         self._important_vars[target] = []
     target = self._reverse_map[target]
     DFF = DataFrameFilterer(self._data_frame1)
     colname = self._target_dimension
     success = 0
     total = 0
     important_vars = []
     for rule in rule_list:
         if ' <= ' in rule:
             var,limit = re.split(' <= ',rule)
             DFF.values_below(var,limit)
         elif ' > ' in rule:
             var,limit = re.split(' > ',rule)
             DFF.values_above(var,limit)
         elif ' not in ' in rule:
             var,levels = re.split(' not in ',rule)
             levels=levels[1:-1].split(",")
             levels = [self._alias_dict[x] for x in levels]
             DFF.values_not_in(var,levels)
         elif ' in ' in rule:
             var,levels = re.split(' in ',rule)
             levels=levels[1:-1].split(",")
             levels = [self._alias_dict[x] for x in levels]
             DFF.values_in(var,levels)
         important_vars.append(var)
     for rows in DFF.get_aggregated_result(colname,target):
         if(rows[0]==target):
             success = rows[1]
         total = total + rows[1]
     target = self._mapping_dict[self._target_dimension][target]
     self._important_vars[target] = list(set(self._important_vars[target] + important_vars))
     if (total > 0):
         if not self._new_rules.has_key(target):
             self._new_rules[target] = []
             self._total[target] = []
             self._success[target] = []
             self._probability[target] = []
         self._new_rules[target].append(','.join(rule_list))
         self._total[target].append(total)
         self._success[target].append(success)
         self._probability[target].append(success*100.0/total)
         return success