Esempio n. 1
0
    def run(self, input_data):
        stats = defaultdict(dict)
        stats_v2 = defaultdict(dict)

        # Really bad that these parameters are implicitly passed through lmd
        # Perhaps sampling can be moved somewhere upwards,
        # so that it can be reused by all downstream phases?
        sample_df = sample_data(
            input_data.data_frame,
            self.transaction.lmd['sample_margin_of_error'],
            self.transaction.lmd['sample_confidence_level'], self.log)

        for col_name in sample_df.columns.values:
            col_data = sample_df[col_name].dropna()

            (data_type, data_subtype, data_type_dist, data_subtype_dist,
             additional_info) = self.get_column_data_type(
                 col_data, input_data.data_frame[col_name], col_name)

            type_data = {
                'data_type': data_type,
                'data_subtype': data_subtype,
                'data_type_dist': data_type_dist,
                'data_subtype_dist': data_subtype_dist,
            }
            stats[col_name] = type_data
            stats[col_name].update(additional_info)
            stats_v2[col_name]['typing'] = type_data
            stats_v2[col_name]['additional_info'] = additional_info

            stats_v2[col_name]['is_foreign_key'] = is_foreign_key(
                col_data, col_name, data_subtype,
                additional_info['other_potential_subtypes'])
            stats[col_name]['is_foreign_key'] = stats_v2[col_name][
                'is_foreign_key']
            if stats_v2[col_name]['is_foreign_key'] and self.transaction.lmd[
                    'handle_foreign_keys']:
                self.transaction.lmd['columns_to_ignore'].append(col_name)

            if data_subtype_dist:
                self.log.info(f'Data distribution for column "{col_name}" '
                              f'of type "{data_type}" '
                              f'and subtype "{data_subtype}"')
                try:
                    self.log.infoChart(
                        data_subtype_dist,
                        type='list',
                        uid=f'Data Type Distribution for column "{col_name}"')
                except Exception:
                    # Functionality is specific to mindsdb logger
                    pass

        if not self.transaction.lmd.get('column_stats'):
            self.transaction.lmd['column_stats'] = {}
        if not self.transaction.lmd.get('stats_v2'):
            self.transaction.lmd['stats_v2'] = {}

        self.transaction.lmd['column_stats'].update(stats)
        self.transaction.lmd['stats_v2'].update(stats_v2)
Esempio n. 2
0
    def run(self, input_data):
        stats = self.transaction.lmd['column_stats']
        stats_v2 = self.transaction.lmd['stats_v2']
        col_data_dict = {}

        sample_df = sample_data(input_data.data_frame,
                                self.transaction.lmd['sample_margin_of_error'],
                                self.transaction.lmd['sample_confidence_level'],
                                self.log)

        for col_name in self.transaction.lmd['empty_columns']:
            stats_v2[col_name] = {}
            stats_v2[col_name]['empty'] = {'is_empty': True}
            self.log.warning(f'Column {col_name} is empty.')

        for col_name in sample_df.columns.values:
            self.log.info(f'Analyzing column: {col_name} !')
            data_type = stats_v2[col_name]['typing']['data_type']
            data_subtype = stats_v2[col_name]['typing']['data_subtype']

            col_data = sample_df[col_name].dropna()
            if data_type == DATA_TYPES.NUMERIC or data_subtype == DATA_SUBTYPES.TIMESTAMP:
                col_data = clean_int_and_date_data(col_data, self.log)
            col_data_dict[col_name] = col_data

            stats_v2[col_name]['empty'] = get_column_empty_values_report(input_data.data_frame[col_name])

            stats[col_name]['empty_cells'] = stats_v2[col_name]['empty']['empty_cells']
            stats[col_name]['empty_percentage'] = stats_v2[col_name]['empty']['empty_percentage']

            if data_type == DATA_TYPES.CATEGORICAL:
                hist_data = input_data.data_frame[col_name]
                stats_v2[col_name]['unique'] = get_uniq_values_report(input_data.data_frame[col_name])
            else:
                hist_data = col_data

            histogram, percentage_buckets = get_histogram(hist_data,
                                                          data_type=data_type,
                                                          data_subtype=data_subtype)
            stats_v2[col_name]['histogram'] = histogram
            stats_v2[col_name]['percentage_buckets'] = percentage_buckets
            stats[col_name]['histogram'] = histogram
            stats[col_name]['percentage_buckets'] = percentage_buckets
            if histogram:
                S, biased_buckets = compute_entropy_biased_buckets(histogram['y'], histogram['x'])
                stats_v2[col_name]['bias'] = {
                    'entropy': S,
                    'description': 'TBD'
                }
                if biased_buckets:
                    stats_v2[col_name]['bias']['biased_buckets'] = biased_buckets
                if S < 0.8:
                    if data_type == DATA_TYPES.CATEGORICAL:
                        warning_str =  "You may to check if some categories occur too often to too little in this columns."
                    else:
                        warning_str = "You may want to check if you see something suspicious on the right-hand-side graph."
                    stats_v2[col_name]['bias']['warning'] = warning_str + " This doesn't necessarily mean there's an issue with your data, it just indicates a higher than usual probability there might be some issue."

            self.compute_scores(col_name, sample_df, col_data_dict, stats)

            if 'lof_outliers' in stats[col_name]:
                stats_v2[col_name]['outliers'] = {
                    'outlier_values': stats[col_name]['lof_outliers'],
                    'outlier_score': stats[col_name]['lof_based_outlier_score'],
                    'outlier_buckets': compute_outlier_buckets(outlier_values=stats[col_name]['lof_outliers'],
                                                               hist_x=histogram['x'],
                                                               hist_y=histogram['y'],
                                                               percentage_buckets=percentage_buckets,
                                                               col_stats=stats[col_name]),
                    'description': 'TBD'
                }

            stats_v2[col_name]['nr_warnings'] = 0
            for x in stats_v2[col_name].values():
                if isinstance(x, dict) and 'warning' in x:
                    self.log.warning(x['warning'])
                stats_v2[col_name]['nr_warnings'] += 1
            self.log.info(f'Finished analyzing column: {col_name} !\n')

        log_interesting_stats(self.log, stats)

        self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = self.transaction.lmd['sample_margin_of_error']

        self.transaction.lmd['data_preparation']['total_row_count'] = len(input_data.data_frame)
        self.transaction.lmd['data_preparation']['used_row_count'] = len(sample_df)
Esempio n. 3
0
    def run(self, input_data):
        stats = defaultdict(dict)
        stats_v2 = defaultdict(dict)

        # Really bad that these parameters are implicitly passed through lmd
        # Perhaps sampling can be moved somewhere upwards,
        # so that it can be reused by all downstream phases?
        sample_df = sample_data(input_data.data_frame,
                                self.transaction.lmd['sample_margin_of_error'],
                                self.transaction.lmd['sample_confidence_level'],
                                self.log)

        for col_name in sample_df.columns.values:
            col_data = sample_df[col_name].dropna()

            (data_type, data_subtype, data_type_dist,
             data_subtype_dist, additional_info) = self.get_column_data_type(col_data,
                                                                             input_data.data_frame[col_name],
                                                                             col_name)

            type_data = {
                'data_type': data_type,
                'data_subtype': data_subtype,
                'data_type_dist': data_type_dist,
                'data_subtype_dist': data_subtype_dist,
                'description': """A data type, in programming, is a classification that specifies which type of value a variable has and what type of mathematical, relational or logical operations can be applied to it without causing an error. A string, for example, is a data type that is used to classify text and an integer is a data type used to classify whole numbers."""
            }

            stats[col_name] = deepcopy(type_data)
            stats[col_name].update(additional_info)
            del stats[col_name]['description']
            
            stats_v2[col_name]['typing'] = type_data
            stats_v2[col_name]['additional_info'] = additional_info

            stats_v2[col_name]['is_foreign_key'] = is_foreign_key(col_data,
                                                                  col_name,
                                                                  data_subtype,
                                                                  additional_info['other_potential_subtypes'])
            stats[col_name]['is_foreign_key'] = stats_v2[col_name]['is_foreign_key']
            if stats_v2[col_name]['is_foreign_key'] and self.transaction.lmd['handle_foreign_keys']:
                self.transaction.lmd['columns_to_ignore'].append(col_name)

            if data_subtype_dist:
                self.log.info(f'Data distribution for column "{col_name}" '
                              f'of type "{data_type}" '
                              f'and subtype "{data_subtype}"')
                try:
                    self.log.infoChart(data_subtype_dist,
                                       type='list',
                                       uid=f'Data Type Distribution for column "{col_name}"')
                except Exception:
                    # Functionality is specific to mindsdb logger
                    pass

        if not self.transaction.lmd.get('column_stats'):
            self.transaction.lmd['column_stats'] = {}
        if not self.transaction.lmd.get('stats_v2'):
            self.transaction.lmd['stats_v2'] = {}

        self.transaction.lmd['column_stats'].update(stats)
        self.transaction.lmd['stats_v2'].update(stats_v2)
Esempio n. 4
0
    def run(self, input_data):
        stats = self.transaction.lmd['column_stats']
        stats_v2 = self.transaction.lmd['stats_v2']
        col_data_dict = {}

        sample_df = sample_data(
            input_data.data_frame,
            self.transaction.lmd['sample_margin_of_error'],
            self.transaction.lmd['sample_confidence_level'], self.log)

        for col_name in self.transaction.lmd['empty_columns']:
            stats_v2[col_name] = {}
            stats_v2[col_name]['empty'] = {'is_empty': True}
            self.log.warning(f'Column {col_name} is empty.')

        for col_name in sample_df.columns.values:
            self.log.info(f'Analyzing column: {col_name} !')
            data_type = stats_v2[col_name]['typing']['data_type']
            data_subtype = stats_v2[col_name]['typing']['data_subtype']

            col_data = sample_df[col_name].dropna()
            if data_type == DATA_TYPES.NUMERIC or data_subtype == DATA_SUBTYPES.TIMESTAMP:
                col_data = clean_int_and_date_data(col_data, self.log)
            col_data_dict[col_name] = col_data

            stats_v2[col_name]['empty'] = get_column_empty_values_report(
                input_data.data_frame[col_name])

            stats[col_name]['empty_cells'] = stats_v2[col_name]['empty'][
                'empty_cells']
            stats[col_name]['empty_percentage'] = stats_v2[col_name]['empty'][
                'empty_percentage']

            if data_type == DATA_TYPES.CATEGORICAL:
                hist_data = input_data.data_frame[col_name]
                stats_v2[col_name]['unique'] = get_uniq_values_report(
                    input_data.data_frame[col_name])
            else:
                hist_data = col_data

            histogram, percentage_buckets = get_histogram(
                hist_data, data_type=data_type, data_subtype=data_subtype)
            stats_v2[col_name]['histogram'] = histogram
            stats_v2[col_name]['percentage_buckets'] = percentage_buckets
            stats[col_name]['histogram'] = histogram
            stats[col_name]['percentage_buckets'] = percentage_buckets
            if histogram:
                S, biased_buckets = compute_entropy_biased_buckets(
                    histogram['y'], histogram['x'])
                stats_v2[col_name]['bias'] = {
                    'entropy':
                    S,
                    'description':
                    """Under the assumption of uniformly distributed data (i.e., same probability for Head or Tails on a coin flip) mindsdb tries to detect potential divergences from such case, and it calls this "potential bias". Thus by our data having any potential bias mindsdb means any divergence from all categories having the same probability of being selected."""
                }
                if biased_buckets:
                    stats_v2[col_name]['bias'][
                        'biased_buckets'] = biased_buckets
                if S < 0.8:
                    if data_type == DATA_TYPES.CATEGORICAL:
                        warning_str = "You may to check if some categories occur too often to too little in this columns."
                    else:
                        warning_str = "You may want to check if you see something suspicious on the right-hand-side graph."
                    stats_v2[col_name]['bias'][
                        'warning'] = warning_str + " This doesn't necessarily mean there's an issue with your data, it just indicates a higher than usual probability there might be some issue."

            self.compute_scores(col_name, sample_df, col_data_dict, stats)

            if 'lof_outliers' in stats[col_name]:
                stats_v2[col_name]['outliers'] = {
                    'outlier_values':
                    stats[col_name]['lof_outliers'],
                    'outlier_score':
                    stats[col_name]['lof_based_outlier_score'],
                    'outlier_buckets':
                    compute_outlier_buckets(
                        outlier_values=stats[col_name]['lof_outliers'],
                        hist_x=histogram['x'],
                        hist_y=histogram['y'],
                        percentage_buckets=percentage_buckets,
                        col_stats=stats[col_name]),
                    'description':
                    """Potential outliers can be thought as the "extremes", i.e., data points that are far from the center of mass (mean/median/interquartile range) of the data."""
                }

            stats_v2[col_name]['nr_warnings'] = 0
            for x in stats_v2[col_name].values():
                if isinstance(x, dict) and 'warning' in x:
                    self.log.warning(x['warning'])
                stats_v2[col_name]['nr_warnings'] += 1
            self.log.info(f'Finished analyzing column: {col_name} !\n')

        log_interesting_stats(self.log, stats)

        self.transaction.lmd['data_preparation'][
            'accepted_margin_of_error'] = self.transaction.lmd[
                'sample_margin_of_error']

        self.transaction.lmd['data_preparation']['total_row_count'] = len(
            input_data.data_frame)
        self.transaction.lmd['data_preparation']['used_row_count'] = len(
            sample_df)