def sample_data(df, sample_margin_of_error, sample_confidence_level, log):
    population_size = len(df)

    sample_size = int(
        calculate_sample_size(population_size, sample_margin_of_error,
                              sample_confidence_level)
    ) if population_size > 50 else population_size
    sample_size_pct = sample_size * 100 / population_size

    # get the indexes of randomly selected rows given the population size
    input_data_sample_indexes = random.sample(range(population_size),
                                              sample_size)

    log.info(
        f'Analyzing a sample of {sample_size} from a total population of {population_size}, this is equivalent to {sample_size_pct}% of your data.'
    )

    return df.iloc[input_data_sample_indexes]
    def run(self, input_data, modify_light_metadata, hmd=None, print_logs=True):
        """
        # Runs the stats generation phase
        # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column
        # A lot of information about the data distribution and quality will  also be logged to the server in this phase
        """

        if print_logs == False:
            self.log = logging.getLogger('null-logger')
            self.log.propagate = False

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(input_data.data_frame)

        if population_size < 50:
            sample_size = population_size
        else:
            sample_size = int(calculate_sample_size(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL))
            #if sample_size > 3000 and sample_size > population_size/8:
            #    sample_size = min(round(population_size/8),3000)

        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size), sample_size)
        self.log.info('population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100))

        all_sampled_data = input_data.data_frame.iloc[input_data_sample_indexes]

        stats = {}
        col_data_dict = {}

        for col_name in all_sampled_data.columns.values:
            col_data = all_sampled_data[col_name].dropna()
            full_col_data = all_sampled_data[col_name]

            data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info, column_status = self._get_column_data_type(col_data, input_data.data_frame, col_name)


            if column_status == 'Column empty':
                if modify_light_metadata:
                    self.transaction.lmd['malformed_columns']['names'].append(col_name)
                    self.transaction.lmd['malformed_columns']['indices'].append(i)
                continue

            new_col_data = []
            if curr_data_subtype == DATA_SUBTYPES.TIMESTAMP: #data_type == DATA_TYPES.DATE:
                for element in col_data:
                    if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null']:
                        new_col_data.append(None)
                    else:
                        try:
                            new_col_data.append(int(parse_datetime(element).timestamp()))
                        except:
                            self.log.warning(f'Could not convert string from col "{col_name}" to date and it was expected, instead got: {element}')
                            new_col_data.append(None)
                col_data = new_col_data
            if data_type == DATA_TYPES.NUMERIC or curr_data_subtype == DATA_SUBTYPES.TIMESTAMP:
                histogram, _ = StatsGenerator.get_histogram(col_data, data_type=data_type, data_subtype=curr_data_subtype)
                x = histogram['x']
                y = histogram['y']

                col_data = StatsGenerator.clean_int_and_date_data(col_data)
                # This means the column is all nulls, which we don't handle at the moment
                if len(col_data) < 1:
                    return None

                xp = []

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)


                    inc_rate = 0.1
                    initial_step_size = abs(max_value-min_value)/100

                    xp += [min_value]
                    i = min_value + initial_step_size

                    while i < max_value:

                        xp += [i]
                        i_inc = abs(i-min_value)*inc_rate
                        i = i + i_inc
                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0
                    xp = []

                is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "mean": mean,
                    "median": median,
                    "variance": var,
                    "skewness": skew,
                    "kurtosis": kurtosis,
                    "max": max_value,
                    "min": min_value,
                    "is_float": is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets": xp
                }
            elif data_type == DATA_TYPES.CATEGORICAL or curr_data_subtype == DATA_SUBTYPES.DATE:
                histogram, _ = StatsGenerator.get_histogram(input_data.data_frame[col_name], data_type=data_type, data_subtype=curr_data_subtype)

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "histogram": histogram,
                    "percentage_buckets": histogram['x']
                }

            elif curr_data_subtype == DATA_SUBTYPES.IMAGE:
                histogram, percentage_buckets = StatsGenerator.get_histogram(col_data, data_subtype=curr_data_subtype)

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    'percentage_buckets': percentage_buckets,
                    'histogram': histogram
                }

            # @TODO This is probably wrong, look into it a bit later
            else:
                # see if its a sentence or a word
                histogram, _ = StatsGenerator.get_histogram(col_data, data_type=data_type, data_subtype=curr_data_subtype)
                dictionary = list(histogram.keys())

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(
                        dictionary) / len(col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False
                    if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False:
                        dictionary = []
                        dictionary_available = False

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "dictionary": dictionary,
                    "dictionaryAvailable": dictionary_available,
                    "dictionaryLenghtPercentage": dictionary_lenght_percentage,
                    "histogram": histogram
                }
            stats[col_name] = col_stats
            stats[col_name]['data_type_dist'] = data_type_dist
            stats[col_name]['data_subtype_dist'] = data_subtype_dist
            stats[col_name]['column'] = col_name

            empty_count = len(full_col_data) - len(col_data)

            stats[col_name]['empty_cells'] = empty_count
            stats[col_name]['empty_percentage'] = empty_count * 100 / len(full_col_data)
            if 'separator' in additional_info:
                stats[col_name]['separator'] = additional_info['separator']
            col_data_dict[col_name] = col_data

        for col_name in all_sampled_data.columns:
            if col_name in self.transaction.lmd['malformed_columns']['names']:
                continue

            stats[col_name].update(self._compute_duplicates_score(stats, all_sampled_data, col_name))
            stats[col_name].update(self._compute_empty_cells_score(stats, all_sampled_data, col_name))
            #stats[col_name].update(self._compute_clf_based_correlation_score(stats, all_sampled_data, col_name))
            stats[col_name].update(self._compute_data_type_dist_score(stats, all_sampled_data, col_name))
            stats[col_name].update(self._compute_z_score(stats, col_data_dict, col_name))
            stats[col_name].update(self._compute_lof_score(stats, col_data_dict, col_name))
            stats[col_name].update(self._compute_similariy_score(stats, all_sampled_data, col_name))
            stats[col_name].update(self._compute_value_distribution_score(stats, all_sampled_data, col_name))

            stats[col_name].update(self._compute_consistency_score(stats, col_name))
            stats[col_name].update(self._compute_redundancy_score(stats, col_name))
            stats[col_name].update(self._compute_variability_score(stats, col_name))

            stats[col_name].update(self._compute_data_quality_score(stats, col_name))


        total_rows = len(input_data.data_frame)

        if modify_light_metadata:
            self.transaction.lmd['column_stats'] = stats

            if 'sample_margin_of_error' in self.transaction.lmd and self.transaction.lmd['sample_margin_of_error'] is not None:
                self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = self.transaction.lmd['sample_margin_of_error']
            else:
                self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = CONFIG.DEFAULT_MARGIN_OF_ERROR

            self.transaction.lmd['data_preparation']['total_row_count'] = total_rows
            self.transaction.lmd['data_preparation']['used_row_count'] = sample_size
            self.transaction.lmd['data_preparation']['test_row_count'] = len(input_data.test_indexes[KEY_NO_GROUP_BY])
            self.transaction.lmd['data_preparation']['train_row_count'] = len(input_data.train_indexes[KEY_NO_GROUP_BY])
            self.transaction.lmd['data_preparation']['validation_row_count'] = len(input_data.validation_indexes[KEY_NO_GROUP_BY])

        self._log_interesting_stats(stats)

        return stats
Exemple #3
0
    def run(self):
        result = self._get_prepared_input_df()

        self.transaction.input_data.columns = result.columns.values.tolist()
        self.transaction.lmd['columns'] = self.transaction.input_data.columns
        self.transaction.input_data.data_frame = result

        self._validate_input_data_integrity()

        is_time_series = self.transaction.lmd['model_is_time_series']
        group_by = self.transaction.lmd['model_group_by']

        # create a list of the column numbers (indexes) that make the group by, this is so that we can greate group by hashes for each row
        if len(group_by) > 0:
            group_by_col_indexes = [
                columns.index(group_by_column) for group_by_column in group_by
            ]

        # create all indexes by group by, that is all the rows that belong to each group by
        self.transaction.input_data.all_indexes[KEY_NO_GROUP_BY] = []
        self.transaction.input_data.train_indexes[KEY_NO_GROUP_BY] = []
        self.transaction.input_data.test_indexes[KEY_NO_GROUP_BY] = []
        self.transaction.input_data.validation_indexes[KEY_NO_GROUP_BY] = []
        for i, row in self.transaction.input_data.data_frame.iterrows():

            if len(group_by) > 0:
                group_by_value = '_'.join([
                    str(row[group_by_index])
                    for group_by_index in group_by_col_indexes
                ])

                if group_by_value not in self.transaction.input_data.all_indexes:
                    self.transaction.input_data.all_indexes[
                        group_by_value] = []

                self.transaction.input_data.all_indexes[group_by_value] += [i]

            self.transaction.input_data.all_indexes[KEY_NO_GROUP_BY] += [i]

        # move indexes to corresponding train, test, validation, etc and trim input data accordingly
        for key in self.transaction.input_data.all_indexes:
            #If this is a group by, skip the `KEY_NO_GROUP_BY` key
            if len(self.transaction.input_data.all_indexes
                   ) > 1 and key == KEY_NO_GROUP_BY:
                continue

            length = len(self.transaction.input_data.all_indexes[key])
            if self.transaction.lmd['type'] == TRANSACTION_LEARN:
                sample_size = int(
                    calculate_sample_size(population_size=length,
                                          margin_error=self.transaction.
                                          lmd['sample_margin_of_error'],
                                          confidence_level=self.transaction.
                                          lmd['sample_confidence_level']))

                # this evals True if it should send the entire group data into test, train or validation as opposed to breaking the group into the subsets
                should_split_by_group = type(
                    group_by) == list and len(group_by) > 0

                if should_split_by_group:
                    self.transaction.input_data.train_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            0:round(length - length * CONFIG.TEST_TRAIN_RATIO)]
                    self.transaction.input_data.train_indexes[
                        KEY_NO_GROUP_BY].extend(
                            self.transaction.input_data.train_indexes[key])

                    self.transaction.input_data.test_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            round(length -
                                  length * CONFIG.TEST_TRAIN_RATIO):int(
                                      round(length -
                                            length * CONFIG.TEST_TRAIN_RATIO) +
                                      round(length * CONFIG.TEST_TRAIN_RATIO /
                                            2))]
                    self.transaction.input_data.test_indexes[
                        KEY_NO_GROUP_BY].extend(
                            self.transaction.input_data.test_indexes[key])

                    self.transaction.input_data.validation_indexes[
                        key] = self.transaction.input_data.all_indexes[key][(
                            round(length - length * CONFIG.TEST_TRAIN_RATIO) +
                            round(length * CONFIG.TEST_TRAIN_RATIO / 2)):]
                    self.transaction.input_data.validation_indexes[
                        KEY_NO_GROUP_BY].extend(self.transaction.input_data.
                                                validation_indexes[key])

                else:
                    # make sure that the last in the time series are also the subset used for test
                    train_window = (0,
                                    int(length *
                                        (1 - 2 * CONFIG.TEST_TRAIN_RATIO)))
                    self.transaction.input_data.train_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            train_window[0]:train_window[1]]
                    validation_window = (train_window[1], train_window[1] +
                                         int(length * CONFIG.TEST_TRAIN_RATIO))
                    test_window = (validation_window[1], length)
                    self.transaction.input_data.test_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            test_window[0]:test_window[1]]
                    self.transaction.input_data.validation_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            validation_window[0]:validation_window[1]]

        self.transaction.input_data.train_df = self.transaction.input_data.data_frame.iloc[
            self.transaction.input_data.train_indexes[KEY_NO_GROUP_BY]].copy()
        self.transaction.input_data.test_df = self.transaction.input_data.data_frame.iloc[
            self.transaction.input_data.test_indexes[KEY_NO_GROUP_BY]].copy()
        self.transaction.input_data.validation_df = self.transaction.input_data.data_frame.iloc[
            self.transaction.input_data.
            validation_indexes[KEY_NO_GROUP_BY]].copy()
        # @TODO: Consider deleting self.transaction.input_data.data_frame here

        # log some stats
        if self.transaction.lmd['type'] == TRANSACTION_LEARN:
            # @TODO I don't think the above works, fix at some point or just remove `sample_margin_of_error` option from the interface
            if len(self.transaction.input_data.data_frame) != sum([
                    len(self.transaction.input_data.train_df),
                    len(self.transaction.input_data.test_df),
                    len(self.transaction.input_data.validation_df)
            ]):
                self.log.info(
                    'You requested to sample with a *margin of error* of {sample_margin_of_error} and a *confidence level* of {sample_confidence_level}. Therefore:'
                    .format(sample_confidence_level=self.transaction.
                            lmd['sample_confidence_level'],
                            sample_margin_of_error=self.transaction.
                            lmd['sample_margin_of_error']))
                self.log.info(
                    'Using a [Cochran\'s sample size calculator](https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/) we got the following sample sizes:'
                )
                data = {
                    'total':
                    [total_rows_in_input, 'Total number of rows in input'],
                    'subsets': [[total_rows_used,
                                 'Total number of rows used']],
                    'label':
                    'Sample size for margin of error of ({sample_margin_of_error}) and a confidence level of ({sample_confidence_level})'
                    .format(sample_confidence_level=self.transaction.
                            lmd['sample_confidence_level'],
                            sample_margin_of_error=self.transaction.
                            lmd['sample_margin_of_error'])
                }
                self.log.infoChart(data, type='pie')
            # @TODO Bad code ends here (see @TODO above)

            data = {
                'subsets':
                [[len(self.transaction.input_data.train_df), 'Train'],
                 [len(self.transaction.input_data.test_df), 'Test'],
                 [
                     len(self.transaction.input_data.validation_df),
                     'Validation'
                 ]],
                'label':
                'Number of rows per subset'
            }

            self.log.info('We have split the input data into:')
            self.log.infoChart(data, type='pie')
Exemple #4
0
    def run(self,
            input_data,
            modify_light_metadata,
            hmd=None,
            print_logs=True):
        """
        # Runs the stats generation phase
        # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column
        # A lot of information about the data distribution and quality will  also be logged to the server in this phase
        """
        ''' @TODO Uncomment when we need multiprocessing, possibly disable on OSX
        no_processes = multiprocessing.cpu_count() - 2
        if no_processes < 1:
            no_processes = 1
        pool = multiprocessing.Pool(processes=no_processes)
        '''
        if print_logs == False:
            self.log = logging.getLogger('null-logger')
            self.log.propagate = False

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(input_data.data_frame)

        if population_size < 50:
            sample_size = population_size
        else:
            sample_size = int(
                calculate_sample_size(population_size=population_size,
                                      margin_error=self.transaction.
                                      lmd['sample_margin_of_error'],
                                      confidence_level=self.transaction.
                                      lmd['sample_confidence_level']))
            #if sample_size > 3000 and sample_size > population_size/8:
            #    sample_size = min(round(population_size/8),3000)

        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size),
                                                  sample_size)
        self.log.info(
            'population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'
            .format(population_size=population_size,
                    sample_size=sample_size,
                    percent=(sample_size / population_size) * 100))

        all_sampled_data = input_data.data_frame.iloc[
            input_data_sample_indexes]

        stats = {}
        col_data_dict = {}

        for col_name in all_sampled_data.columns.values:
            if col_name in self.transaction.lmd['columns_to_ignore']:
                continue

            col_data = all_sampled_data[col_name].dropna()
            full_col_data = all_sampled_data[col_name]

            data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info, column_status = self._get_column_data_type(
                col_data, input_data.data_frame, col_name)

            if column_status == 'Column empty':
                if modify_light_metadata:
                    self.transaction.lmd['columns_to_ignore'].append(col_name)

                continue

            new_col_data = []

            if curr_data_subtype == DATA_SUBTYPES.TIMESTAMP:  #data_type == DATA_TYPES.DATE:
                for element in col_data:
                    if str(element) in [
                            str(''),
                            str(None),
                            str(False),
                            str(np.nan), 'NaN', 'nan', 'NA', 'null'
                    ]:
                        new_col_data.append(None)
                    else:
                        try:
                            new_col_data.append(
                                int(parse_datetime(element).timestamp()))
                        except:
                            self.log.warning(
                                f'Could not convert string from col "{col_name}" to date and it was expected, instead got: {element}'
                            )
                            new_col_data.append(None)
                col_data = new_col_data
            if data_type == DATA_TYPES.NUMERIC or curr_data_subtype == DATA_SUBTYPES.TIMESTAMP:
                histogram, _ = StatsGenerator.get_histogram(
                    col_data,
                    data_type=data_type,
                    data_subtype=curr_data_subtype)
                x = histogram['x']
                y = histogram['y']

                col_data = StatsGenerator.clean_int_and_date_data(col_data)
                # This means the column is all nulls, which we don't handle at the moment
                if len(col_data) < 1:
                    return None

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)
                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0

                is_float = True if max(
                    [1 if int(i) != i else 0
                     for i in col_data]) == 1 else False

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "mean": mean,
                    "median": median,
                    "variance": var,
                    "skewness": skew,
                    "kurtosis": kurtosis,
                    "max": max_value,
                    "min": min_value,
                    "is_float": is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets": histogram['x']  #xp
                }

            elif data_type == DATA_TYPES.CATEGORICAL or curr_data_subtype == DATA_SUBTYPES.DATE:
                histogram, _ = StatsGenerator.get_histogram(
                    input_data.data_frame[col_name],
                    data_type=data_type,
                    data_subtype=curr_data_subtype)

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "histogram": histogram,
                    "percentage_buckets": histogram['x']
                }

            elif curr_data_subtype == DATA_SUBTYPES.IMAGE:
                histogram, percentage_buckets = StatsGenerator.get_histogram(
                    col_data, data_subtype=curr_data_subtype)

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    'percentage_buckets': percentage_buckets,
                    'histogram': histogram
                }

            # @TODO This is probably wrong, look into it a bit later
            else:
                # see if its a sentence or a word
                histogram, _ = StatsGenerator.get_histogram(
                    col_data,
                    data_type=data_type,
                    data_subtype=curr_data_subtype)
                dictionary = list(histogram.keys())

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(dictionary) / len(
                        col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False
                    if dictionary_lenght_percentage > 10 and len(
                            col_data) > 50 and is_full_text == False:
                        dictionary = []
                        dictionary_available = False

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "dictionary": dictionary,
                    "dictionaryAvailable": dictionary_available,
                    "dictionaryLenghtPercentage": dictionary_lenght_percentage,
                    "histogram": histogram
                }
            stats[col_name] = col_stats
            stats[col_name]['data_type_dist'] = data_type_dist
            stats[col_name]['data_subtype_dist'] = data_subtype_dist
            stats[col_name]['column'] = col_name

            empty_count = len(full_col_data) - len(col_data)

            stats[col_name]['empty_cells'] = empty_count
            stats[col_name]['empty_percentage'] = empty_count * 100 / len(
                full_col_data)
            for k in additional_info:
                stats[col_name][k] = additional_info[k]

            col_data_dict[col_name] = col_data

        for col_name in all_sampled_data.columns:
            if col_name in self.transaction.lmd['columns_to_ignore']:
                continue

            # Use the multiprocessing pool for computing scores which take a very long time to compute
            # For now there's only one and computing it takes way too long, so this is not enabled
            scores = []
            '''
            scores.append(pool.apply_async(compute_clf_based_correlation_score, args=(stats, all_sampled_data, col_name)))
            '''
            for score_promise in scores:
                # Wait for function on process to finish running
                score = score_promise.get()
                stats[col_name].update(score)

            for score_func in [
                    compute_duplicates_score, compute_empty_cells_score,
                    compute_data_type_dist_score, compute_z_score,
                    compute_lof_score, compute_similariy_score,
                    compute_value_distribution_score
            ]:
                start_time = time.time()
                if 'compute_z_score' in str(
                        score_func) or 'compute_lof_score' in str(score_func):
                    stats[col_name].update(
                        score_func(stats, col_data_dict, col_name))
                else:
                    stats[col_name].update(
                        score_func(stats, all_sampled_data, col_name))

                fun_name = str(score_func)
                run_duration = round(time.time() - start_time, 2)

            stats[col_name].update(compute_consistency_score(stats, col_name))
            stats[col_name].update(compute_redundancy_score(stats, col_name))
            stats[col_name].update(compute_variability_score(stats, col_name))
            stats[col_name].update(compute_data_quality_score(stats, col_name))

            stats[col_name]['is_foreign_key'] = self.is_foreign_key(
                col_name, stats[col_name], col_data_dict[col_name])
            if stats[col_name]['is_foreign_key'] and self.transaction.lmd[
                    'handle_foreign_keys']:
                self.transaction.lmd['columns_to_ignore'].append(col_name)

        total_rows = len(input_data.data_frame)

        if modify_light_metadata:
            self.transaction.lmd['column_stats'] = stats

            self.transaction.lmd['data_preparation'][
                'accepted_margin_of_error'] = self.transaction.lmd[
                    'sample_margin_of_error']

            self.transaction.lmd['data_preparation'][
                'total_row_count'] = total_rows
            self.transaction.lmd['data_preparation'][
                'used_row_count'] = sample_size
        ''' @TODO Uncomment when we need multiprocessing, possibly disable on OSX
        pool.close()
        pool.join()
        '''

        self._log_interesting_stats(stats)
        return stats
Exemple #5
0
    def run(self):
        result = self._get_prepared_input_df()

        columns = list(result.columns.values)
        data_array = list(result.values.tolist())

        self.transaction.input_data.columns = columns
        self.transaction.input_data.data_array = data_array

        self._validate_input_data_integrity()

        is_time_series = self.transaction.lmd['model_is_time_series']
        group_by = self.transaction.lmd['model_group_by']

        # create a list of the column numbers (indexes) that make the group by, this is so that we can greate group by hashes for each row
        if len(group_by) > 0:
            group_by_col_indexes = [
                columns.index(group_by_column) for group_by_column in group_by
            ]

        # create all indexes by group by, that is all the rows that belong to each group by
        self.transaction.input_data.all_indexes[KEY_NO_GROUP_BY] = []
        self.transaction.input_data.train_indexes[KEY_NO_GROUP_BY] = []
        self.transaction.input_data.test_indexes[KEY_NO_GROUP_BY] = []
        self.transaction.input_data.validation_indexes[KEY_NO_GROUP_BY] = []
        for i, row in enumerate(self.transaction.input_data.data_array):

            if len(group_by) > 0:
                group_by_value = '_'.join([
                    str(row[group_by_index])
                    for group_by_index in group_by_col_indexes
                ])

                if group_by_value not in self.transaction.input_data.all_indexes:
                    self.transaction.input_data.all_indexes[
                        group_by_value] = []

                self.transaction.input_data.all_indexes[group_by_value] += [i]

            self.transaction.input_data.all_indexes[KEY_NO_GROUP_BY] += [i]

        # move indexes to corresponding train, test, validation, etc and trim input data accordingly
        for key in self.transaction.input_data.all_indexes:
            if len(self.transaction.input_data.all_indexes
                   ) > 1 and key == KEY_NO_GROUP_BY:
                continue

            length = len(self.transaction.input_data.all_indexes[key])
            if self.transaction.lmd['type'] == TRANSACTION_LEARN:
                sample_size = int(
                    calculate_sample_size(population_size=length,
                                          margin_error=self.transaction.
                                          lmd['sample_margin_of_error'],
                                          confidence_level=self.transaction.
                                          lmd['sample_confidence_level']))

                # this evals True if it should send the entire group data into test, train or validation as opposed to breaking the group into the subsets
                should_split_by_group = type(
                    group_by) == list and len(group_by) > 0

                if should_split_by_group:
                    self.transaction.input_data.train_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            0:round(length - length * CONFIG.TEST_TRAIN_RATIO)]
                    self.transaction.input_data.train_indexes[
                        KEY_NO_GROUP_BY].extend(
                            self.transaction.input_data.train_indexes[key])

                    self.transaction.input_data.test_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            round(length -
                                  length * CONFIG.TEST_TRAIN_RATIO):int(
                                      round(length -
                                            length * CONFIG.TEST_TRAIN_RATIO) +
                                      round(length * CONFIG.TEST_TRAIN_RATIO /
                                            2))]
                    self.transaction.input_data.test_indexes[
                        KEY_NO_GROUP_BY].extend(
                            self.transaction.input_data.test_indexes[key])

                    self.transaction.input_data.validation_indexes[
                        key] = self.transaction.input_data.all_indexes[key][(
                            round(length - length * CONFIG.TEST_TRAIN_RATIO) +
                            round(length * CONFIG.TEST_TRAIN_RATIO / 2)):]
                    self.transaction.input_data.validation_indexes[
                        KEY_NO_GROUP_BY].extend(self.transaction.input_data.
                                                validation_indexes[key])

                else:
                    # make sure that the last in the time series are also the subset used for test
                    train_window = (0,
                                    int(length *
                                        (1 - 2 * CONFIG.TEST_TRAIN_RATIO)))
                    self.transaction.input_data.train_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            train_window[0]:train_window[1]]
                    validation_window = (train_window[1], train_window[1] +
                                         int(length * CONFIG.TEST_TRAIN_RATIO))
                    test_window = (validation_window[1], length)
                    self.transaction.input_data.test_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            test_window[0]:test_window[1]]
                    self.transaction.input_data.validation_indexes[
                        key] = self.transaction.input_data.all_indexes[key][
                            validation_window[0]:validation_window[1]]

        # log some stats
        if self.transaction.lmd['type'] == TRANSACTION_LEARN:

            total_rows_used_by_subset = {
                'train': 0,
                'test': 0,
                'validation': 0
            }
            average_number_of_rows_used_per_groupby = {
                'train': 0,
                'test': 0,
                'validation': 0
            }
            number_of_groups_per_subset = {
                'train': 0,
                'test': 0,
                'validation': 0
            }

            for group_key in total_rows_used_by_subset:
                pointer = getattr(self.transaction.input_data,
                                  group_key + '_indexes')
                total_rows_used_by_subset[group_key] = sum(
                    [len(pointer[key_i]) for key_i in pointer])
                number_of_groups_per_subset[group_key] = len(pointer)
                #average_number_of_rows_used_per_groupby[group_key] = total_rows_used_by_subset[group_key] / number_of_groups_per_subset[group_key]

            total_rows_used = sum(total_rows_used_by_subset.values())
            total_rows_in_input = len(self.transaction.input_data.data_array)
            total_number_of_groupby_groups = len(
                self.transaction.input_data.all_indexes)

            if total_rows_used != total_rows_in_input:
                self.log.info(
                    'You requested to sample with a *margin of error* of {sample_margin_of_error} and a *confidence level* of {sample_confidence_level}. Therefore:'
                    .format(sample_confidence_level=self.transaction.
                            lmd['sample_confidence_level'],
                            sample_margin_of_error=self.transaction.
                            lmd['sample_margin_of_error']))
                self.log.info(
                    'Using a [Cochran\'s sample size calculator](https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/find-sample-size/) we got the following sample sizes:'
                )
                data = {
                    'total':
                    [total_rows_in_input, 'Total number of rows in input'],
                    'subsets': [[total_rows_used,
                                 'Total number of rows used']],
                    'label':
                    'Sample size for margin of error of ({sample_margin_of_error}) and a confidence level of ({sample_confidence_level})'
                    .format(sample_confidence_level=self.transaction.
                            lmd['sample_confidence_level'],
                            sample_margin_of_error=self.transaction.
                            lmd['sample_margin_of_error'])
                }
                self.log.infoChart(data, type='pie')
            '''
            if total_number_of_groupby_groups > 1:
                self.log.info('You are grouping your data by [{group_by}], we found:'.format(group_by=', '.join(group_by)))
                data = {
                    'Total number of groupby groups': total_number_of_groupby_groups,
                    'Average number of rows per groupby group': int(sum(average_number_of_rows_used_per_groupby.values())/len(average_number_of_rows_used_per_groupby))
                }
                self.log.infoChart(data, type='list')
            '''

            self.log.info('We have split the input data into:')

            data = {
                'subsets':
                [[total_rows_used_by_subset['train'], 'Train'],
                 [total_rows_used_by_subset['test'], 'Test'],
                 [total_rows_used_by_subset['validation'], 'Validation']],
                'label':
                'Number of rows per subset'
            }

            self.log.infoChart(data, type='pie')
Exemple #6
0
    def run(self):
        """
        # Runs the stats generation phase
        # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column
        # A lot of information about the data distribution and quality will  also be logged to the server in this phase
        """

        header = self.transaction.input_data.columns
        non_null_data = {}
        all_sampled_data = {}

        for column in header:
            non_null_data[column] = []
            all_sampled_data[column] = []

        empty_count = {}
        column_count = {}

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(self.transaction.input_data.data_array)
        sample_size = int(
            calculate_sample_size(
                population_size=population_size,
                margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR,
                confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL))
        if sample_size > 3000 and sample_size > population_size / 8:
            sample_size = min(round(population_size / 8), 3000)
        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size),
                                                  sample_size)
        self.log.info(
            'population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'
            .format(population_size=population_size,
                    sample_size=sample_size,
                    percent=(sample_size / population_size) * 100))

        for sample_i in input_data_sample_indexes:
            row = self.transaction.input_data.data_array[sample_i]

            for i, val in enumerate(row):
                column = header[i]
                value = cast_string_to_python_type(val)
                if not column in empty_count:
                    empty_count[column] = 0
                    column_count[column] = 0
                if value == None:
                    empty_count[column] += 1
                else:
                    non_null_data[column].append(value)
                all_sampled_data[column].append(value)
                column_count[column] += 1
        stats = {}

        col_data_dict = {}
        for i, col_name in enumerate(non_null_data):
            col_data = non_null_data[col_name]  # all rows in just one column
            full_col_data = all_sampled_data[col_name]
            data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info = self._get_column_data_type(
                col_data, i)

            if data_type == DATA_TYPES.DATE:
                for i, element in enumerate(col_data):
                    if str(element) in [
                            str(''),
                            str(None),
                            str(False),
                            str(np.nan), 'NaN', 'nan', 'NA', 'null'
                    ]:
                        col_data[i] = None
                    else:
                        try:
                            col_data[i] = int(
                                parse_datetime(element).timestamp())
                        except:
                            self.log.warning(
                                'Could not convert string to date and it was expected, current value {value}'
                                .format(value=element))
                            col_data[i] = None

            if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE:
                newData = []

                for value in col_data:
                    if value != '' and value != '\r' and value != '\n':
                        newData.append(value)

                col_data = [
                    clean_float(i) for i in newData if str(i) not in [
                        '',
                        str(None),
                        str(False),
                        str(np.nan), 'NaN', 'nan', 'NA', 'null'
                    ]
                ]

                y, x = np.histogram(col_data, 50, density=False)
                x = (x + np.roll(x, -1))[:-1] / 2.0
                x = x.tolist()
                y = y.tolist()

                xp = []

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)

                    inc_rate = 0.1
                    initial_step_size = abs(max_value - min_value) / 100

                    xp += [min_value]
                    i = min_value + initial_step_size

                    while i < max_value:

                        xp += [i]
                        i_inc = abs(i - min_value) * inc_rate
                        i = i + i_inc
                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0
                    xp = []

                is_float = True if max(
                    [1 if int(i) != i else 0
                     for i in col_data]) == 1 else False

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "mean": mean,
                    "median": median,
                    "variance": var,
                    "skewness": skew,
                    "kurtosis": kurtosis,
                    "max": max_value,
                    "min": min_value,
                    "is_float": is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets": xp
                }
            elif data_type == DATA_TYPES.CATEGORICAL:
                all_values = []
                for row in self.transaction.input_data.data_array:
                    all_values.append(row[i])

                histogram = Counter(all_values)
                all_possible_values = histogram.keys()

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "histogram": {
                        "x": list(histogram.keys()),
                        "y": list(histogram.values())
                    }
                    #"percentage_buckets": list(histogram.keys())
                }

            # @TODO This is probably wrong, look into it a bit later
            else:
                # see if its a sentence or a word
                is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False
                dictionary, histogram = self._get_words_dictionary(
                    col_data, is_full_text)

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(dictionary) / len(
                        col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    if dictionary_lenght_percentage > 10 and len(
                            col_data) > 50 and is_full_text == False:
                        dictionary = []
                        dictionary_available = False
                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "dictionary": dictionary,
                    "dictionaryAvailable": dictionary_available,
                    "dictionaryLenghtPercentage": dictionary_lenght_percentage,
                    "histogram": histogram
                }
            stats[col_name] = col_stats
            stats[col_name]['data_type_dist'] = data_type_dist
            stats[col_name]['data_subtype_dist'] = data_subtype_dist
            stats[col_name]['column'] = col_name
            stats[col_name]['empty_cells'] = empty_count[col_name]
            stats[col_name]['empty_percentage'] = empty_count[
                col_name] * 100 / column_count[col_name]
            if 'separator' in additional_info:
                stats[col_name]['separator'] = additional_info['separator']
            col_data_dict[col_name] = col_data

        for i, col_name in enumerate(all_sampled_data):
            stats[col_name].update(
                self._compute_duplicates_score(stats, all_sampled_data,
                                               col_name))
            stats[col_name].update(
                self._compute_empty_cells_score(stats, all_sampled_data,
                                                col_name))
            #stats[col_name].update(self._compute_clf_based_correlation_score(stats, all_sampled_data, col_name))
            stats[col_name].update(
                self._compute_data_type_dist_score(stats, all_sampled_data,
                                                   col_name))
            stats[col_name].update(
                self._compute_z_score(stats, col_data_dict, col_name))
            stats[col_name].update(
                self._compute_lof_score(stats, col_data_dict, col_name))
            stats[col_name].update(
                self._compute_similariy_score(stats, all_sampled_data,
                                              col_name))
            stats[col_name].update(
                self._compute_value_distribution_score(stats, all_sampled_data,
                                                       col_name))

            stats[col_name].update(
                self._compute_consistency_score(stats, col_name))
            stats[col_name].update(
                self._compute_redundancy_score(stats, col_name))
            stats[col_name].update(
                self._compute_variability_score(stats, col_name))

            stats[col_name].update(
                self._compute_data_quality_score(stats, col_name))

        total_rows = len(self.transaction.input_data.data_array)
        test_rows = len(self.transaction.input_data.test_indexes)
        validation_rows = len(self.transaction.input_data.validation_indexes)
        train_rows = len(self.transaction.input_data.train_indexes)

        self.transaction.lmd['column_stats'] = stats
        self.transaction.lmd['data_preparation'][
            'total_row_count'] = total_rows
        self.transaction.lmd['data_preparation']['test_row_count'] = test_rows
        self.transaction.lmd['data_preparation'][
            'train_row_count'] = train_rows
        self.transaction.lmd['data_preparation'][
            'validation_row_count'] = validation_rows

        self._log_interesting_stats(stats)
        return stats