Esempio n. 1
0
 def _is_number(self, string):
     """ Returns True if string is a number. """
     try:
         # Should crash if not number
         clean_float(str(string))
         if '.' in str(string) or ',' in str(string):
             return DATA_SUBTYPES.FLOAT
         else:
             return DATA_SUBTYPES.INT
     except ValueError:
         return False
Esempio n. 2
0
    def clean_int_and_date_data(col_data):
        cleaned_data = []

        for value in col_data:
            if value != '' and value != '\r' and value != '\n':
                cleaned_data.append(value)

        cleaned_data = [clean_float(i) for i in cleaned_data if str(i) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null']]
        return cleaned_data
Esempio n. 3
0
def clean_int_and_date_data(col_data, log):
    cleaned_data = []

    for ele in col_data:
        if str(ele) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA', 'null'] and (not ele or not str(ele).isspace()):
            try:
                cleaned_data.append(clean_float(ele))
            except Exception as e1:
                try:
                    cleaned_data.append(parse_datetime(str(ele)).timestamp())
                except Exception as e2:
                    log.warning(f'Failed to parser numerical value with error chain:\n {e1} -> {e2}\n')
                    cleaned_data.append(0)

    return cleaned_data
Esempio n. 4
0
    def clean_int_and_date_data(col_data):
        cleaned_data = []

        for value in col_data:
            if value != '' and value != '\r' and value != '\n':
                cleaned_data.append(value)

        cleaned_data_new = []

        for ele in cleaned_data:
            if str(ele) not in [
                    '',
                    str(None),
                    str(False),
                    str(np.nan), 'NaN', 'nan', 'NA', 'null'
            ]:
                try:
                    cleaned_data_new.append(clean_float(ele))
                except:
                    cleaned_data_new.append(
                        parse_datetime(str(ele)).timestamp())

        return cleaned_data_new
Esempio n. 5
0
    def run(self):
        """
        # Runs the stats generation phase
        # This shouldn't alter the columns themselves, but rather provide the `stats` metadata object and update the types for each column
        # A lot of information about the data distribution and quality will  also be logged to the server in this phase
        """

        header = self.transaction.input_data.columns
        non_null_data = {}
        all_sampled_data = {}

        for column in header:
            non_null_data[column] = []
            all_sampled_data[column] = []

        empty_count = {}
        column_count = {}

        # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error
        population_size = len(self.transaction.input_data.data_array)
        sample_size = int(
            calculate_sample_size(
                population_size=population_size,
                margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR,
                confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL))
        if sample_size > 3000 and sample_size > population_size / 8:
            sample_size = min(round(population_size / 8), 3000)
        # get the indexes of randomly selected rows given the population size
        input_data_sample_indexes = random.sample(range(population_size),
                                                  sample_size)
        self.log.info(
            'population_size={population_size},  sample_size={sample_size}  {percent:.2f}%'
            .format(population_size=population_size,
                    sample_size=sample_size,
                    percent=(sample_size / population_size) * 100))

        for sample_i in input_data_sample_indexes:
            row = self.transaction.input_data.data_array[sample_i]

            for i, val in enumerate(row):
                column = header[i]
                value = cast_string_to_python_type(val)
                if not column in empty_count:
                    empty_count[column] = 0
                    column_count[column] = 0
                if value == None:
                    empty_count[column] += 1
                else:
                    non_null_data[column].append(value)
                all_sampled_data[column].append(value)
                column_count[column] += 1
        stats = {}

        col_data_dict = {}
        for i, col_name in enumerate(non_null_data):
            col_data = non_null_data[col_name]  # all rows in just one column
            full_col_data = all_sampled_data[col_name]
            data_type, curr_data_subtype, data_type_dist, data_subtype_dist, additional_info = self._get_column_data_type(
                col_data, i)

            if data_type == DATA_TYPES.DATE:
                for i, element in enumerate(col_data):
                    if str(element) in [
                            str(''),
                            str(None),
                            str(False),
                            str(np.nan), 'NaN', 'nan', 'NA', 'null'
                    ]:
                        col_data[i] = None
                    else:
                        try:
                            col_data[i] = int(
                                parse_datetime(element).timestamp())
                        except:
                            self.log.warning(
                                'Could not convert string to date and it was expected, current value {value}'
                                .format(value=element))
                            col_data[i] = None

            if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE:
                newData = []

                for value in col_data:
                    if value != '' and value != '\r' and value != '\n':
                        newData.append(value)

                col_data = [
                    clean_float(i) for i in newData if str(i) not in [
                        '',
                        str(None),
                        str(False),
                        str(np.nan), 'NaN', 'nan', 'NA', 'null'
                    ]
                ]

                y, x = np.histogram(col_data, 50, density=False)
                x = (x + np.roll(x, -1))[:-1] / 2.0
                x = x.tolist()
                y = y.tolist()

                xp = []

                if len(col_data) > 0:
                    max_value = max(col_data)
                    min_value = min(col_data)
                    mean = np.mean(col_data)
                    median = np.median(col_data)
                    var = np.var(col_data)
                    skew = st.skew(col_data)
                    kurtosis = st.kurtosis(col_data)

                    inc_rate = 0.1
                    initial_step_size = abs(max_value - min_value) / 100

                    xp += [min_value]
                    i = min_value + initial_step_size

                    while i < max_value:

                        xp += [i]
                        i_inc = abs(i - min_value) * inc_rate
                        i = i + i_inc
                else:
                    max_value = 0
                    min_value = 0
                    mean = 0
                    median = 0
                    var = 0
                    skew = 0
                    kurtosis = 0
                    xp = []

                is_float = True if max(
                    [1 if int(i) != i else 0
                     for i in col_data]) == 1 else False

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "mean": mean,
                    "median": median,
                    "variance": var,
                    "skewness": skew,
                    "kurtosis": kurtosis,
                    "max": max_value,
                    "min": min_value,
                    "is_float": is_float,
                    "histogram": {
                        "x": x,
                        "y": y
                    },
                    "percentage_buckets": xp
                }
            elif data_type == DATA_TYPES.CATEGORICAL:
                all_values = []
                for row in self.transaction.input_data.data_array:
                    all_values.append(row[i])

                histogram = Counter(all_values)
                all_possible_values = histogram.keys()

                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "histogram": {
                        "x": list(histogram.keys()),
                        "y": list(histogram.values())
                    }
                    #"percentage_buckets": list(histogram.keys())
                }

            # @TODO This is probably wrong, look into it a bit later
            else:
                # see if its a sentence or a word
                is_full_text = True if curr_data_subtype == DATA_SUBTYPES.TEXT else False
                dictionary, histogram = self._get_words_dictionary(
                    col_data, is_full_text)

                # if no words, then no dictionary
                if len(col_data) == 0:
                    dictionary_available = False
                    dictionary_lenght_percentage = 0
                    dictionary = []
                else:
                    dictionary_available = True
                    dictionary_lenght_percentage = len(dictionary) / len(
                        col_data) * 100
                    # if the number of uniques is too large then treat is a text
                    if dictionary_lenght_percentage > 10 and len(
                            col_data) > 50 and is_full_text == False:
                        dictionary = []
                        dictionary_available = False
                col_stats = {
                    'data_type': data_type,
                    'data_subtype': curr_data_subtype,
                    "dictionary": dictionary,
                    "dictionaryAvailable": dictionary_available,
                    "dictionaryLenghtPercentage": dictionary_lenght_percentage,
                    "histogram": histogram
                }
            stats[col_name] = col_stats
            stats[col_name]['data_type_dist'] = data_type_dist
            stats[col_name]['data_subtype_dist'] = data_subtype_dist
            stats[col_name]['column'] = col_name
            stats[col_name]['empty_cells'] = empty_count[col_name]
            stats[col_name]['empty_percentage'] = empty_count[
                col_name] * 100 / column_count[col_name]
            if 'separator' in additional_info:
                stats[col_name]['separator'] = additional_info['separator']
            col_data_dict[col_name] = col_data

        for i, col_name in enumerate(all_sampled_data):
            stats[col_name].update(
                self._compute_duplicates_score(stats, all_sampled_data,
                                               col_name))
            stats[col_name].update(
                self._compute_empty_cells_score(stats, all_sampled_data,
                                                col_name))
            #stats[col_name].update(self._compute_clf_based_correlation_score(stats, all_sampled_data, col_name))
            stats[col_name].update(
                self._compute_data_type_dist_score(stats, all_sampled_data,
                                                   col_name))
            stats[col_name].update(
                self._compute_z_score(stats, col_data_dict, col_name))
            stats[col_name].update(
                self._compute_lof_score(stats, col_data_dict, col_name))
            stats[col_name].update(
                self._compute_similariy_score(stats, all_sampled_data,
                                              col_name))
            stats[col_name].update(
                self._compute_value_distribution_score(stats, all_sampled_data,
                                                       col_name))

            stats[col_name].update(
                self._compute_consistency_score(stats, col_name))
            stats[col_name].update(
                self._compute_redundancy_score(stats, col_name))
            stats[col_name].update(
                self._compute_variability_score(stats, col_name))

            stats[col_name].update(
                self._compute_data_quality_score(stats, col_name))

        total_rows = len(self.transaction.input_data.data_array)
        test_rows = len(self.transaction.input_data.test_indexes)
        validation_rows = len(self.transaction.input_data.validation_indexes)
        train_rows = len(self.transaction.input_data.train_indexes)

        self.transaction.lmd['column_stats'] = stats
        self.transaction.lmd['data_preparation'][
            'total_row_count'] = total_rows
        self.transaction.lmd['data_preparation']['test_row_count'] = test_rows
        self.transaction.lmd['data_preparation'][
            'train_row_count'] = train_rows
        self.transaction.lmd['data_preparation'][
            'validation_row_count'] = validation_rows

        self._log_interesting_stats(stats)
        return stats
Esempio n. 6
0
 def clean_float_or_none(val):
     try:
         return clean_float(val)
     except:
         return None