Esempio n. 1
0
def test_language_analysis():
    from langdetect import DetectorFactory
    DetectorFactory.seed = 0

    WORDS = {
        'en':
        ['because', 'tree', 'merge', 'work', 'interpret', 'call', 'think'],
        'ru':
        ['только', 'говорить', 'когда', 'человек', 'быть', 'первый', 'осень'],
        'de':
        ['führen', 'stelle', 'heißen', 'konnten', 'schlimm', 'mögen', 'nähe'],
    }

    sent_size = 7
    num_sents = 10

    for lang, words in WORDS.items():
        sentences = [random.sample(words, sent_size) for _ in range(num_sents)]

        nr_words, word_dist, nr_words_dist = analyze_sentences(
            ' '.join(sent) for sent in sentences)

        assert nr_words == len(sentences * sent_size)

        lang_dist = get_language_dist(' '.join(sent) for sent in sentences)
        assert lang_dist[lang] == len(sentences)
        assert 'Unknown' in lang_dist and lang_dist['Unknown'] == 0
Esempio n. 2
0
    def get_column_data_type(self, data, full_data, col_name):
        """
        Provided the column data, define its data type and data subtype.

        :param data: an iterable containing a sample of the data frame
        :param full_data: an iterable containing the whole column of a data frame

        :return: type and type distribution, we can later use type_distribution to determine data quality
        NOTE: type distribution is the count that this column has for belonging cells to each DATA_TYPE
        """
        additional_info = {
            'other_potential_subtypes': [],
            'other_potential_types': []
        }

        if len(data) == 0:
            self.log.warning(
                f'Column {col_name} has no data in it. '
                f'Please remove {col_name} from the training file or fill in some of the values !'
            )
            return None, None, None, None, additional_info

        type_dist, subtype_dist = {}, {}

        # User-provided dtype
        if col_name in self.transaction.lmd['data_subtypes']:
            curr_data_type = self.transaction.lmd['data_types'][col_name]
            curr_data_subtype = self.transaction.lmd['data_subtypes'][col_name]
            type_dist[curr_data_type] = len(data)
            subtype_dist[curr_data_subtype] = len(data)
            self.log.info(
                f'Manually setting the types for column {col_name} to {curr_data_type}->{curr_data_subtype}'
            )
            return curr_data_type, curr_data_subtype, type_dist, subtype_dist, additional_info

        # Forced categorical dtype
        if col_name in self.transaction.lmd['force_categorical_encoding']:
            curr_data_type = DATA_TYPES.CATEGORICAL
            curr_data_subtype = DATA_SUBTYPES.MULTIPLE
            type_dist[DATA_TYPES.CATEGORICAL] = len(data)
            subtype_dist[DATA_SUBTYPES.MULTIPLE] = len(data)
            return curr_data_type, curr_data_subtype, type_dist, subtype_dist, additional_info

        type_dist, subtype_dist, new_additional_info = self.count_data_types_in_column(
            data)

        if new_additional_info:
            additional_info.update(new_additional_info)

        # @TODO consider removing or flagging rows where data type is unknown in the future, might just be corrupt data...
        known_type_dist = {
            k: v
            for k, v in type_dist.items() if k != 'Unknown'
        }

        if known_type_dist:
            max_known_dtype, max_known_dtype_count = max(
                known_type_dist.items(), key=lambda kv: kv[0])
        else:
            max_known_dtype, max_known_dtype_count = None, None

        nr_vals = len(full_data)
        nr_distinct_vals = len(set(full_data))

        # Data is mostly not unknown, go with type counting results
        if max_known_dtype and max_known_dtype_count > type_dist['Unknown']:
            curr_data_type = max_known_dtype

            possible_subtype_counts = [
                (k, v) for k, v in subtype_dist.items()
                if k in DATA_TYPES_SUBTYPES[curr_data_type]
            ]
            curr_data_subtype, _ = max(possible_subtype_counts,
                                       key=lambda pair: pair[0])
        else:
            curr_data_type, curr_data_subtype = None, None

        # Check for Tags subtype
        if curr_data_subtype != DATA_SUBTYPES.ARRAY:
            lengths = []
            unique_tokens = set()

            can_be_tags = False
            if all(isinstance(x, str) for x in data):
                can_be_tags = True
                delimiter = self.transaction.lmd.get('tags_delimiter', ',')
                for item in data:
                    item_tags = [t.strip() for t in item.split(delimiter)]
                    lengths.append(len(item_tags))
                    unique_tokens = unique_tokens.union(set(item_tags))

            # If more than 30% of the samples contain more than 1 category and there's more than 6 of them and they are shared between the various cells
            if can_be_tags and np.mean(lengths) > 1.3 and len(
                    unique_tokens
            ) >= 6 and len(unique_tokens) / np.mean(lengths) < (len(data) / 4):
                curr_data_type = DATA_TYPES.CATEGORICAL
                curr_data_subtype = DATA_SUBTYPES.TAGS

        # Categorical based on unique values
        if curr_data_type != DATA_TYPES.DATE and curr_data_subtype != DATA_SUBTYPES.TAGS:
            if nr_distinct_vals < (nr_vals / 20) or nr_distinct_vals < 6:
                if (curr_data_type != DATA_TYPES.NUMERIC) or (nr_distinct_vals
                                                              < 20):
                    if curr_data_type is not None:
                        additional_info['other_potential_types'].append(
                            curr_data_type)
                        additional_info['other_potential_subtypes'].append(
                            curr_data_subtype)
                    curr_data_type = DATA_TYPES.CATEGORICAL

        # If curr_data_type is still None, then it's text or category
        if curr_data_type is None:
            lang_dist = get_language_dist(data)

            # Normalize lang probabilities
            for lang in lang_dist:
                lang_dist[lang] /= len(data)

            # If most cells are unknown language then it's categorical
            if lang_dist['Unknown'] > 0.5:
                curr_data_type = DATA_TYPES.CATEGORICAL
            else:
                nr_words, word_dist, nr_words_dist = analyze_sentences(data)

                if 1 in nr_words_dist and nr_words_dist[1] == nr_words:
                    curr_data_type = DATA_TYPES.CATEGORICAL
                else:
                    curr_data_type = DATA_TYPES.TEXT

                    if len(word_dist) > 500 and nr_words / len(data) > 5:
                        curr_data_subtype = DATA_SUBTYPES.RICH
                    else:
                        curr_data_subtype = DATA_SUBTYPES.SHORT

                    type_dist = {curr_data_type: len(data)}
                    subtype_dist = {curr_data_subtype: len(data)}

                    return curr_data_type, curr_data_subtype, type_dist, subtype_dist, additional_info

        if curr_data_type == DATA_TYPES.CATEGORICAL and curr_data_subtype != DATA_SUBTYPES.TAGS:
            if nr_distinct_vals > 2:
                curr_data_subtype = DATA_SUBTYPES.MULTIPLE
            else:
                curr_data_subtype = DATA_SUBTYPES.SINGLE

        if curr_data_type in [DATA_TYPES.CATEGORICAL, DATA_TYPES.TEXT]:
            type_dist = {curr_data_type: len(data)}
            subtype_dist = {curr_data_subtype: len(data)}

        return curr_data_type, curr_data_subtype, type_dist, subtype_dist, additional_info
Esempio n. 3
0
    def run(self, input_data):
        stats_v2 = self.transaction.lmd['stats_v2']

        sample_settings = self.transaction.lmd['sample_settings']

        population_size = len(input_data.data_frame)
        if sample_settings['sample_for_analysis']:
            sample_margin_of_error = sample_settings['sample_margin_of_error']
            sample_confidence_level = sample_settings['sample_confidence_level']
            sample_percentage = sample_settings['sample_percentage']
            sample_function = self.transaction.hmd['sample_function']

            sample_df = input_data.sample_df(sample_function,
                                             sample_margin_of_error,
                                             sample_confidence_level,
                                             sample_percentage)

            sample_size = len(sample_df)
        else:
            sample_size = population_size
            sample_df = input_data.data_frame

        self.transaction.log.info(f'Analyzing a sample of {sample_size} '
                                  f'from a total population of {population_size}, '
                                  f'this is equivalent to {round(sample_size * 100 / population_size, 1)}% of your data.')

        for col_name in self.transaction.lmd['empty_columns']:
            stats_v2[col_name] = {}
            stats_v2[col_name]['empty'] = {'is_empty': True}
            self.log.warning(f'Column {col_name} is empty.')

        for col_name in sample_df.columns.values:
            self.log.info(f'Analyzing column: {col_name} !')
            data_type = stats_v2[col_name]['typing']['data_type']
            data_subtype = stats_v2[col_name]['typing']['data_subtype']

            col_data = sample_df[col_name].dropna()
            if data_type == DATA_TYPES.NUMERIC or data_subtype == DATA_SUBTYPES.TIMESTAMP:
                col_data = clean_int_and_date_data(col_data, self.log)

            stats_v2[col_name]['empty'] = get_column_empty_values_report(input_data.data_frame[col_name])

            if data_type == DATA_TYPES.CATEGORICAL:
                hist_data = input_data.data_frame[col_name]
                stats_v2[col_name]['unique'] = get_uniq_values_report(input_data.data_frame[col_name])
            else:
                hist_data = col_data

            histogram, percentage_buckets = get_histogram(hist_data,
                                                          data_type=data_type,
                                                          data_subtype=data_subtype)
            stats_v2[col_name]['histogram'] = histogram
            stats_v2[col_name]['percentage_buckets'] = percentage_buckets
            if histogram:
                S, biased_buckets = compute_entropy_biased_buckets(histogram['y'], histogram['x'])
                stats_v2[col_name]['bias'] = {
                    'entropy': S,
                    'description': """Under the assumption of uniformly distributed data (i.e., same probability for Head or Tails on a coin flip) mindsdb tries to detect potential divergences from such case, and it calls this "potential bias". Thus by our data having any potential bias mindsdb means any divergence from all categories having the same probability of being selected."""
                }
                if biased_buckets:
                    stats_v2[col_name]['bias']['biased_buckets'] = biased_buckets
                if S < 0.8:
                    if data_type == DATA_TYPES.CATEGORICAL:
                        warning_str =  "You may to check if some categories occur too often to too little in this columns."
                    else:
                        warning_str = "You may want to check if you see something suspicious on the right-hand-side graph."
                    stats_v2[col_name]['bias']['warning'] = warning_str + " This doesn't necessarily mean there's an issue with your data, it just indicates a higher than usual probability there might be some issue."

                if data_type == DATA_TYPES.NUMERIC:
                        outliers = lof_outliers(data_subtype, col_data)
                        stats_v2[col_name]['outliers'] = {
                            'outlier_values': outliers,
                            'outlier_buckets': compute_outlier_buckets(
                                outlier_values=outliers,
                                hist_x=histogram['x'],
                                hist_y=histogram['y'],
                                percentage_buckets=percentage_buckets,
                                col_stats=stats_v2[col_name]
                            ),
                            'description': """Potential outliers can be thought as the "extremes", i.e., data points that are far from the center of mass (mean/median/interquartile range) of the data."""
                        }

            if data_type == DATA_TYPES.TEXT:
                lang_dist = get_language_dist(col_data)
                nr_words, word_dist, nr_words_dist = analyze_sentences(col_data)

                stats_v2[col_name]['avg_words_per_sentence'] = nr_words / len(col_data)
                stats_v2[col_name]['word_dist'] = shrink_word_dist(word_dist)
                stats_v2[col_name]['nr_words_dist'] = nr_words_dist
                stats_v2[col_name]['lang_dist'] = lang_dist

            stats_v2[col_name]['nr_warnings'] = 0
            for x in stats_v2[col_name].values():
                if isinstance(x, dict) and 'warning' in x:
                    self.log.warning(x['warning'])
                stats_v2[col_name]['nr_warnings'] += 1
            self.log.info(f'Finished analyzing column: {col_name} !\n')

            if data_type == DATA_TYPES.CATEGORICAL:
                if data_subtype == DATA_SUBTYPES.TAGS:
                    delimiter = self.transaction.lmd.get('tags_delimiter', ',')
                    stats_v2[col_name]['tag_hist'] = Counter()
                    for item in col_data:
                        arr = [x.strip() for x in item.split(delimiter)]
                        stats_v2[col_name]['tag_hist'].update(arr)
                    stats_v2[col_name]['guess_probability'] = np.mean([(v / len(col_data))**2 for v in stats_v2[col_name]['tag_hist'].values()])
                    stats_v2[col_name]['balanced_guess_probability'] = 0.5
                else:
                    stats_v2[col_name]['guess_probability'] = sum((k / len(col_data))**2 for k in histogram['y'])
                    stats_v2[col_name]['balanced_guess_probability'] = 1 / len(histogram['y'])

        self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = self.transaction.lmd['sample_settings']['sample_margin_of_error']

        self.transaction.lmd['data_preparation']['total_row_count'] = len(input_data.data_frame)
        self.transaction.lmd['data_preparation']['used_row_count'] = len(sample_df)