Esempio n. 1
0
    def test_deduce_foreign_key(self, transaction, lmd):
        """Tests that basic cases of type deduction work correctly"""
        hmd = transaction.hmd

        lmd['handle_foreign_keys'] = True

        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)
        n_points = 100

        input_dataframe = pd.DataFrame(
            {
                'numeric_id': list(range(n_points)),
                'uuid': [str(uuid4()) for i in range(n_points)]
            },
            index=list(range(n_points)))

        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        type_deductor.run(input_data)

        stats_v2 = lmd['stats_v2']

        assert stats_v2['numeric_id']['is_foreign_key']
        assert stats_v2['uuid']['is_foreign_key']

        assert 'numeric_id' in lmd['columns_to_ignore']
        assert 'uuid' in lmd['columns_to_ignore']
Esempio n. 2
0
    def __init__(self,
                 session,
                 light_transaction_metadata,
                 heavy_transaction_metadata,
                 logger=log):
        """
        A transaction is the interface to start some MindsDB operation within a session

        :param session:
        :type session: utils.controllers.session_controller.SessionController
        :param transaction_type:
        :param transaction_metadata:
        :type transaction_metadata: dict
        :type heavy_transaction_metadata: dict
        """

        self.session = session
        self.lmd = light_transaction_metadata
        self.lmd['created_at'] = str(datetime.datetime.now())
        self.hmd = heavy_transaction_metadata

        # variables to de defined by setup
        self.error = None
        self.errorMsg = None

        self.input_data = TransactionData()
        self.output_data = TrainTransactionOutputData()

        # variables that can be persisted

        self.log = logger

        self.run()
Esempio n. 3
0
    def __init__(self, session, transaction_metadata, breakpoint=PHASE_END):
        """
        A transaction is the interface to start some MindsDB operation within a session

        :param session:
        :type session: utils.controllers.session_controller.SessionController
        :param transaction_type:
        :param transaction_metadata:
        :type transaction_metadata: TransactionMetadata
        :param breakpoint:
        """

        self.session = session
        self.breakpoint = breakpoint
        self.session.current_transaction = self
        self.metadata = transaction_metadata  #type: TransactionMetadata

        # variables to de defined by setup
        self.error = None
        self.errorMsg = None

        self.input_data = TransactionData()
        self.output_data = TransactionOutputData(
            predicted_columns=self.metadata.model_predict_columns)

        self.model_data = ModelData()

        # variables that can be persisted
        self.persistent_model_metadata = PersistentModelMetadata()
        self.persistent_model_metadata.model_name = self.metadata.model_name
        self.persistent_ml_model_info = PersistentMlModelInfo()
        self.persistent_ml_model_info.model_name = self.metadata.model_name

        self.run()
Esempio n. 4
0
    def test_data_analysis(self, transaction, lmd, stats_v2, stats):
        """Tests that data analyzer doesn't crash on common types"""
        lmd['stats_v2'] = stats_v2
        lmd['column_stats'] = stats
        hmd = transaction.hmd


        data_analyzer = DataAnalyzer(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        n_category_values = 4
        input_dataframe = pd.DataFrame({
            'numeric_int': list(range(n_points)),
            'numeric_float': np.linspace(0, n_points, n_points),
            'date_timestamp': [(datetime.now() - timedelta(minutes=int(i))).isoformat() for i in range(n_points)],
            'date_date': [(datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d') for i in range(n_points)],
            'categorical_str': [f'a{x}' for x in (list(range(n_category_values)) * (n_points//n_category_values))],
            'categorical_binary': [0, 1] * (n_points//2),
            'sequential_array': [f"1,2,3,4,5,{i}" for i in range(n_points)],
            'sequential_text': [f'lorem ipsum long text {i}' for i in range(n_points)],
            # @NOTE: It makes little for the tests to crash if this column is missing.
            'categorical_int': [x for x in (list(range(n_category_values)) * (n_points//n_category_values))],
        }, index=list(range(n_points)))

        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        data_analyzer.run(input_data)

        stats_v2 = lmd['stats_v2']

        for col_name in input_dataframe.columns:
            assert stats_v2[col_name]['empty']['empty_percentage'] == 0
            assert not stats_v2[col_name]['empty']['is_empty']
            assert stats_v2[col_name]['histogram']
            assert 'percentage_buckets' in stats_v2[col_name]
            assert stats_v2[col_name]['bias']['entropy']

        assert stats_v2['categorical_str']['unique']['unique_values']
        assert stats_v2['categorical_str']['unique']['unique_percentage'] == 4.0

        # Assert that the histogram on text field is made using words
        assert isinstance(stats_v2['sequential_text']['histogram']['x'][0], str)

        assert hmd == {}

        assert isinstance(json.dumps(transaction.lmd), str)
Esempio n. 5
0
    def test_empty_column(self, transaction, lmd):
        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_int': list(range(n_points)),
                'empty_column': [None for i in range(n_points)],
            },
            index=list(range(n_points)))

        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        type_deductor.run(input_data)

        stats_v2 = lmd['stats_v2']
        assert stats_v2['empty_column']['typing']['data_type'] is None
Esempio n. 6
0
    def test_empty_values(self, transaction, lmd):
        data_analyzer = DataAnalyzer(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        input_dataframe = pd.DataFrame({
            'numeric_int': list(range(n_points)),
        },
                                       index=list(range(n_points)))

        stats_v2 = self.get_stats_v2(input_dataframe.columns)
        stats = self.get_stats(stats_v2)
        lmd['stats_v2'] = stats_v2
        lmd['column_stats'] = stats

        input_dataframe['numeric_int'].iloc[::2] = None
        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        data_analyzer.run(input_data)

        stats_v2 = lmd['stats_v2']

        assert stats_v2['numeric_int']['empty']['empty_percentage'] == 50
Esempio n. 7
0
    def test_type_mix(self, transaction, lmd):
        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_float': np.linspace(0, n_points, n_points),
            },
            index=list(range(n_points)))
        input_dataframe['numeric_float'].iloc[:2] = 'random string'
        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        type_deductor.run(input_data)

        stats_v2 = lmd['stats_v2']
        assert stats_v2['numeric_float']['typing'][
            'data_type'] == DATA_TYPES.NUMERIC
        assert stats_v2['numeric_float']['typing'][
            'data_subtype'] == DATA_SUBTYPES.FLOAT
        assert stats_v2['numeric_float']['typing']['data_type_dist'][
            DATA_TYPES.NUMERIC] == 97
        assert stats_v2['numeric_float']['typing']['data_subtype_dist'][
            DATA_SUBTYPES.FLOAT] == 97
Esempio n. 8
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        self.normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(self.normal_predictions,
                                            full_dataset, stats,
                                            output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            stats_generator = StatsGenerator(session=None,
                                             transaction=self.transaction)
            input_data = TransactionData()
            input_data.data_frame = self.normal_predictions[[output_column]]
            input_data.columns = [output_column]
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_stats = stats_generator.run(
                input_data=input_data, modify_light_metadata=False)

            if validation_set_output_stats is None:
                pass
            elif 'histogram' in validation_set_output_stats[output_column]:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_stats[
                        output_column]['histogram']

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}
                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                input_data = TransactionData()
                input_data.data_frame = col_missing_predictions[[
                    output_column
                ]]
                input_data.columns = [output_column]

                # @TODO: Running stats generator just to get the histogram is very inefficient, change this
                col_missing_output_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][
                        input_column] = col_missing_output_stats[
                            output_column]['histogram']

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        #for column in ignorable_input_columns:
        #    if c(column_importance_dict[column] > 0.8 or column_importance_dict[column] < 0.2):

        for output_column in output_columns:
            buckets_stats[output_column] = {}

            bucket_indexes = {}
            for index, row in full_dataset.iterrows():
                print(index)
                value = row[output_column]
                if 'percentage_buckets' in stats[output_column]:
                    percentage_buckets = stats[output_column][
                        'percentage_buckets']
                else:
                    percentage_buckets = None

                value_bucket = get_value_bucket(value, percentage_buckets,
                                                stats[output_column])
                if value_bucket not in bucket_indexes:
                    bucket_indexes[value_bucket] = []
                bucket_indexes[value_bucket].append(index)

            for bucket in bucket_indexes:
                buckets_stats[output_column][bucket] = {}
                input_data = TransactionData()
                input_data.data_frame = full_dataset.loc[
                    bucket_indexes[bucket]]
                input_data.columns = input_data.data_frame.columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                try:
                    col_buckets_stats = stats_generator.run(
                        input_data=input_data, modify_light_metadata=False)
                    buckets_stats[output_column][bucket].update(
                        col_buckets_stats)
                except:
                    print('Cloud not generate bucket stats for sub-bucket: {}'.
                          format(bucket))

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
Esempio n. 9
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        self.normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(self.normal_predictions,
                                            full_dataset, stats,
                                            output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            stats_generator = StatsGenerator(session=None,
                                             transaction=self.transaction)
            input_data = TransactionData()
            input_data.data_array = list(
                map(lambda x: [x],
                    list(self.normal_predictions[output_column])))
            input_data.columns = [output_column]
            validation_set_output_stats = stats_generator.run(
                input_data=input_data, modify_light_metadata=False)

            if validation_set_output_stats is None:
                pass
            elif 'histogram' in validation_set_output_stats[output_column]:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_stats[
                        output_column]['histogram']

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}
                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                input_data = TransactionData()
                input_data.data_array = list(
                    map(lambda x: [x],
                        list(col_missing_predictions[output_column])))
                input_data.columns = [output_column]
                col_missing_output_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                if col_missing_output_stats is None:
                    pass
                elif 'histogram' in col_missing_output_stats[output_column]:
                    columnless_prediction_distribution[output_column][
                        input_column] = col_missing_output_stats[
                            output_column]['histogram']

            # If this coulmn is either very important or not important at all, compute stats for each of the buckets (in the validation data)
            if column_importance > 0.8 or column_importance < 0.2:
                split_data = {}
                for value in full_dataset[input_column]:

                    if 'percentage_buckets' in stats[input_column]:
                        bucket = stats[input_column]['percentage_buckets']
                    else:
                        bucket = None

                    vb = get_value_bucket(value, bucket, stats[input_column])
                    if f'{input_column}_bucket_{vb}' not in split_data:
                        split_data[f'{input_column}_bucket_{vb}'] = []

                    split_data[f'{input_column}_bucket_{vb}'].append(value)

                row_wise_data = []
                max_length = max(list(map(len, split_data.values())))

                columns = []
                for i in range(max_length):
                    row_wise_data.append([])
                    for k in split_data.keys():
                        # If the sub bucket has less than 6 values, it's no relevant
                        if len(split_data[k]) > 6:
                            columns.append(k)
                            if len(split_data[k]) > i:
                                row_wise_data[-1].append(split_data[k][i])
                            else:
                                row_wise_data[-1].append(None)

                input_data = TransactionData()
                input_data.data_array = row_wise_data
                input_data.columns = columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                col_buckets_stats = stats_generator.run(
                    input_data=input_data, modify_light_metadata=False)

                buckets_stats.update(col_buckets_stats)

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
Esempio n. 10
0
    def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        with disable_console_output(True):
            normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(normal_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

            if validation_set_output_column_histogram is not None:
                all_columns_prediction_distribution[output_column] = validation_set_output_column_histogram

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [col for col in ignorable_input_columns if col != input_column]
            with disable_console_output(True):
                col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns)

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            with disable_console_output(True):
                col_missing_predictions = model.predict('validate', ignore_columns)
            col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns)

            combined_column_accuracy = ((normal_accuracy - col_missing_accuracy) + col_only_accuracy)/2
            if combined_column_accuracy < 0:
                combined_column_accuracy = 0
            column_importance = 10*(1 - (normal_accuracy - combined_column_accuracy)/normal_accuracy)
            if column_importance < 1:
                column_importance = 1
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}

                col_missing_output_histogram, _ = StatsGenerator.get_histogram(col_missing_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

                if col_missing_output_histogram is not None:
                    columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        for output_column in output_columns:
                buckets_stats[output_column] = {}

                bucket_indexes = {}
                for index,row in full_dataset.iterrows():
                    value = row[output_column]
                    if 'percentage_buckets' in stats[output_column]:
                        percentage_buckets = stats[output_column]['percentage_buckets']
                    else:
                        percentage_buckets = None

                    value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd)
                    if value_bucket not in bucket_indexes:
                        bucket_indexes[value_bucket] = []
                    bucket_indexes[value_bucket].append(index)

                for bucket in bucket_indexes:
                    buckets_stats[output_column][bucket] = {}
                    input_data = TransactionData()
                    input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]]
                    input_data.columns = input_data.columns

                    stats_generator = StatsGenerator(session=None, transaction=self.transaction)
                    try:
                        with disable_console_output():
                            col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False)
                        buckets_stats[output_column][bucket].update(col_buckets_stats)
                    except Exception as e:
                        pass

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
Esempio n. 11
0
    def test_type_deduction(self, transaction, lmd):
        """Tests that basic cases of type deduction work correctly"""
        hmd = transaction.hmd

        type_deductor = TypeDeductor(session=transaction.session,
                                     transaction=transaction)

        n_points = 100

        # Apparently for n_category_values = 10 it doesnt work
        n_category_values = 4
        input_dataframe = pd.DataFrame(
            {
                'numeric_int':
                list(range(n_points)),
                'numeric_float':
                np.linspace(0, n_points, n_points),
                'date_timestamp': [
                    (datetime.now() - timedelta(minutes=int(i))).isoformat()
                    for i in range(n_points)
                ],
                'date_date': [
                    (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d')
                    for i in range(n_points)
                ],
                'categorical_str': [
                    f'a{x}' for x in (list(range(n_category_values)) *
                                      (n_points // n_category_values))
                ],
                'categorical_int': [
                    x for x in (list(range(n_category_values)) *
                                (n_points // n_category_values))
                ],
                'categorical_binary': [0, 1] * (n_points // 2),
                'sequential_array':
                [f"1,2,3,4,5,{i}" for i in range(n_points)],
                'sequential_text':
                [f'lorem ipsum long text {i}' for i in range(n_points)],
            },
            index=list(range(n_points)))

        input_data = TransactionData()
        input_data.data_frame = input_dataframe
        type_deductor.run(input_data)

        stats_v2 = lmd['stats_v2']

        for col_name in input_dataframe.columns:
            expected_type = test_column_types[col_name][0]
            expected_subtype = test_column_types[col_name][1]
            assert stats_v2[col_name]['typing']['data_type'] == expected_type
            assert stats_v2[col_name]['typing'][
                'data_subtype'] == expected_subtype
            assert stats_v2[col_name]['typing']['data_type_dist'][
                expected_type] == 99
            assert stats_v2[col_name]['typing']['data_subtype_dist'][
                expected_subtype] == 99

        for col_name in stats_v2:
            assert not stats_v2[col_name]['is_foreign_key']

        assert DATA_SUBTYPES.INT in stats_v2['categorical_int'][
            'additional_info']['other_potential_subtypes']
        assert hmd == {}

        assert isinstance(json.dumps(transaction.lmd), str)