Ejemplo n.º 1
0
    def predict(self, mode='predict', ignore_columns=[]):
        predict_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe(
            mode)
        model_definition = self.transaction.hmd['ludwig_data'][
            'model_definition']

        if len(timeseries_cols) > 0:
            predict_dataframe, model_definition = self._translate_df_to_timeseries_format(
                predict_dataframe, model_definition, timeseries_cols)

        for ignore_col in ignore_columns:
            try:
                predict_dataframe[ignore_col] = [None] * len(
                    predict_dataframe[ignore_col])
            except:
                for date_appendage in ['_year', '_month', '_day']:
                    predict_dataframe[ignore_col + date_appendage] = [
                        None
                    ] * len(predict_dataframe[ignore_col + date_appendage])

        with disable_console_output(True):
            model_dir = self.get_model_dir()
            model = LudwigModel.load(model_dir=model_dir)
            predictions = model.predict(data_df=predict_dataframe,
                                        gpus=self.get_useable_gpus())

        for col_name in predictions:
            col_name_normalized = col_name.replace('_predictions', '')
            predictions = predictions.rename(
                columns={col_name: col_name_normalized})

        return predictions
Ejemplo n.º 2
0
    def train(self):
        training_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe(
            'train')

        if len(timeseries_cols) > 0:
            training_dataframe, model_definition = self._translate_df_to_timeseries_format(
                training_dataframe, model_definition, timeseries_cols, 'train')

        with disable_console_output(True):
            # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295
            #model.initialize_model(train_set_metadata={})
            #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name']

            ludwig_save_is_working = False

            if not ludwig_save_is_working:
                shutil.rmtree('results', ignore_errors=True)

            if self.transaction.lmd['rebuild_model'] is True:
                model = LudwigModel(model_definition)
                merged_model_definition = model.model_definition
                train_set_metadata = build_metadata(
                    training_dataframe,
                    (merged_model_definition['input_features'] +
                     merged_model_definition['output_features']),
                    merged_model_definition['preprocessing'])
                model.initialize_model(train_set_metadata=train_set_metadata,
                                       gpus=self.get_useable_gpus())

                train_stats = model.train(
                    data_df=training_dataframe,
                    model_name=self.transaction.lmd['name'],
                    skip_save_model=ludwig_save_is_working,
                    skip_save_progress=True,
                    gpus=self.get_useable_gpus())
            else:
                model = LudwigModel.load(model_dir=self.get_model_dir())
                train_stats = model.train(
                    data_df=training_dataframe,
                    model_name=self.transaction.lmd['name'],
                    skip_save_model=ludwig_save_is_working,
                    skip_save_progress=True,
                    gpus=self.get_useable_gpus())

            for k in train_stats['train']:
                if k not in self.transaction.lmd['model_accuracy']['train']:
                    self.transaction.lmd['model_accuracy']['train'][k] = []
                    self.transaction.lmd['model_accuracy']['test'][k] = []
                elif k is not 'combined':
                    # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway
                    pass
                else:
                    self.transaction.lmd['model_accuracy']['train'][k].extend(
                        train_stats['train'][k]['accuracy'])
                    self.transaction.lmd['model_accuracy']['test'][k].extend(
                        train_stats['test'][k]['accuracy'])
            '''
            @ TRAIN ONLINE BIT That's not working
            model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path'])
            for i in range(0,100):
                train_stats = model.train_online(data_df=training_dataframe)
                # The resulting train_stats are "None"... wonderful -_-
            '''

        ludwig_model_savepath = os.path.join(
            CONFIG.MINDSDB_STORAGE_PATH,
            self.transaction.lmd['name'] + '_ludwig_data')
        if ludwig_save_is_working:
            model.save(ludwig_model_savepath)
            model.close()
        else:
            shutil.rmtree(ludwig_model_savepath, ignore_errors=True)
            shutil.move(os.path.join('results',
                                     os.listdir('results')[0]),
                        ludwig_model_savepath)
        self.transaction.lmd['ludwig_data'] = {
            'ludwig_save_path': ludwig_model_savepath
        }
        self.transaction.hmd['ludwig_data'] = {
            'model_definition': model_definition
        }
Ejemplo n.º 3
0
    def get_column_importance(self, model, output_columns, input_columns,
                              full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset,
                                            stats, output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(
                normal_predictions[output_column],
                data_type=stats[output_column]['data_type'],
                data_subtype=stats[output_column]['data_subtype'])

            if validation_set_output_column_histogram is not None:
                all_columns_prediction_distribution[
                    output_column] = validation_set_output_column_histogram

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [
                col for col in ignorable_input_columns if col != input_column
            ]
            col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions,
                                                  full_dataset, stats,
                                                  output_columns)

            col_only_normalized_accuracy = col_only_accuracy / normal_accuracy

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            col_missing_predictions = model.predict('validate', ignore_columns)

            col_missing_accuracy = evaluate_accuracy(col_missing_predictions,
                                                     full_dataset, stats,
                                                     output_columns)

            col_missing_reverse_accuracy = (
                normal_accuracy - col_missing_accuracy) / normal_accuracy
            column_importance = (col_only_normalized_accuracy +
                                 col_missing_reverse_accuracy) / 2
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}

                # @TODO: Running stats generator just to get the histogram is very inefficient, change this
                col_missing_output_histogram, _ = StatsGenerator.get_histogram(
                    col_missing_predictions[output_column],
                    data_type=stats[output_column]['data_type'],
                    data_subtype=stats[output_column]['data_subtype'])

                if col_missing_output_histogram is None:
                    columnless_prediction_distribution[output_column][
                        input_column] = missing_output_histogram

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        #for column in ignorable_input_columns:
        #    if c(column_importance_dict[column] > 0.8 or column_importance_dict[column] < 0.2):

        for output_column in output_columns:
            buckets_stats[output_column] = {}

            bucket_indexes = {}
            for index, row in full_dataset.iterrows():
                value = row[output_column]
                if 'percentage_buckets' in stats[output_column]:
                    percentage_buckets = stats[output_column][
                        'percentage_buckets']
                else:
                    percentage_buckets = None

                value_bucket = get_value_bucket(value, percentage_buckets,
                                                stats[output_column])
                if value_bucket not in bucket_indexes:
                    bucket_indexes[value_bucket] = []
                bucket_indexes[value_bucket].append(index)

            for bucket in bucket_indexes:
                buckets_stats[output_column][bucket] = {}
                input_data = TransactionData()
                input_data.data_frame = full_dataset.loc[
                    bucket_indexes[bucket]]
                input_data.columns = input_data.data_frame.columns

                stats_generator = StatsGenerator(session=None,
                                                 transaction=self.transaction)
                try:
                    with disable_console_output():
                        col_buckets_stats = stats_generator.run(
                            input_data=input_data,
                            modify_light_metadata=False,
                            print_logs=False)
                    buckets_stats[output_column][bucket].update(
                        col_buckets_stats)
                except:
                    pass
                    # @TODO Is this worth informing the user about ?
                    #print('Cloud not generate bucket stats for sub-bucket: {}'.format(bucket))

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
Ejemplo n.º 4
0
    def run(self):
        np.seterr(divide='warn', invalid='warn')
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore']]
        # Test some hypotheses about our columns

        if self.transaction.lmd['disable_optional_analysis'] is False:
            column_evaluator = ColumnEvaluator(self.transaction)
            column_importances, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution = column_evaluator.get_column_importance(model=self.transaction.model_backend, output_columns=output_columns, input_columns=input_columns, full_dataset=self.transaction.input_data.validation_df, stats=self.transaction.lmd['column_stats'])

            self.transaction.lmd['column_importances'] = column_importances
            self.transaction.lmd['columns_buckets_importances'] = buckets_stats
            self.transaction.lmd['columnless_prediction_distribution'] = columnless_prediction_distribution
            self.transaction.lmd['all_columns_prediction_distribution'] = all_columns_prediction_distribution

        # Create the probabilistic validators for each of the predict column
        probabilistic_validators = {}
        for col in output_columns:
            if 'percentage_buckets' in self.transaction.lmd['column_stats'][col]:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])
            else:
                probabilistic_validators[col] = ProbabilisticValidator(
                    col_stats=self.transaction.lmd['column_stats'][col])

        ignorable_input_columns = []
        for input_column in input_columns:
            if self.transaction.lmd['column_stats'][input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]:
                ignorable_input_columns.append(input_column)

        with disable_console_output():
            normal_predictions = self.transaction.model_backend.predict('validate')

        # Single observation on the validation dataset when we have no ignorable column
        if len(ignorable_input_columns) == 0:
            for pcol in output_columns:
                for i in range(len(self.transaction.input_data.validation_df[pcol])):
                    probabilistic_validators[pcol].register_observation(features_existence=[True for col in input_columns], real_value=self.transaction.input_data.validation_df[pcol].iloc[i], predicted_value=normal_predictions[pcol][i], is_original_data=True, hmd=self.transaction.hmd)

        # Run on the validation set multiple times, each time with one of the column blanked out
        is_original_data = True
        for column_name in ignorable_input_columns:
            ignore_columns = []
            ignore_columns.append(column_name)

            # Silence logging since otherwise lightwood and ludwig will complain too much about None values
            with disable_console_output():
                ignore_col_predictions = self.transaction.model_backend.predict('validate', ignore_columns)

            # create a vector that has True for each feature that was passed to the model tester and False if it was blanked
            features_existence = [True if np_col not in ignore_columns else False for np_col in input_columns]

            pv = {}
            for pcol in output_columns:
                for i in range(len(self.transaction.input_data.validation_df[pcol])):

                    probabilistic_validators[pcol].register_observation(features_existence=features_existence, real_value=self.transaction.input_data.validation_df[pcol].iloc[i], predicted_value=ignore_col_predictions[pcol][i], is_original_data=False, hmd=self.transaction.hmd)

                    probabilistic_validators[pcol].register_observation(features_existence=[True for col in input_columns], real_value=self.transaction.input_data.validation_df[pcol].iloc[i], predicted_value=normal_predictions[pcol][i], is_original_data=is_original_data, hmd=self.transaction.hmd)
                    # Only register the original data once !
            is_original_data = False

        self.transaction.lmd['accuracy_histogram'] = {}

        total_accuracy = 0
        for pcol in output_columns:
            probabilistic_validators[pcol].partial_fit()
            accuracy_histogram, validation_set_accuracy = probabilistic_validators[pcol].get_accuracy_histogram()
            self.transaction.lmd['accuracy_histogram'][pcol] = accuracy_histogram
            total_accuracy += validation_set_accuracy
        self.transaction.lmd['validation_set_accuracy'] = total_accuracy/len(output_columns)

        # Pickle for later use
        self.transaction.hmd['probabilistic_validators'] = {}
        for col in probabilistic_validators:
            confusion_matrix = probabilistic_validators[col].get_confusion_matrix()
            self.transaction.lmd['confusion_matrices'][col] = confusion_matrix
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(probabilistic_validators[col])
Ejemplo n.º 5
0
    def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats):
        columnless_prediction_distribution = {}
        all_columns_prediction_distribution = {}

        with disable_console_output(True):
            normal_predictions = model.predict('validate')
        normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns)
        column_importance_dict = {}
        buckets_stats = {}

        # Histogram for when all columns are present, in order to plot the force vectors
        for output_column in output_columns:
            # @TODO: Running stats generator just to get the histogram is very inefficient, change this
            validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(normal_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

            if validation_set_output_column_histogram is not None:
                all_columns_prediction_distribution[output_column] = validation_set_output_column_histogram

        ignorable_input_columns = []
        for input_column in input_columns:
            if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]:
                ignorable_input_columns.append(input_column)

        for input_column in ignorable_input_columns:
            # See what happens with the accuracy of the outputs if only this column is present
            ignore_columns = [col for col in ignorable_input_columns if col != input_column]
            with disable_console_output(True):
                col_only_predictions = model.predict('validate', ignore_columns)
            col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns)

            # See what happens with the accuracy if all columns but this one are present
            ignore_columns = [input_column]
            with disable_console_output(True):
                col_missing_predictions = model.predict('validate', ignore_columns)
            col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns)

            combined_column_accuracy = ((normal_accuracy - col_missing_accuracy) + col_only_accuracy)/2
            if combined_column_accuracy < 0:
                combined_column_accuracy = 0
            column_importance = 10*(1 - (normal_accuracy - combined_column_accuracy)/normal_accuracy)
            if column_importance < 1:
                column_importance = 1
            column_importance_dict[input_column] = column_importance

            # Histogram for when the column is missing, in order to plot the force vectors
            for output_column in output_columns:
                if output_column not in columnless_prediction_distribution:
                    columnless_prediction_distribution[output_column] = {}

                col_missing_output_histogram, _ = StatsGenerator.get_histogram(col_missing_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype'])

                if col_missing_output_histogram is not None:
                    columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram

        # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
        for output_column in output_columns:
                buckets_stats[output_column] = {}

                bucket_indexes = {}
                for index,row in full_dataset.iterrows():
                    value = row[output_column]
                    if 'percentage_buckets' in stats[output_column]:
                        percentage_buckets = stats[output_column]['percentage_buckets']
                    else:
                        percentage_buckets = None

                    value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd)
                    if value_bucket not in bucket_indexes:
                        bucket_indexes[value_bucket] = []
                    bucket_indexes[value_bucket].append(index)

                for bucket in bucket_indexes:
                    buckets_stats[output_column][bucket] = {}
                    input_data = TransactionData()
                    input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]]
                    input_data.columns = input_data.columns

                    stats_generator = StatsGenerator(session=None, transaction=self.transaction)
                    try:
                        with disable_console_output():
                            col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False)
                        buckets_stats[output_column][bucket].update(col_buckets_stats)
                    except Exception as e:
                        pass

        return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution