def predict(self, mode='predict', ignore_columns=[]): predict_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe( mode) model_definition = self.transaction.hmd['ludwig_data'][ 'model_definition'] if len(timeseries_cols) > 0: predict_dataframe, model_definition = self._translate_df_to_timeseries_format( predict_dataframe, model_definition, timeseries_cols) for ignore_col in ignore_columns: try: predict_dataframe[ignore_col] = [None] * len( predict_dataframe[ignore_col]) except: for date_appendage in ['_year', '_month', '_day']: predict_dataframe[ignore_col + date_appendage] = [ None ] * len(predict_dataframe[ignore_col + date_appendage]) with disable_console_output(True): model_dir = self.get_model_dir() model = LudwigModel.load(model_dir=model_dir) predictions = model.predict(data_df=predict_dataframe, gpus=self.get_useable_gpus()) for col_name in predictions: col_name_normalized = col_name.replace('_predictions', '') predictions = predictions.rename( columns={col_name: col_name_normalized}) return predictions
def train(self): training_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe( 'train') if len(timeseries_cols) > 0: training_dataframe, model_definition = self._translate_df_to_timeseries_format( training_dataframe, model_definition, timeseries_cols, 'train') with disable_console_output(True): # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295 #model.initialize_model(train_set_metadata={}) #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name'] ludwig_save_is_working = False if not ludwig_save_is_working: shutil.rmtree('results', ignore_errors=True) if self.transaction.lmd['rebuild_model'] is True: model = LudwigModel(model_definition) merged_model_definition = model.model_definition train_set_metadata = build_metadata( training_dataframe, (merged_model_definition['input_features'] + merged_model_definition['output_features']), merged_model_definition['preprocessing']) model.initialize_model(train_set_metadata=train_set_metadata, gpus=self.get_useable_gpus()) train_stats = model.train( data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=ludwig_save_is_working, skip_save_progress=True, gpus=self.get_useable_gpus()) else: model = LudwigModel.load(model_dir=self.get_model_dir()) train_stats = model.train( data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=ludwig_save_is_working, skip_save_progress=True, gpus=self.get_useable_gpus()) for k in train_stats['train']: if k not in self.transaction.lmd['model_accuracy']['train']: self.transaction.lmd['model_accuracy']['train'][k] = [] self.transaction.lmd['model_accuracy']['test'][k] = [] elif k is not 'combined': # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway pass else: self.transaction.lmd['model_accuracy']['train'][k].extend( train_stats['train'][k]['accuracy']) self.transaction.lmd['model_accuracy']['test'][k].extend( train_stats['test'][k]['accuracy']) ''' @ TRAIN ONLINE BIT That's not working model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) for i in range(0,100): train_stats = model.train_online(data_df=training_dataframe) # The resulting train_stats are "None"... wonderful -_- ''' ludwig_model_savepath = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_ludwig_data') if ludwig_save_is_working: model.save(ludwig_model_savepath) model.close() else: shutil.rmtree(ludwig_model_savepath, ignore_errors=True) shutil.move(os.path.join('results', os.listdir('results')[0]), ludwig_model_savepath) self.transaction.lmd['ludwig_data'] = { 'ludwig_save_path': ludwig_model_savepath } self.transaction.hmd['ludwig_data'] = { 'model_definition': model_definition }
def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats): columnless_prediction_distribution = {} all_columns_prediction_distribution = {} normal_predictions = model.predict('validate') normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns) column_importance_dict = {} buckets_stats = {} # Histogram for when all columns are present, in order to plot the force vectors for output_column in output_columns: # @TODO: Running stats generator just to get the histogram is very inefficient, change this validation_set_output_column_histogram, _ = StatsGenerator.get_histogram( normal_predictions[output_column], data_type=stats[output_column]['data_type'], data_subtype=stats[output_column]['data_subtype']) if validation_set_output_column_histogram is not None: all_columns_prediction_distribution[ output_column] = validation_set_output_column_histogram ignorable_input_columns = [] for input_column in input_columns: if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH: ignorable_input_columns.append(input_column) for input_column in ignorable_input_columns: # See what happens with the accuracy of the outputs if only this column is present ignore_columns = [ col for col in ignorable_input_columns if col != input_column ] col_only_predictions = model.predict('validate', ignore_columns) col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns) col_only_normalized_accuracy = col_only_accuracy / normal_accuracy # See what happens with the accuracy if all columns but this one are present ignore_columns = [input_column] col_missing_predictions = model.predict('validate', ignore_columns) col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns) col_missing_reverse_accuracy = ( normal_accuracy - col_missing_accuracy) / normal_accuracy column_importance = (col_only_normalized_accuracy + col_missing_reverse_accuracy) / 2 column_importance_dict[input_column] = column_importance # Histogram for when the column is missing, in order to plot the force vectors for output_column in output_columns: if output_column not in columnless_prediction_distribution: columnless_prediction_distribution[output_column] = {} # @TODO: Running stats generator just to get the histogram is very inefficient, change this col_missing_output_histogram, _ = StatsGenerator.get_histogram( col_missing_predictions[output_column], data_type=stats[output_column]['data_type'], data_subtype=stats[output_column]['data_subtype']) if col_missing_output_histogram is None: columnless_prediction_distribution[output_column][ input_column] = missing_output_histogram # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column #for column in ignorable_input_columns: # if c(column_importance_dict[column] > 0.8 or column_importance_dict[column] < 0.2): for output_column in output_columns: buckets_stats[output_column] = {} bucket_indexes = {} for index, row in full_dataset.iterrows(): value = row[output_column] if 'percentage_buckets' in stats[output_column]: percentage_buckets = stats[output_column][ 'percentage_buckets'] else: percentage_buckets = None value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column]) if value_bucket not in bucket_indexes: bucket_indexes[value_bucket] = [] bucket_indexes[value_bucket].append(index) for bucket in bucket_indexes: buckets_stats[output_column][bucket] = {} input_data = TransactionData() input_data.data_frame = full_dataset.loc[ bucket_indexes[bucket]] input_data.columns = input_data.data_frame.columns stats_generator = StatsGenerator(session=None, transaction=self.transaction) try: with disable_console_output(): col_buckets_stats = stats_generator.run( input_data=input_data, modify_light_metadata=False, print_logs=False) buckets_stats[output_column][bucket].update( col_buckets_stats) except: pass # @TODO Is this worth informing the user about ? #print('Cloud not generate bucket stats for sub-bucket: {}'.format(bucket)) return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution
def run(self): np.seterr(divide='warn', invalid='warn') """ # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions """ output_columns = self.transaction.lmd['predict_columns'] input_columns = [col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore']] # Test some hypotheses about our columns if self.transaction.lmd['disable_optional_analysis'] is False: column_evaluator = ColumnEvaluator(self.transaction) column_importances, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution = column_evaluator.get_column_importance(model=self.transaction.model_backend, output_columns=output_columns, input_columns=input_columns, full_dataset=self.transaction.input_data.validation_df, stats=self.transaction.lmd['column_stats']) self.transaction.lmd['column_importances'] = column_importances self.transaction.lmd['columns_buckets_importances'] = buckets_stats self.transaction.lmd['columnless_prediction_distribution'] = columnless_prediction_distribution self.transaction.lmd['all_columns_prediction_distribution'] = all_columns_prediction_distribution # Create the probabilistic validators for each of the predict column probabilistic_validators = {} for col in output_columns: if 'percentage_buckets' in self.transaction.lmd['column_stats'][col]: probabilistic_validators[col] = ProbabilisticValidator( col_stats=self.transaction.lmd['column_stats'][col]) else: probabilistic_validators[col] = ProbabilisticValidator( col_stats=self.transaction.lmd['column_stats'][col]) ignorable_input_columns = [] for input_column in input_columns: if self.transaction.lmd['column_stats'][input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]: ignorable_input_columns.append(input_column) with disable_console_output(): normal_predictions = self.transaction.model_backend.predict('validate') # Single observation on the validation dataset when we have no ignorable column if len(ignorable_input_columns) == 0: for pcol in output_columns: for i in range(len(self.transaction.input_data.validation_df[pcol])): probabilistic_validators[pcol].register_observation(features_existence=[True for col in input_columns], real_value=self.transaction.input_data.validation_df[pcol].iloc[i], predicted_value=normal_predictions[pcol][i], is_original_data=True, hmd=self.transaction.hmd) # Run on the validation set multiple times, each time with one of the column blanked out is_original_data = True for column_name in ignorable_input_columns: ignore_columns = [] ignore_columns.append(column_name) # Silence logging since otherwise lightwood and ludwig will complain too much about None values with disable_console_output(): ignore_col_predictions = self.transaction.model_backend.predict('validate', ignore_columns) # create a vector that has True for each feature that was passed to the model tester and False if it was blanked features_existence = [True if np_col not in ignore_columns else False for np_col in input_columns] pv = {} for pcol in output_columns: for i in range(len(self.transaction.input_data.validation_df[pcol])): probabilistic_validators[pcol].register_observation(features_existence=features_existence, real_value=self.transaction.input_data.validation_df[pcol].iloc[i], predicted_value=ignore_col_predictions[pcol][i], is_original_data=False, hmd=self.transaction.hmd) probabilistic_validators[pcol].register_observation(features_existence=[True for col in input_columns], real_value=self.transaction.input_data.validation_df[pcol].iloc[i], predicted_value=normal_predictions[pcol][i], is_original_data=is_original_data, hmd=self.transaction.hmd) # Only register the original data once ! is_original_data = False self.transaction.lmd['accuracy_histogram'] = {} total_accuracy = 0 for pcol in output_columns: probabilistic_validators[pcol].partial_fit() accuracy_histogram, validation_set_accuracy = probabilistic_validators[pcol].get_accuracy_histogram() self.transaction.lmd['accuracy_histogram'][pcol] = accuracy_histogram total_accuracy += validation_set_accuracy self.transaction.lmd['validation_set_accuracy'] = total_accuracy/len(output_columns) # Pickle for later use self.transaction.hmd['probabilistic_validators'] = {} for col in probabilistic_validators: confusion_matrix = probabilistic_validators[col].get_confusion_matrix() self.transaction.lmd['confusion_matrices'][col] = confusion_matrix self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(probabilistic_validators[col])
def get_column_importance(self, model, output_columns, input_columns, full_dataset, stats): columnless_prediction_distribution = {} all_columns_prediction_distribution = {} with disable_console_output(True): normal_predictions = model.predict('validate') normal_accuracy = evaluate_accuracy(normal_predictions, full_dataset, stats, output_columns) column_importance_dict = {} buckets_stats = {} # Histogram for when all columns are present, in order to plot the force vectors for output_column in output_columns: # @TODO: Running stats generator just to get the histogram is very inefficient, change this validation_set_output_column_histogram, _ = StatsGenerator.get_histogram(normal_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype']) if validation_set_output_column_histogram is not None: all_columns_prediction_distribution[output_column] = validation_set_output_column_histogram ignorable_input_columns = [] for input_column in input_columns: if stats[input_column]['data_type'] != DATA_TYPES.FILE_PATH and input_column not in [x[0] for x in self.transaction.lmd['model_order_by']]: ignorable_input_columns.append(input_column) for input_column in ignorable_input_columns: # See what happens with the accuracy of the outputs if only this column is present ignore_columns = [col for col in ignorable_input_columns if col != input_column] with disable_console_output(True): col_only_predictions = model.predict('validate', ignore_columns) col_only_accuracy = evaluate_accuracy(col_only_predictions, full_dataset, stats, output_columns) # See what happens with the accuracy if all columns but this one are present ignore_columns = [input_column] with disable_console_output(True): col_missing_predictions = model.predict('validate', ignore_columns) col_missing_accuracy = evaluate_accuracy(col_missing_predictions, full_dataset, stats, output_columns) combined_column_accuracy = ((normal_accuracy - col_missing_accuracy) + col_only_accuracy)/2 if combined_column_accuracy < 0: combined_column_accuracy = 0 column_importance = 10*(1 - (normal_accuracy - combined_column_accuracy)/normal_accuracy) if column_importance < 1: column_importance = 1 column_importance_dict[input_column] = column_importance # Histogram for when the column is missing, in order to plot the force vectors for output_column in output_columns: if output_column not in columnless_prediction_distribution: columnless_prediction_distribution[output_column] = {} col_missing_output_histogram, _ = StatsGenerator.get_histogram(col_missing_predictions[output_column], data_type=stats[output_column]['data_type'],data_subtype=stats[output_column]['data_subtype']) if col_missing_output_histogram is not None: columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram # @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column for output_column in output_columns: buckets_stats[output_column] = {} bucket_indexes = {} for index,row in full_dataset.iterrows(): value = row[output_column] if 'percentage_buckets' in stats[output_column]: percentage_buckets = stats[output_column]['percentage_buckets'] else: percentage_buckets = None value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd) if value_bucket not in bucket_indexes: bucket_indexes[value_bucket] = [] bucket_indexes[value_bucket].append(index) for bucket in bucket_indexes: buckets_stats[output_column][bucket] = {} input_data = TransactionData() input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]] input_data.columns = input_data.columns stats_generator = StatsGenerator(session=None, transaction=self.transaction) try: with disable_console_output(): col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False) buckets_stats[output_column][bucket].update(col_buckets_stats) except Exception as e: pass return column_importance_dict, buckets_stats, columnless_prediction_distribution, all_columns_prediction_distribution