def run_tabular_benchmark_toy(fit_args): dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip', 'name': 'toyClassification', 'problem_type': MULTICLASS, 'label': 'y', 'performance_val': 0.436} # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data # toyclassif_dataset should produce 1 warning and 1 error during inference: # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn'] # Additional warning that would have occurred if ValueError was not triggered: # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them): ['distractioncolumn1', 'distractioncolumn2'] directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) print(f"Evaluating Benchmark Dataset {dataset['name']}") directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = TabularPredictor(label=dataset['label'], path=savedir).fit(train_data, **fit_args) print(predictor.feature_metadata) print(predictor.feature_metadata.type_map_raw) print(predictor.feature_metadata.type_group_map_special) try: predictor.predict(test_data) except KeyError: # KeyError should be raised because test_data has missing column 'lostcolumn' pass else: raise AssertionError(f'{dataset["name"]} should raise an exception.')
def train(args): set_seed(args.seed) if args.task is not None: feature_columns, label_column, eval_metric, all_metrics = TASKS[args.task] else: raise NotImplementedError if args.exp_dir is None: args.exp_dir = 'autogluon_text_{}'.format(args.task) train_df = load_pd.load(args.train_file) dev_df = load_pd.load(args.dev_file) test_df = load_pd.load(args.test_file) train_df = train_df[feature_columns + [label_column]] dev_df = dev_df[feature_columns + [label_column]] test_df = test_df[feature_columns] if args.task == 'mrpc' or args.task == 'sts': # Augmenting the un-ordered set manually. train_df_other_part = pd.DataFrame({feature_columns[0]: train_df[feature_columns[1]], feature_columns[1]: train_df[feature_columns[0]], label_column: train_df[label_column]}) real_train_df = pd.concat([train_df, train_df_other_part]) real_dev_df = dev_df else: real_train_df = train_df real_dev_df = dev_df if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal', num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, hyperparameters='multimodal') elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=eval_metric, path=args.exp_dir) predictor.fit(train_data=real_train_df, tuning_data=real_dev_df, seed=args.seed) else: raise NotImplementedError dev_metric_score = predictor.evaluate(dev_df) dev_predictions = predictor.predict(dev_df, as_pandas=True) test_predictions = predictor.predict(test_df, as_pandas=True) dev_predictions.to_csv(os.path.join(args.exp_dir, 'dev_prediction.csv')) test_predictions.to_csv(os.path.join(args.exp_dir, 'test_prediction.csv')) with open(os.path.join(args.exp_dir, 'final_model_scores.json'), 'w') as of: json.dump({f'valid_{eval_metric}': dev_metric_score}, of)
class AGLearner(object): def __init__(self, path=None): self.path = path def fit(self, x, y): ''' ''' x = x if len(x.shape) > 1 else x[:, None] y = y if len(y.shape) > 1 else y[:, None] x_columns = ['x_%d' % i for i in range(x.shape[1])] self.x_columns = x_columns y_column = 'target' columns = x_columns + [y_column] train_data = pd.DataFrame(np.concatenate([x, y], axis=1), columns=columns) self._model = TabularPredictor(y_column, problem_type=problem_type, eval_metric=eval_metric, \ path=self.path, verbosity=verbosity, sample_weight=sample_weight, weight_evaluation=weight_evaluation, \ groups=groups, **kwargs).fit(train_data, **fit_kwargs) def predict(self, x): ''' ''' assert hasattr(self, '_model'), 'The model has not been fitted yet' x = x if len(x.shape) > 1 else x[:, None] if not hasattr(self, 'x_columns'): self.x_columns = ['x_%d' % i for i in range(x.shape[1])] assert x.shape[1] == len( self.x_columns ), 'x has a shape incompatible with training data' data = pd.DataFrame(x, columns=self.x_columns) y_pred = self._model.predict(data, as_pandas=False) return y_pred @property def feature_importances_(self): try: importance_df = self._model.feature_importance() importances = [ importance_df.at[col, 'importance'] for col in self.x_columns ] return importances except: return [] def save(self, path): self._model.save() @classmethod def load(cls, path): learner = AGLearner(path=path) learner._model = TabularPredictor.load(path) return learner
def train(args): model_output_dir = f'{args.output_dir}/data' is_distributed = len(args.hosts) > 1 host_rank = args.hosts.index(args.current_host) dist_ip_addrs = args.hosts dist_ip_addrs.pop(host_rank) # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) # Extract column info target = args.init_args['label'] columns = train_data.columns.tolist() column_dict = {"columns": columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) # Train models args.init_args['path'] = args.model_dir #args.fit_args.pop('label', None) predictor = TabularPredictor(**args.init_args).fit(train_data, **args.fit_args) # Results summary predictor.fit_summary(verbosity=3) #model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html') model_summary_fname_src = os.path.join(args.model_dir, 'SummaryOfModels.html') model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html') if os.path.exists(model_summary_fname_src): shutil.copy(model_summary_fname_src, model_summary_fname_tgt) # ensemble visualization G = predictor._trainer.model_graph remove = [node for node, degree in dict(G.degree()).items() if degree < 1] G.remove_nodes_from(remove) A = nx.nx_agraph.to_agraph(G) A.graph_attr.update(rankdir='BT') A.node_attr.update(fontsize=10) for node in A.iternodes(): node.attr['shape'] = 'rectagle' A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot') # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring if target in test_data: # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False) # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) feature_importance_df = predictor.feature_importance(test_data) print(feature_importance_df) feature_importance_df.to_csv( f'{model_output_dir}/feature_importance.csv', index=True) # Classification report and confusion matrix for classification model if predictor.problem_type in [BINARY, MULTICLASS]: from sklearn.metrics import classification_report, confusion_matrix X_test = test_data.drop(target, axis=1) y_test_true = test_data[target] y_test_pred = predictor.predict(X_test) y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True) report_dict = classification_report( y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels) report_dict_df = pd.DataFrame(report_dict).T report_dict_df.to_csv( f'{model_output_dir}/classification_report.csv', index=True) cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels) cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels) sns.set(font_scale=1) cmap = 'coolwarm' sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap) plt.title('Confusion Matrix') plt.ylabel('true label') plt.xlabel('predicted label') plt.show() plt.savefig(f'{model_output_dir}/confusion_matrix.png') get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir) else: warnings.warn( 'Skipping eval on test data since label column is not included.' ) # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
def run(args): if args.task == 'product_sentiment': train_df, test_df, label_column = load_machine_hack_product_sentiment(args.train_file, args.test_file) elif args.task == 'mercari_price': train_df, test_df, label_column = load_mercari_price_prediction(args.train_file, args.test_file) elif args.task == 'price_of_books': train_df, test_df, label_column = load_price_of_books(args.train_file, args.test_file) elif args.task == 'data_scientist_salary': train_df, test_df, label_column = load_data_scientist_salary(args.train_file, args.test_file) else: raise NotImplementedError hyperparameters = get_hyperparameter_config('multimodal') if args.preset is not None and args.mode in ['stacking', 'weighted']: hyperparameters['AG_TEXT_NN']['presets'] = args.preset if args.mode == 'stacking': predictor = TabularPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, hyperparameters=hyperparameters, num_bag_folds=5, num_stack_levels=1) elif args.mode == 'weighted': predictor = TabularPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, hyperparameters=hyperparameters) elif args.mode == 'single': # When no embedding is used, # we will just use TextPredictor that will train a single model internally. predictor = TextPredictor(label=label_column, eval_metric=args.eval_metric, path=args.exp_dir) predictor.fit(train_data=train_df, presets=args.preset, seed=args.seed) else: raise NotImplementedError if args.task == 'product_sentiment': test_probabilities = predictor.predict_proba(test_df, as_pandas=True, as_multiclass=True) test_probabilities.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False) elif args.task == 'data_scientist_salary': predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_excel(args.sample_submission, engine='openpyxl') submission.loc[:, label_column] = predictions submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx')) elif args.task == 'price_of_books': predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_excel(args.sample_submission, engine='openpyxl') submission.loc[:, label_column] = np.power(10, predictions) - 1 submission.to_excel(os.path.join(args.exp_dir, 'submission.xlsx')) elif args.task == 'mercari_price': test_predictions = predictor.predict(test_df, as_pandas=False) submission = pd.read_csv(args.sample_submission) submission.loc[:, label_column] = np.exp(test_predictions) - 1 submission.to_csv(os.path.join(args.exp_dir, 'submission.csv'), index=False) else: raise NotImplementedError
""" Example script for predicting columns of tables, demonstrating simple use-case """ from autogluon.tabular import TabularDataset, TabularPredictor # Training time: train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label = 'class' # specifies which column do we want to predict save_path = 'ag_models/' # where to save trained models predictor = TabularPredictor(label=label, path=save_path).fit(train_data) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: # predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality') results = predictor.fit_summary() # Inference time: test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame y_test = test_data[label] test_data = test_data.drop(labels=[label], axis=1) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = TabularPredictor.load(save_path) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
def autogluon(df, task, timelife): pd.options.mode.chained_assignment = None df_new = copy.copy(df) X, y, _ = return_X_y(df_new) if isinstance(y, pd.Series): y = y.to_frame() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) if isinstance(y_train, pd.Series): y_train = y_train.to_frame() target = y_train.columns[0] if isinstance(y_test, pd.Series): y_test = y_test.to_frame() X_train[target] = y_train train = X_train test = X_test if task == 'classification': if len(y[y.columns[0]].unique()) > 2: pt = 'multiclass' f1 = lambda y_test, y_pred: f1_score( y_test, y_pred, average='weighted') else: pt = 'binary' f1 = lambda y_test, y_pred: f1_score(y_test, y_pred) else: pt = 'regression' #, path='/home/riccardo/.local/share/Trash' predictor = TabularPredictor(label=target, problem_type=pt).fit( train_data=train, time_limit=timelife * 60, presets=['optimize_for_deployment' ]) # TEMPORANEO -> attenzione salvo sul cestino results = predictor.fit_summary() y_pred = predictor.predict(test) pipelines = (predictor.leaderboard(df, silent=True)) # sono queste res = predictor.evaluate_predictions(y_true=y_test.squeeze(), y_pred=y_pred, auxiliary_metrics=True) shutil.rmtree('./AutogluonModels') if (task == 'classification'): '''y_test = le.fit_transform(y_test) y_pred = le.fit_transform(y_pred) if len(np.unique(y_pred)) > 2: f1 = f1_score(y_test, y_pred, average='weighted')s else: f1 = f1_score(y_test, y_pred) return (res['accuracy'], f1)''' return (res['accuracy'], f1(y_test, y_pred), pipelines) else: return (res['root_mean_squared_error'], res['r2'], pipelines)
predictor = TabularPredictor(path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric=train_dataset.metric, label=label_columns[0]) if args.ensemble_type == 'weighted': predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, hyperparameters=tabular_hparams) else: predictor.fit(concat_df[feature_columns + [label_columns[0]]], feature_generator=feature_generator, num_bag_folds=5, num_stack_levels=1, hyperparameters=tabular_hparams) predictor.save() else: predictor = TextPredictor(path=os.path.join(args.save_dir, args.model_type, time_str), problem_type=train_dataset.problem_type, eval_metric=train_dataset.metric, label=label_columns[0]) predictor.fit(concat_df[feature_columns + [label_columns[0]]], presets='electra_base_late_fusion_concate_e10_avg3') predictor.save( os.path.join(args.save_dir, args.model_type, time_str, 'text_prediction')) predictions = predictor.predict(competition_df, as_pandas=True) predictions.to_csv( os.path.join(args.save_dir, args.model_type, time_str, 'pred.csv'))
def run(dataset, config): log.info(f"\n**** AutoGluon [v{__version__}] ****\n") save_metadata(config, version=__version__) metrics_mapping = dict( acc=metrics.accuracy, auc=metrics.roc_auc, f1=metrics.f1, logloss=metrics.log_loss, mae=metrics.mean_absolute_error, mse=metrics.mean_squared_error, r2=metrics.r2, rmse=metrics.root_mean_squared_error, ) label = dataset.target.name problem_type = dataset.problem_type perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported.", config.metric) is_classification = config.type == 'classification' training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} load_raw = config.framework_params.get('_load_raw', False) if load_raw: train, test = load_data_raw(dataset=dataset) else: column_names, _ = zip(*dataset.columns) column_types = dict(dataset.columns) train = pd.DataFrame(dataset.train.data, columns=column_names).astype(column_types, copy=False) print(f"Columns dtypes:\n{train.dtypes}") test = pd.DataFrame(dataset.test.data, columns=column_names).astype(column_types, copy=False) del dataset gc.collect() output_dir = output_subdir("models", config) with utils.Timer() as training: predictor = TabularPredictor( label=label, eval_metric=perf_metric.name, path=output_dir, problem_type=problem_type, ).fit( train_data=train, time_limit=config.max_runtime_seconds, **training_params ) del train y_test = test[label] test = test.drop(columns=label) if is_classification: with utils.Timer() as predict: probabilities = predictor.predict_proba(test, as_multiclass=True) predictions = probabilities.idxmax(axis=1).to_numpy() else: with utils.Timer() as predict: predictions = predictor.predict(test, as_pandas=False) probabilities = None prob_labels = probabilities.columns.values.tolist() if probabilities is not None else None leaderboard = predictor.leaderboard(silent=True) # Removed test data input to avoid long running computation, remove 7200s timeout limitation to re-enable with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000): print(leaderboard) save_artifacts(predictor, leaderboard, config) num_models_trained = len(leaderboard) if predictor._trainer.model_best is not None: num_models_ensemble = len(predictor._trainer.get_minimum_model_set(predictor._trainer.model_best)) else: num_models_ensemble = 1 return result(output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, probabilities_labels=prob_labels, target_is_encoded=False, models_count=num_models_trained, models_ensemble_count=num_models_ensemble, training_duration=training.duration, predict_duration=predict.duration)
ag_predictor_args["path"] = args.model_dir ag_fit_args = config["ag_fit_args"] predictor = TabularPredictor(**ag_predictor_args).fit( train_data, **ag_fit_args) logger.info("Best model: %s", predictor.get_model_best()) # Leaderboard lb = predictor.leaderboard() lb.to_csv(f'{args.output_data_dir}/leaderboard.csv', index=False) logger.info("Saved leaderboard to output.") # Feature importance feature_importance = predictor.feature_importance(test_data) feature_importance.to_csv(f'{args.output_data_dir}/feature_importance.csv') logger.info("Saved feature importance to output.") # Evaluation evaluation = predictor.evaluate(test_data) with open(f'{args.output_data_dir}/evaluation.json', 'w') as f: json.dump(evaluation, f) logger.info("Saved evaluation to output.") predictor.save_space() # ---------------------------- Inference ----------------------------------- test_data_nolabel = test_data.drop(labels=ag_predictor_args['label'], axis=1) y_pred = predictor.predict(test_data_nolabel) y_pred.to_csv(f'{args.output_data_dir}/predictions.csv', index=False)
data_adjust=data_adjust.reset_index(drop = True) ''' #train train_data = TabularDataset( datasettemp.drop( columns=['Trading_money', 'open', 'max', 'min', 'PER', 'PBR' ]).iloc[:-11]) #predictor predictor = TabularPredictor(label='close').fit( train_data.drop(columns=['date', 'stock_id'])) # , num_stack_levels=1,num_bag_folds=2) #test test_data = datasettemp.iloc[-11:len(datasettemp)] preds = predictor.predict( test_data.drop(columns=['date', 'stock_id', 'close'])) test_hat = pd.DataFrame({ 'date': test_data['date'], 'stock_id': test_data['stock_id'], 'close': preds }) test_hat predition_data = [] for k in range(0, 10): #抓後10天的資料 g = [] g.append(test_hat.iloc[k, 0]) g.append(test_hat.iloc[k, 2]) g.append(datasettemp.iloc[k - 10, 2]) g.append(Quote_change(test_hat.iloc[k + 1, 2], test_hat.iloc[k, 2])) predition_data.append(g)
# %% # Train AutoGLUON train = False if train: df_train = df[df["vad"] & (df[COL_LABEL] != LABEL_NONE_ID)] predictor = TabularPredictor(label=COL_LABEL).fit(train_data=df_train) print("[green]Finished training[/green]") else: predictor = TabularPredictor.load("AutogluonModels/ag-20211002_203405/") print("[green]Loaded pre-trained model[/green]") # predictions = predictor.predict(TEST_DATA.csv) # %% predictions = predictor.predict(df) # %% # TODO # Set label = none where VAD=0 # Create training set, removing "none" # Predict the whole audio and check results # %% # Plot waveform, features and labels fig, axes = get_figure(n_axes=6) (wv_points, ) = axes[0].plot(t, y) axes[0].set_ylabel("Waveform") axes[0].set_xlabel("time (s)") axes[1].plot(t, is_voice, "r") axes[1].set_ylabel("VAD")
def train_model(dataset_name, text_presets, save_dir, model, tabular_presets, num_gpus=None, get_competition_results=False, seed=123): set_seed(seed) if get_competition_results: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'competition') else: train_dataset = dataset_registry.create(dataset_name, 'train') test_dataset = dataset_registry.create(dataset_name, 'test') feature_columns = train_dataset.feature_columns label_columns = train_dataset.label_columns metric = train_dataset.metric problem_type = train_dataset.problem_type train_data1, tuning_data1 = sklearn.model_selection.train_test_split( train_dataset.data, test_size=0.05, random_state=np.random.RandomState(seed)) train_data = train_dataset.data test_data = test_dataset.data column_types, inferred_problem_type = infer_column_problem_types( train_data1, tuning_data1, label_columns=label_columns, problem_type=problem_type) train_data = train_data[feature_columns + label_columns] # tuning_data = tuning_data[feature_columns + label_columns] if not get_competition_results: test_data = test_data[feature_columns + label_columns] train_tic = time.time() if model == 'ag_tabular_quick': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8)) predictor = TabularPredictor(label=label_columns[0], path=save_dir, problem_type=problem_type) predictor.fit(train_data, time_limit=30, feature_generator=feature_generator) elif model == 'ag_tabular_without_text': no_text_feature_columns = [] for col_name in feature_columns: if column_types[col_name] != _TEXT: no_text_feature_columns.append(col_name) train_data = train_data[no_text_feature_columns + label_columns] # tuning_data = tuning_data[no_text_feature_columns + label_columns] test_data = test_data[no_text_feature_columns + label_columns] predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets in ['best_quality']: predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, presets=tabular_presets) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS, num_bag_folds=5, num_stack_levels=1) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_tabular_old': predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'ag_text_only': text_feature_columns = [ col_name for col_name in feature_columns if column_types[col_name] == _TEXT ] train_data = train_data[text_feature_columns + label_columns] test_data = test_data[text_feature_columns + label_columns] predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'ag_text_multimodal': predictor = TextPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) hparams = ag_text_presets.create(text_presets) if len(train_data) > 500000: hparams = set_epoch3(hparams) predictor.fit(train_data=train_data, hyperparameters=hparams, num_gpus=num_gpus, seed=seed) elif model == 'pre_embedding' or model == 'tune_embedding_multimodal' or model == 'tune_embedding_text': feature_generator = AutoMLPipelineFeatureGenerator( enable_text_special_features=False, enable_text_ngram_features=False) pre_embedding_folder = os.path.join(_CURR_DIR, 'pre_computed_embeddings') if model == 'pre_embedding': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'pretrain_text_embedding', 'test.npy')) elif model == 'tune_embedding_multimodal': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'multimodal_embedding', 'test.npy')) elif model == 'tune_embedding_text': train_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'train.npy')) test_features = np.load( os.path.join(pre_embedding_folder, dataset_name, 'tuned_text_embedding', 'test.npy')) else: raise NotImplementedError train_data = train_data.join( pd.DataFrame(train_features, columns=[ f'pre_feat{i}' for i in range(train_features.shape[1]) ])) train_data.reset_index(drop=True, inplace=True) test_data = test_data.join( pd.DataFrame(test_features, columns=[ f'pre_feat{i}' for i in range(test_features.shape[1]) ])) test_data.reset_index(drop=True, inplace=True) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError elif model == 'tabular_multimodal' or model == 'tabular_multimodal_just_table': if model == 'tabular_multimodal': MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True) hyperparameters = get_multimodal_tabular_hparam_just_gbm( text_presets=text_presets) else: MAX_NGRAM = 300 feature_generator = AutoMLPipelineFeatureGenerator( vectorizer=CountVectorizer(min_df=30, ngram_range=(1, 3), max_features=MAX_NGRAM, dtype=np.uint8), enable_raw_text_features=True, enable_text_special_features=False, enable_text_ngram_features=False) hyperparameters = multimodal_tabular_just_table_hparam( text_presets=text_presets) predictor = TabularPredictor(path=save_dir, label=label_columns[0], problem_type=problem_type, eval_metric=metric) if tabular_presets == 'best_quality': predictor.fit(train_data=train_data, presets=tabular_presets, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '5fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=5, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == '3fold_1stack': predictor.fit(train_data=train_data, num_bag_folds=3, num_stack_levels=1, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) elif tabular_presets == 'no': predictor.fit(train_data=train_data, hyperparameters=hyperparameters, feature_generator=feature_generator, excluded_model_types=TABULAR_EXCLUDE_MODELS) else: raise NotImplementedError else: raise NotImplementedError train_toc = time.time() inference_tic = time.time() predictions = predictor.predict(test_data, as_pandas=True) predictor.save() inference_toc = time.time() if problem_type == MULTICLASS or problem_type == BINARY: prediction_prob = predictor.predict_proba(test_data, as_pandas=True) prediction_prob.to_csv( os.path.join(save_dir, 'test_prediction_prob.csv')) predictions.to_csv(os.path.join(save_dir, 'test_prediction.csv')) gt = test_data[label_columns[0]] gt.to_csv(os.path.join(save_dir, 'ground_truth.csv')) if not get_competition_results: score = predictor.evaluate(test_data) with open(os.path.join(save_dir, 'test_score.json'), 'w') as of: json.dump({metric: score}, of) with open(os.path.join(save_dir, 'speed_stats.json'), 'w') as of: json.dump( { 'train_time': train_toc - train_tic, 'inference_time': inference_toc - inference_tic, 'cpuinfo': cpuinfo.get_cpu_info() }, of)
def run(dataset, config): log.info(f"\n**** AutoGluon [v{__version__}] ****\n") metrics_mapping = dict( acc=metrics.accuracy, auc=metrics.roc_auc, f1=metrics.f1, logloss=metrics.log_loss, mae=metrics.mean_absolute_error, mse=metrics.mean_squared_error, r2=metrics.r2, rmse=metrics.root_mean_squared_error, ) perf_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported.", config.metric) is_classification = config.type == 'classification' training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } train, test = dataset.train.path, dataset.test.path label = dataset.target.name problem_type = dataset.problem_type models_dir = tempfile.mkdtemp() + os.sep # passed to AG with Timer() as training: predictor = TabularPredictor( label=label, eval_metric=perf_metric.name, path=models_dir, problem_type=problem_type, ).fit(train_data=train, time_limit=config.max_runtime_seconds, **training_params) del train if is_classification: with Timer() as predict: probabilities = predictor.predict_proba(test, as_multiclass=True) predictions = probabilities.idxmax(axis=1).to_numpy() else: with Timer() as predict: predictions = predictor.predict(test, as_pandas=False) probabilities = None prob_labels = probabilities.columns.values.astype( str).tolist() if probabilities is not None else None _leaderboard_extra_info = config.framework_params.get( '_leaderboard_extra_info', False) # whether to get extra model info (very verbose) _leaderboard_test = config.framework_params.get( '_leaderboard_test', False) # whether to compute test scores in leaderboard (expensive) leaderboard_kwargs = dict(silent=True, extra_info=_leaderboard_extra_info) # Disabled leaderboard test data input by default to avoid long running computation, remove 7200s timeout limitation to re-enable if _leaderboard_test: leaderboard_kwargs['data'] = test leaderboard = predictor.leaderboard(**leaderboard_kwargs) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000): log.info(leaderboard) num_models_trained = len(leaderboard) if predictor._trainer.model_best is not None: num_models_ensemble = len( predictor._trainer.get_minimum_model_set( predictor._trainer.model_best)) else: num_models_ensemble = 1 save_artifacts(predictor, leaderboard, config) shutil.rmtree(predictor.path, ignore_errors=True) return result(output_file=config.output_predictions_file, predictions=predictions, probabilities=probabilities, probabilities_labels=prob_labels, target_is_encoded=False, models_count=num_models_trained, models_ensemble_count=num_models_ensemble, training_duration=training.duration, predict_duration=predict.duration)
train_data = TabularDataset( "../../data/processed/oversampled/train_valid_feat_eng_oversample.csv") # train_data = train_data.drop(["Age","Room_Rate","Discount_Rate"],axis="columns") save_path = "models_oversample_valid" predictor = TabularPredictor(label="Reservation_Status", path=save_path, eval_metric="f1_macro").fit( train_data, time_limit=7200, presets="best_quality") valid_data = TabularDataset("../../data/processed/valid_preproc.csv") y_test = valid_data.loc[:, "Reservation_Status"] valid_data = valid_data.drop(["Reservation_Status"], axis="columns") y_pred = predictor.predict(valid_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) print(perf) test_data = TabularDataset("../../data/processed/test_preproc.csv") test_preds = predictor.predict(test_data) test_df = pd.read_csv("../../data/processed/test_preproc.csv") test_df["Reservation_Status"] = test_preds test_df = test_df.replace( {"Reservation_Status": { "Check-In": 1, "Canceled": 2, "No-Show": 3
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None, run_distill=False, crash_in_oof=False): print("Running fit with args:") print(fit_args) # Each train/test dataset must be located in single directory with the given names. train_file = 'train_data.csv' test_file = 'test_data.csv' EPS = 1e-10 # List containing dicts for each dataset to include in benchmark (try to order based on runtimes) datasets = get_benchmark_sets() if dataset_indices is not None: # only run some datasets datasets = [datasets[i] for i in dataset_indices] # Aggregate performance summaries obtained in previous benchmark run: prev_perf_vals = [dataset['performance_val'] for dataset in datasets] previous_avg_performance = np.mean(prev_perf_vals) previous_median_performance = np.median(prev_perf_vals) previous_worst_performance = np.max(prev_perf_vals) # Run benchmark: performance_vals = [0.0] * len(datasets) # performance obtained in this run directory_prefix = './datasets/' with warnings.catch_warnings(record=True) as caught_warnings: for idx in range(len(datasets)): dataset = datasets[idx] train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if seed_val is not None: seed(seed_val) np.random.seed(seed_val) print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets))) directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label = dataset['label'] y_test = test_data[label] test_data = test_data.drop(labels=[label], axis=1) if fast_benchmark: if subsample_size is None: raise ValueError("fast_benchmark specified without subsample_size") if subsample_size < len(train_data): # .sample instead of .head to increase diversity and test cases where data index is not monotonically increasing. train_data = train_data.sample(n=subsample_size, random_state=seed_val) # subsample for fast_benchmark predictor = TabularPredictor(label=label, path=savedir).fit(train_data, **fit_args) results = predictor.fit_summary(verbosity=4) if predictor.problem_type != dataset['problem_type']: warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type'])) predictor = TabularPredictor.load(savedir) # Test loading previously-trained predictor from file y_pred_empty = predictor.predict(test_data[0:0]) assert len(y_pred_empty) == 0 y_pred = predictor.predict(test_data) perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) if dataset['problem_type'] != REGRESSION: perf = 1.0 - perf_dict['accuracy'] # convert accuracy to error-rate else: perf = 1.0 - perf_dict['r2'] # unexplained variance score. performance_vals[idx] = perf print("Performance on dataset %s: %s (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val'])) if (not fast_benchmark) and (performance_vals[idx] > dataset['performance_val'] * perf_threshold): warnings.warn("Performance on dataset %s is %s times worse than previous performance." % (dataset['name'], performance_vals[idx]/(EPS+dataset['performance_val']))) if predictor._trainer.bagged_mode and not crash_in_oof: # TODO: Test index alignment with original training data (first handle duplicated rows / dropped rows edge cases) y_pred_oof = predictor.get_oof_pred() y_pred_proba_oof = predictor.get_oof_pred_proba(as_multiclass=False) y_pred_oof_transformed = predictor.get_oof_pred(transformed=True) y_pred_proba_oof_transformed = predictor.get_oof_pred_proba(as_multiclass=False, transformed=True) # Assert expected type output assert isinstance(y_pred_oof, pd.Series) assert isinstance(y_pred_oof_transformed, pd.Series) if predictor.problem_type == MULTICLASS: assert isinstance(y_pred_proba_oof, pd.DataFrame) assert isinstance(y_pred_proba_oof_transformed, pd.DataFrame) else: if predictor.problem_type == BINARY: assert isinstance(predictor.get_oof_pred_proba(), pd.DataFrame) assert isinstance(y_pred_proba_oof, pd.Series) assert isinstance(y_pred_proba_oof_transformed, pd.Series) assert y_pred_oof_transformed.equals(predictor.transform_labels(y_pred_oof, proba=False)) # Test that the transform_labels method is capable of reproducing the same output when converting back and forth, and test that oof 'transform' parameter works properly. y_pred_proba_oof_inverse = predictor.transform_labels(y_pred_proba_oof, proba=True) y_pred_proba_oof_inverse_inverse = predictor.transform_labels(y_pred_proba_oof_inverse, proba=True, inverse=True) y_pred_oof_inverse = predictor.transform_labels(y_pred_oof) y_pred_oof_inverse_inverse = predictor.transform_labels(y_pred_oof_inverse, inverse=True) if isinstance(y_pred_proba_oof_transformed, pd.DataFrame): pd.testing.assert_frame_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse) pd.testing.assert_frame_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse) else: pd.testing.assert_series_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse) pd.testing.assert_series_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse) pd.testing.assert_series_equal(y_pred_oof_transformed, y_pred_oof_inverse) pd.testing.assert_series_equal(y_pred_oof, y_pred_oof_inverse_inverse) # Test that index of both the internal training data and the oof outputs are consistent in their index values. X_internal, y_internal = predictor.load_data_internal() y_internal_index = list(y_internal.index) assert list(X_internal.index) == y_internal_index assert list(y_pred_oof.index) == y_internal_index assert list(y_pred_proba_oof.index) == y_internal_index assert list(y_pred_oof_transformed.index) == y_internal_index assert list(y_pred_proba_oof_transformed.index) == y_internal_index else: # Raise exception with pytest.raises(AssertionError): predictor.get_oof_pred() with pytest.raises(AssertionError): predictor.get_oof_pred_proba() if run_distill: predictor.distill(time_limit=60, augment_args={'size_factor':0.5}) # Summarize: avg_perf = np.mean(performance_vals) median_perf = np.median(performance_vals) worst_perf = np.max(performance_vals) for idx in range(len(datasets)): print("Performance on dataset %s: %s (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val'])) print("Average performance: %s" % avg_perf) print("Median performance: %s" % median_perf) print("Worst performance: %s" % worst_perf) if not fast_benchmark: if avg_perf > previous_avg_performance * perf_threshold: warnings.warn("Average Performance is %s times worse than previously." % (avg_perf/(EPS+previous_avg_performance))) if median_perf > previous_median_performance * perf_threshold: warnings.warn("Median Performance is %s times worse than previously." % (median_perf/(EPS+previous_median_performance))) if worst_perf > previous_worst_performance * perf_threshold: warnings.warn("Worst Performance is %s times worse than previously." % (worst_perf/(EPS+previous_worst_performance))) print("Ran fit with args:") print(fit_args) # List all warnings again to make sure they are seen: print("\n\n WARNINGS:") for w in caught_warnings: warnings.warn(w.message)
def test_advanced_functionality(): fast_benchmark = True dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY} label = 'class' directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if fast_benchmark: # subsample for fast_benchmark subsample_size = 100 train_data = train_data.head(subsample_size) test_data = test_data.head(subsample_size) print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}") directory = directory_prefix + 'advanced/' + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = TabularPredictor(label=label, path=savedir).fit(train_data) leaderboard = predictor.leaderboard(data=test_data) extra_metrics = ['accuracy', 'roc_auc', 'log_loss'] leaderboard_extra = predictor.leaderboard(data=test_data, extra_info=True, extra_metrics=extra_metrics) assert set(predictor.get_model_names()) == set(leaderboard['model']) assert set(predictor.get_model_names()) == set(leaderboard_extra['model']) assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns)) assert len(leaderboard) == len(leaderboard_extra) assert set(leaderboard_extra.columns).issuperset(set(extra_metrics)) # Assert that extra_metrics are present in output num_models = len(predictor.get_model_names()) feature_importances = predictor.feature_importance(data=test_data) original_features = set(train_data.columns) original_features.remove(label) assert set(feature_importances.index) == original_features assert set(feature_importances.columns) == {'importance', 'stddev', 'p_value', 'n', 'p99_high', 'p99_low'} predictor.transform_features() predictor.transform_features(data=test_data) predictor.info() assert predictor.get_model_names_persisted() == [] # Assert that no models were persisted during training assert predictor.unpersist_models() == [] # Assert that no models were unpersisted persisted_models = predictor.persist_models(models='all', max_memory=None) assert set(predictor.get_model_names_persisted()) == set(persisted_models) # Ensure all models are persisted assert predictor.persist_models(models='all', max_memory=None) == [] # Ensure that no additional models are persisted on repeated calls unpersised_models = predictor.unpersist_models() assert set(unpersised_models) == set(persisted_models) assert predictor.get_model_names_persisted() == [] # Assert that all models were unpersisted # Raise exception with pytest.raises(NetworkXError): predictor.persist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) assert predictor.get_model_names_persisted() == [] assert predictor.unpersist_models(models=['UNKNOWN_MODEL_1', 'UNKNOWN_MODEL_2']) == [] predictor.persist_models(models='all', max_memory=None) predictor.save() # Save predictor while models are persisted: Intended functionality is that they won't be persisted when loaded. predictor_loaded = TabularPredictor.load(predictor.path) # Assert that predictor loading works leaderboard_loaded = predictor_loaded.leaderboard(data=test_data) assert len(leaderboard) == len(leaderboard_loaded) assert predictor_loaded.get_model_names_persisted() == [] # Assert that models were not still persisted after loading predictor assert(predictor.get_model_full_dict() == dict()) predictor.refit_full() assert(len(predictor.get_model_full_dict()) == num_models) assert(len(predictor.get_model_names()) == num_models * 2) for model in predictor.get_model_names(): predictor.predict(data=test_data, model=model) predictor.refit_full() # Confirm that refit_models aren't further refit. assert(len(predictor.get_model_full_dict()) == num_models) assert(len(predictor.get_model_names()) == num_models * 2) predictor.delete_models(models_to_keep=[]) # Test that dry-run doesn't delete models assert(len(predictor.get_model_names()) == num_models * 2) predictor.predict(data=test_data) predictor.delete_models(models_to_keep=[], dry_run=False) # Test that dry-run deletes models assert len(predictor.get_model_names()) == 0 assert len(predictor.leaderboard()) == 0 assert len(predictor.leaderboard(extra_info=True)) == 0 try: predictor.predict(data=test_data) except: pass else: raise AssertionError('predictor.predict should raise exception after all models are deleted') print('Tabular Advanced Functionality Test Succeeded.')
from autogluon.tabular import TabularDataset, TabularPredictor # Train train_data = TabularDataset('train.csv') id, label = 'PassengerId', 'Survived' save_path = 'model' time_limit = 300 predictor = TabularPredictor(label=label, path=save_path).fit(train_data.drop(columns=[id]), time_limit=time_limit, presets='best_quality') # Test import pandas as pd test_data = TabularDataset('test.csv') # predictor = TabularPredictor.load( # save_path # ) # unnecessary, just demonstrates how to load previously-trained predictor from file preds = predictor.predict(test_data.drop(columns=[id])) submission = pd.DataFrame({id: test_data[id], label: preds}) submission.to_csv('submission.csv', index=False)