def train(args): model_output_dir = f'{args.output_dir}/data' is_distributed = len(args.hosts) > 1 host_rank = args.hosts.index(args.current_host) dist_ip_addrs = args.hosts dist_ip_addrs.pop(host_rank) # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) # Extract column info target = args.init_args['label'] columns = train_data.columns.tolist() column_dict = {"columns": columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) # Train models args.init_args['path'] = args.model_dir #args.fit_args.pop('label', None) predictor = TabularPredictor(**args.init_args).fit(train_data, **args.fit_args) # Results summary predictor.fit_summary(verbosity=3) #model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html') model_summary_fname_src = os.path.join(args.model_dir, 'SummaryOfModels.html') model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html') if os.path.exists(model_summary_fname_src): shutil.copy(model_summary_fname_src, model_summary_fname_tgt) # ensemble visualization G = predictor._trainer.model_graph remove = [node for node, degree in dict(G.degree()).items() if degree < 1] G.remove_nodes_from(remove) A = nx.nx_agraph.to_agraph(G) A.graph_attr.update(rankdir='BT') A.node_attr.update(fontsize=10) for node in A.iternodes(): node.attr['shape'] = 'rectagle' A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot') # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring if target in test_data: # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False) # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) feature_importance_df = predictor.feature_importance(test_data) print(feature_importance_df) feature_importance_df.to_csv( f'{model_output_dir}/feature_importance.csv', index=True) # Classification report and confusion matrix for classification model if predictor.problem_type in [BINARY, MULTICLASS]: from sklearn.metrics import classification_report, confusion_matrix X_test = test_data.drop(target, axis=1) y_test_true = test_data[target] y_test_pred = predictor.predict(X_test) y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True) report_dict = classification_report( y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels) report_dict_df = pd.DataFrame(report_dict).T report_dict_df.to_csv( f'{model_output_dir}/classification_report.csv', index=True) cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels) cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels) sns.set(font_scale=1) cmap = 'coolwarm' sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap) plt.title('Confusion Matrix') plt.ylabel('true label') plt.xlabel('predicted label') plt.show() plt.savefig(f'{model_output_dir}/confusion_matrix.png') get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir) else: warnings.warn( 'Skipping eval on test data since label column is not included.' ) # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
def run_tabular_benchmarks(fast_benchmark, subsample_size, perf_threshold, seed_val, fit_args, dataset_indices=None, run_distill=False, crash_in_oof=False): print("Running fit with args:") print(fit_args) # Each train/test dataset must be located in single directory with the given names. train_file = 'train_data.csv' test_file = 'test_data.csv' EPS = 1e-10 # List containing dicts for each dataset to include in benchmark (try to order based on runtimes) datasets = get_benchmark_sets() if dataset_indices is not None: # only run some datasets datasets = [datasets[i] for i in dataset_indices] # Aggregate performance summaries obtained in previous benchmark run: prev_perf_vals = [dataset['performance_val'] for dataset in datasets] previous_avg_performance = np.mean(prev_perf_vals) previous_median_performance = np.median(prev_perf_vals) previous_worst_performance = np.max(prev_perf_vals) # Run benchmark: performance_vals = [0.0] * len(datasets) # performance obtained in this run directory_prefix = './datasets/' with warnings.catch_warnings(record=True) as caught_warnings: for idx in range(len(datasets)): dataset = datasets[idx] train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if seed_val is not None: seed(seed_val) np.random.seed(seed_val) print("Evaluating Benchmark Dataset %s (%d of %d)" % (dataset['name'], idx+1, len(datasets))) directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. label = dataset['label'] y_test = test_data[label] test_data = test_data.drop(labels=[label], axis=1) if fast_benchmark: if subsample_size is None: raise ValueError("fast_benchmark specified without subsample_size") if subsample_size < len(train_data): # .sample instead of .head to increase diversity and test cases where data index is not monotonically increasing. train_data = train_data.sample(n=subsample_size, random_state=seed_val) # subsample for fast_benchmark predictor = TabularPredictor(label=label, path=savedir).fit(train_data, **fit_args) results = predictor.fit_summary(verbosity=4) if predictor.problem_type != dataset['problem_type']: warnings.warn("For dataset %s: Autogluon inferred problem_type = %s, but should = %s" % (dataset['name'], predictor.problem_type, dataset['problem_type'])) predictor = TabularPredictor.load(savedir) # Test loading previously-trained predictor from file y_pred_empty = predictor.predict(test_data[0:0]) assert len(y_pred_empty) == 0 y_pred = predictor.predict(test_data) perf_dict = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) if dataset['problem_type'] != REGRESSION: perf = 1.0 - perf_dict['accuracy'] # convert accuracy to error-rate else: perf = 1.0 - perf_dict['r2'] # unexplained variance score. performance_vals[idx] = perf print("Performance on dataset %s: %s (previous perf=%s)" % (dataset['name'], performance_vals[idx], dataset['performance_val'])) if (not fast_benchmark) and (performance_vals[idx] > dataset['performance_val'] * perf_threshold): warnings.warn("Performance on dataset %s is %s times worse than previous performance." % (dataset['name'], performance_vals[idx]/(EPS+dataset['performance_val']))) if predictor._trainer.bagged_mode and not crash_in_oof: # TODO: Test index alignment with original training data (first handle duplicated rows / dropped rows edge cases) y_pred_oof = predictor.get_oof_pred() y_pred_proba_oof = predictor.get_oof_pred_proba(as_multiclass=False) y_pred_oof_transformed = predictor.get_oof_pred(transformed=True) y_pred_proba_oof_transformed = predictor.get_oof_pred_proba(as_multiclass=False, transformed=True) # Assert expected type output assert isinstance(y_pred_oof, pd.Series) assert isinstance(y_pred_oof_transformed, pd.Series) if predictor.problem_type == MULTICLASS: assert isinstance(y_pred_proba_oof, pd.DataFrame) assert isinstance(y_pred_proba_oof_transformed, pd.DataFrame) else: if predictor.problem_type == BINARY: assert isinstance(predictor.get_oof_pred_proba(), pd.DataFrame) assert isinstance(y_pred_proba_oof, pd.Series) assert isinstance(y_pred_proba_oof_transformed, pd.Series) assert y_pred_oof_transformed.equals(predictor.transform_labels(y_pred_oof, proba=False)) # Test that the transform_labels method is capable of reproducing the same output when converting back and forth, and test that oof 'transform' parameter works properly. y_pred_proba_oof_inverse = predictor.transform_labels(y_pred_proba_oof, proba=True) y_pred_proba_oof_inverse_inverse = predictor.transform_labels(y_pred_proba_oof_inverse, proba=True, inverse=True) y_pred_oof_inverse = predictor.transform_labels(y_pred_oof) y_pred_oof_inverse_inverse = predictor.transform_labels(y_pred_oof_inverse, inverse=True) if isinstance(y_pred_proba_oof_transformed, pd.DataFrame): pd.testing.assert_frame_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse) pd.testing.assert_frame_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse) else: pd.testing.assert_series_equal(y_pred_proba_oof_transformed, y_pred_proba_oof_inverse) pd.testing.assert_series_equal(y_pred_proba_oof, y_pred_proba_oof_inverse_inverse) pd.testing.assert_series_equal(y_pred_oof_transformed, y_pred_oof_inverse) pd.testing.assert_series_equal(y_pred_oof, y_pred_oof_inverse_inverse) # Test that index of both the internal training data and the oof outputs are consistent in their index values. X_internal, y_internal = predictor.load_data_internal() y_internal_index = list(y_internal.index) assert list(X_internal.index) == y_internal_index assert list(y_pred_oof.index) == y_internal_index assert list(y_pred_proba_oof.index) == y_internal_index assert list(y_pred_oof_transformed.index) == y_internal_index assert list(y_pred_proba_oof_transformed.index) == y_internal_index else: # Raise exception with pytest.raises(AssertionError): predictor.get_oof_pred() with pytest.raises(AssertionError): predictor.get_oof_pred_proba() if run_distill: predictor.distill(time_limit=60, augment_args={'size_factor':0.5}) # Summarize: avg_perf = np.mean(performance_vals) median_perf = np.median(performance_vals) worst_perf = np.max(performance_vals) for idx in range(len(datasets)): print("Performance on dataset %s: %s (previous perf=%s)" % (datasets[idx]['name'], performance_vals[idx], datasets[idx]['performance_val'])) print("Average performance: %s" % avg_perf) print("Median performance: %s" % median_perf) print("Worst performance: %s" % worst_perf) if not fast_benchmark: if avg_perf > previous_avg_performance * perf_threshold: warnings.warn("Average Performance is %s times worse than previously." % (avg_perf/(EPS+previous_avg_performance))) if median_perf > previous_median_performance * perf_threshold: warnings.warn("Median Performance is %s times worse than previously." % (median_perf/(EPS+previous_median_performance))) if worst_perf > previous_worst_performance * perf_threshold: warnings.warn("Worst Performance is %s times worse than previously." % (worst_perf/(EPS+previous_worst_performance))) print("Ran fit with args:") print(fit_args) # List all warnings again to make sure they are seen: print("\n\n WARNINGS:") for w in caught_warnings: warnings.warn(w.message)
""" Example script for predicting columns of tables, demonstrating simple use-case """ from autogluon.tabular import TabularDataset, TabularPredictor # Training time: train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label = 'class' # specifies which column do we want to predict save_path = 'ag_models/' # where to save trained models predictor = TabularPredictor(label=label, path=save_path).fit(train_data) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: # predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality') results = predictor.fit_summary() # Inference time: test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame y_test = test_data[label] test_data = test_data.drop(labels=[label], axis=1) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = TabularPredictor.load(save_path) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
def autogluon(df, task, timelife): pd.options.mode.chained_assignment = None df_new = copy.copy(df) X, y, _ = return_X_y(df_new) if isinstance(y, pd.Series): y = y.to_frame() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) if isinstance(y_train, pd.Series): y_train = y_train.to_frame() target = y_train.columns[0] if isinstance(y_test, pd.Series): y_test = y_test.to_frame() X_train[target] = y_train train = X_train test = X_test if task == 'classification': if len(y[y.columns[0]].unique()) > 2: pt = 'multiclass' f1 = lambda y_test, y_pred: f1_score( y_test, y_pred, average='weighted') else: pt = 'binary' f1 = lambda y_test, y_pred: f1_score(y_test, y_pred) else: pt = 'regression' #, path='/home/riccardo/.local/share/Trash' predictor = TabularPredictor(label=target, problem_type=pt).fit( train_data=train, time_limit=timelife * 60, presets=['optimize_for_deployment' ]) # TEMPORANEO -> attenzione salvo sul cestino results = predictor.fit_summary() y_pred = predictor.predict(test) pipelines = (predictor.leaderboard(df, silent=True)) # sono queste res = predictor.evaluate_predictions(y_true=y_test.squeeze(), y_pred=y_pred, auxiliary_metrics=True) shutil.rmtree('./AutogluonModels') if (task == 'classification'): '''y_test = le.fit_transform(y_test) y_pred = le.fit_transform(y_pred) if len(np.unique(y_pred)) > 2: f1 = f1_score(y_test, y_pred, average='weighted')s else: f1 = f1_score(y_test, y_pred) return (res['accuracy'], f1)''' return (res['accuracy'], f1(y_test, y_pred), pipelines) else: return (res['root_mean_squared_error'], res['r2'], pipelines)
'num_boost_round': 1000, 'learning_rate': ag.Real(0.01, 0.1, log=True) }, 'XGB': { 'n_estimators': 1000, 'learning_rate': ag.Real(0.01, 0.1, log=True) } } predictor = TabularPredictor(label=label, path=save_path).fit( train_data, hyperparameters=hyperparameters, hyperparameter_tune_kwargs='auto', time_limit=60) results = predictor.fit_summary() # display detailed summary of fit() process print(results) # Inference time: test_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv' ) # another Pandas DataFrame print(test_data.head()) perf = predictor.evaluate( test_data ) # shorthand way to evaluate our predictor if test-labels are available # Otherwise we make predictions and can evaluate them later: y_pred = predictor.predict_proba(test_data) perf = predictor.evaluate_predictions(y_true=test_data[label],
# Enter text for testing s = 'pd.DataFrame' sample_dtypes = { 'list': [1, 'a', [2, 'c'], { 'b': 2 }], 'str': 'Hello Streamlit!', 'int': 17, 'float': 17.0, 'dict': { 1: 'a', 'x': [2, 'c'], 2: { 'b': 2 } }, 'bool': True, 'pd.DataFrame': y_predproba } sample_dtypes = sample_dtypes # Download predictor st.write("予測・分類結果をダウンロードします。※分類問題においては各ラベルに対する確率で表示されています。") download_button_str = download_button( sample_dtypes[s], "prediction.csv", 'Click here to download prediction.csv') st.markdown(download_button_str, unsafe_allow_html=True) st.write(predictor.fit_summary()) else: st.write("チェックを入れると教師データによる学習モデルの作成とテストデータに対する計算が行われます")
from autogluon.tabular import TabularDataset, TabularPredictor # Training time: train_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv' ) # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label = 'class' # specifies which column do we want to predict save_path = 'ag_models/' # where to save trained models predictor = TabularPredictor(label=label, path=save_path).fit(train_data) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: # predictor = TabularPredictor(label=label, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality') results = predictor.fit_summary(show_plot=True) # Inference time: test_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv' ) # another Pandas DataFrame y_test = test_data[label] test_data = test_data.drop( labels=[label], axis=1 ) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = TabularPredictor.load( save_path ) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data)