def frc_AutoGluon(df_train, df_test, categoricalVars, experiment_label='grocery', responseVar='wk1_sales_all_stores'): import autogluon as ag from autogluon import TabularPrediction as task # autogluon.task.tabular_prediction.TabularPredictor for varName in categoricalVars: df_train[varName] = df_train[varName].astype(str) df_test[varName] = df_test[varName].astype(str) # AutoGluon format train_data = task.Dataset(df=df_train) test_data = task.Dataset(df=df_test) model = task.fit(train_data=train_data, output_directory="auto_gluon/" + experiment_label, label=responseVar, hyperparameter_tune=False) # Forecast with the best model autogluon_frc = model.predict(test_data) # Forecast with all the models individual_frc = {'AG_'+model_to_use: model.predict(test_data, model=model_to_use) \ for model_to_use in model.model_names} return { 'autoGluon_frc': autogluon_frc, 'autoGluon_model': model, 'individual_frc': individual_frc }
def run_example(self): train_data = task.Dataset(file_path='./data/churn-train.csv') train_data = train_data.head( 500) # subsample 500 data points for faster demo print(train_data.head()) label_column = 'churn_probability' print("Summary of class variable: \n", train_data[label_column].describe()) dir = 'agModels-predictClass' # specifies folder where to store trained models predictor = task.fit(train_data=train_data, label=label_column, eval_metric="mean_absolute_error") test_data = task.Dataset(file_path='./data/churn-test.csv') y_test = test_data[label_column] # values to predict test_data_nolab = test_data.drop( labels=[label_column], axis=1) # delete label column to prove we're not cheating print(test_data_nolab.head()) #predictor = task.load(dir) # unnecessary, just demonstrates how to load previously-trained predictor from file y_pred = predictor.predict(test_data_nolab) print("Predictions: ", y_pred) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) print("MAE: " + perf) return perf
def train(self, train_data, val_data, params): train_dataset = TabularPrediction.Dataset(train_data) val_dataset = TabularPrediction.Dataset(val_data) output_dir = os.path.join(self.get_output_folders()[0], dt.now().strftime('%Y%m%d%H%M%S')) hp_tune = params["hp_tune"] ag_params = params["autogluon"] self._label_column = params["label"] if hp_tune is True: hp_params = ag_params["hyperparameters"] time_limits = hp_params["time_limits"] num_trials = hp_params["num_trials"] hyperparameters = self.__create_hp_params(hp_params) search_strategy = hp_params["search_strategy"] self._model = TabularPrediction.fit( train_data=train_dataset, tuning_data=val_dataset, label=self._label_column, output_directory=output_dir, time_limits=time_limits, num_trials=num_trials, hyperparameter_tune=hp_tune, hyperparameters=hyperparameters, search_strategy=search_strategy ) else: self._model = TabularPrediction.fit( train_data=train_dataset, tuning_data=val_dataset, label=self._label_column, output_directory=output_dir ) self.__dump_params(output_dir, params) self._model.fit_summary()
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. model_dir = args.model_dir train_dir = args.train_dir filename = args.filename target = args.target debug = args.debug eval_metric = args.eval_metric presets = args.presets num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts logging.info(train_dir) train_data = task.Dataset(file_path=os.path.join(train_dir, filename)) if debug: subsample_size = 500 # subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir, eval_metric=eval_metric, presets=presets) return predictor
def train(self, data, params): self.data = data self.train_data = task.Dataset(data.unscaled_df) autogluon_dir = f'agModels-predictClass/{uuid.uuid4()}' # specifies folder where to store trained models self.predictor = task.fit(train_data=self.train_data, label=self.metadata.get("output")[0], output_directory=autogluon_dir) self.state = "TRAINED"
def run(self, train_path, test_path, target, task): train_data = task.Dataset(file_path=train_path) predictor = task.fit(train_data=train_data, label=label_column, eval_metric="f1_macro", num_bagging_folds=5) test_data = task.Dataset(file_path=test_path) y_test = test_data[target] y_pred = predictor.predict(test_data) return predictor.evaluate_predictions(y_true=y_test.to_numpy(), y_pred=y_pred, auxiliary_metrics=True)
def run_tabular_benchmark_toy(fit_args): dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip', 'name': 'toyClassification', 'problem_type': MULTICLASS, 'label_column': 'y', 'performance_val': 0.436} # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data # toyclassif_dataset should produce 1 warning and 1 error during inference: # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn'] # Additional warning that would have occurred if ValueError was not triggered: # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them): ['distractioncolumn1', 'distractioncolumn2'] directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) print(f"Evaluating Benchmark Dataset {dataset['name']}") directory = directory_prefix + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = task.fit(train_data=train_data, label=dataset['label_column'], output_directory=savedir, **fit_args) try: predictor.predict(test_data) except ValueError: # ValueError should be raised because test_data has missing column 'lostcolumn' pass else: raise AssertionError(f'{dataset["name"]} should raise an exception.')
def load_data(directory_prefix, train_file, test_file, name, url=None): if not os.path.exists(directory_prefix): os.mkdir(directory_prefix) directory = directory_prefix + name + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (name, url)) zip_name = ag.download(url, directory_prefix) ag.unzip(zip_name, directory_prefix) os.remove(zip_name) train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) return train_data, test_data
def convert_gluon(X_train, y_train): feature_list = list() for i in range(len(X_train[0])): feature_list.append('feature_' + str(i)) feature_list.append('class') data = dict() for i in range(len(X_train)): for j in range(len(feature_list) - 1): if i > 0: try: data[feature_list[j]] = data[feature_list[j]] + [ X_train[i][j] ] except: pass else: data[feature_list[j]] = [X_train[i][j]] print(data) data['class'] = y_train data = pd.DataFrame(data, columns=list(data)) data = task.Dataset(data) return data
def define_and_evaluate_autogluon_pipeline(X, y, random_state=0): # autogluon dataframes data_df = pd.DataFrame(X) data_df["y"] = y outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state) nested_scores = [] for train_inds, test_inds in outer_cv.split(X, y): data_df_train = data_df.iloc[train_inds, :] data_df_test = data_df.iloc[test_inds, :] if len((set(y))) == 2: eval_metric = "roc_auc" problem_type = "binary" else: eval_metric = "f1_weighted" # no multiclass auroc in autogluon problem_type = "multiclass" predictor = task.fit( data_df_train, "y", time_limits=SEC, presets="best_quality", output_directory=".autogluon_temp", eval_metric=eval_metric, problem_type=problem_type, verbosity=0, ) y_pred = predictor.predict_proba(data_df.iloc[test_inds, :]) # same as roc_auc_ovr_weighted score = roc_auc_score(data_df_test["y"], y_pred, average="weighted", multi_class="ovr") nested_scores.append(score) return nested_scores
def train(args): is_distributed = len(args.hosts) > 1 host_rank = args.hosts.index(args.current_host) dist_ip_addrs = args.hosts dist_ip_addrs.pop(host_rank) ngpus_per_trial = 1 if args.num_gpus > 0 else 0 # load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) print(f'Label counts: {dict(Counter(train_data[args.label]))}') print(f'hp: {args.hyperparameters}') predictor = task.fit( train_data=train_data, label=args.label, output_directory=args.model_dir, problem_type=args.problem_type, eval_metric=args.eval_metric, stopping_metric=args.stopping_metric, auto_stack=args.auto_stack, # default: False hyperparameter_tune=args.hyperparameter_tune, # default: False feature_prune=args.feature_prune, # default: False holdout_frac=args.holdout_frac, # default: None num_bagging_folds=args.num_bagging_folds, # default: 0 num_bagging_sets=args.num_bagging_sets, # default: None stack_ensemble_levels=args.stack_ensemble_levels, # default: 0 hyperparameters=args.hyperparameters, cache_data=args.cache_data, time_limits=args.time_limits, num_trials=args.num_trials, # default: None search_strategy=args.search_strategy, # default: 'random' search_options=args.search_options, visualizer=args.visualizer, verbosity=args.verbosity) # Results summary predictor.fit_summary(verbosity=1) # Leaderboard on optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(dataset=test_data, silent=True) def format_for_print(df): table = PrettyTable(list(df.columns)) for row in df.itertuples(): table.add_row(row[1:]) return str(table) print(format_for_print(leaderboard), end='\n\n') # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
def train_autogluon(X_train, X_test, y_train, y_test, mtype, common_name_model, problemtype, classes, default_featurenames, transform_model, settings, model_session): # get train and test data train_data = convert_gluon(X_train, y_train) test_data = convert_gluon(X_test, y_test) predictor = task.fit(train_data=train_data, label='class') # get summary results = predictor.fit_summary(verbosity=3) # get model name files = list() model_name = common_name_model + '.pickle' # pickle store classifier f = open(model_name, 'wb') pickle.dump(predictor, f) f.close() # now rename current directory with models (keep this info in a folder) files.append(model_name) files.append('AutogluonModels') files.append('catboost_info') files.append('dask-worker-space') # get model_name model_dir = os.getcwd() return model_name, model_dir, files, test_data
def load(self, path): # load model self._model = TabularPrediction.load(path) # get the column name of label with open(os.path.join(path, "params.json"), "r") as f: params = json.load(f) self._label_column = params["label"]
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. model_dir = args.model_dir target = args.label_column train_file_path = get_file_path(args.train, args.train_filename) train_data = task.Dataset(file_path= train_file_path ) subsample_size = int(args.train_rows) # subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir) return predictor
def model_fn(model_dir): """ Load the gluon model. Called once when hosting service starts. :param: model_dir The directory where model files are stored. :return: a model (in this case an AutoGluon network) """ net = task.load(model_dir) return net
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. model_dir = args.model_dir target = args.label presets = args.presets # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) # train_file_path = get_file_path(args.train, args.train_filename) # train_data = task.Dataset(file_path= train_file_path ) columns = train_data.columns.tolist() column_dict = {"columns":columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) subsample_size = int(args.train_rows) # subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir, presets = presets) # Results summary predictor.fit_summary(verbosity=1) # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(dataset=test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) print(predictor.feature_importance(test_data)) # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n") return predictor
def train(args): is_distributed = len(args.hosts) > 1 host_rank = args.hosts.index(args.current_host) dist_ip_addrs = args.hosts dist_ip_addrs.pop(host_rank) # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) # Extract column info target = args.fit_args['label'] columns = train_data.columns.tolist() column_dict = {"columns": columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) # Train models predictor = task.fit( train_data=train_data, output_directory=args.model_dir, **args.fit_args, ) # Results summary predictor.fit_summary(verbosity=1) # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring if args.fit_args['label'] in test_data: # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(dataset=test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) print(predictor.feature_importance(test_data)) else: warnings.warn( 'Skipping eval on test data since label column is not included.' ) # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
def model_fn(model_dir): """ Load the gluon model. Called once when hosting service starts. :param: model_dir The directory where model files are stored. :return: a model (in this case a Gluon network) """ print(f'Loading model from {model_dir} with contents {os.listdir(model_dir)}') net = task.load(model_dir, verbosity=True) return net
def test_advanced_functionality(): fast_benchmark = True dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip', 'name': 'AdultIncomeBinaryClassification', 'problem_type': BINARY} label = 'class' directory_prefix = './datasets/' train_file = 'train_data.csv' test_file = 'test_data.csv' train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url']) if fast_benchmark: # subsample for fast_benchmark subsample_size = 100 train_data = train_data.head(subsample_size) test_data = test_data.head(subsample_size) print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}") directory = directory_prefix + 'advanced/' + dataset['name'] + "/" savedir = directory + 'AutogluonOutput/' shutil.rmtree(savedir, ignore_errors=True) # Delete AutoGluon output directory to ensure previous runs' information has been removed. predictor = task.fit(train_data=train_data, label=label, output_directory=savedir) leaderboard = predictor.leaderboard(dataset=test_data) leaderboard_extra = predictor.leaderboard(dataset=test_data, extra_info=True) assert set(predictor.get_model_names()) == set(leaderboard['model']) assert set(predictor.get_model_names()) == set(leaderboard_extra['model']) assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns)) assert len(leaderboard) == len(leaderboard_extra) num_models = len(predictor.get_model_names()) feature_importances = predictor.feature_importance(dataset=test_data) original_features = set(train_data.columns) original_features.remove(label) assert(set(feature_importances.keys()) == original_features) predictor.transform_features() predictor.transform_features(dataset=test_data) predictor.info() assert(predictor.get_model_full_dict() == dict()) predictor.refit_full() assert(len(predictor.get_model_full_dict()) == num_models) assert(len(predictor.get_model_names()) == num_models * 2) for model in predictor.get_model_names(): predictor.predict(dataset=test_data, model=model) predictor.refit_full() # Confirm that refit_models aren't further refit. assert(len(predictor.get_model_full_dict()) == num_models) assert(len(predictor.get_model_names()) == num_models * 2) predictor.delete_models(models_to_keep=[]) # Test that dry-run doesn't delete models assert(len(predictor.get_model_names()) == num_models * 2) predictor.predict(dataset=test_data) predictor.delete_models(models_to_keep=[], dry_run=False) # Test that dry-run deletes models assert len(predictor.get_model_names()) == 0 assert len(predictor.leaderboard()) == 0 assert len(predictor.leaderboard(extra_info=True)) == 0 try: predictor.predict(dataset=test_data) except: pass else: raise AssertionError('predictor.predict should raise exception after all models are deleted') print('Tabular Advanced Functionality Test Succeeded.')
def predict(cls, prediction_input: DataFrame): """For the input, do the predictions and return them. Args: prediction_input (a pandas dataframe): The data on which to do the predictions. There will be one prediction per row in the dataframe""" prediction_data = task.Dataset(df=prediction_input) print("Prediction Data: ") print(prediction_data.head()) return cls.model.predict(prediction_data)
def model_fn(model_dir): """ Load the gluon model. Called once when hosting service starts. :param: model_dir The directory where model files are stored. :return: a model (in this case a Gluon network) and the column info. """ print(f'Loading model from {model_dir} with contents {os.listdir(model_dir)}') net = task.load(model_dir, verbosity=True) with open(f'{model_dir}/code/columns.pkl', 'rb') as f: column_dict = pickle.load(f) return net, column_dict
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts model_dir = args.model_dir target = args.target # load training and validation data training_dir = args.train filename = args.filename logging.info(training_dir) train_data = task.Dataset(file_path=training_dir + '/' + filename) predictor = task.fit(train_data=train_data, label=target, output_directory=model_dir) return predictor
def Load_GLUON(dataDownstream, dataFeaturized): df = pd.DataFrame(columns=['column', 'feature_type']) df.to_csv('AutoGluon_predictions.csv', index=False) # dataDownstream train = copy.deepcopy(dataDownstream) train['label_target'] = 1 train_data = task.Dataset(df=train) label_column = 'label_target' try: features = task.fit(train_data=train_data, label=label_column) except: AlwaysTrue = 1 agl_predictions = pd.read_csv('AutoGluon_predictions.csv') predictions = agl_predictions['feature_type'].values.tolist() return predictions
def train_regression_autogluon(args, train_df, test_df): mx.npx.reset_np() from autogluon import TabularPrediction as task predictor = task.fit(train_data=task.Dataset(df=train_df), output_directory=args.out_dir, label='thrpt', eval_metric='mean_absolute_error') #performance = predictor.evaluate(test_df) test_prediction = predictor.predict(test_df) ret = np.zeros((len(test_prediction), 2), dtype=np.float32) for i, (lhs, rhs) in enumerate(zip(test_df['thrpt'].to_numpy(), test_prediction)): ret[i][0] = lhs ret[i][1] = rhs df_result = pd.DataFrame(ret, columns=['gt', 'pred']) df_result.to_csv(os.path.join(args.out_dir, 'pred_result.csv')) plot_save_figure(gt_thrpt=test_df['thrpt'].to_numpy(), pred_thrpt=test_prediction, save_dir=args.out_dir) mx.npx.set_np()
def __load_input_data(path: str) -> TabularDataset: """ Load training data as dataframe :param path: :return: DataFrame """ input_data_files = os.listdir(path) try: input_dfs = [pd.read_csv(f'{path}/{data_file}') for data_file in input_data_files] return task.Dataset(df=pd.concat(input_dfs)) except: print(f'No csv data in {path}!') return None
def frc_AutoGluon(df_train, df_test, categoricalVars, responseVar = 'wk1_sales_all_stores'): import autogluon as ag from autogluon import TabularPrediction as task for varName in categoricalVars: df_train[varName] = df_train[varName].astype(str) df_test[varName] = df_test[varName].astype(str) # AutoGluon format train_data = task.Dataset(df=df_train) test_data = task.Dataset(df=df_test) model = task.fit(train_data=train_data, output_directory="auto_gluon", label=responseVar, hyperparameter_tune=False) # Forecast with the best model autogluon_frc = model.predict(test_data) return {'autoGluon_frc': autogluon_frc, 'autoGluon_model':model}
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts model_dir = args.model_dir target = args.target # load training and validation data training_dir = args.train filename = args.filename logging.info(training_dir) hyperparameters = { 'GBM': [ {}, { 'extra_trees': True, 'AG_args': { 'name_suffix': 'XT' } }, ], 'RF': {}, 'XT': {}, 'KNN': {}, 'custom': ['GBM'] } presets = 'medium_quality_faster_train' train_data = task.Dataset(file_path=training_dir + '/' + filename) predictor = task.fit(train_data=train_data, label=target, output_directory=model_dir, presets=presets, hyperparameters=hyperparameters) return predictor
def evaluate(predictor, args): train_dir = args.train_dir train_file = args.filename test_file = train_file.replace('train', 'test', 1) target = args.target training_job_name = args.training_job_name s3_output = args.s3_output dataset_name = train_file.split('_')[0] logging.info(dataset_name) test_data = task.Dataset(file_path=os.path.join(train_dir, test_file)) u = urlparse(s3_output, allow_fragments=False) bucket = u.netloc logging.info(bucket) prefix = u.path.strip('/') logging.info(prefix) s3 = boto3.client('s3') y_test = test_data[target] test_data_nolab = test_data.drop(labels=[target], axis=1) y_pred = predictor.predict(test_data_nolab) y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred}) pred_file = f'{dataset_name}_test_predictions.csv' y_pred_df.to_csv(pred_file, index=False, header=True) leaderboard = predictor.leaderboard() lead_file = f'{dataset_name}_leaderboard.csv' leaderboard.to_csv(lead_file) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) del perf['confusion_matrix'] perf_file = f'{dataset_name}_model_performance.txt' with open(perf_file, 'w') as f: print(json.dumps(perf, indent=4), file=f) summary = predictor.fit_summary() summ_file = f'{dataset_name}_fit_summary.txt' with open(summ_file, 'w') as f: print(summary, file=f) files_to_upload = [pred_file, lead_file, perf_file, summ_file] for file in files_to_upload: s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))
def transform_fn(net, data, input_content_type, output_content_type): """ Transform a request using the Gluon model. Called once per request. :param net: The Gluon model. :param data: The request payload. :param input_content_type: The request content type. ('text/csv') :param output_content_type: The (desired) response content type. ('text/csv') :return: response payload and content type. """ start = timer() # text/csv if input_content_type == 'text/csv': # Load dataset df = pd.read_csv(StringIO(data)) ds = task.Dataset(df=df) # Predict predictions = net.predict(ds) print(f'Prediction counts: {Counter(predictions.tolist())}') # Form response output = StringIO() pd.DataFrame(predictions).to_csv(output, header=False, index=False) response_body = output.getvalue() # If target column passed, evaluate predictions performance target = net.label_column if target in ds: print(f'Label column ({target}) found in input data. ' 'Therefore, evaluating prediction performance...') performance = net.evaluate_predictions(y_true=ds[target], y_pred=predictions.tolist(), auxiliary_metrics=True) print(json.dumps(performance, indent=4)) else: raise NotImplementedError("content_type must be 'text/csv'") elapsed_time = round(timer() - start, 3) print(f'Elapsed time: {round(timer()-start,3)} seconds') return response_body, output_content_type
def train(args): model_output_dir = f'{args.output_dir}/data' is_distributed = len(args.hosts) > 1 host_rank = args.hosts.index(args.current_host) dist_ip_addrs = args.hosts dist_ip_addrs.pop(host_rank) # Load training and validation data print(f'Train files: {os.listdir(args.train)}') train_data = __load_input_data(args.train) # Extract column info target = args.fit_args['label'] columns = train_data.columns.tolist() column_dict = {"columns":columns} with open('columns.pkl', 'wb') as f: pickle.dump(column_dict, f) # Train models predictor = task.fit( train_data=train_data, output_directory=args.model_dir, **args.fit_args, ) # Results summary predictor.fit_summary(verbosity=3) model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html') model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html') if os.path.exists(model_summary_fname_src): shutil.copy(model_summary_fname_src, model_summary_fname_tgt) # ensemble visualization G = predictor._trainer.model_graph remove = [node for node,degree in dict(G.degree()).items() if degree < 1] G.remove_nodes_from(remove) A = nx.nx_agraph.to_agraph(G) A.graph_attr.update(rankdir='BT') A.node_attr.update(fontsize=10) for node in A.iternodes(): node.attr['shape'] = 'rectagle' A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot') # Optional test data if args.test: print(f'Test files: {os.listdir(args.test)}') test_data = __load_input_data(args.test) # Test data must be labeled for scoring if args.fit_args['label'] in test_data: # Leaderboard on test data print('Running model on test data and getting Leaderboard...') leaderboard = predictor.leaderboard(dataset=test_data, silent=True) print(format_for_print(leaderboard), end='\n\n') leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False) # Feature importance on test data # Note: Feature importance must be calculated on held-out (test) data. # If calculated on training data it will be biased due to overfitting. if args.feature_importance: print('Feature importance:') # Increase rows to print feature importance pd.set_option('display.max_rows', 500) feature_importance = predictor.feature_importance(test_data) feature_importance_df = pd.DataFrame(feature_importance, columns=['Importance score']).rename_axis(index='Feature') print(feature_importance_df) feature_importance_df.to_csv(f'{model_output_dir}/feature_importance.csv', index=True) # Classification report and confusion matrix for classification model if predictor.problem_type in [BINARY, MULTICLASS]: from sklearn.metrics import classification_report, confusion_matrix X_test = test_data.drop(args.fit_args['label'], axis=1) y_test_true = test_data[args.fit_args['label']] y_test_pred = predictor.predict(X_test) y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True) report_dict = classification_report(y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels) report_dict_df = pd.DataFrame(report_dict).T report_dict_df.to_csv(f'{model_output_dir}/classification_report.csv', index=True) cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels) cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels) sns.set(font_scale=1) cmap = 'coolwarm' sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap) plt.title('Confusion Matrix') plt.ylabel('true label') plt.xlabel('predicted label') plt.show() plt.savefig(f'{model_output_dir}/confusion_matrix.png') get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir) else: warnings.warn('Skipping eval on test data since label column is not included.') # Files summary print(f'Model export summary:') print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}") models_contents = os.listdir('/opt/ml/model/models') print(f"/opt/ml/model/models: {models_contents}") print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")