def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. model_dir = args.model_dir train_dir = args.train_dir filename = args.filename target = args.target debug = args.debug eval_metric = args.eval_metric presets = args.presets num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts time_limit = int(args.training_minutes) * 60 logging.info(train_dir) train_data = TabularDataset(os.path.join(train_dir, filename)) if debug: subsample_size = 500 # subsample subset of data for faster demo, try setting this to much larger values train_data = train_data.sample(n=subsample_size, random_state=0) predictor = TabularPredictor(label=target, path=model_dir, eval_metric=eval_metric).fit( train_data=train_data, excluded_model_types=['KNN','RF','NN'], time_limit=time_limit, presets=[presets, 'optimize_for_deployment']) return predictor
def load_data(directory_prefix, train_file, test_file, name, url=None): if not os.path.exists(directory_prefix): os.mkdir(directory_prefix) directory = directory_prefix + name + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (name, url)) zip_name = download(url, directory_prefix) unzip(zip_name, directory_prefix) os.remove(zip_name) train_data = TabularDataset(train_file_path) test_data = TabularDataset(test_file_path) return train_data, test_data
def __load_input_data(path: str) -> TabularDataset: """ Load training data as dataframe :param path: :return: DataFrame """ input_data_files = os.listdir(path) try: input_dfs = [ pd.read_csv(f'{path}/{data_file}') for data_file in input_data_files ] return TabularDataset(data=pd.concat(input_dfs)) except: print(f'No csv data in {path}!') return None
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts model_dir = args.model_dir target = args.target # load training and validation data training_dir = args.train filename = args.filename logging.info(training_dir) # train_data = task.Dataset(file_path=training_dir + '/' + filename) train_data = TabularDataset(data=training_dir + '/' + filename) # predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir) predictor = TabularPredictor(label=target, path=model_dir).fit(train_data) return predictor
""" Example script for predicting columns of tables, demonstrating simple use-case """ from autogluon.tabular import TabularDataset, TabularPredictor # Training time: train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label = 'class' # specifies which column do we want to predict save_path = 'ag_models/' # where to save trained models predictor = TabularPredictor(label=label, path=save_path).fit(train_data) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: # predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality') results = predictor.fit_summary() # Inference time: test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame y_test = test_data[label] test_data = test_data.drop(labels=[label], axis=1) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = TabularPredictor.load(save_path) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
""" Example: distilling AutoGluon's ensemble-predictor into a single model for binary classification. """ # NOTE: Distillation can be done in a similar manner for multiclass classification and regression problems. # NOTE: To distill CatBoost models in multiclass classification, you need to first run: pip install catboost-dev from autogluon.tabular import TabularDataset, TabularPredictor subsample_size = 500 time_limit = 60 label = 'class' # specifies which column do we want to predict train_file_path = 'https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv' test_file_path = 'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv' train_data = TabularDataset(train_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = TabularDataset(test_file_path) test_data = test_data.head(subsample_size) # subsample for faster run # Fit model ensemble: predictor = TabularPredictor(label).fit(train_data, auto_stack=True, time_limit=time_limit) # Distill ensemble-predictor into single model: time_limit = 60 # set = None to fully train distilled models # aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here: aug_data = TabularDataset(train_file_path)
os.makedirs(args.output_data_dir, mode=0o777, exist_ok=True) config_file = get_input_path(args.ag_config) with open(config_file) as f: config = yaml.safe_load(f) # AutoGluon-specific config if args.n_gpus: config["num_gpus"] = int(args.n_gpus) print("Running training job with the config:") pprint(config) # ---------------------------------------------------------------- Training train_file = get_input_path(args.training_dir) train_data = TabularDataset(train_file) ag_predictor_args = config["ag_predictor_args"] ag_predictor_args["path"] = args.model_dir ag_fit_args = config["ag_fit_args"] predictor = TabularPredictor(**ag_predictor_args).fit( train_data, **ag_fit_args) # --------------------------------------------------------------- Inference if args.test_dir: test_file = get_input_path(args.test_dir) test_data = TabularDataset(test_file) # Predictions
""" Example script for predicting columns of tables, demonstrating more advanced usage of fit(). Note that all settings demonstrated here are just chosen for demonstration purposes (to minimize runtime), and do not represent wise choices to use in practice. To maximize predictive accuracy, we recommend you do NOT specify `hyperparameters` or `hyperparameter_tune_kwargs`, and instead only specify the following fit() arguments: eval_metric=YOUR_METRIC, presets='best_quality' """ import autogluon.core as ag from autogluon.tabular import TabularDataset, TabularPredictor # Training time: train_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv' ) # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(100) # subsample for faster demo print(train_data.head()) label = 'class' # specifies which column do we want to predict save_path = 'ag_hpo_models/' # where to save trained models hyperparameters = { 'NN': { 'num_epochs': 10, 'activation': 'relu', 'dropout_prob': ag.Real(0.0, 0.5) }, 'GBM': { 'num_boost_round': 1000, 'learning_rate': ag.Real(0.01, 0.1, log=True) }, 'XGB': { 'n_estimators': 1000, 'learning_rate': ag.Real(0.01, 0.1, log=True) }
Most users can get strong performance without specifying custom feature generators due to the generic and powerful default feature generator used by AutoGluon. An advanced user may wish to create a custom feature generator to: 1. Experiment with different preprocessing pipelines to improve model quality. 2. Have full control over what data is being sent to downstream models. 3. Migrate existing pipelines into AutoGluon for ease of use and deployment. 4. Contribute new feature generators to AutoGluon. """ ################ # Loading Data # ################ from autogluon.tabular import TabularDataset, TabularPredictor train_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv' ) # can be local CSV file as well, returns Pandas DataFrame test_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv' ) # another Pandas DataFrame label = 'class' # specifies which column do we want to predict sample_train_data = train_data.head(100) # subsample for faster demo # Separate features and labels # Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well. X = sample_train_data.drop(columns=[label]) y = sample_train_data[label] X_test = test_data.drop(columns=[label]) y_test = test_data[label]
os.makedirs(args.output_data_dir, mode=0o777, exist_ok=True) config_file = get_input_path(args.ag_config) with open(config_file) as f: config = yaml.safe_load(f) # AutoGluon-specific config if args.n_gpus: config['num_gpus'] = int(args.n_gpus) logger.info("Running training job with the config:") pprint(config) # ----------------------------- Training ----------------------------------- train_file = get_input_path(args.training_dir) train_data = TabularDataset(train_file) test_file = get_input_path(args.test_dir) test_data = TabularDataset(test_file) ag_predictor_args = config["ag_predictor_args"] ag_predictor_args["path"] = args.model_dir ag_fit_args = config["ag_fit_args"] predictor = TabularPredictor(**ag_predictor_args).fit( train_data, **ag_fit_args) logger.info("Best model: %s", predictor.get_model_best()) # Leaderboard lb = predictor.leaderboard() lb.to_csv(f'{args.output_data_dir}/leaderboard.csv', index=False) logger.info("Saved leaderboard to output.")
# The `_get_default_auxiliary_params` method defines various model-agnostic parameters such as maximum memory usage and valid input column dtypes. # For most users who build custom models, they will only need to specify the valid/invalid dtypes to the model here. def _get_default_auxiliary_params(self) -> dict: default_auxiliary_params = super()._get_default_auxiliary_params() extra_auxiliary_params = dict( # Drop category and object column dtypes, since NaiveBayes can't handle these dtypes. ignored_type_group_raw=['category', 'object'], ) default_auxiliary_params.update(extra_auxiliary_params) return default_auxiliary_params ################ # Loading Data # ################ train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame label = 'class' # specifies which column do we want to predict train_data = train_data.head(1000) # subsample for faster demo ##################################################### # Training custom model outside of TabularPredictor # ##################################################### # Separate features and labels X = train_data.drop(columns=[label]) y = train_data[label] problem_type = infer_problem_type(y=y) # Infer problem type (or else specify directly) naive_bayes_model = NaiveBayesModel(path='AutogluonModels/', name='CustomNaiveBayes', problem_type=problem_type)
temp = temp.reset_index(drop=True) temp data_final = temp.loc[:, ['date', 'stock_id', 'close']].join(df_zscore) #data_final = data_final.drop(columns=['']) ''' #adjust data move 1odays temp_adjust=data_final.iloc[:-10,3:] temp_adjust=temp_adjust.reset_index(drop = True) date_adjust=data_final.iloc[10:,:3] date_adjust=date_adjust.reset_index(drop = True) data_adjust=date_adjust.join(temp_adjust) data_adjust=data_adjust.reset_index(drop = True) ''' #train train_data = TabularDataset( datasettemp.drop( columns=['Trading_money', 'open', 'max', 'min', 'PER', 'PBR' ]).iloc[:-11]) #predictor predictor = TabularPredictor(label='close').fit( train_data.drop(columns=['date', 'stock_id'])) # , num_stack_levels=1,num_bag_folds=2) #test test_data = datasettemp.iloc[-11:len(datasettemp)] preds = predictor.predict( test_data.drop(columns=['date', 'stock_id', 'close'])) test_hat = pd.DataFrame({ 'date': test_data['date'], 'stock_id': test_data['stock_id'], 'close': preds
def transform_fn(models, data, input_content_type, output_content_type): """ Transform a request using the Gluon model. Called once per request. :param models: The Gluon model and the column info. :param data: The request payload. :param input_content_type: The request content type. ('text/csv') :param output_content_type: The (desired) response content type. ('text/csv') :return: response payload and content type. """ start = timer() net = models[0] column_dict = models[1] # text/csv if input_content_type == 'text/csv': # Load dataset columns = column_dict['columns'] df = pd.read_csv(StringIO(data), header=None) df_preprosessed = preprocess(df, columns, net.label) ds = TabularDataset(data=df_preprosessed) try: predictions = net.predict(ds) except: try: predictions = net.predict(ds.fillna(0.0)) warnings.warn('Filled NaN\'s with 0.0 in order to predict.') except Exception as e: response_body = e return response_body, output_content_type # Print prediction counts, limit in case of regression problem pred_counts = Counter(predictions.tolist()) n_display_items = 30 if len(pred_counts) > n_display_items: print(f'Top {n_display_items} prediction counts: ' f'{dict(take(n_display_items, pred_counts.items()))}') else: print(f'Prediction counts: {pred_counts}') # Form response output = StringIO() pd.DataFrame(predictions).to_csv(output, header=False, index=False) response_body = output.getvalue() # If target column passed, evaluate predictions performance target = net.label if target in ds: print(f'Label column ({target}) found in input data. ' 'Therefore, evaluating prediction performance...') try: performance = net.evaluate_predictions(y_true=ds[target], y_pred=predictions, auxiliary_metrics=True) print(json.dumps(performance, indent=4, default=pd.DataFrame.to_json)) time.sleep(0.1) except Exception as e: # Print exceptions on evaluate, continue to return predictions print(f'Exception: {e}') else: raise NotImplementedError("content_type must be 'text/csv'") elapsed_time = round(timer()-start,3) print(f'Elapsed time: {round(timer()-start,3)} seconds') return response_body, output_content_type
from autogluon.tabular import TabularDataset, TabularPredictor import pandas as pd from datetime import datetime # train_df = pd.read_csv("../../data/processed/train_preproc.csv") train_data = TabularDataset( "../../data/processed/oversampled/train_valid_feat_eng_oversample.csv") # train_data = train_data.drop(["Age","Room_Rate","Discount_Rate"],axis="columns") save_path = "models_oversample_valid" predictor = TabularPredictor(label="Reservation_Status", path=save_path, eval_metric="f1_macro").fit( train_data, time_limit=7200, presets="best_quality") valid_data = TabularDataset("../../data/processed/valid_preproc.csv") y_test = valid_data.loc[:, "Reservation_Status"] valid_data = valid_data.drop(["Reservation_Status"], axis="columns") y_pred = predictor.predict(valid_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) print(perf) test_data = TabularDataset("../../data/processed/test_preproc.csv") test_preds = predictor.predict(test_data) test_df = pd.read_csv("../../data/processed/test_preproc.csv") test_df["Reservation_Status"] = test_preds
def evaluate(predictor, args): train_dir = args.train_dir train_file = args.filename test_file = train_file.replace('train', 'test', 1) target = args.target training_job_name = args.training_job_name s3_output = args.s3_output presets = args.presets dataset_name = train_file.split('_')[0] logging.info(dataset_name) test_data = TabularDataset(os.path.join(train_dir, test_file)) u = urlparse(s3_output, allow_fragments=False) bucket = u.netloc logging.info(bucket) prefix = u.path.strip('/') logging.info(prefix) s3 = boto3.client('s3') y_test = test_data[target] test_data_nolab = test_data.drop(labels=[target], axis=1) y_pred = predictor.predict(test_data_nolab) y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred}) pred_file = f'{dataset_name}_test_predictions.csv' y_pred_df.to_csv(pred_file, index=False, header=True) leaderboard = predictor.leaderboard() lead_file = f'{dataset_name}_leaderboard.csv' leaderboard.to_csv(lead_file) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) #del perf['confusion_matrix'] perf_file = f'{dataset_name}_model_performance.txt' with open(perf_file, 'w') as f: print(json.dumps(perf, indent=4, default=pd.DataFrame.to_json), file=f) summary = predictor.fit_summary() summ_file = f'{dataset_name}_fit_summary.txt' with open(summ_file, 'w') as f: print(summary, file=f) y_prob = predictor.predict_proba(test_data_nolab) y_prob = y_prob.iloc[:,-1] y_test_enc, uniques = pd.factorize(y_test) # Label Encoding fig = plt.figure(figsize=(14,4)) plt.subplot(1,3,1) plot_roc_curve(y_test_enc, y_prob) plt.subplot(1,3,2) plot_pr_curve(y_test_enc, y_prob) plt.subplot(1,3,3) plot_conf_mtx(y_test_enc, y_prob, 0.5) eval_file = f'{dataset_name}_eval.png' plt.savefig(eval_file) plt.close(fig) # # Feature importance # featimp = predictor.feature_importance(test_data) # fig, ax = plt.subplots(figsize=(12,5)) # plot = sns.barplot(x=featimp.index, y=featimp.values) # ax.set_title('Feature Importance') # plot.set_xticklabels(plot.get_xticklabels(), rotation='vertical') # featimp_imgfile = f'{dataset_name}_featimp.png' # featimp_csvfile = f'{dataset_name}_featimp.csv' # fig.savefig(featimp_imgfile) # featimp.to_csv(featimp_csvfile) # plt.close(fig) # Cleanup data in order to avoid disk space issues predictor.save_space() predictor.delete_models(models_to_keep='best', dry_run=False) files_to_upload = [pred_file, lead_file, perf_file, summ_file, eval_file] for file in files_to_upload: s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))
return parser.parse_args() if __name__ == '__main__': args = parse_args() predictor = train(args) training_dir = args.train train_file = args.filename test_file = train_file.replace('train', 'test', 1) dataset_name = train_file.split('_')[0] print(dataset_name) # test_data = task.Dataset(file_path=os.path.join(training_dir, test_file)) test_data = TabularDataset(data=os.path.join(training_dir, test_file)) u = urlparse(args.s3_output, allow_fragments=False) bucket = u.netloc print(bucket) prefix = u.path.strip('/') print(prefix) s3 = boto3.client('s3') try: y_test = test_data[args.target] # values to predict test_data_nolab = test_data.drop( labels=[args.target], axis=1) # delete label column to prove we're not cheating y_pred = predictor.predict(test_data_nolab)
def transform_fn(models, data, input_content_type, output_content_type): """ Transform a request using the Gluon model. Called once per request. :param models: The Gluon model and the column info. :param data: The request payload. :param input_content_type: The request content type. ('text/csv') :param output_content_type: The (desired) response content type. ('text/csv') :return: response payload and content type. """ start = timer() net = models[0] column_dict = models[1] label_map = net.class_labels_internal_map ### # text/csv if "text/csv" in input_content_type: # Load dataset columns = column_dict["columns"] if type(data) == str: # Load dataset df = pd.read_csv(StringIO(data), header=None) else: df = pd.read_csv(StringIO(data.decode()), header=None) df_preprosessed = preprocess(df, columns, net.label) ds = TabularDataset(data=df_preprosessed) try: predictions = net.predict_proba(ds) predictions_ = net.predict(ds) except: try: predictions = net.predict_proba(ds.fillna(0.0)) predictions_ = net.predict(ds.fillna(0.0)) warnings.warn("Filled NaN's with 0.0 in order to predict.") except Exception as e: response_body = e return response_body, output_content_type # threshold = 0.5 # predictions_label = [[k for k, v in label_map.items() if v == 1][0] if i > threshold else [k for k, v in label_map.items() if v == 0][0] for i in predictions] predictions_label = predictions_.tolist() # Print prediction counts, limit in case of regression problem pred_counts = Counter(predictions_label) n_display_items = 30 if len(pred_counts) > n_display_items: print(f"Top {n_display_items} prediction counts: " f"{dict(take(n_display_items, pred_counts.items()))}") else: print(f"Prediction counts: {pred_counts}") # Form response output = StringIO() pd.DataFrame(predictions).to_csv(output, header=False, index=False) response_body = output.getvalue() # If target column passed, evaluate predictions performance target = net.label if target in ds: print(f"Label column ({target}) found in input data. " "Therefore, evaluating prediction performance...") try: performance = net.evaluate_predictions( y_true=ds[target], y_pred=np.array(predictions_label), auxiliary_metrics=True) print( json.dumps(performance, indent=4, default=pd.DataFrame.to_json)) time.sleep(0.1) except Exception as e: # Print exceptions on evaluate, continue to return predictions print(f"Exception: {e}") else: raise NotImplementedError("content_type must be 'text/csv'") elapsed_time = round(timer() - start, 3) print(f"Elapsed time: {round(timer()-start,3)} seconds") return response_body, output_content_type
from autogluon.tabular import TabularDataset, TabularPredictor # Train train_data = TabularDataset('train.csv') id, label = 'PassengerId', 'Survived' save_path = 'model' time_limit = 300 predictor = TabularPredictor(label=label, path=save_path).fit(train_data.drop(columns=[id]), time_limit=time_limit, presets='best_quality') # Test import pandas as pd test_data = TabularDataset('test.csv') # predictor = TabularPredictor.load( # save_path # ) # unnecessary, just demonstrates how to load previously-trained predictor from file preds = predictor.predict(test_data.drop(columns=[id])) submission = pd.DataFrame({id: test_data[id], label: preds}) submission.to_csv('submission.csv', index=False)