print(dataset_name) # test_data = task.Dataset(file_path=os.path.join(training_dir, test_file)) test_data = TabularDataset(data=os.path.join(training_dir, test_file)) u = urlparse(args.s3_output, allow_fragments=False) bucket = u.netloc print(bucket) prefix = u.path.strip('/') print(prefix) s3 = boto3.client('s3') try: y_test = test_data[args.target] # values to predict test_data_nolab = test_data.drop( labels=[args.target], axis=1) # delete label column to prove we're not cheating y_pred = predictor.predict(test_data_nolab) y_pred_df = pd.DataFrame.from_dict({ 'True': y_test, 'Predicted': y_pred }) pred_file = f'{dataset_name}_test_predictions.csv' y_pred_df.to_csv(pred_file, index=False, header=True) leaderboard = predictor.leaderboard() lead_file = f'{dataset_name}_leaderboard.csv' leaderboard.to_csv(lead_file) perf = predictor.evaluate_predictions(y_true=y_test,
""" Example script for predicting columns of tables, demonstrating simple use-case """ from autogluon.tabular import TabularDataset, TabularPredictor # Training time: train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(500) # subsample for faster demo print(train_data.head()) label = 'class' # specifies which column do we want to predict save_path = 'ag_models/' # where to save trained models predictor = TabularPredictor(label=label, path=save_path).fit(train_data) # NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead: # predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality') results = predictor.fit_summary() # Inference time: test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame y_test = test_data[label] test_data = test_data.drop(labels=[label], axis=1) # delete labels from test data since we wouldn't have them in practice print(test_data.head()) predictor = TabularPredictor.load(save_path) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file y_pred = predictor.predict(test_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
ag_predictor_args["path"] = args.model_dir ag_fit_args = config["ag_fit_args"] predictor = TabularPredictor(**ag_predictor_args).fit( train_data, **ag_fit_args) logger.info("Best model: %s", predictor.get_model_best()) # Leaderboard lb = predictor.leaderboard() lb.to_csv(f'{args.output_data_dir}/leaderboard.csv', index=False) logger.info("Saved leaderboard to output.") # Feature importance feature_importance = predictor.feature_importance(test_data) feature_importance.to_csv(f'{args.output_data_dir}/feature_importance.csv') logger.info("Saved feature importance to output.") # Evaluation evaluation = predictor.evaluate(test_data) with open(f'{args.output_data_dir}/evaluation.json', 'w') as f: json.dump(evaluation, f) logger.info("Saved evaluation to output.") predictor.save_space() # ---------------------------- Inference ----------------------------------- test_data_nolabel = test_data.drop(labels=ag_predictor_args['label'], axis=1) y_pred = predictor.predict(test_data_nolabel) y_pred.to_csv(f'{args.output_data_dir}/predictions.csv', index=False)
train_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv' ) # can be local CSV file as well, returns Pandas DataFrame test_data = TabularDataset( 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv' ) # another Pandas DataFrame label = 'class' # specifies which column do we want to predict sample_train_data = train_data.head(100) # subsample for faster demo # Separate features and labels # Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well. X = sample_train_data.drop(columns=[label]) y = sample_train_data[label] X_test = test_data.drop(columns=[label]) y_test = test_data[label] print(X) ############################## # Fitting feature generators # ############################## from autogluon.features.generators import CategoryFeatureGenerator, IdentityFeatureGenerator # IdentityFeatureGenerator is a 'do-nothing' feature generator if given default arguments. It will simply pass the data along. identity_feature_generator = IdentityFeatureGenerator() # fit_transform the generator using the input data. This must be done prior to calling transform. X_transform = identity_feature_generator.fit_transform(
################ # Loading Data # ################ train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame label = 'class' # specifies which column do we want to predict train_data = train_data.head(1000) # subsample for faster demo ##################################################### # Training custom model outside of TabularPredictor # ##################################################### # Separate features and labels X = train_data.drop(columns=[label]) y = train_data[label] problem_type = infer_problem_type(y=y) # Infer problem type (or else specify directly) naive_bayes_model = NaiveBayesModel(path='AutogluonModels/', name='CustomNaiveBayes', problem_type=problem_type) # Construct a LabelCleaner to neatly convert labels to float/integers during model training/inference, can also use to inverse_transform back to original. label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=y) y_clean = label_cleaner.transform(y) naive_bayes_model.fit(X=X, y=y_clean) # Fit custom model # To save to disk and load the model, do the following: # load_path = naive_bayes_model.path # naive_bayes_model.save() # del naive_bayes_model
temp_adjust=data_final.iloc[:-10,3:] temp_adjust=temp_adjust.reset_index(drop = True) date_adjust=data_final.iloc[10:,:3] date_adjust=date_adjust.reset_index(drop = True) data_adjust=date_adjust.join(temp_adjust) data_adjust=data_adjust.reset_index(drop = True) ''' #train train_data = TabularDataset( datasettemp.drop( columns=['Trading_money', 'open', 'max', 'min', 'PER', 'PBR' ]).iloc[:-11]) #predictor predictor = TabularPredictor(label='close').fit( train_data.drop(columns=['date', 'stock_id'])) # , num_stack_levels=1,num_bag_folds=2) #test test_data = datasettemp.iloc[-11:len(datasettemp)] preds = predictor.predict( test_data.drop(columns=['date', 'stock_id', 'close'])) test_hat = pd.DataFrame({ 'date': test_data['date'], 'stock_id': test_data['stock_id'], 'close': preds }) test_hat predition_data = [] for k in range(0, 10): #抓後10天的資料
# train_df = pd.read_csv("../../data/processed/train_preproc.csv") train_data = TabularDataset( "../../data/processed/oversampled/train_valid_feat_eng_oversample.csv") # train_data = train_data.drop(["Age","Room_Rate","Discount_Rate"],axis="columns") save_path = "models_oversample_valid" predictor = TabularPredictor(label="Reservation_Status", path=save_path, eval_metric="f1_macro").fit( train_data, time_limit=7200, presets="best_quality") valid_data = TabularDataset("../../data/processed/valid_preproc.csv") y_test = valid_data.loc[:, "Reservation_Status"] valid_data = valid_data.drop(["Reservation_Status"], axis="columns") y_pred = predictor.predict(valid_data) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) print(perf) test_data = TabularDataset("../../data/processed/test_preproc.csv") test_preds = predictor.predict(test_data) test_df = pd.read_csv("../../data/processed/test_preproc.csv") test_df["Reservation_Status"] = test_preds test_df = test_df.replace( {"Reservation_Status": { "Check-In": 1,
def evaluate(predictor, args): train_dir = args.train_dir train_file = args.filename test_file = train_file.replace('train', 'test', 1) target = args.target training_job_name = args.training_job_name s3_output = args.s3_output presets = args.presets dataset_name = train_file.split('_')[0] logging.info(dataset_name) test_data = TabularDataset(os.path.join(train_dir, test_file)) u = urlparse(s3_output, allow_fragments=False) bucket = u.netloc logging.info(bucket) prefix = u.path.strip('/') logging.info(prefix) s3 = boto3.client('s3') y_test = test_data[target] test_data_nolab = test_data.drop(labels=[target], axis=1) y_pred = predictor.predict(test_data_nolab) y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred}) pred_file = f'{dataset_name}_test_predictions.csv' y_pred_df.to_csv(pred_file, index=False, header=True) leaderboard = predictor.leaderboard() lead_file = f'{dataset_name}_leaderboard.csv' leaderboard.to_csv(lead_file) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) #del perf['confusion_matrix'] perf_file = f'{dataset_name}_model_performance.txt' with open(perf_file, 'w') as f: print(json.dumps(perf, indent=4, default=pd.DataFrame.to_json), file=f) summary = predictor.fit_summary() summ_file = f'{dataset_name}_fit_summary.txt' with open(summ_file, 'w') as f: print(summary, file=f) y_prob = predictor.predict_proba(test_data_nolab) y_prob = y_prob.iloc[:,-1] y_test_enc, uniques = pd.factorize(y_test) # Label Encoding fig = plt.figure(figsize=(14,4)) plt.subplot(1,3,1) plot_roc_curve(y_test_enc, y_prob) plt.subplot(1,3,2) plot_pr_curve(y_test_enc, y_prob) plt.subplot(1,3,3) plot_conf_mtx(y_test_enc, y_prob, 0.5) eval_file = f'{dataset_name}_eval.png' plt.savefig(eval_file) plt.close(fig) # # Feature importance # featimp = predictor.feature_importance(test_data) # fig, ax = plt.subplots(figsize=(12,5)) # plot = sns.barplot(x=featimp.index, y=featimp.values) # ax.set_title('Feature Importance') # plot.set_xticklabels(plot.get_xticklabels(), rotation='vertical') # featimp_imgfile = f'{dataset_name}_featimp.png' # featimp_csvfile = f'{dataset_name}_featimp.csv' # fig.savefig(featimp_imgfile) # featimp.to_csv(featimp_csvfile) # plt.close(fig) # Cleanup data in order to avoid disk space issues predictor.save_space() predictor.delete_models(models_to_keep='best', dry_run=False) files_to_upload = [pred_file, lead_file, perf_file, summ_file, eval_file] for file in files_to_upload: s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))
from autogluon.tabular import TabularDataset, TabularPredictor # Train train_data = TabularDataset('train.csv') id, label = 'PassengerId', 'Survived' save_path = 'model' time_limit = 300 predictor = TabularPredictor(label=label, path=save_path).fit(train_data.drop(columns=[id]), time_limit=time_limit, presets='best_quality') # Test import pandas as pd test_data = TabularDataset('test.csv') # predictor = TabularPredictor.load( # save_path # ) # unnecessary, just demonstrates how to load previously-trained predictor from file preds = predictor.predict(test_data.drop(columns=[id])) submission = pd.DataFrame({id: test_data[id], label: preds}) submission.to_csv('submission.csv', index=False)