def load_data(directory_prefix, train_file, test_file, name, url=None): if not os.path.exists(directory_prefix): os.mkdir(directory_prefix) directory = directory_prefix + name + "/" train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (name, url)) zip_name = ag.download(url, directory_prefix) ag.unzip(zip_name, directory_prefix) os.remove(zip_name) train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) return train_data, test_data
""" Example script for predicting columns of tables, demonstrating more advanced usage of fit(). Note that all settings demonstrated here are just chosen for demonstration purposes (to minimize runtime), and do not represent wise choices to use in practice. To maximize predictive accuracy, we recommend you do NOT specify `hyperparameters` or `hyperparameter_tune`, and instead only specify the following fit() arguments: eval_metric=YOUR_METRIC, presets='best_quality' """ import autogluon.core as ag from autogluon.tabular import TabularPrediction as task # Training time: train_data = task.Dataset( file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv' ) # can be local CSV file as well, returns Pandas DataFrame train_data = train_data.head(100) # subsample for faster demo print(train_data.head()) label_column = 'class' # specifies which column do we want to predict savedir = 'ag_hpo_models/' # where to save trained models hyperparams = { 'NN': { 'num_epochs': 10, 'activation': 'relu', 'dropout_prob': ag.Real(0.0, 0.5) }, 'GBM': { 'num_boost_round': 1000, 'learning_rate': ag.Real(0.01, 0.1, log=True) } } predictor = task.fit( train_data=train_data,
Most users can get strong performance without specifying custom feature generators due to the generic and powerful default feature generator used by AutoGluon. An advanced user may wish to create a custom feature generator to: 1. Experiment with different preprocessing pipelines to improve model quality. 2. Have full control over what data is being sent to downstream models. 3. Migrate existing pipelines into AutoGluon for ease of use and deployment. 4. Contribute new feature generators to AutoGluon. """ ################ # Loading Data # ################ from autogluon.tabular import TabularPrediction as task train_data = task.Dataset( file_path= 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv' ) # can be local CSV file as well, returns Pandas DataFrame test_data = task.Dataset( file_path= 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv' ) # another Pandas DataFrame label_column = 'class' # specifies which column do we want to predict sample_train_data = train_data.head(100) # subsample for faster demo # Separate features and labels # Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well. X = sample_train_data.drop(columns=[label_column]) y = sample_train_data[label_column] X_test = test_data.drop(columns=[label_column]) y_test = test_data[label_column]
train_file = 'train_data.csv' test_file = 'test_data.csv' train_file_path = directory + train_file test_file_path = directory + test_file if (not os.path.exists(train_file_path)) or ( not os.path.exists(test_file_path)): # fetch files from s3: print("%s data not found locally, so fetching from %s" % (dataset['name'], dataset['url'])) os.system("wget " + dataset['url'] + " -O temp.zip && unzip -o temp.zip && rm temp.zip") savedir = directory + 'agModels/' label_column = dataset['label_column'] train_data = task.Dataset(file_path=train_file_path) test_data = task.Dataset(file_path=test_file_path) train_data = train_data.head(subsample_size) # subsample for faster demo test_data = test_data.head(subsample_size) # subsample for faster run print(train_data.head()) # Fit model ensemble: predictor = task.fit(train_data=train_data, label=label_column, problem_type='multiclass', output_directory=savedir, cache_data=True, auto_stack=True, time_limits=time_limits) # Distill ensemble-predictor into single model:
from autogluon.tabular import TabularPrediction as task from autogluon.tabular.task.tabular_prediction.predictor_v2 import TabularPredictorV2 ################ # Loading data # ################ train_data = task.Dataset( file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') test_data = task.Dataset( file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') label = 'class' eval_metric = 'roc_auc' hyperparameters = {'RF': {}} train_data = train_data.head(1000) # subsample for faster demo ################################## # Fitting with the old Predictor # ################################## predictor1 = task.fit(train_data, label=label, eval_metric=eval_metric, hyperparameters=hyperparameters, num_bagging_folds=2) predictor1.leaderboard(test_data) ################################## # Fitting with the new Predictor # ##################################
from datetime import datetime import pandas as pd # %% define data root_folder = "/home/lstm/Google Drive/MATLAB data files/Project__autoML/datasets for autoML/data_weekly_archive/" data_folder = "20200213/" data_file = "GCP_trainvalid_KOSPIb1f0bNsCFCCOFOC20200213.csv" data_ref = 'KOSPIb1f0bNsCFCCOFOC20200213' target_col = "target" most_recent_folder = "20112032/" most_recent_file = "GCP_trainvalid_KOSPIb1f0bNsCFCCOFOC2020112032.csv" cols_2_drop_4_training = ["timestamp", "split_tag", "weight_vector"] data_trainvalid = task.Dataset(file_path=root_folder + data_folder + data_file) data_trainvalid["DoW"] = data_trainvalid["DoW"].astype('category') train_data = data_trainvalid.loc[data_trainvalid.split_tag == 'TRAIN', :] print(train_data.head()) print(train_data.tail()) valid_data = data_trainvalid.loc[ data_trainvalid.split_tag == 'VALIDATE', :] # do not provide if bagging/stacking print(valid_data.head()) print(valid_data.tail()) latest_valid_date = valid_data["timestamp"][valid_data["timestamp"] == valid_data["timestamp"].max()] ## REDO TEST DATA (to be pre-processed in matlab first)