Esempio n. 1
0
def load_data(directory_prefix, train_file, test_file, name, url=None):
    if not os.path.exists(directory_prefix):
        os.mkdir(directory_prefix)
    directory = directory_prefix + name + "/"
    train_file_path = directory + train_file
    test_file_path = directory + test_file
    if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
        # fetch files from s3:
        print("%s data not found locally, so fetching from %s" % (name, url))
        zip_name = ag.download(url, directory_prefix)
        ag.unzip(zip_name, directory_prefix)
        os.remove(zip_name)

    train_data = task.Dataset(file_path=train_file_path)
    test_data = task.Dataset(file_path=test_file_path)
    return train_data, test_data
Esempio n. 2
0
""" Example script for predicting columns of tables, demonstrating more advanced usage of fit().
    Note that all settings demonstrated here are just chosen for demonstration purposes (to minimize runtime), and do not represent wise choices to use in practice.
    To maximize predictive accuracy, we recommend you do NOT specify `hyperparameters` or `hyperparameter_tune`, and instead only specify the following fit() arguments: eval_metric=YOUR_METRIC, presets='best_quality'
"""

import autogluon.core as ag
from autogluon.tabular import TabularPrediction as task

# Training time:
train_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(100)  # subsample for faster demo
print(train_data.head())
label_column = 'class'  # specifies which column do we want to predict
savedir = 'ag_hpo_models/'  # where to save trained models

hyperparams = {
    'NN': {
        'num_epochs': 10,
        'activation': 'relu',
        'dropout_prob': ag.Real(0.0, 0.5)
    },
    'GBM': {
        'num_boost_round': 1000,
        'learning_rate': ag.Real(0.01, 0.1, log=True)
    }
}

predictor = task.fit(
    train_data=train_data,
Esempio n. 3
0
Most users can get strong performance without specifying custom feature generators due to the generic and powerful default feature generator used by AutoGluon.
An advanced user may wish to create a custom feature generator to:
    1. Experiment with different preprocessing pipelines to improve model quality.
    2. Have full control over what data is being sent to downstream models.
    3. Migrate existing pipelines into AutoGluon for ease of use and deployment.
    4. Contribute new feature generators to AutoGluon.
"""

################
# Loading Data #
################

from autogluon.tabular import TabularPrediction as task

train_data = task.Dataset(
    file_path=
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
test_data = task.Dataset(
    file_path=
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv'
)  # another Pandas DataFrame
label_column = 'class'  # specifies which column do we want to predict
sample_train_data = train_data.head(100)  # subsample for faster demo

# Separate features and labels
# Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well.
X = sample_train_data.drop(columns=[label_column])
y = sample_train_data[label_column]

X_test = test_data.drop(columns=[label_column])
y_test = test_data[label_column]
Esempio n. 4
0
train_file = 'train_data.csv'
test_file = 'test_data.csv'
train_file_path = directory + train_file
test_file_path = directory + test_file

if (not os.path.exists(train_file_path)) or (
        not os.path.exists(test_file_path)):  # fetch files from s3:
    print("%s data not found locally, so fetching from %s" %
          (dataset['name'], dataset['url']))
    os.system("wget " + dataset['url'] +
              " -O temp.zip && unzip -o temp.zip && rm temp.zip")

savedir = directory + 'agModels/'

label_column = dataset['label_column']
train_data = task.Dataset(file_path=train_file_path)
test_data = task.Dataset(file_path=test_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo
test_data = test_data.head(subsample_size)  # subsample for faster run
print(train_data.head())

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     problem_type='multiclass',
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits)

# Distill ensemble-predictor into single model:
Esempio n. 5
0
from autogluon.tabular import TabularPrediction as task
from autogluon.tabular.task.tabular_prediction.predictor_v2 import TabularPredictorV2

################
# Loading data #
################

train_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')
test_data = task.Dataset(
    file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
label = 'class'
eval_metric = 'roc_auc'
hyperparameters = {'RF': {}}
train_data = train_data.head(1000)  # subsample for faster demo

##################################
# Fitting with the old Predictor #
##################################

predictor1 = task.fit(train_data,
                      label=label,
                      eval_metric=eval_metric,
                      hyperparameters=hyperparameters,
                      num_bagging_folds=2)
predictor1.leaderboard(test_data)

##################################
# Fitting with the new Predictor #
##################################
Esempio n. 6
0
from datetime import datetime
import pandas as pd

# %% define data

root_folder = "/home/lstm/Google Drive/MATLAB data files/Project__autoML/datasets for autoML/data_weekly_archive/"
data_folder = "20200213/"
data_file = "GCP_trainvalid_KOSPIb1f0bNsCFCCOFOC20200213.csv"
data_ref = 'KOSPIb1f0bNsCFCCOFOC20200213'
target_col = "target"
most_recent_folder = "20112032/"
most_recent_file = "GCP_trainvalid_KOSPIb1f0bNsCFCCOFOC2020112032.csv"

cols_2_drop_4_training = ["timestamp", "split_tag", "weight_vector"]

data_trainvalid = task.Dataset(file_path=root_folder + data_folder + data_file)
data_trainvalid["DoW"] = data_trainvalid["DoW"].astype('category')

train_data = data_trainvalid.loc[data_trainvalid.split_tag == 'TRAIN', :]
print(train_data.head())
print(train_data.tail())

valid_data = data_trainvalid.loc[
    data_trainvalid.split_tag ==
    'VALIDATE', :]  # do not provide if bagging/stacking
print(valid_data.head())
print(valid_data.tail())
latest_valid_date = valid_data["timestamp"][valid_data["timestamp"] ==
                                            valid_data["timestamp"].max()]

## REDO TEST DATA (to be pre-processed in matlab first)