def train(args):
    
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir    
    train_dir = args.train_dir
    filename = args.filename
    target = args.target    
    debug = args.debug
    eval_metric = args.eval_metric   
    presets = args.presets    
    
    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    time_limit = int(args.training_minutes) * 60
     
    logging.info(train_dir)
    
    train_data = TabularDataset(os.path.join(train_dir, filename))
    if debug:
        subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
        train_data = train_data.sample(n=subsample_size, random_state=0)
        
    predictor = TabularPredictor(label=target, path=model_dir, eval_metric=eval_metric).fit(
        train_data=train_data,
        excluded_model_types=['KNN','RF','NN'],
        time_limit=time_limit, presets=[presets, 'optimize_for_deployment'])

    return predictor
Exemple #2
0
def load_data(directory_prefix, train_file, test_file, name, url=None):
    if not os.path.exists(directory_prefix):
        os.mkdir(directory_prefix)
    directory = directory_prefix + name + "/"
    train_file_path = directory + train_file
    test_file_path = directory + test_file
    if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
        # fetch files from s3:
        print("%s data not found locally, so fetching from %s" % (name, url))
        zip_name = download(url, directory_prefix)
        unzip(zip_name, directory_prefix)
        os.remove(zip_name)

    train_data = TabularDataset(train_file_path)
    test_data = TabularDataset(test_file_path)
    return train_data, test_data
Exemple #3
0
def __load_input_data(path: str) -> TabularDataset:
    """
    Load training data as dataframe
    :param path:
    :return: DataFrame
    """
    input_data_files = os.listdir(path)

    try:
        input_dfs = [
            pd.read_csv(f'{path}/{data_file}')
            for data_file in input_data_files
        ]

        return TabularDataset(data=pd.concat(input_dfs))
    except:
        print(f'No csv data in {path}!')
        return None
Exemple #4
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    target = args.target

    # load training and validation data

    training_dir = args.train
    filename = args.filename
    logging.info(training_dir)
    #   train_data = task.Dataset(file_path=training_dir + '/' + filename)
    train_data = TabularDataset(data=training_dir + '/' + filename)

    #   predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir)
    predictor = TabularPredictor(label=target, path=model_dir).fit(train_data)

    return predictor
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'ag_models/'  # where to save trained models

predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()

# Inference time:
test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
""" Example: distilling AutoGluon's ensemble-predictor into a single model for binary classification. """

# NOTE: Distillation can be done in a similar manner for multiclass classification and regression problems.
# NOTE: To distill CatBoost models in multiclass classification, you need to first run:  pip install catboost-dev

from autogluon.tabular import TabularDataset, TabularPredictor

subsample_size = 500
time_limit = 60

label = 'class'  # specifies which column do we want to predict
train_file_path = 'https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv'
test_file_path = 'https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv'

train_data = TabularDataset(train_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo

test_data = TabularDataset(test_file_path)
test_data = test_data.head(subsample_size)  # subsample for faster run

# Fit model ensemble:
predictor = TabularPredictor(label).fit(train_data,
                                        auto_stack=True,
                                        time_limit=time_limit)

# Distill ensemble-predictor into single model:

time_limit = 60  # set = None to fully train distilled models

# aug_data below is optional, but this could be additional unlabeled data you may have. Here we use the training data for demonstration, but you should only use new data here:
aug_data = TabularDataset(train_file_path)
    os.makedirs(args.output_data_dir, mode=0o777, exist_ok=True)

    config_file = get_input_path(args.ag_config)
    with open(config_file) as f:
        config = yaml.safe_load(f)  # AutoGluon-specific config

    if args.n_gpus:
        config["num_gpus"] = int(args.n_gpus)

    print("Running training job with the config:")
    pprint(config)

    # ---------------------------------------------------------------- Training

    train_file = get_input_path(args.training_dir)
    train_data = TabularDataset(train_file)

    ag_predictor_args = config["ag_predictor_args"]
    ag_predictor_args["path"] = args.model_dir
    ag_fit_args = config["ag_fit_args"]

    predictor = TabularPredictor(**ag_predictor_args).fit(
        train_data, **ag_fit_args)

    # --------------------------------------------------------------- Inference

    if args.test_dir:
        test_file = get_input_path(args.test_dir)
        test_data = TabularDataset(test_file)

        # Predictions
Exemple #8
0
""" Example script for predicting columns of tables, demonstrating more advanced usage of fit().
    Note that all settings demonstrated here are just chosen for demonstration purposes (to minimize runtime), and do not represent wise choices to use in practice.
    To maximize predictive accuracy, we recommend you do NOT specify `hyperparameters` or `hyperparameter_tune_kwargs`, and instead only specify the following fit() arguments: eval_metric=YOUR_METRIC, presets='best_quality'
"""

import autogluon.core as ag
from autogluon.tabular import TabularDataset, TabularPredictor

# Training time:
train_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(100)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'ag_hpo_models/'  # where to save trained models

hyperparameters = {
    'NN': {
        'num_epochs': 10,
        'activation': 'relu',
        'dropout_prob': ag.Real(0.0, 0.5)
    },
    'GBM': {
        'num_boost_round': 1000,
        'learning_rate': ag.Real(0.01, 0.1, log=True)
    },
    'XGB': {
        'n_estimators': 1000,
        'learning_rate': ag.Real(0.01, 0.1, log=True)
    }
Most users can get strong performance without specifying custom feature generators due to the generic and powerful default feature generator used by AutoGluon.
An advanced user may wish to create a custom feature generator to:
    1. Experiment with different preprocessing pipelines to improve model quality.
    2. Have full control over what data is being sent to downstream models.
    3. Migrate existing pipelines into AutoGluon for ease of use and deployment.
    4. Contribute new feature generators to AutoGluon.
"""

################
# Loading Data #
################

from autogluon.tabular import TabularDataset, TabularPredictor

train_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
test_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv'
)  # another Pandas DataFrame
label = 'class'  # specifies which column do we want to predict
sample_train_data = train_data.head(100)  # subsample for faster demo

# Separate features and labels
# Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well.
X = sample_train_data.drop(columns=[label])
y = sample_train_data[label]

X_test = test_data.drop(columns=[label])
y_test = test_data[label]
Exemple #10
0
    os.makedirs(args.output_data_dir, mode=0o777, exist_ok=True)

    config_file = get_input_path(args.ag_config)
    with open(config_file) as f:
        config = yaml.safe_load(f)  # AutoGluon-specific config

    if args.n_gpus:
        config['num_gpus'] = int(args.n_gpus)

    logger.info("Running training job with the config:")
    pprint(config)

    # ----------------------------- Training -----------------------------------

    train_file = get_input_path(args.training_dir)
    train_data = TabularDataset(train_file)
    test_file = get_input_path(args.test_dir)
    test_data = TabularDataset(test_file)

    ag_predictor_args = config["ag_predictor_args"]
    ag_predictor_args["path"] = args.model_dir
    ag_fit_args = config["ag_fit_args"]

    predictor = TabularPredictor(**ag_predictor_args).fit(
        train_data, **ag_fit_args)
    logger.info("Best model: %s", predictor.get_model_best())

    # Leaderboard
    lb = predictor.leaderboard()
    lb.to_csv(f'{args.output_data_dir}/leaderboard.csv', index=False)
    logger.info("Saved leaderboard to output.")
Exemple #11
0
    # The `_get_default_auxiliary_params` method defines various model-agnostic parameters such as maximum memory usage and valid input column dtypes.
    # For most users who build custom models, they will only need to specify the valid/invalid dtypes to the model here.
    def _get_default_auxiliary_params(self) -> dict:
        default_auxiliary_params = super()._get_default_auxiliary_params()
        extra_auxiliary_params = dict(
            # Drop category and object column dtypes, since NaiveBayes can't handle these dtypes.
            ignored_type_group_raw=['category', 'object'],
        )
        default_auxiliary_params.update(extra_auxiliary_params)
        return default_auxiliary_params

################
# Loading Data #
################

train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
label = 'class'  # specifies which column do we want to predict
train_data = train_data.head(1000)  # subsample for faster demo

#####################################################
# Training custom model outside of TabularPredictor #
#####################################################

# Separate features and labels
X = train_data.drop(columns=[label])
y = train_data[label]

problem_type = infer_problem_type(y=y)  # Infer problem type (or else specify directly)
naive_bayes_model = NaiveBayesModel(path='AutogluonModels/', name='CustomNaiveBayes', problem_type=problem_type)
Exemple #12
0
    temp = temp.reset_index(drop=True)
    temp
    data_final = temp.loc[:, ['date', 'stock_id', 'close']].join(df_zscore)
    #data_final = data_final.drop(columns=[''])
    '''
    #adjust data move 1odays
    temp_adjust=data_final.iloc[:-10,3:]
    temp_adjust=temp_adjust.reset_index(drop = True)
    date_adjust=data_final.iloc[10:,:3]
    date_adjust=date_adjust.reset_index(drop = True)
    data_adjust=date_adjust.join(temp_adjust)
    data_adjust=data_adjust.reset_index(drop = True)
    '''
    #train
    train_data = TabularDataset(
        datasettemp.drop(
            columns=['Trading_money', 'open', 'max', 'min', 'PER', 'PBR'
                     ]).iloc[:-11])

    #predictor
    predictor = TabularPredictor(label='close').fit(
        train_data.drop(columns=['date', 'stock_id']))
    # , num_stack_levels=1,num_bag_folds=2)

    #test
    test_data = datasettemp.iloc[-11:len(datasettemp)]
    preds = predictor.predict(
        test_data.drop(columns=['date', 'stock_id', 'close']))
    test_hat = pd.DataFrame({
        'date': test_data['date'],
        'stock_id': test_data['stock_id'],
        'close': preds
def transform_fn(models, data, input_content_type, output_content_type):
    """
    Transform a request using the Gluon model. Called once per request.
    :param models: The Gluon model and the column info.
    :param data: The request payload.
    :param input_content_type: The request content type. ('text/csv')
    :param output_content_type: The (desired) response content type. ('text/csv')
    :return: response payload and content type.
    """
    start = timer()
    net = models[0]
    column_dict = models[1]

    # text/csv
    if input_content_type == 'text/csv':
        
        # Load dataset
        columns = column_dict['columns']
        df = pd.read_csv(StringIO(data), header=None)

        df_preprosessed = preprocess(df, columns, net.label)

        ds = TabularDataset(data=df_preprosessed)
        
        try:
            predictions = net.predict(ds)
        except:
            try:
                predictions = net.predict(ds.fillna(0.0))
                warnings.warn('Filled NaN\'s with 0.0 in order to predict.')
            except Exception as e:
                response_body = e
                return response_body, output_content_type
        
        # Print prediction counts, limit in case of regression problem
        pred_counts = Counter(predictions.tolist())
        n_display_items = 30
        if len(pred_counts) > n_display_items:
            print(f'Top {n_display_items} prediction counts: '
                  f'{dict(take(n_display_items, pred_counts.items()))}')
        else:
            print(f'Prediction counts: {pred_counts}')

        # Form response
        output = StringIO()
        pd.DataFrame(predictions).to_csv(output, header=False, index=False)
        response_body = output.getvalue() 

        # If target column passed, evaluate predictions performance
        target = net.label
        if target in ds:
            print(f'Label column ({target}) found in input data. '
                  'Therefore, evaluating prediction performance...')    
            try:
                performance = net.evaluate_predictions(y_true=ds[target], 
                                                       y_pred=predictions, 
                                                       auxiliary_metrics=True)                
                print(json.dumps(performance, indent=4, default=pd.DataFrame.to_json))
                time.sleep(0.1)
            except Exception as e:
                # Print exceptions on evaluate, continue to return predictions
                print(f'Exception: {e}')
    else:
        raise NotImplementedError("content_type must be 'text/csv'")

    elapsed_time = round(timer()-start,3)
    print(f'Elapsed time: {round(timer()-start,3)} seconds')           
    
    return response_body, output_content_type
Exemple #14
0
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
from datetime import datetime

# train_df = pd.read_csv("../../data/processed/train_preproc.csv")
train_data = TabularDataset(
    "../../data/processed/oversampled/train_valid_feat_eng_oversample.csv")
# train_data = train_data.drop(["Age","Room_Rate","Discount_Rate"],axis="columns")
save_path = "models_oversample_valid"
predictor = TabularPredictor(label="Reservation_Status",
                             path=save_path,
                             eval_metric="f1_macro").fit(
                                 train_data,
                                 time_limit=7200,
                                 presets="best_quality")

valid_data = TabularDataset("../../data/processed/valid_preproc.csv")
y_test = valid_data.loc[:, "Reservation_Status"]
valid_data = valid_data.drop(["Reservation_Status"], axis="columns")

y_pred = predictor.predict(valid_data)
perf = predictor.evaluate_predictions(y_true=y_test,
                                      y_pred=y_pred,
                                      auxiliary_metrics=True)
print(perf)

test_data = TabularDataset("../../data/processed/test_preproc.csv")
test_preds = predictor.predict(test_data)

test_df = pd.read_csv("../../data/processed/test_preproc.csv")
test_df["Reservation_Status"] = test_preds
def evaluate(predictor, args):
    
    train_dir = args.train_dir
    train_file = args.filename
    test_file = train_file.replace('train', 'test', 1)
    target = args.target
    training_job_name = args.training_job_name
    s3_output = args.s3_output
    presets = args.presets 

    dataset_name = train_file.split('_')[0]
    logging.info(dataset_name)
    
    test_data = TabularDataset(os.path.join(train_dir, test_file))   
    
    u = urlparse(s3_output, allow_fragments=False)
    bucket = u.netloc
    logging.info(bucket)
    prefix = u.path.strip('/')
    logging.info(prefix)
    s3 = boto3.client('s3')
    
    y_test = test_data[target]
    test_data_nolab = test_data.drop(labels=[target], axis=1)
    
    y_pred = predictor.predict(test_data_nolab)
    y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred})
    pred_file = f'{dataset_name}_test_predictions.csv'
    y_pred_df.to_csv(pred_file, index=False, header=True)

    leaderboard = predictor.leaderboard()
    lead_file = f'{dataset_name}_leaderboard.csv'
    leaderboard.to_csv(lead_file)
    
    perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
    #del perf['confusion_matrix']
    perf_file = f'{dataset_name}_model_performance.txt'
    with open(perf_file, 'w') as f:
        print(json.dumps(perf, indent=4, default=pd.DataFrame.to_json), file=f)

    summary = predictor.fit_summary()
    summ_file = f'{dataset_name}_fit_summary.txt'
    with open(summ_file, 'w') as f:
        print(summary, file=f)
    
    y_prob = predictor.predict_proba(test_data_nolab)
    y_prob = y_prob.iloc[:,-1]    
    
    y_test_enc, uniques = pd.factorize(y_test)  # Label Encoding  
            
    fig = plt.figure(figsize=(14,4))
    plt.subplot(1,3,1)
    plot_roc_curve(y_test_enc, y_prob)
    plt.subplot(1,3,2)    
    plot_pr_curve(y_test_enc, y_prob)
    plt.subplot(1,3,3)    
    plot_conf_mtx(y_test_enc, y_prob, 0.5) 
    eval_file = f'{dataset_name}_eval.png'
    plt.savefig(eval_file)
    plt.close(fig)

#     # Feature importance
#     featimp = predictor.feature_importance(test_data)
#     fig, ax = plt.subplots(figsize=(12,5))
#     plot = sns.barplot(x=featimp.index, y=featimp.values)
#     ax.set_title('Feature Importance')
#     plot.set_xticklabels(plot.get_xticklabels(), rotation='vertical')
#     featimp_imgfile = f'{dataset_name}_featimp.png'
#     featimp_csvfile = f'{dataset_name}_featimp.csv'
#     fig.savefig(featimp_imgfile)
#     featimp.to_csv(featimp_csvfile)
#     plt.close(fig)        
        
    # Cleanup data in order to avoid disk space issues
    predictor.save_space()
    predictor.delete_models(models_to_keep='best', dry_run=False)

    files_to_upload = [pred_file, lead_file, perf_file, summ_file, eval_file]
    for file in files_to_upload:
        s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))   
Exemple #16
0
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    predictor = train(args)

    training_dir = args.train
    train_file = args.filename
    test_file = train_file.replace('train', 'test', 1)
    dataset_name = train_file.split('_')[0]
    print(dataset_name)

    #   test_data = task.Dataset(file_path=os.path.join(training_dir, test_file))
    test_data = TabularDataset(data=os.path.join(training_dir, test_file))
    u = urlparse(args.s3_output, allow_fragments=False)
    bucket = u.netloc
    print(bucket)
    prefix = u.path.strip('/')
    print(prefix)

    s3 = boto3.client('s3')

    try:
        y_test = test_data[args.target]  # values to predict
        test_data_nolab = test_data.drop(
            labels=[args.target],
            axis=1)  # delete label column to prove we're not cheating

        y_pred = predictor.predict(test_data_nolab)
def transform_fn(models, data, input_content_type, output_content_type):
    """
    Transform a request using the Gluon model. Called once per request.
    :param models: The Gluon model and the column info.
    :param data: The request payload.
    :param input_content_type: The request content type. ('text/csv')
    :param output_content_type: The (desired) response content type. ('text/csv')
    :return: response payload and content type.
    """
    start = timer()
    net = models[0]
    column_dict = models[1]
    label_map = net.class_labels_internal_map  ###

    # text/csv
    if "text/csv" in input_content_type:
        # Load dataset
        columns = column_dict["columns"]

        if type(data) == str:
            # Load dataset
            df = pd.read_csv(StringIO(data), header=None)
        else:
            df = pd.read_csv(StringIO(data.decode()), header=None)

        df_preprosessed = preprocess(df, columns, net.label)

        ds = TabularDataset(data=df_preprosessed)

        try:
            predictions = net.predict_proba(ds)
            predictions_ = net.predict(ds)
        except:
            try:
                predictions = net.predict_proba(ds.fillna(0.0))
                predictions_ = net.predict(ds.fillna(0.0))
                warnings.warn("Filled NaN's with 0.0 in order to predict.")
            except Exception as e:
                response_body = e
                return response_body, output_content_type

        # threshold = 0.5
        # predictions_label = [[k for k, v in label_map.items() if v == 1][0] if i > threshold else [k for k, v in label_map.items() if v == 0][0] for i in predictions]
        predictions_label = predictions_.tolist()

        # Print prediction counts, limit in case of regression problem
        pred_counts = Counter(predictions_label)
        n_display_items = 30
        if len(pred_counts) > n_display_items:
            print(f"Top {n_display_items} prediction counts: "
                  f"{dict(take(n_display_items, pred_counts.items()))}")
        else:
            print(f"Prediction counts: {pred_counts}")

        # Form response
        output = StringIO()
        pd.DataFrame(predictions).to_csv(output, header=False, index=False)
        response_body = output.getvalue()

        # If target column passed, evaluate predictions performance
        target = net.label
        if target in ds:
            print(f"Label column ({target}) found in input data. "
                  "Therefore, evaluating prediction performance...")
            try:
                performance = net.evaluate_predictions(
                    y_true=ds[target],
                    y_pred=np.array(predictions_label),
                    auxiliary_metrics=True)
                print(
                    json.dumps(performance,
                               indent=4,
                               default=pd.DataFrame.to_json))
                time.sleep(0.1)
            except Exception as e:
                # Print exceptions on evaluate, continue to return predictions
                print(f"Exception: {e}")
    else:
        raise NotImplementedError("content_type must be 'text/csv'")

    elapsed_time = round(timer() - start, 3)
    print(f"Elapsed time: {round(timer()-start,3)} seconds")

    return response_body, output_content_type
Exemple #18
0
from autogluon.tabular import TabularDataset, TabularPredictor
# Train
train_data = TabularDataset('train.csv')
id, label = 'PassengerId', 'Survived'
save_path = 'model'

time_limit = 300

predictor = TabularPredictor(label=label,
                             path=save_path).fit(train_data.drop(columns=[id]),
                                                 time_limit=time_limit,
                                                 presets='best_quality')
# Test
import pandas as pd

test_data = TabularDataset('test.csv')

# predictor = TabularPredictor.load(
#     save_path
# )  # unnecessary, just demonstrates how to load previously-trained predictor from file

preds = predictor.predict(test_data.drop(columns=[id]))
submission = pd.DataFrame({id: test_data[id], label: preds})
submission.to_csv('submission.csv', index=False)