Exemple #1
0
    print(dataset_name)

    #   test_data = task.Dataset(file_path=os.path.join(training_dir, test_file))
    test_data = TabularDataset(data=os.path.join(training_dir, test_file))
    u = urlparse(args.s3_output, allow_fragments=False)
    bucket = u.netloc
    print(bucket)
    prefix = u.path.strip('/')
    print(prefix)

    s3 = boto3.client('s3')

    try:
        y_test = test_data[args.target]  # values to predict
        test_data_nolab = test_data.drop(
            labels=[args.target],
            axis=1)  # delete label column to prove we're not cheating

        y_pred = predictor.predict(test_data_nolab)
        y_pred_df = pd.DataFrame.from_dict({
            'True': y_test,
            'Predicted': y_pred
        })
        pred_file = f'{dataset_name}_test_predictions.csv'
        y_pred_df.to_csv(pred_file, index=False, header=True)

        leaderboard = predictor.leaderboard()
        lead_file = f'{dataset_name}_leaderboard.csv'
        leaderboard.to_csv(lead_file)

        perf = predictor.evaluate_predictions(y_true=y_test,
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon.tabular import TabularDataset, TabularPredictor


# Training time:
train_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())
label = 'class'  # specifies which column do we want to predict
save_path = 'ag_models/'  # where to save trained models

predictor = TabularPredictor(label=label, path=save_path).fit(train_data)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:
# predictor = TabularPredictor(label=label_column, eval_metric=YOUR_METRIC_NAME, path=save_path).fit(train_data, presets='best_quality')
results = predictor.fit_summary()

# Inference time:
test_data = TabularDataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
y_test = test_data[label]
test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = TabularPredictor.load(save_path)  # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
Exemple #3
0
    ag_predictor_args["path"] = args.model_dir
    ag_fit_args = config["ag_fit_args"]

    predictor = TabularPredictor(**ag_predictor_args).fit(
        train_data, **ag_fit_args)
    logger.info("Best model: %s", predictor.get_model_best())

    # Leaderboard
    lb = predictor.leaderboard()
    lb.to_csv(f'{args.output_data_dir}/leaderboard.csv', index=False)
    logger.info("Saved leaderboard to output.")

    # Feature importance
    feature_importance = predictor.feature_importance(test_data)
    feature_importance.to_csv(f'{args.output_data_dir}/feature_importance.csv')
    logger.info("Saved feature importance to output.")

    # Evaluation
    evaluation = predictor.evaluate(test_data)
    with open(f'{args.output_data_dir}/evaluation.json', 'w') as f:
        json.dump(evaluation, f)
    logger.info("Saved evaluation to output.")

    predictor.save_space()

    # ---------------------------- Inference -----------------------------------

    test_data_nolabel = test_data.drop(labels=ag_predictor_args['label'],
                                       axis=1)
    y_pred = predictor.predict(test_data_nolabel)
    y_pred.to_csv(f'{args.output_data_dir}/predictions.csv', index=False)
train_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/train_data.csv'
)  # can be local CSV file as well, returns Pandas DataFrame
test_data = TabularDataset(
    'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification/test_data.csv'
)  # another Pandas DataFrame
label = 'class'  # specifies which column do we want to predict
sample_train_data = train_data.head(100)  # subsample for faster demo

# Separate features and labels
# Make sure to not include your label/target column when sending input to the feature generators, or else the label will be transformed as well.
X = sample_train_data.drop(columns=[label])
y = sample_train_data[label]

X_test = test_data.drop(columns=[label])
y_test = test_data[label]

print(X)

##############################
# Fitting feature generators #
##############################

from autogluon.features.generators import CategoryFeatureGenerator, IdentityFeatureGenerator

# IdentityFeatureGenerator is a 'do-nothing' feature generator if given default arguments. It will simply pass the data along.
identity_feature_generator = IdentityFeatureGenerator()

# fit_transform the generator using the input data. This must be done prior to calling transform.
X_transform = identity_feature_generator.fit_transform(
Exemple #5
0
################
# Loading Data #
################

train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
label = 'class'  # specifies which column do we want to predict
train_data = train_data.head(1000)  # subsample for faster demo

#####################################################
# Training custom model outside of TabularPredictor #
#####################################################

# Separate features and labels
X = train_data.drop(columns=[label])
y = train_data[label]

problem_type = infer_problem_type(y=y)  # Infer problem type (or else specify directly)
naive_bayes_model = NaiveBayesModel(path='AutogluonModels/', name='CustomNaiveBayes', problem_type=problem_type)

# Construct a LabelCleaner to neatly convert labels to float/integers during model training/inference, can also use to inverse_transform back to original.
label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=y)
y_clean = label_cleaner.transform(y)

naive_bayes_model.fit(X=X, y=y_clean)  # Fit custom model

# To save to disk and load the model, do the following:
# load_path = naive_bayes_model.path
# naive_bayes_model.save()
# del naive_bayes_model
Exemple #6
0
    temp_adjust=data_final.iloc[:-10,3:]
    temp_adjust=temp_adjust.reset_index(drop = True)
    date_adjust=data_final.iloc[10:,:3]
    date_adjust=date_adjust.reset_index(drop = True)
    data_adjust=date_adjust.join(temp_adjust)
    data_adjust=data_adjust.reset_index(drop = True)
    '''
    #train
    train_data = TabularDataset(
        datasettemp.drop(
            columns=['Trading_money', 'open', 'max', 'min', 'PER', 'PBR'
                     ]).iloc[:-11])

    #predictor
    predictor = TabularPredictor(label='close').fit(
        train_data.drop(columns=['date', 'stock_id']))
    # , num_stack_levels=1,num_bag_folds=2)

    #test
    test_data = datasettemp.iloc[-11:len(datasettemp)]
    preds = predictor.predict(
        test_data.drop(columns=['date', 'stock_id', 'close']))
    test_hat = pd.DataFrame({
        'date': test_data['date'],
        'stock_id': test_data['stock_id'],
        'close': preds
    })
    test_hat

    predition_data = []
    for k in range(0, 10):  #抓後10天的資料
Exemple #7
0
# train_df = pd.read_csv("../../data/processed/train_preproc.csv")
train_data = TabularDataset(
    "../../data/processed/oversampled/train_valid_feat_eng_oversample.csv")
# train_data = train_data.drop(["Age","Room_Rate","Discount_Rate"],axis="columns")
save_path = "models_oversample_valid"
predictor = TabularPredictor(label="Reservation_Status",
                             path=save_path,
                             eval_metric="f1_macro").fit(
                                 train_data,
                                 time_limit=7200,
                                 presets="best_quality")

valid_data = TabularDataset("../../data/processed/valid_preproc.csv")
y_test = valid_data.loc[:, "Reservation_Status"]
valid_data = valid_data.drop(["Reservation_Status"], axis="columns")

y_pred = predictor.predict(valid_data)
perf = predictor.evaluate_predictions(y_true=y_test,
                                      y_pred=y_pred,
                                      auxiliary_metrics=True)
print(perf)

test_data = TabularDataset("../../data/processed/test_preproc.csv")
test_preds = predictor.predict(test_data)

test_df = pd.read_csv("../../data/processed/test_preproc.csv")
test_df["Reservation_Status"] = test_preds
test_df = test_df.replace(
    {"Reservation_Status": {
        "Check-In": 1,
def evaluate(predictor, args):
    
    train_dir = args.train_dir
    train_file = args.filename
    test_file = train_file.replace('train', 'test', 1)
    target = args.target
    training_job_name = args.training_job_name
    s3_output = args.s3_output
    presets = args.presets 

    dataset_name = train_file.split('_')[0]
    logging.info(dataset_name)
    
    test_data = TabularDataset(os.path.join(train_dir, test_file))   
    
    u = urlparse(s3_output, allow_fragments=False)
    bucket = u.netloc
    logging.info(bucket)
    prefix = u.path.strip('/')
    logging.info(prefix)
    s3 = boto3.client('s3')
    
    y_test = test_data[target]
    test_data_nolab = test_data.drop(labels=[target], axis=1)
    
    y_pred = predictor.predict(test_data_nolab)
    y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred})
    pred_file = f'{dataset_name}_test_predictions.csv'
    y_pred_df.to_csv(pred_file, index=False, header=True)

    leaderboard = predictor.leaderboard()
    lead_file = f'{dataset_name}_leaderboard.csv'
    leaderboard.to_csv(lead_file)
    
    perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
    #del perf['confusion_matrix']
    perf_file = f'{dataset_name}_model_performance.txt'
    with open(perf_file, 'w') as f:
        print(json.dumps(perf, indent=4, default=pd.DataFrame.to_json), file=f)

    summary = predictor.fit_summary()
    summ_file = f'{dataset_name}_fit_summary.txt'
    with open(summ_file, 'w') as f:
        print(summary, file=f)
    
    y_prob = predictor.predict_proba(test_data_nolab)
    y_prob = y_prob.iloc[:,-1]    
    
    y_test_enc, uniques = pd.factorize(y_test)  # Label Encoding  
            
    fig = plt.figure(figsize=(14,4))
    plt.subplot(1,3,1)
    plot_roc_curve(y_test_enc, y_prob)
    plt.subplot(1,3,2)    
    plot_pr_curve(y_test_enc, y_prob)
    plt.subplot(1,3,3)    
    plot_conf_mtx(y_test_enc, y_prob, 0.5) 
    eval_file = f'{dataset_name}_eval.png'
    plt.savefig(eval_file)
    plt.close(fig)

#     # Feature importance
#     featimp = predictor.feature_importance(test_data)
#     fig, ax = plt.subplots(figsize=(12,5))
#     plot = sns.barplot(x=featimp.index, y=featimp.values)
#     ax.set_title('Feature Importance')
#     plot.set_xticklabels(plot.get_xticklabels(), rotation='vertical')
#     featimp_imgfile = f'{dataset_name}_featimp.png'
#     featimp_csvfile = f'{dataset_name}_featimp.csv'
#     fig.savefig(featimp_imgfile)
#     featimp.to_csv(featimp_csvfile)
#     plt.close(fig)        
        
    # Cleanup data in order to avoid disk space issues
    predictor.save_space()
    predictor.delete_models(models_to_keep='best', dry_run=False)

    files_to_upload = [pred_file, lead_file, perf_file, summ_file, eval_file]
    for file in files_to_upload:
        s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))   
Exemple #9
0
from autogluon.tabular import TabularDataset, TabularPredictor
# Train
train_data = TabularDataset('train.csv')
id, label = 'PassengerId', 'Survived'
save_path = 'model'

time_limit = 300

predictor = TabularPredictor(label=label,
                             path=save_path).fit(train_data.drop(columns=[id]),
                                                 time_limit=time_limit,
                                                 presets='best_quality')
# Test
import pandas as pd

test_data = TabularDataset('test.csv')

# predictor = TabularPredictor.load(
#     save_path
# )  # unnecessary, just demonstrates how to load previously-trained predictor from file

preds = predictor.predict(test_data.drop(columns=[id]))
submission = pd.DataFrame({id: test_data[id], label: preds})
submission.to_csv('submission.csv', index=False)