예제 #1
0
def frc_AutoGluon(df_train,
                  df_test,
                  categoricalVars,
                  experiment_label='grocery',
                  responseVar='wk1_sales_all_stores'):

    import autogluon as ag
    from autogluon import TabularPrediction as task
    # autogluon.task.tabular_prediction.TabularPredictor

    for varName in categoricalVars:
        df_train[varName] = df_train[varName].astype(str)
        df_test[varName] = df_test[varName].astype(str)

    # AutoGluon format
    train_data = task.Dataset(df=df_train)
    test_data = task.Dataset(df=df_test)

    model = task.fit(train_data=train_data,
                     output_directory="auto_gluon/" + experiment_label,
                     label=responseVar,
                     hyperparameter_tune=False)

    # Forecast with the best model
    autogluon_frc = model.predict(test_data)

    # Forecast with all the models
    individual_frc = {'AG_'+model_to_use: model.predict(test_data, model=model_to_use) \
        for model_to_use in model.model_names}

    return {
        'autoGluon_frc': autogluon_frc,
        'autoGluon_model': model,
        'individual_frc': individual_frc
    }
예제 #2
0
    def run_example(self):

        train_data = task.Dataset(file_path='./data/churn-train.csv')
        train_data = train_data.head(
            500)  # subsample 500 data points for faster demo
        print(train_data.head())
        label_column = 'churn_probability'
        print("Summary of class variable: \n",
              train_data[label_column].describe())
        dir = 'agModels-predictClass'  # specifies folder where to store trained models
        predictor = task.fit(train_data=train_data,
                             label=label_column,
                             eval_metric="mean_absolute_error")
        test_data = task.Dataset(file_path='./data/churn-test.csv')
        y_test = test_data[label_column]  # values to predict
        test_data_nolab = test_data.drop(
            labels=[label_column],
            axis=1)  # delete label column to prove we're not cheating
        print(test_data_nolab.head())
        #predictor = task.load(dir) # unnecessary, just demonstrates how to load previously-trained predictor from file

        y_pred = predictor.predict(test_data_nolab)
        print("Predictions:  ", y_pred)
        perf = predictor.evaluate_predictions(y_true=y_test,
                                              y_pred=y_pred,
                                              auxiliary_metrics=True)

        print("MAE: " + perf)

        return perf
예제 #3
0
 def train(self, train_data, val_data, params):
     train_dataset = TabularPrediction.Dataset(train_data)
     val_dataset   = TabularPrediction.Dataset(val_data)
     output_dir    = os.path.join(self.get_output_folders()[0], dt.now().strftime('%Y%m%d%H%M%S'))
     hp_tune       = params["hp_tune"]
     ag_params     = params["autogluon"]
     self._label_column = params["label"]
     
     if hp_tune is True:
         hp_params       = ag_params["hyperparameters"]
         time_limits     = hp_params["time_limits"]
         num_trials      = hp_params["num_trials"]
         hyperparameters = self.__create_hp_params(hp_params)
         search_strategy = hp_params["search_strategy"]
         self._model = TabularPrediction.fit(
             train_data=train_dataset, tuning_data=val_dataset, label=self._label_column,
             output_directory=output_dir, time_limits=time_limits, 
             num_trials=num_trials, hyperparameter_tune=hp_tune, 
             hyperparameters=hyperparameters, search_strategy=search_strategy
         )
     else:
         self._model = TabularPrediction.fit(
             train_data=train_dataset, tuning_data=val_dataset, label=self._label_column,
             output_directory=output_dir
         )
     
     self.__dump_params(output_dir, params)
     
     self._model.fit_summary()
예제 #4
0
def train(args):
    
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir    
    train_dir = args.train_dir
    filename = args.filename
    target = args.target    
    debug = args.debug
    eval_metric = args.eval_metric   
    presets = args.presets    
    
    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
     
    logging.info(train_dir)
    
    train_data = task.Dataset(file_path=os.path.join(train_dir, filename))
    if debug:
        subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
        train_data = train_data.sample(n=subsample_size, random_state=0)
    predictor = task.fit(train_data = train_data, label=target, 
        output_directory=model_dir, eval_metric=eval_metric, presets=presets)

    return predictor
예제 #5
0
    def train(self, data, params):
        self.data = data

        self.train_data = task.Dataset(data.unscaled_df)

        autogluon_dir = f'agModels-predictClass/{uuid.uuid4()}'  # specifies folder where to store trained models
        self.predictor = task.fit(train_data=self.train_data,
                                  label=self.metadata.get("output")[0],
                                  output_directory=autogluon_dir)

        self.state = "TRAINED"
예제 #6
0
    def run(self, train_path, test_path, target, task):
        train_data = task.Dataset(file_path=train_path)

        predictor = task.fit(train_data=train_data,
                             label=label_column,
                             eval_metric="f1_macro",
                             num_bagging_folds=5)

        test_data = task.Dataset(file_path=test_path)
        y_test = test_data[target]

        y_pred = predictor.predict(test_data)
        return predictor.evaluate_predictions(y_true=y_test.to_numpy(),
                                              y_pred=y_pred,
                                              auxiliary_metrics=True)
예제 #7
0
def run_tabular_benchmark_toy(fit_args):
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/toyClassification.zip',
                          'name': 'toyClassification',
                          'problem_type': MULTICLASS,
                          'label_column': 'y',
                          'performance_val': 0.436}
    # 2-D toy noisy, imbalanced 4-class classification task with: feature missingness, out-of-vocabulary feature categories in test data, out-of-vocabulary labels in test data, training column missing from test data, extra distraction columns in test data
    # toyclassif_dataset should produce 1 warning and 1 error during inference:
    # Warning: Ignoring 181 (out of 1000) training examples for which the label value in column 'y' is missing
    # ValueError: Required columns are missing from the provided dataset. Missing columns: ['lostcolumn']

    # Additional warning that would have occurred if ValueError was not triggered:
    # UserWarning: These columns from this dataset were not present in the training dataset (AutoGluon will ignore them):  ['distractioncolumn1', 'distractioncolumn2']

    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    print(f"Evaluating Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data, label=dataset['label_column'], output_directory=savedir, **fit_args)
    try:
        predictor.predict(test_data)
    except ValueError:  # ValueError should be raised because test_data has missing column 'lostcolumn'
        pass
    else:
        raise AssertionError(f'{dataset["name"]} should raise an exception.')
예제 #8
0
def load_data(directory_prefix, train_file, test_file, name, url=None):
    if not os.path.exists(directory_prefix):
        os.mkdir(directory_prefix)
    directory = directory_prefix + name + "/"
    train_file_path = directory + train_file
    test_file_path = directory + test_file
    if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
        # fetch files from s3:
        print("%s data not found locally, so fetching from %s" % (name, url))
        zip_name = ag.download(url, directory_prefix)
        ag.unzip(zip_name, directory_prefix)
        os.remove(zip_name)

    train_data = task.Dataset(file_path=train_file_path)
    test_data = task.Dataset(file_path=test_file_path)
    return train_data, test_data
예제 #9
0
def convert_gluon(X_train, y_train):

    feature_list = list()
    for i in range(len(X_train[0])):
        feature_list.append('feature_' + str(i))

    feature_list.append('class')
    data = dict()

    for i in range(len(X_train)):
        for j in range(len(feature_list) - 1):
            if i > 0:
                try:
                    data[feature_list[j]] = data[feature_list[j]] + [
                        X_train[i][j]
                    ]
                except:
                    pass

            else:
                data[feature_list[j]] = [X_train[i][j]]
                print(data)

    data['class'] = y_train
    data = pd.DataFrame(data, columns=list(data))
    data = task.Dataset(data)

    return data
예제 #10
0
def define_and_evaluate_autogluon_pipeline(X, y, random_state=0):
    # autogluon dataframes
    data_df = pd.DataFrame(X)
    data_df["y"] = y
    outer_cv = StratifiedKFold(n_splits=4,
                               shuffle=True,
                               random_state=random_state)
    nested_scores = []
    for train_inds, test_inds in outer_cv.split(X, y):
        data_df_train = data_df.iloc[train_inds, :]
        data_df_test = data_df.iloc[test_inds, :]
        if len((set(y))) == 2:
            eval_metric = "roc_auc"
            problem_type = "binary"
        else:
            eval_metric = "f1_weighted"  # no multiclass auroc in autogluon
            problem_type = "multiclass"
        predictor = task.fit(
            data_df_train,
            "y",
            time_limits=SEC,
            presets="best_quality",
            output_directory=".autogluon_temp",
            eval_metric=eval_metric,
            problem_type=problem_type,
            verbosity=0,
        )
        y_pred = predictor.predict_proba(data_df.iloc[test_inds, :])
        # same as roc_auc_ovr_weighted
        score = roc_auc_score(data_df_test["y"],
                              y_pred,
                              average="weighted",
                              multi_class="ovr")
        nested_scores.append(score)
    return nested_scores
예제 #11
0
def train(args):

    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)
    ngpus_per_trial = 1 if args.num_gpus > 0 else 0

    # load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)
    print(f'Label counts: {dict(Counter(train_data[args.label]))}')
    print(f'hp: {args.hyperparameters}')
    predictor = task.fit(
        train_data=train_data,
        label=args.label,
        output_directory=args.model_dir,
        problem_type=args.problem_type,
        eval_metric=args.eval_metric,
        stopping_metric=args.stopping_metric,
        auto_stack=args.auto_stack,  # default: False
        hyperparameter_tune=args.hyperparameter_tune,  # default: False
        feature_prune=args.feature_prune,  # default: False
        holdout_frac=args.holdout_frac,  # default: None
        num_bagging_folds=args.num_bagging_folds,  # default: 0
        num_bagging_sets=args.num_bagging_sets,  # default: None
        stack_ensemble_levels=args.stack_ensemble_levels,  # default: 0
        hyperparameters=args.hyperparameters,
        cache_data=args.cache_data,
        time_limits=args.time_limits,
        num_trials=args.num_trials,  # default: None
        search_strategy=args.search_strategy,  # default: 'random'
        search_options=args.search_options,
        visualizer=args.visualizer,
        verbosity=args.verbosity)

    # Results summary
    predictor.fit_summary(verbosity=1)

    # Leaderboard on optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        print('Running model on test data and getting Leaderboard...')
        leaderboard = predictor.leaderboard(dataset=test_data, silent=True)

        def format_for_print(df):
            table = PrettyTable(list(df.columns))
            for row in df.itertuples():
                table.add_row(row[1:])
            return str(table)

        print(format_for_print(leaderboard), end='\n\n')

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
예제 #12
0
def train_autogluon(X_train, X_test, y_train, y_test, mtype, common_name_model,
                    problemtype, classes, default_featurenames,
                    transform_model, settings, model_session):
    # get train and test data
    train_data = convert_gluon(X_train, y_train)
    test_data = convert_gluon(X_test, y_test)
    predictor = task.fit(train_data=train_data, label='class')

    # get summary
    results = predictor.fit_summary(verbosity=3)

    # get model name
    files = list()
    model_name = common_name_model + '.pickle'
    # pickle store classifier
    f = open(model_name, 'wb')
    pickle.dump(predictor, f)
    f.close()

    # now rename current directory with models (keep this info in a folder)
    files.append(model_name)
    files.append('AutogluonModels')
    files.append('catboost_info')
    files.append('dask-worker-space')

    # get model_name
    model_dir = os.getcwd()

    return model_name, model_dir, files, test_data
예제 #13
0
 def load(self, path):
     # load model
     self._model = TabularPrediction.load(path)
     
     # get the column name of label
     with open(os.path.join(path, "params.json"), "r") as f:
         params = json.load(f)
         self._label_column = params["label"]
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir
    target = args.label_column

    train_file_path = get_file_path(args.train, args.train_filename)

    train_data = task.Dataset(file_path= train_file_path )
    subsample_size = int(args.train_rows)  # subsample subset of data for faster demo, try setting this to much larger values
    train_data = train_data.sample(n=subsample_size, random_state=0)


    predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir)

    return predictor
def model_fn(model_dir):
    """
    Load the gluon model. Called once when hosting service starts.
    :param: model_dir The directory where model files are stored.
    :return: a model (in this case an AutoGluon network)
    """
    net = task.load(model_dir)
    return net
예제 #16
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir
    target = args.label
    presets = args.presets
    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)

    # train_file_path = get_file_path(args.train, args.train_filename)
    # train_data = task.Dataset(file_path= train_file_path )
    columns = train_data.columns.tolist()
    column_dict = {"columns":columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)
    
    subsample_size = int(args.train_rows)  # subsample subset of data for faster demo, try setting this to much larger values
    train_data = train_data.sample(n=subsample_size, random_state=0)


    predictor = task.fit(train_data = train_data, label=target, 
                         output_directory=model_dir,
                         presets = presets)

    # Results summary
    predictor.fit_summary(verbosity=1)

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring

        # Leaderboard on test data
        print('Running model on test data and getting Leaderboard...')
        leaderboard = predictor.leaderboard(dataset=test_data, silent=True)
        print(format_for_print(leaderboard), end='\n\n')

        # Feature importance on test data
        # Note: Feature importance must be calculated on held-out (test) data.
        # If calculated on training data it will be biased due to overfitting.
        if args.feature_importance:      
            print('Feature importance:')
            # Increase rows to print feature importance                
            pd.set_option('display.max_rows', 500)
            print(predictor.feature_importance(test_data))

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")

    
    return predictor
def train(args):
    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)

    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)

    # Extract column info
    target = args.fit_args['label']
    columns = train_data.columns.tolist()
    column_dict = {"columns": columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)

    # Train models
    predictor = task.fit(
        train_data=train_data,
        output_directory=args.model_dir,
        **args.fit_args,
    )

    # Results summary
    predictor.fit_summary(verbosity=1)

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring
        if args.fit_args['label'] in test_data:
            # Leaderboard on test data
            print('Running model on test data and getting Leaderboard...')
            leaderboard = predictor.leaderboard(dataset=test_data, silent=True)
            print(format_for_print(leaderboard), end='\n\n')

            # Feature importance on test data
            # Note: Feature importance must be calculated on held-out (test) data.
            # If calculated on training data it will be biased due to overfitting.
            if args.feature_importance:
                print('Feature importance:')
                # Increase rows to print feature importance
                pd.set_option('display.max_rows', 500)
                print(predictor.feature_importance(test_data))
        else:
            warnings.warn(
                'Skipping eval on test data since label column is not included.'
            )

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")
예제 #18
0
def model_fn(model_dir):
    """
    Load the gluon model. Called once when hosting service starts.
    :param: model_dir The directory where model files are stored.
    :return: a model (in this case a Gluon network)
    """
    print(f'Loading model from {model_dir} with contents {os.listdir(model_dir)}')
    net = task.load(model_dir, verbosity=True)    
    return net
예제 #19
0
def test_advanced_functionality():
    fast_benchmark = True
    dataset = {'url': 'https://autogluon.s3.amazonaws.com/datasets/AdultIncomeBinaryClassification.zip',
                      'name': 'AdultIncomeBinaryClassification',
                      'problem_type': BINARY}
    label = 'class'
    directory_prefix = './datasets/'
    train_file = 'train_data.csv'
    test_file = 'test_data.csv'
    train_data, test_data = load_data(directory_prefix=directory_prefix, train_file=train_file, test_file=test_file, name=dataset['name'], url=dataset['url'])
    if fast_benchmark:  # subsample for fast_benchmark
        subsample_size = 100
        train_data = train_data.head(subsample_size)
        test_data = test_data.head(subsample_size)
    print(f"Evaluating Advanced Functionality on Benchmark Dataset {dataset['name']}")
    directory = directory_prefix + 'advanced/' + dataset['name'] + "/"
    savedir = directory + 'AutogluonOutput/'
    shutil.rmtree(savedir, ignore_errors=True)  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
    predictor = task.fit(train_data=train_data, label=label, output_directory=savedir)
    leaderboard = predictor.leaderboard(dataset=test_data)
    leaderboard_extra = predictor.leaderboard(dataset=test_data, extra_info=True)
    assert set(predictor.get_model_names()) == set(leaderboard['model'])
    assert set(predictor.get_model_names()) == set(leaderboard_extra['model'])
    assert set(leaderboard_extra.columns).issuperset(set(leaderboard.columns))
    assert len(leaderboard) == len(leaderboard_extra)
    num_models = len(predictor.get_model_names())
    feature_importances = predictor.feature_importance(dataset=test_data)
    original_features = set(train_data.columns)
    original_features.remove(label)
    assert(set(feature_importances.keys()) == original_features)
    predictor.transform_features()
    predictor.transform_features(dataset=test_data)
    predictor.info()
    assert(predictor.get_model_full_dict() == dict())
    predictor.refit_full()
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    for model in predictor.get_model_names():
        predictor.predict(dataset=test_data, model=model)
    predictor.refit_full()  # Confirm that refit_models aren't further refit.
    assert(len(predictor.get_model_full_dict()) == num_models)
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.delete_models(models_to_keep=[])  # Test that dry-run doesn't delete models
    assert(len(predictor.get_model_names()) == num_models * 2)
    predictor.predict(dataset=test_data)
    predictor.delete_models(models_to_keep=[], dry_run=False)  # Test that dry-run deletes models
    assert len(predictor.get_model_names()) == 0
    assert len(predictor.leaderboard()) == 0
    assert len(predictor.leaderboard(extra_info=True)) == 0
    try:
        predictor.predict(dataset=test_data)
    except:
        pass
    else:
        raise AssertionError('predictor.predict should raise exception after all models are deleted')
    print('Tabular Advanced Functionality Test Succeeded.')
예제 #20
0
    def predict(cls, prediction_input: DataFrame):
        """For the input, do the predictions and return them.

        Args:
            prediction_input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""
        prediction_data = task.Dataset(df=prediction_input)
        print("Prediction Data: ")
        print(prediction_data.head())
        return cls.model.predict(prediction_data)
예제 #21
0
def model_fn(model_dir):
    """
    Load the gluon model. Called once when hosting service starts.
    :param: model_dir The directory where model files are stored.
    :return: a model (in this case a Gluon network) and the column info.
    """
    print(f'Loading model from {model_dir} with contents {os.listdir(model_dir)}')
    net = task.load(model_dir, verbosity=True)
    with open(f'{model_dir}/code/columns.pkl', 'rb') as f:
        column_dict = pickle.load(f)
    return net, column_dict
예제 #22
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    target = args.target

    # load training and validation data

    training_dir = args.train
    filename = args.filename
    logging.info(training_dir)
    train_data = task.Dataset(file_path=training_dir + '/' + filename)
    predictor = task.fit(train_data=train_data,
                         label=target,
                         output_directory=model_dir)

    return predictor
예제 #23
0
def Load_GLUON(dataDownstream, dataFeaturized):

    df = pd.DataFrame(columns=['column', 'feature_type'])
    df.to_csv('AutoGluon_predictions.csv', index=False)

    # dataDownstream
    train = copy.deepcopy(dataDownstream)

    train['label_target'] = 1
    train_data = task.Dataset(df=train)
    label_column = 'label_target'

    try:
        features = task.fit(train_data=train_data, label=label_column)
    except:
        AlwaysTrue = 1

    agl_predictions = pd.read_csv('AutoGluon_predictions.csv')
    predictions = agl_predictions['feature_type'].values.tolist()

    return predictions
예제 #24
0
def train_regression_autogluon(args, train_df, test_df):
    mx.npx.reset_np()
    from autogluon import TabularPrediction as task
    predictor = task.fit(train_data=task.Dataset(df=train_df),
                         output_directory=args.out_dir,
                         label='thrpt',
                         eval_metric='mean_absolute_error')
    #performance = predictor.evaluate(test_df)
    test_prediction = predictor.predict(test_df)
    ret = np.zeros((len(test_prediction), 2), dtype=np.float32)
    for i, (lhs,
            rhs) in enumerate(zip(test_df['thrpt'].to_numpy(),
                                  test_prediction)):
        ret[i][0] = lhs
        ret[i][1] = rhs
    df_result = pd.DataFrame(ret, columns=['gt', 'pred'])
    df_result.to_csv(os.path.join(args.out_dir, 'pred_result.csv'))
    plot_save_figure(gt_thrpt=test_df['thrpt'].to_numpy(),
                     pred_thrpt=test_prediction,
                     save_dir=args.out_dir)
    mx.npx.set_np()
def __load_input_data(path: str) -> TabularDataset:
    """
    Load training data as dataframe
    :param path:
    :return: DataFrame
    """
    input_data_files = os.listdir(path)
    try:
        input_dfs = [pd.read_csv(f'{path}/{data_file}') for data_file in input_data_files]
        return task.Dataset(df=pd.concat(input_dfs))
    except:
        print(f'No csv data in {path}!')
        return None
예제 #26
0
def frc_AutoGluon(df_train, df_test, 
    categoricalVars, responseVar = 'wk1_sales_all_stores'):
    
    import autogluon as ag
    from autogluon import TabularPrediction as task

    for varName in categoricalVars:
        df_train[varName] = df_train[varName].astype(str)
        df_test[varName] = df_test[varName].astype(str)

    # AutoGluon format
    train_data = task.Dataset(df=df_train)
    test_data = task.Dataset(df=df_test)

    model = task.fit(train_data=train_data, 
    output_directory="auto_gluon", label=responseVar,
    hyperparameter_tune=False)


    # Forecast with the best model
    autogluon_frc = model.predict(test_data)
    return {'autoGluon_frc': autogluon_frc, 'autoGluon_model':model}
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    target = args.target

    # load training and validation data

    training_dir = args.train
    filename = args.filename
    logging.info(training_dir)
    hyperparameters = {
        'GBM': [
            {},
            {
                'extra_trees': True,
                'AG_args': {
                    'name_suffix': 'XT'
                }
            },
        ],
        'RF': {},
        'XT': {},
        'KNN': {},
        'custom': ['GBM']
    }
    presets = 'medium_quality_faster_train'
    train_data = task.Dataset(file_path=training_dir + '/' + filename)
    predictor = task.fit(train_data=train_data,
                         label=target,
                         output_directory=model_dir,
                         presets=presets,
                         hyperparameters=hyperparameters)

    return predictor
예제 #28
0
def evaluate(predictor, args):
    
    train_dir = args.train_dir
    train_file = args.filename
    test_file = train_file.replace('train', 'test', 1)
    target = args.target
    training_job_name = args.training_job_name
    s3_output = args.s3_output

    dataset_name = train_file.split('_')[0]
    logging.info(dataset_name)
    
    test_data = task.Dataset(file_path=os.path.join(train_dir, test_file))   
    
    u = urlparse(s3_output, allow_fragments=False)
    bucket = u.netloc
    logging.info(bucket)
    prefix = u.path.strip('/')
    logging.info(prefix)
    s3 = boto3.client('s3')
    
    y_test = test_data[target]
    test_data_nolab = test_data.drop(labels=[target], axis=1)

    y_pred = predictor.predict(test_data_nolab)
    y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred})
    pred_file = f'{dataset_name}_test_predictions.csv'
    y_pred_df.to_csv(pred_file, index=False, header=True)

    leaderboard = predictor.leaderboard()
    lead_file = f'{dataset_name}_leaderboard.csv'
    leaderboard.to_csv(lead_file)
    
    perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
    del perf['confusion_matrix']
    perf_file = f'{dataset_name}_model_performance.txt'
    with open(perf_file, 'w') as f:
        print(json.dumps(perf, indent=4), file=f)

    summary = predictor.fit_summary()
    summ_file = f'{dataset_name}_fit_summary.txt'
    with open(summ_file, 'w') as f:
        print(summary, file=f)

    files_to_upload = [pred_file, lead_file, perf_file, summ_file]  
    for file in files_to_upload:
        s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))
def transform_fn(net, data, input_content_type, output_content_type):
    """
    Transform a request using the Gluon model. Called once per request.
    :param net: The Gluon model.
    :param data: The request payload.
    :param input_content_type: The request content type. ('text/csv')
    :param output_content_type: The (desired) response content type. ('text/csv')
    :return: response payload and content type.
    """
    start = timer()

    # text/csv
    if input_content_type == 'text/csv':

        # Load dataset
        df = pd.read_csv(StringIO(data))
        ds = task.Dataset(df=df)

        # Predict
        predictions = net.predict(ds)
        print(f'Prediction counts: {Counter(predictions.tolist())}')

        # Form response
        output = StringIO()
        pd.DataFrame(predictions).to_csv(output, header=False, index=False)
        response_body = output.getvalue()

        # If target column passed, evaluate predictions performance
        target = net.label_column
        if target in ds:
            print(f'Label column ({target}) found in input data. '
                  'Therefore, evaluating prediction performance...')

            performance = net.evaluate_predictions(y_true=ds[target],
                                                   y_pred=predictions.tolist(),
                                                   auxiliary_metrics=True)
            print(json.dumps(performance, indent=4))

    else:
        raise NotImplementedError("content_type must be 'text/csv'")

    elapsed_time = round(timer() - start, 3)
    print(f'Elapsed time: {round(timer()-start,3)} seconds')

    return response_body, output_content_type
def train(args):
    model_output_dir = f'{args.output_dir}/data'
    
    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)

    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)
    
    # Extract column info
    target = args.fit_args['label']
    columns = train_data.columns.tolist()
    column_dict = {"columns":columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)
    
    # Train models
    predictor = task.fit(
        train_data=train_data,
        output_directory=args.model_dir,
        **args.fit_args,
    )
    
    # Results summary
    predictor.fit_summary(verbosity=3)
    model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html')
    model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html')
    
    if os.path.exists(model_summary_fname_src):
        shutil.copy(model_summary_fname_src, model_summary_fname_tgt)
    
    # ensemble visualization
    G = predictor._trainer.model_graph
    remove = [node for node,degree in dict(G.degree()).items() if degree < 1]
    G.remove_nodes_from(remove)
    A = nx.nx_agraph.to_agraph(G)
    A.graph_attr.update(rankdir='BT')
    A.node_attr.update(fontsize=10)
    for node in A.iternodes():
        node.attr['shape'] = 'rectagle'
    A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot')

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring
        if args.fit_args['label'] in test_data:
            # Leaderboard on test data
            print('Running model on test data and getting Leaderboard...')
            leaderboard = predictor.leaderboard(dataset=test_data, silent=True)
            print(format_for_print(leaderboard), end='\n\n')
            leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False)

            # Feature importance on test data
            # Note: Feature importance must be calculated on held-out (test) data.
            # If calculated on training data it will be biased due to overfitting.
            if args.feature_importance:      
                print('Feature importance:')
                # Increase rows to print feature importance                
                pd.set_option('display.max_rows', 500)
                feature_importance = predictor.feature_importance(test_data)
                feature_importance_df = pd.DataFrame(feature_importance, columns=['Importance score']).rename_axis(index='Feature')
                print(feature_importance_df)
                feature_importance_df.to_csv(f'{model_output_dir}/feature_importance.csv', index=True)
            
            # Classification report and confusion matrix for classification model
            if predictor.problem_type in [BINARY, MULTICLASS]:
                from sklearn.metrics import classification_report, confusion_matrix
                
                X_test = test_data.drop(args.fit_args['label'], axis=1)
                y_test_true = test_data[args.fit_args['label']]
                y_test_pred = predictor.predict(X_test)
                y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True)
                
                report_dict = classification_report(y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels)
                report_dict_df = pd.DataFrame(report_dict).T
                report_dict_df.to_csv(f'{model_output_dir}/classification_report.csv', index=True)
                
                cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels)
                cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels)
                sns.set(font_scale=1)
                cmap = 'coolwarm'
                sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap)
                plt.title('Confusion Matrix')
                plt.ylabel('true label')
                plt.xlabel('predicted label')
                plt.show()
                plt.savefig(f'{model_output_dir}/confusion_matrix.png')
                
                get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir)
        else:
            warnings.warn('Skipping eval on test data since label column is not included.')

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")