def run_example(self):

        train_data = task.Dataset(file_path='./data/churn-train.csv')
        train_data = train_data.head(
            500)  # subsample 500 data points for faster demo
        print(train_data.head())
        label_column = 'churn_probability'
        print("Summary of class variable: \n",
              train_data[label_column].describe())
        dir = 'agModels-predictClass'  # specifies folder where to store trained models
        predictor = task.fit(train_data=train_data,
                             label=label_column,
                             eval_metric="mean_absolute_error")
        test_data = task.Dataset(file_path='./data/churn-test.csv')
        y_test = test_data[label_column]  # values to predict
        test_data_nolab = test_data.drop(
            labels=[label_column],
            axis=1)  # delete label column to prove we're not cheating
        print(test_data_nolab.head())
        #predictor = task.load(dir) # unnecessary, just demonstrates how to load previously-trained predictor from file

        y_pred = predictor.predict(test_data_nolab)
        print("Predictions:  ", y_pred)
        perf = predictor.evaluate_predictions(y_true=y_test,
                                              y_pred=y_pred,
                                              auxiliary_metrics=True)

        print("MAE: " + perf)

        return perf
Exemple #2
0
def frc_AutoGluon(df_train,
                  df_test,
                  categoricalVars,
                  experiment_label='grocery',
                  responseVar='wk1_sales_all_stores'):

    import autogluon as ag
    from autogluon import TabularPrediction as task
    # autogluon.task.tabular_prediction.TabularPredictor

    for varName in categoricalVars:
        df_train[varName] = df_train[varName].astype(str)
        df_test[varName] = df_test[varName].astype(str)

    # AutoGluon format
    train_data = task.Dataset(df=df_train)
    test_data = task.Dataset(df=df_test)

    model = task.fit(train_data=train_data,
                     output_directory="auto_gluon/" + experiment_label,
                     label=responseVar,
                     hyperparameter_tune=False)

    # Forecast with the best model
    autogluon_frc = model.predict(test_data)

    # Forecast with all the models
    individual_frc = {'AG_'+model_to_use: model.predict(test_data, model=model_to_use) \
        for model_to_use in model.model_names}

    return {
        'autoGluon_frc': autogluon_frc,
        'autoGluon_model': model,
        'individual_frc': individual_frc
    }
 def train(self, train_data, val_data, params):
     train_dataset = TabularPrediction.Dataset(train_data)
     val_dataset   = TabularPrediction.Dataset(val_data)
     output_dir    = os.path.join(self.get_output_folders()[0], dt.now().strftime('%Y%m%d%H%M%S'))
     hp_tune       = params["hp_tune"]
     ag_params     = params["autogluon"]
     self._label_column = params["label"]
     
     if hp_tune is True:
         hp_params       = ag_params["hyperparameters"]
         time_limits     = hp_params["time_limits"]
         num_trials      = hp_params["num_trials"]
         hyperparameters = self.__create_hp_params(hp_params)
         search_strategy = hp_params["search_strategy"]
         self._model = TabularPrediction.fit(
             train_data=train_dataset, tuning_data=val_dataset, label=self._label_column,
             output_directory=output_dir, time_limits=time_limits, 
             num_trials=num_trials, hyperparameter_tune=hp_tune, 
             hyperparameters=hyperparameters, search_strategy=search_strategy
         )
     else:
         self._model = TabularPrediction.fit(
             train_data=train_dataset, tuning_data=val_dataset, label=self._label_column,
             output_directory=output_dir
         )
     
     self.__dump_params(output_dir, params)
     
     self._model.fit_summary()
    def run(self, train_path, test_path, target, task):
        train_data = task.Dataset(file_path=train_path)

        predictor = task.fit(train_data=train_data,
                             label=label_column,
                             eval_metric="f1_macro",
                             num_bagging_folds=5)

        test_data = task.Dataset(file_path=test_path)
        y_test = test_data[target]

        y_pred = predictor.predict(test_data)
        return predictor.evaluate_predictions(y_true=y_test.to_numpy(),
                                              y_pred=y_pred,
                                              auxiliary_metrics=True)
Exemple #5
0
def convert_gluon(X_train, y_train):

    feature_list = list()
    for i in range(len(X_train[0])):
        feature_list.append('feature_' + str(i))

    feature_list.append('class')
    data = dict()

    for i in range(len(X_train)):
        for j in range(len(feature_list) - 1):
            if i > 0:
                try:
                    data[feature_list[j]] = data[feature_list[j]] + [
                        X_train[i][j]
                    ]
                except:
                    pass

            else:
                data[feature_list[j]] = [X_train[i][j]]
                print(data)

    data['class'] = y_train
    data = pd.DataFrame(data, columns=list(data))
    data = task.Dataset(data)

    return data
Exemple #6
0
def train(args):
    
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir    
    train_dir = args.train_dir
    filename = args.filename
    target = args.target    
    debug = args.debug
    eval_metric = args.eval_metric   
    presets = args.presets    
    
    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
     
    logging.info(train_dir)
    
    train_data = task.Dataset(file_path=os.path.join(train_dir, filename))
    if debug:
        subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
        train_data = train_data.sample(n=subsample_size, random_state=0)
    predictor = task.fit(train_data = train_data, label=target, 
        output_directory=model_dir, eval_metric=eval_metric, presets=presets)

    return predictor
Exemple #7
0
def load_data(directory_prefix, train_file, test_file, name, url=None):
    if not os.path.exists(directory_prefix):
        os.mkdir(directory_prefix)
    directory = directory_prefix + name + "/"
    train_file_path = directory + train_file
    test_file_path = directory + test_file
    if (not os.path.exists(train_file_path)) or (not os.path.exists(test_file_path)):
        # fetch files from s3:
        print("%s data not found locally, so fetching from %s" % (name, url))
        zip_name = ag.download(url, directory_prefix)
        ag.unzip(zip_name, directory_prefix)
        os.remove(zip_name)

    train_data = task.Dataset(file_path=train_file_path)
    test_data = task.Dataset(file_path=test_file_path)
    return train_data, test_data
Exemple #8
0
    def predict(cls, prediction_input: DataFrame):
        """For the input, do the predictions and return them.

        Args:
            prediction_input (a pandas dataframe): The data on which to do the predictions. There will be
                one prediction per row in the dataframe"""
        prediction_data = task.Dataset(df=prediction_input)
        print("Prediction Data: ")
        print(prediction_data.head())
        return cls.model.predict(prediction_data)
Exemple #9
0
    def train(self, data, params):
        self.data = data

        self.train_data = task.Dataset(data.unscaled_df)

        autogluon_dir = f'agModels-predictClass/{uuid.uuid4()}'  # specifies folder where to store trained models
        self.predictor = task.fit(train_data=self.train_data,
                                  label=self.metadata.get("output")[0],
                                  output_directory=autogluon_dir)

        self.state = "TRAINED"
Exemple #10
0
def frc_AutoGluon(df_train, df_test, 
    categoricalVars, responseVar = 'wk1_sales_all_stores'):
    
    import autogluon as ag
    from autogluon import TabularPrediction as task

    for varName in categoricalVars:
        df_train[varName] = df_train[varName].astype(str)
        df_test[varName] = df_test[varName].astype(str)

    # AutoGluon format
    train_data = task.Dataset(df=df_train)
    test_data = task.Dataset(df=df_test)

    model = task.fit(train_data=train_data, 
    output_directory="auto_gluon", label=responseVar,
    hyperparameter_tune=False)


    # Forecast with the best model
    autogluon_frc = model.predict(test_data)
    return {'autoGluon_frc': autogluon_frc, 'autoGluon_model':model}
def __load_input_data(path: str) -> TabularDataset:
    """
    Load training data as dataframe
    :param path:
    :return: DataFrame
    """
    input_data_files = os.listdir(path)
    try:
        input_dfs = [pd.read_csv(f'{path}/{data_file}') for data_file in input_data_files]
        return task.Dataset(df=pd.concat(input_dfs))
    except:
        print(f'No csv data in {path}!')
        return None
Exemple #12
0
def evaluate(predictor, args):
    
    train_dir = args.train_dir
    train_file = args.filename
    test_file = train_file.replace('train', 'test', 1)
    target = args.target
    training_job_name = args.training_job_name
    s3_output = args.s3_output

    dataset_name = train_file.split('_')[0]
    logging.info(dataset_name)
    
    test_data = task.Dataset(file_path=os.path.join(train_dir, test_file))   
    
    u = urlparse(s3_output, allow_fragments=False)
    bucket = u.netloc
    logging.info(bucket)
    prefix = u.path.strip('/')
    logging.info(prefix)
    s3 = boto3.client('s3')
    
    y_test = test_data[target]
    test_data_nolab = test_data.drop(labels=[target], axis=1)

    y_pred = predictor.predict(test_data_nolab)
    y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred})
    pred_file = f'{dataset_name}_test_predictions.csv'
    y_pred_df.to_csv(pred_file, index=False, header=True)

    leaderboard = predictor.leaderboard()
    lead_file = f'{dataset_name}_leaderboard.csv'
    leaderboard.to_csv(lead_file)
    
    perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
    del perf['confusion_matrix']
    perf_file = f'{dataset_name}_model_performance.txt'
    with open(perf_file, 'w') as f:
        print(json.dumps(perf, indent=4), file=f)

    summary = predictor.fit_summary()
    summ_file = f'{dataset_name}_fit_summary.txt'
    with open(summ_file, 'w') as f:
        print(summary, file=f)

    files_to_upload = [pred_file, lead_file, perf_file, summ_file]  
    for file in files_to_upload:
        s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    model_dir = args.model_dir
    target = args.label_column

    train_file_path = get_file_path(args.train, args.train_filename)

    train_data = task.Dataset(file_path= train_file_path )
    subsample_size = int(args.train_rows)  # subsample subset of data for faster demo, try setting this to much larger values
    train_data = train_data.sample(n=subsample_size, random_state=0)


    predictor = task.fit(train_data = train_data, label=target, output_directory=model_dir)

    return predictor
def transform_fn(net, data, input_content_type, output_content_type):
    """
    Transform a request using the Gluon model. Called once per request.
    :param net: The Gluon model.
    :param data: The request payload.
    :param input_content_type: The request content type. ('text/csv')
    :param output_content_type: The (desired) response content type. ('text/csv')
    :return: response payload and content type.
    """
    start = timer()

    # text/csv
    if input_content_type == 'text/csv':

        # Load dataset
        df = pd.read_csv(StringIO(data))
        ds = task.Dataset(df=df)

        # Predict
        predictions = net.predict(ds)
        print(f'Prediction counts: {Counter(predictions.tolist())}')

        # Form response
        output = StringIO()
        pd.DataFrame(predictions).to_csv(output, header=False, index=False)
        response_body = output.getvalue()

        # If target column passed, evaluate predictions performance
        target = net.label_column
        if target in ds:
            print(f'Label column ({target}) found in input data. '
                  'Therefore, evaluating prediction performance...')

            performance = net.evaluate_predictions(y_true=ds[target],
                                                   y_pred=predictions.tolist(),
                                                   auxiliary_metrics=True)
            print(json.dumps(performance, indent=4))

    else:
        raise NotImplementedError("content_type must be 'text/csv'")

    elapsed_time = round(timer() - start, 3)
    print(f'Elapsed time: {round(timer()-start,3)} seconds')

    return response_body, output_content_type
def Load_GLUON(dataDownstream, dataFeaturized):

    df = pd.DataFrame(columns=['column', 'feature_type'])
    df.to_csv('AutoGluon_predictions.csv', index=False)

    # dataDownstream
    train = copy.deepcopy(dataDownstream)

    train['label_target'] = 1
    train_data = task.Dataset(df=train)
    label_column = 'label_target'

    try:
        features = task.fit(train_data=train_data, label=label_column)
    except:
        AlwaysTrue = 1

    agl_predictions = pd.read_csv('AutoGluon_predictions.csv')
    predictions = agl_predictions['feature_type'].values.tolist()

    return predictions
Exemple #16
0
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    target = args.target

    # load training and validation data

    training_dir = args.train
    filename = args.filename
    logging.info(training_dir)
    train_data = task.Dataset(file_path=training_dir + '/' + filename)
    predictor = task.fit(train_data=train_data,
                         label=target,
                         output_directory=model_dir)

    return predictor
Exemple #17
0
def train_regression_autogluon(args, train_df, test_df):
    mx.npx.reset_np()
    from autogluon import TabularPrediction as task
    predictor = task.fit(train_data=task.Dataset(df=train_df),
                         output_directory=args.out_dir,
                         label='thrpt',
                         eval_metric='mean_absolute_error')
    #performance = predictor.evaluate(test_df)
    test_prediction = predictor.predict(test_df)
    ret = np.zeros((len(test_prediction), 2), dtype=np.float32)
    for i, (lhs,
            rhs) in enumerate(zip(test_df['thrpt'].to_numpy(),
                                  test_prediction)):
        ret[i][0] = lhs
        ret[i][1] = rhs
    df_result = pd.DataFrame(ret, columns=['gt', 'pred'])
    df_result.to_csv(os.path.join(args.out_dir, 'pred_result.csv'))
    plot_save_figure(gt_thrpt=test_df['thrpt'].to_numpy(),
                     pred_thrpt=test_prediction,
                     save_dir=args.out_dir)
    mx.npx.set_np()
def train(args):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.

    num_gpus = int(os.environ['SM_NUM_GPUS'])
    current_host = args.current_host
    hosts = args.hosts
    model_dir = args.model_dir
    target = args.target

    # load training and validation data

    training_dir = args.train
    filename = args.filename
    logging.info(training_dir)
    hyperparameters = {
        'GBM': [
            {},
            {
                'extra_trees': True,
                'AG_args': {
                    'name_suffix': 'XT'
                }
            },
        ],
        'RF': {},
        'XT': {},
        'KNN': {},
        'custom': ['GBM']
    }
    presets = 'medium_quality_faster_train'
    train_data = task.Dataset(file_path=training_dir + '/' + filename)
    predictor = task.fit(train_data=train_data,
                         label=target,
                         output_directory=model_dir,
                         presets=presets,
                         hyperparameters=hyperparameters)

    return predictor
                            os.environ['SM_TRAINING_ENV'])['job_name'])

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    predictor = train(args)

    training_dir = args.train
    train_file = args.filename
    test_file = train_file.replace('train', 'test', 1)
    dataset_name = train_file.split('_')[0]
    print(dataset_name)

    test_data = task.Dataset(file_path=os.path.join(training_dir, test_file))
    u = urlparse(args.s3_output, allow_fragments=False)
    bucket = u.netloc
    print(bucket)
    prefix = u.path.strip('/')
    print(prefix)

    s3 = boto3.client('s3')

    try:
        y_test = test_data[args.target]  # values to predict
        # delete label column to prove we're not cheating
        test_data_nolab = test_data.drop(labels=[args.target], axis=1)

        y_pred = predictor.predict(test_data_nolab)
        y_pred_df = pd.DataFrame.from_dict({
def processData(data,
                label_column=None,
                output_directory=None,
                ag_predictor=None,
                problem_type=None,
                eval_metric=None):
    """ Converts pandas Dataframe to matrix of entirely numerical values (stored in DataFrame).
        Performs same data preprocessing as used for AutoGluon's tabular neural network model, 
        to deal with issues such as: missing value imputation, one-hot encoding of categoricals, 
        handling of high-cardinality categoricals, handling unknown categorical feature-levels at test-time, etc.
        
        If ag_predictor is not None, uses existing autogluon predictor object to process data (must have tabularNN as first model).
        To process training data, ag_predictor should = None. For test data, should != None.
        Returns:
            Tuple (X, y, ag_predictor)
            where y may be None if labels are not present in test data.
    """

    # fit dummy neural network model just to preprocess data. Here we ensure no embedding layers are used.
    if ag_predictor is None:
        if label_column is None:
            raise ValueError(
                "when processing training data, label_column cannot be None")
        elif not label_column in data.columns:
            raise ValueError(
                "label_column cannot be missing from training data")
        ag_predictor = task.fit(train_data=task.Dataset(data),
                                tuning_data=task.Dataset(data),
                                label=label_column,
                                hyperparameter_tune=False,
                                problem_type=problem_type,
                                eval_metric=eval_metric,
                                hyperparameters={
                                    'NN': {
                                        'num_epochs': 0,
                                        'proc.embed_min_categories': np.inf
                                    }
                                },
                                num_bagging_folds=0,
                                stack_ensemble_levels=0,
                                label_count_threshold=1,
                                verbosity=2,
                                feature_generator_kwargs={
                                    'enable_nlp_vectorizer_features': False,
                                    'enable_nlp_ratio_features': False
                                })

    model = ag_predictor._trainer.load_model(
        ag_predictor._trainer.get_model_names_all()
        [0])  # This must be the neural net model which contains data processor
    if 'NeuralNet' not in model.name:
        raise ValueError(
            "Data preprocessing error. This model should be the NeuralNet, not the: %s"
            % model.name)
    bad_inds = []  # row-indices to remove from dataset
    if label_column is not None and label_column in data.columns:
        label_cleaner = ag_predictor._learner.label_cleaner
        y = data[label_column].values
        data = data.drop([label_column], axis=1, inplace=False)
        y = label_cleaner.transform(y)
        if np.sum(y.isna()) > 0:
            bad_inds = y.index[y.apply(np.isnan)].tolist(
            )  # remove these inds as label is NaN (due to very rare classes)
            warnings.warn(
                "Dropped these rows from data in preprocessing, due to missing labels: "
                + str(bad_inds))
    else:
        y = None
    data_initial_processed = ag_predictor._learner.transform_features(
        data)  # general autogluon data processing.
    # data_fg = ag_predictor._learner.general_data_processing(X=data, X_test=data, holdout_frac=0.0, num_bagging_folds=0)
    tabNN_data = model.process_data(
        data_initial_processed, is_test=True
    )  # neural net-specific autogluon data processing required to turn tabular data into numerical matrix.
    numeric_data = tabNN_data.dataset._data  # list of mxnet.NDArrays
    if len(numeric_data) != 1:
        raise ValueError("Data Preprocessing failed.")
    numpy_data = numeric_data[0].asnumpy()  # 2D Numpy array
    X = pd.DataFrame(numpy_data)
    X.columns = ['feature' + str(i) for i in range(X.shape[1])]
    if len(bad_inds) > 0:
        y.drop(index=bad_inds, inplace=True)
        X.drop(index=bad_inds, axis=0, inplace=True)
    return (X, y, ag_predictor)
dataset = regression_dataset
directory = dataset['name'] + "/"

train_file = 'train_data.csv'
test_file = 'test_data.csv'
train_file_path = directory + train_file
test_file_path = directory + test_file

if (not os.path.exists(train_file_path)) or (
        not os.path.exists(test_file_path)):  # fetch files from s3:
    print("%s data not found locally, so fetching from %s" %
          (dataset['name'], dataset['url']))
    os.system("wget " + dataset['url'] +
              " -O temp.zip && unzip -o temp.zip && rm temp.zip")

train_data = task.Dataset(file_path=train_file_path)
test_data = task.Dataset(file_path=test_file_path)
train_data = train_data.head(subsample_size)  # subsample for faster demo
test_data = test_data.head(subsample_size)  # subsample for faster run
label_column = dataset['label_column']

# Fit model ensemble:
predictor = task.fit(train_data=train_data,
                     label=label_column,
                     output_directory=savedir,
                     cache_data=True,
                     auto_stack=True,
                     time_limits=time_limits,
                     eval_metric='mean_absolute_error')

# Distill ensemble-predictor into single model:
Exemple #22
0
def transform_fn(models, data, input_content_type, output_content_type):
    """
    Transform a request using the Gluon model. Called once per request.
    :param models: The Gluon model and the column info.
    :param data: The request payload.
    :param input_content_type: The request content type. ('text/csv')
    :param output_content_type: The (desired) response content type. ('text/csv')
    :return: response payload and content type.
    """
    start = timer()
    net = models[0]
    column_dict = models[1]

    # text/csv
    if input_content_type == 'text/csv':
        
        # Load dataset
        columns = column_dict['columns']
        df = pd.read_csv(StringIO(data), header=None)
        df_preprosessed = preprocess(df, columns, net.label_column)
        ds = task.Dataset(df=df_preprosessed)
        
        try:
            predictions = net.predict(ds)
        except:
            try:
                predictions = net.predict(ds.fillna(0.0))
                warnings.warn('Filled NaN\'s with 0.0 in order to predict.')
            except Exception as e:
                response_body = e
                return response_body, output_content_type
        
        # Print prediction counts, limit in case of regression problem
        pred_counts = Counter(predictions.tolist())
        n_display_items = 30
        if len(pred_counts) > n_display_items:
            print(f'Top {n_display_items} prediction counts: '
                  f'{dict(take(n_display_items, pred_counts.items()))}')
        else:
            print(f'Prediction counts: {pred_counts}')

        # Form response
        output = StringIO()
        pd.DataFrame(predictions).to_csv(output, header=False, index=False)
        response_body = output.getvalue() 

        # If target column passed, evaluate predictions performance
        target = net.label_column
        if target in ds:
            print(f'Label column ({target}) found in input data. '
                  'Therefore, evaluating prediction performance...')    
            try:
                performance = net.evaluate_predictions(y_true=ds[target], 
                                                       y_pred=predictions, 
                                                       auxiliary_metrics=True)                
                print(json.dumps(performance, indent=4))
                time.sleep(0.1)
            except Exception as e:
                # Print exceptions on evaluate, continue to return predictions
                print(f'Exception: {e}')
    else:
        raise NotImplementedError("content_type must be 'text/csv'")

    elapsed_time = round(timer()-start,3)
    print(f'Elapsed time: {round(timer()-start,3)} seconds')           
    
    return response_body, output_content_type
Exemple #23
0
import pandas as pd
import autogluon.core as ag
from autogluon import TabularPrediction as task
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

#autogluon
label_column = 'test'
dir = 'agModels-predictClass_jiagnwei'
train_data = task.Dataset(file_path="/dataset/jiangweitrai.csv")
test_data = task.Dataset(file_path="/dataset/jiangweitrai.csv")
# TODO
predictor = task.fit(train_data=train_data,
                     label='test',
                     output_directory=dir,
                     auto_stack=True,
                     time_limits=1800)
results = predictor.fit_summary()
print(predictor.feature_importance(dataset=test_data, subsample_size=None))

# predictor = task.load(dir)
# print(predictor.info())
# print(predictor.feature_importance(dataset=train_data))
Exemple #24
0
# Run Auto-WEKA:
(num_models_trained, num_models_ensemble, fit_time, y_pred, y_prob,
 predict_time,
 class_order) = autoweka_fit_predict(train_data=train_data,
                                     test_data=test_data,
                                     label_column=label_column,
                                     problem_type=problem_type,
                                     output_directory=output_directory,
                                     autoweka_path=autoweka_path,
                                     eval_metric=eval_metric,
                                     runtime_sec=runtime_sec,
                                     random_state=random_state,
                                     num_cores=num_cores)

# Can use autogluon.tabular.Predictor to evaluate predictions (assuming metric correctly specified):
ag_predictor = task.fit(task.Dataset(df=train_data),
                        label=label_column,
                        problem_type=problem_type,
                        eval_metric=eval_metric,
                        hyperparameters={'GBM': {
                            'num_boost_round': 2
                        }})
if eval_metric == 'roc_auc':
    preds_toevaluate = y_prob[:, 1]
elif eval_metric == 'log_loss':
    preds_toevaluate = y_prob
else:
    preds_toevaluate = y_pred

perf = ag_predictor.evaluate_predictions(
    test_data[label_column], preds_toevaluate
Exemple #25
0
if not args.evaluate:
    if args.walltime <= 120:
        excluded_model_types = ["KNN"]
    else:
        excluded_model_types = []

    # Create output directory
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    (X_train, y_train), (X_valid, y_valid) = load_data(use_test=False)

    df_train = convert_to_dataframe(X_train, y_train)
    df_valid = convert_to_dataframe(X_valid, y_valid)

    predictor = task.fit(
        train_data=task.Dataset(df=df_train),
        tuning_data=task.Dataset(df=df_valid),
        label="label",
        output_directory=output_dir,
        time_limits=args.walltime,
        hyperparameter_tune=True,
        auto_stack=True,
        excluded_model_types=excluded_model_types,
    )
else:
    _, (X_test, y_test) = load_data(use_test=True)

    print("Convert arrays to DataFrame...")
    df_test = convert_to_dataframe(X_test, y_test)

    print("Loading models...")
Exemple #26
0
def test_tabularHPO():
    # Aggregate performance summaries obtained in previous benchmark run:
    prev_perf_vals = [dataset['performance_val'] for dataset in datasets]
    previous_avg_performance = np.mean(prev_perf_vals)
    previous_median_performance = np.median(prev_perf_vals)
    previous_worst_performance = np.max(prev_perf_vals)

    # Run benchmark:
    performance_vals = [0.0] * len(
        datasets)  # performance obtained in this run
    with warnings.catch_warnings(record=True) as caught_warnings:
        for idx in range(len(datasets)):
            seed(seed_val)
            np.random.seed(seed_val)
            mx.random.seed(seed_val)
            dataset = datasets[idx]
            print("Evaluating Benchmark Dataset %s (%d of %d)" %
                  (dataset['name'], idx + 1, len(datasets)))
            directory = dataset['name'] + "/"
            train_file_path = directory + train_file
            test_file_path = directory + test_file
            if (not os.path.exists(train_file_path)) or (
                    not os.path.exists(test_file_path)):
                # fetch files from s3:
                print("%s data not found locally, so fetching from %s" %
                      (dataset['name'], dataset['url']))
                os.system("wget " + dataset['url'] +
                          " -O temp.zip && unzip -o temp.zip && rm temp.zip")

            savedir = directory + 'AutogluonOutput/'
            shutil.rmtree(
                savedir, ignore_errors=True
            )  # Delete AutoGluon output directory to ensure previous runs' information has been removed.
            label_column = dataset['label_column']
            train_data = task.Dataset(file_path=train_file_path)
            test_data = task.Dataset(file_path=test_file_path)
            y_test = test_data[label_column]
            test_data = test_data.drop(labels=[label_column], axis=1)
            if fast_benchmark:
                train_data = train_data.head(
                    subsample_size)  # subsample for fast_benchmark
            predictor = None  # reset from last Dataset
            if fast_benchmark:
                predictor = task.fit(train_data=train_data,
                                     label=label_column,
                                     output_directory=savedir,
                                     hyperparameter_tune=hyperparameter_tune,
                                     hyperparameters=hyperparameters,
                                     time_limits=time_limits,
                                     num_trials=num_trials,
                                     verbosity=verbosity)
            else:
                predictor = task.fit(train_data=train_data,
                                     label=label_column,
                                     output_directory=savedir,
                                     hyperparameter_tune=hyperparameter_tune,
                                     verbosity=verbosity)
            results = predictor.fit_summary(verbosity=0)
            if predictor.problem_type != dataset['problem_type']:
                warnings.warn(
                    "For dataset %s: Autogluon inferred problem_type = %s, but should = %s"
                    % (dataset['name'], predictor.problem_type,
                       dataset['problem_type']))
            predictor = None  # We delete predictor here to test loading previously-trained predictor from file
            predictor = task.load(savedir)
            y_pred = predictor.predict(test_data)
            perf_dict = predictor.evaluate_predictions(y_true=y_test,
                                                       y_pred=y_pred,
                                                       auxiliary_metrics=True)
            if dataset['problem_type'] != REGRESSION:
                perf = 1.0 - perf_dict[
                    'accuracy_score']  # convert accuracy to error-rate
            else:
                perf = 1.0 - perf_dict[
                    'r2_score']  # unexplained variance score.
            performance_vals[idx] = perf
            print("Performance on dataset %s: %s   (previous perf=%s)" %
                  (dataset['name'], performance_vals[idx],
                   dataset['performance_val']))
            if (not fast_benchmark) and (
                    performance_vals[idx] >
                    dataset['performance_val'] * perf_threshold):
                warnings.warn(
                    "Performance on dataset %s is %s times worse than previous performance."
                    % (dataset['name'], performance_vals[idx] /
                       (EPS + dataset['performance_val'])))

    # Summarize:
    avg_perf = np.mean(performance_vals)
    median_perf = np.median(performance_vals)
    worst_perf = np.max(performance_vals)
    for idx in range(len(datasets)):
        print("Performance on dataset %s: %s   (previous perf=%s)" %
              (datasets[idx]['name'], performance_vals[idx],
               datasets[idx]['performance_val']))

    print("Average performance: %s" % avg_perf)
    print("Median performance: %s" % median_perf)
    print("Worst performance: %s" % worst_perf)

    if not fast_benchmark:
        if avg_perf > previous_avg_performance * perf_threshold:
            warnings.warn(
                "Average Performance is %s times worse than previously." %
                (avg_perf / (EPS + previous_avg_performance)))
        if median_perf > previous_median_performance * perf_threshold:
            warnings.warn(
                "Median Performance is %s times worse than previously." %
                (median_perf / (EPS + previous_median_performance)))
        if worst_perf > previous_worst_performance * perf_threshold:
            warnings.warn(
                "Worst Performance is %s times worse than previously." %
                (worst_perf / (EPS + previous_worst_performance)))

    # List all warnings again to make sure they are seen:
    print("\n\n WARNINGS:")
    for w in caught_warnings:
        warnings.warn(w.message)
"""
wide and deep test, follow code from autogluon
autogluon's NN architecture is based on wide and deep network
"""
from autogluon import TabularPrediction as task
from data_config.data_config import load_data, data_config

if __name__ == '__main__':
    res = {}
    for data_name in data_config.keys():
        ylabel = data_config[data_name]['ylabel']

        X_train, X_valid = load_data(data_name, combine_y=True)
        train_data = task.Dataset(df=X_train)
        test_data = task.Dataset(df=X_valid)
        savedir = f'{data_name}/'  # where to save trained models
        predictor = task.fit(
            train_data=train_data,
            label=ylabel,
            output_directory=savedir,
            eval_metric='roc_auc',
            verbosity=2,
            visualizer='tensorboard',
            random_seed=0,
            save_space=True,
            keep_only_best=True,
        )
        auc = predictor.evaluate(X_valid)
        res[data_name] = auc

    print(res)
Exemple #28
0
    # nn_options = {  # specifies non-default hyperparameter values for neural network models
    #     "num_epochs": 100,  # number of training epochs (controls training time of NN models)
    #     "learning_rate": ag.space.Real(
    #         0.001, 0.1, default=0.01, log=True
    #     ),  # learning rate used in training (real-valued hyperparameter searched on log-scale)
    #     "activation": ag.space.Categorical(
    #         None, swish, "relu", "tanh", "sigmoid"
    #     ),  # activation function used in NN (categorical hyperparameter, default = first entry)
    #     "layers": ag.space.Categorical(*(nunits for _ in range(10))),
    #     # Each choice for categorical hyperparameter 'layers' corresponds to list of sizes for each NN layer to use
    #     "dropout_prob": 0.0,
    # }
    # hyperparameters = {"NN": nn_options}

    predictor = task.fit(
        train_data=task.Dataset(df=df_train),
        # tuning_data=task.Dataset(df=df_valid),
        label="label",
        output_directory=output_dir,
        time_limits=args.walltime,
        hyperparameter_tune=False,
        auto_stack=True,
        excluded_model_types=excluded_model_types,
        dist_ip_addrs=ips,
    )
else:
    _, (X_test, y_test) = load_data(use_test=True)

    print("Convert arrays to DataFrame...")
    df_test = convert_to_dataframe(X_test, y_test)
Exemple #29
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/10/23 8:32
# @Author  : iszhang
# @Email   :
# @File    : ag_main.py
# @software: PyCharm

from autogluon import TabularPrediction as task
import sys

sys.path.append('C:/Users/ThinkPad/PycharmProjects/TabNet&AutoGluon/utils')
import utils.data_utils as data_utils

# pd.set_option('display.max_columns', None)
train_data = task.Dataset(file_path='../Data/5.Haberman/haberman.csv')
label_column = 'status'
dir = 'agModels-predictClass'  # specifies folder where to store trained models

# print(train_data.head(10))
# print(train_data.info())
# print(train_data.describe())

if __name__ == '__main__':
    # predictor = task.fit(train_data=train_data, label=label_column, output_directory=dir, time_limits=100)
    # results = predictor.fit_summary()
    # print("AutoGluon infers problem type is: ", predictor.problem_type)
    # print("AutoGluon identified the following types of features:")
    # print(predictor.feature_metadata)
    # # predictor.leaderboard(train_data, silent=True)
    # # print(results)
""" Example script for predicting columns of tables, demonstrating simple use-case """

from autogluon import TabularPrediction as task

# Training time:
train_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500) # subsample for faster demo
print(train_data.head())
label_column = 'class' # specifies which column do we want to predict
savedir = 'ag_models/' # where to save trained models

predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir)
# NOTE: Default settings above are intended to ensure reasonable runtime at the cost of accuracy. To maximize predictive accuracy, do this instead:  predictor = task.fit(train_data=train_data, label=label_column, output_directory=savedir, presets='best_quality', eval_metric=YOUR_METRIC_NAME)
results = predictor.fit_summary()

# Inference time:
test_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame
y_test = test_data[label_column]
test_data = test_data.drop(labels=[label_column],axis=1) # delete labels from test data since we wouldn't have them in practice
print(test_data.head())

predictor = task.load(savedir) # Unnecessary, we reload predictor just to demonstrate how to load previously-trained predictor from file
y_pred = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)