コード例 #1
0
def test_mixed_column_type():
    train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                              'glue/sts/train.parquet')
    dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                            'glue/sts/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]

    # Add more columns as feature
    train_data = pd.DataFrame({'sentence1': train_data['sentence1'],
                               'sentence2': train_data['sentence2'],
                               'sentence3': train_data['sentence2'],
                               'categorical0': train_data['genre'],
                               'numerical0': train_data['score'],
                               'genre': train_data['genre'],
                               'score': train_data['score']})
    dev_data = pd.DataFrame({'sentence1': dev_data['sentence1'],
                             'sentence2': dev_data['sentence2'],
                             'sentence3': dev_data['sentence2'],
                             'categorical0': dev_data['genre'],
                             'numerical0': dev_data['score'],
                             'genre': dev_data['genre'],
                             'score': dev_data['score']})
    # Train Regression
    predictor1 = task.fit(train_data,
                          hyperparameters=test_hyperparameters,
                          label='score', num_trials=1,
                          verbosity=4,
                          ngpus_per_trial=1,
                          output_directory='./sts_score',
                          plot_results=False)
    dev_rmse = predictor1.evaluate(dev_data, metrics=['rmse'])
    dev_prediction = predictor1.predict(dev_data)

    # Tran Classification
    predictor2 = task.fit(train_data,
                          hyperparameters=test_hyperparameters,
                          label='genre', num_trials=1,
                          verbosity=4,
                          ngpus_per_trial=1,
                          output_directory='./sts_genre',
                          plot_results=False)
    dev_rmse = predictor2.evaluate(dev_data, metrics=['acc'])
    dev_prediction = predictor2.predict(dev_data)

    # Specify the feature column
    predictor3 = task.fit(train_data,
                          hyperparameters=test_hyperparameters,
                          feature_columns=['sentence1', 'sentence3', 'categorical0'],
                          label='score', num_trials=1,
                          verbosity=4,
                          ngpus_per_trial=1,
                          output_directory='./sts_score',
                          plot_results=False)
    dev_rmse = predictor1.evaluate(dev_data, metrics=['rmse'])
    dev_prediction = predictor1.predict(dev_data)
コード例 #2
0
def prepare_data(config, dataset):
    print('#################')
    print('Config:')
    print(config.__json__())
    print()
    print('Dataset:')
    print(dataset.__dict__)
    print('#################')

    metrics_mapping = dict(acc=metrics.accuracy,
                           auc=metrics.roc_auc,
                           f1=metrics.f1,
                           logloss=metrics.log_loss,
                           mae=metrics.mean_absolute_error,
                           mse=metrics.mean_squared_error,
                           r2=metrics.r2)

    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if perf_metric is None:
        # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported.", config.metric)

    # un = dataset.train.path
    # print(un)
    # raw_data = loadarff(un)
    # df_data = pd.DataFrame(raw_data[0])

    X_train = dataset.train.X
    y_train = dataset.train.y
    X_test = dataset.test.X
    y_test = dataset.test.y

    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    # Save and load data to remove any pre-set dtypes, we want to observe performance from worst-case scenario: raw csv
    save_pd.save(path='tmp/tmp_file_train.csv', df=X_train)
    X_train = load_pd.load(path='tmp/tmp_file_train.csv')
    save_pd.save(path='tmp/tmp_file_test.csv', df=X_test)
    X_test = load_pd.load(path='tmp/tmp_file_test.csv')

    is_classification = config.type == 'classification'
    if is_classification:
        unique_vals = np.unique(y_train)
        if len(unique_vals) == 2:
            problem_type = BINARY
        else:
            problem_type = MULTICLASS
    else:
        problem_type = REGRESSION

    return X_train, y_train, X_test, y_test, problem_type, perf_metric
コード例 #3
0
def test_no_job_finished_raise():
    train_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                              'glue/sst/train.parquet')
    dev_data = load_pd.load('https://autogluon-text.s3-accelerate.amazonaws.com/'
                            'glue/sst/dev.parquet')
    with pytest.raises(RuntimeError):
        # Setting a very small time limits to trigger the bug
        predictor = task.fit(train_data, hyperparameters=test_hyperparameters,
                             label='label', num_trials=1,
                             ngpus_per_trial=0,
                             verbosity=4,
                             time_limits=10,
                             output_directory='./sst_raise',
                             plot_results=False)
コード例 #4
0
def preprocess_openml_input(path,
                            framework_suffix=None,
                            framework_rename_dict=None,
                            folds_to_keep=None):
    raw_input = load_pd.load(path)
    raw_input = _rename_openml_columns(raw_input)
    if framework_rename_dict is not None:
        for key in framework_rename_dict.keys():
            raw_input[FRAMEWORK] = [
                framework_rename_dict[key]
                if framework[0] == key else framework[0]
                for framework in zip(raw_input[FRAMEWORK])
            ]
    if framework_suffix is not None:
        raw_input[FRAMEWORK] = [
            framework[0] + framework_suffix
            for framework in zip(raw_input[FRAMEWORK])
        ]

    with_prob_type_input = _infer_problem_type(raw_input)
    fixed_input = _fix_results(with_prob_type_input)
    fixed_input[METRIC_ERROR] = [
        1 - result if ptype == BINARY else result for result, ptype in zip(
            fixed_input[METRIC_SCORE], fixed_input[PROBLEM_TYPE])
    ]
    cleaned_input = preprocess_utils.clean_result(fixed_input,
                                                  folds_to_keep=folds_to_keep,
                                                  remove_invalid=False)
    return cleaned_input
コード例 #5
0
def test_sts():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/train.parquet')
    dev_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/sts/dev.parquet')
    rng_state = np.random.RandomState(123)
    train_perm = rng_state.permutation(len(train_data))
    valid_perm = rng_state.permutation(len(dev_data))
    train_data = train_data.iloc[train_perm[:100]]
    dev_data = dev_data.iloc[valid_perm[:10]]
    predictor = task.fit(train_data, hyperparameters=test_hyperparameters,
                         label='score', num_trials=1,
                         verbosity=4,
                         ngpus_per_trial=1,
                         output_directory='./sts',
                         plot_results=False)
    dev_rmse = predictor.evaluate(dev_data, metrics=['rmse'])
    dev_prediction = predictor.predict(dev_data)
コード例 #6
0
def run():
    results_dir = 'data/results/'
    results_dir_input = results_dir + 'input/prepared/openml/'
    results_dir_output = results_dir + 'output/openml/ablation/'

    results_raw = load_pd.load(path=[
        results_dir_input + 'openml_core.csv', results_dir_input +
        'openml_autogluon_ablation.csv'
    ])

    frameworks_1h = [
        'autogluon_1h',
        'autogluon_nostack_1h',
        'autogluon_nobag_1h',
        'autogluon_norepeatbag_1h',
        'autogluon_nonn_1h',
        # 'autogluon_noknn_1h',
    ]

    frameworks_4h = [
        'autogluon_4h',
        'autogluon_nostack_4h',
        'autogluon_nobag_4h',
        'autogluon_norepeatbag_4h',
        'autogluon_nonn_4h',
        # 'autogluon_noknn_4h',
    ]

    run_path_prefix_list = ['1h/', '4h/', 'combined/']
    frameworks_compare_vs_all_list = [['autogluon_1h'], ['autogluon_4h'],
                                      ['autogluon_1h', 'autogluon_4h']]
    frameworks_run_list = [
        frameworks_1h, frameworks_4h, frameworks_1h + frameworks_4h
    ]
    folds_to_keep_list = [[0], [0], [0]]
    banned_datasets = []
    num_runs = len(run_path_prefix_list)
    for i in range(num_runs):
        run_path_prefix = run_path_prefix_list[i]
        frameworks_compare_vs_all = frameworks_compare_vs_all_list[i]
        frameworks_run = frameworks_run_list[i]
        folds_to_keep = folds_to_keep_list[i]

        results_ranked, results_ranked_by_dataset, results_ranked_all, results_ranked_by_dataset_all, results_pairs_merged_dict = evaluate_results.evaluate(
            results_raw=results_raw,
            frameworks=frameworks_run,
            banned_datasets=banned_datasets,
            folds_to_keep=folds_to_keep,
            columns_to_agg_extra=[
                # TIME_INFER_S,
                'acc',
                'auc',
                'logloss'
            ],
            frameworks_compare_vs_all=frameworks_compare_vs_all,
            output_dir=results_dir_output + run_path_prefix,
        )
コード例 #7
0
def run_single_vs(results_dir_input, filename, col_name_comparison_str, framework_name_map=SYSTEM_NAMES):
    results_dir_output = results_dir_input + 'tex/'
    pairwise_vs_df = load_pd.load(results_dir_input + 'pairwise/' + filename + '.csv')
    textable_file = results_dir_output + 'pairwise/' + filename + ".tex"
    textab = generate_tex_pairwise_vs_table(pairwise_vs_df, col_name_comparison_str=col_name_comparison_str, framework_name_map=framework_name_map)
    os.makedirs(os.path.dirname(textable_file), exist_ok=True)
    with open(textable_file, 'w') as tf:
        tf.write(textab)
        print("saved tex table to: %s" % textable_file)
コード例 #8
0
def run():
    results_dir = 'data/results/'
    results_dir_input = results_dir + 'input/prepared/openml/'
    results_dir_output = results_dir + 'output/openml/core_1h_vs_4h/'

    results_raw = load_pd.load(path=results_dir_input + 'openml_core.csv')

    frameworks = [
        'autogluon',
        'GCPTables',
        'H2OAutoML',
        'autosklearn',
        'TPOT',
        'AutoWEKA',
    ]

    folds_to_keep = [0]
    banned_datasets = []
    full_results_pairs_merged_dict = {}
    for framework in frameworks:
        run_path_prefix = framework + '/'
        framework_1h = framework + '_1h'
        framework_4h = framework + '_4h'

        results_ranked, results_ranked_by_dataset, results_ranked_all, results_ranked_by_dataset_all, results_pairs_merged_dict = evaluate_results.evaluate(
            results_raw=results_raw,
            frameworks=[framework_1h, framework_4h],
            banned_datasets=banned_datasets,
            folds_to_keep=folds_to_keep,
            columns_to_agg_extra=[
                # TIME_INFER_S,
                'acc',
                'auc',
                'logloss'
            ],
            frameworks_compare_vs_all=[framework_4h],
            output_dir=results_dir_output + run_path_prefix,
        )
        full_results_pairs_merged_dict.update(results_pairs_merged_dict)

    dfs = []
    for framework in frameworks:
        framework_1h = framework + '_1h'
        framework_4h = framework + '_4h'
        cur_df = full_results_pairs_merged_dict[framework_4h]
        cur_df = cur_df[cur_df[FRAMEWORK] == framework_1h]
        cur_columns = list(cur_df.columns)
        cur_columns[1] = '> 4h'
        cur_columns[2] = '< 4h'
        cur_columns[3] = '= 4h'
        cur_df.columns = cur_columns
        dfs.append(cur_df)
    df_final = pd.concat(dfs, ignore_index=True)
    print(df_final)
    save_pd.save(path=results_dir_output + 'pairwise/1h_vs_4h.csv',
                 df=df_final)
コード例 #9
0
def run_single(results_dir_input, framework_compare_vs_all, drop_columns=None, framework_name_map=SYSTEM_NAMES, suffix=''):
    input_openml = results_dir_input + 'pairwise/' + framework_compare_vs_all + '.csv'
    results_dir_output = results_dir_input + 'tex/'
    pairwise_df = load_pd.load(input_openml)
    textab = generate_tex_pairwise_table(pairwise_df=pairwise_df, framework_compare_vs_all=framework_compare_vs_all, drop_columns=drop_columns, framework_name_map=framework_name_map)
    textable_file = results_dir_output + 'pairwise/' + framework_compare_vs_all + suffix + ".tex"
    os.makedirs(os.path.dirname(textable_file), exist_ok=True)
    with open(textable_file, 'w') as tf:
        tf.write(textab)
        print("saved tex table to: %s" % textable_file)
コード例 #10
0
def test_mrpc():
    train_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/mrpc/train.parquet'
    )
    dev_data = load_pd.load(
        'https://autogluon-text.s3-accelerate.amazonaws.com/glue/mrpc/dev.parquet'
    )
    train_data = train_data.iloc[:100]
    dev_data = dev_data.iloc[:10]
    predictor = task.fit(train_data,
                         hyperparameters=test_hyperparameters,
                         label='label',
                         num_trials=1,
                         verbosity=4,
                         ngpus_per_trial=1,
                         output_directory='./mrpc',
                         plot_results=False)
    dev_acc = predictor.evaluate(dev_data, metrics=['acc'])
    dev_prediction = predictor.predict(dev_data)
    dev_pred_prob = predictor.predict_proba(dev_data)
コード例 #11
0
def run():
    results_dir = 'data/results/'
    results_dir_input = results_dir + 'input/prepared/openml/'
    results_dir_output = results_dir + 'output/openml/core/'

    results_raw = load_pd.load(path=results_dir_input + 'openml_core.csv')

    frameworks_1h = [
        'autogluon_1h',
        'GCPTables_1h',
        'H2OAutoML_1h',
        'autosklearn_1h',
        'TPOT_1h',
        'AutoWEKA_1h',
    ]

    frameworks_4h = [
        'autogluon_4h',
        'GCPTables_4h',
        'H2OAutoML_4h',
        'autosklearn_4h',
        'TPOT_4h',
        'AutoWEKA_4h',
    ]

    run_path_prefix_list = ['1h/', '4h/']
    frameworks_compare_vs_all_list = [['autogluon_1h'], ['autogluon_4h']]
    frameworks_run_list = [frameworks_1h, frameworks_4h]
    folds_to_keep_list = [[0], [0]]
    banned_datasets = []
    num_runs = len(run_path_prefix_list)
    for i in range(num_runs):
        run_path_prefix = run_path_prefix_list[i]
        frameworks_compare_vs_all = frameworks_compare_vs_all_list[i]
        frameworks_run = frameworks_run_list[i]
        folds_to_keep = folds_to_keep_list[i]

        results_ranked, results_ranked_by_dataset, results_ranked_all, results_ranked_by_dataset_all, results_pairs_merged_dict = evaluate_results.evaluate(
            results_raw=results_raw,
            frameworks=frameworks_run,
            banned_datasets=banned_datasets,
            folds_to_keep=folds_to_keep,
            columns_to_agg_extra=[
                # TIME_INFER_S,
                'acc',
                'auc',
                'logloss'
            ],
            frameworks_compare_vs_all=frameworks_compare_vs_all,
            output_dir=results_dir_output + run_path_prefix,
        )
コード例 #12
0
def run():
    results_dir = 'data/results/'
    results_dir_input = results_dir + 'output/'
    results_dir_output = results_dir + 'output/combined/4h/tables/'

    input_openml = results_dir_input + 'openml/core/4h/results_ranked_by_dataset_all.csv'
    input_kaggle = results_dir_input + 'kaggle/4h/results_ranked_by_dataset_all.csv'

    results_ranked_by_dataset_all = load_pd.load([input_openml, input_kaggle])
    print(results_ranked_by_dataset_all)

    result = generate_charts.compute_dataset_framework_df(results_ranked_by_dataset_all)
    print(result)

    save_pd.save(path=results_dir_output + 'dataset_x_framework.csv', df=result)
コード例 #13
0
def aggregate(path_prefix: str, contains=None):
    bucket, prefix = s3_utils.s3_path_to_bucket_prefix(path_prefix)
    objects = list_bucket_prefix_suffix_contains_s3(
        bucket=bucket,
        prefix=prefix,
        suffix='scores/results.csv',
        contains=contains)
    print(objects)
    paths_full = [
        s3_utils.s3_bucket_prefix_to_path(bucket=bucket,
                                          prefix=file,
                                          version='s3') for file in objects
    ]
    print(paths_full)
    df = load_pd.load(paths_full)
    print(df)
    return df
コード例 #14
0
def run():
    results_dir = 'data/results/'
    results_dir_input = results_dir + 'input/prepared/openml/'
    results_dir_output = results_dir + 'output/openml/accuracy/'

    results_raw = load_pd.load(
        path=[
            results_dir_input + 'openml_core.csv',
            results_dir_input + 'openml_autopilot.csv'
        ],
        worker_count=1
    )

    valid_frameworks = [
        'autogluon_1h',
        'GCPTables_1h',
        'H2OAutoML_1h',
        'autosklearn_1h',
        'TPOT_1h',
        'AutoWEKA_1h',
        'AutoPilot_1h',
    ]

    results_raw[METRIC_SCORE] = results_raw['acc']
    results_raw[METRIC_ERROR] = 1 - results_raw[METRIC_SCORE]
    run_path_prefix = '1h/'

    banned_datasets = []

    folds_to_keep = [0]
    results_ranked, results_ranked_by_dataset, results_ranked_all, results_ranked_by_dataset_all, results_pairs_merged_dict = evaluate_results.evaluate(
        results_raw=results_raw,
        frameworks=valid_frameworks,
        banned_datasets=banned_datasets,
        folds_to_keep=folds_to_keep,
        columns_to_agg_extra=[
            # TIME_INFER_S,
            'acc',
        ],
        frameworks_compare_vs_all=['autogluon_1h', 'AutoPilot_1h'],
        output_dir=results_dir_output + run_path_prefix,
    )
コード例 #15
0
def preprocess_kaggle_input(path,
                            framework_suffix=None,
                            framework_rename_dict=None):
    raw_input = load_pd.load(path)

    raw_input = _rename_kaggle_input(raw_input)
    raw_input[FOLD] = 0
    if METRIC_SCORE not in raw_input.columns:
        raw_input[METRIC_SCORE] = -raw_input[METRIC_ERROR]

    if framework_rename_dict is not None:
        for key in framework_rename_dict.keys():
            raw_input[FRAMEWORK] = [
                framework_rename_dict[key]
                if framework[0] == key else framework[0]
                for framework in zip(raw_input[FRAMEWORK])
            ]
    if framework_suffix is not None:
        raw_input[FRAMEWORK] = [
            framework[0] + framework_suffix
            for framework in zip(raw_input[FRAMEWORK])
        ]
    cleaned_input = preprocess_utils.clean_result(raw_input, folds_to_keep=[0])
    return cleaned_input
コード例 #16
0
def generate_tex_datasetXframework_table(results_dir_input, time_limit, method_order=None):
    """ # generate datasets x frameworks raw data dumps """
    results_dir_output = results_dir_input + 'tex/'
    results_raw = load_pd.load(
        path=[
            results_dir_input + 'results_ranked_by_dataset_all.csv',
        ]
    )
    if method_order is None:
        method_order = ['AutoWEKA', 'autosklearn','TPOT', 'H2OAutoML','GCPTables','autogluon']
    metric_error_df = generate_charts.compute_dataset_framework_df(results_raw)
    print("metric_error_df:")
    print(metric_error_df.head())
    metric_error_df[DATASET] = pd.Series([x[:17] for x in list(metric_error_df[DATASET])])
    df_ordered = metric_error_df.set_index(DATASET)
    df_ordered = df_ordered[[meth+"_"+time_limit for meth in method_order]].copy()
    df_ordered.rename(columns={'dataset': 'Dataset'},inplace=True)
    df_ordered.rename(columns=NOTIME_NAMES,inplace=True)
    # save_pd.save(path=results_dir_output + "openml_datasetsXframeworks_"+time_limit+".csv", df=df_ordered)
    textable_file = results_dir_output + "openml_alllosses_"+time_limit+".tex"
    if not os.path.exists(results_dir_output):
        os.makedirs(results_dir_output)

    tex_table.tex_table(df_ordered,textable_file,bold = 'min',nan_char =" x ",max_digits=5)
コード例 #17
0
def get_predictions(problem_type, weka_file, class_prefix, labels_are_int, eval_metric):
    # Load predictions:
    if not os.path.exists(weka_file):
        raise ValueError("AutoWEKA failed producing any prediction.")
    
    if problem_type in [BINARY, MULTICLASS]: # Load classification predictions:
        # class_labels = sorted(list(set(labels_train)))
        # class_order = [''] * len(class_labels) # will contain ordering of classes
        # remaining_classes = class_labels[:] # classes whose index we don't know yet
        with open(weka_file, 'r') as weka_file_io:
            probabilities = []
            predictions = []
            truth = []
            for line in weka_file_io.readlines()[1:-1]:
                inst, actual, predicted, error, *distribution = line.split(',')
                pred_probabilities = [float(pred_probability.replace('*', '').replace('\n', '')) for pred_probability in distribution]
                _, pred = predicted.split(':')
                _, tru = actual.split(':')
                pred = pred[pred.startswith(class_prefix) and len(class_prefix):]
                if labels_are_int:
                    pred = int(pred)
                probabilities.append(pred_probabilities)
                predictions.append(pred)
                truth.append(tru)
                class_index = np.argmax(pred_probabilities)
                """ # Old code to compute class order:
                if pred in remaining_classes:
                    remaining_classes.remove(pred)
                    class_order[class_index] = pred
                elif class_order[class_index] != pred:
                    raise ValueError("Class ordering cannot be determined due to ordering error")
                """
        """ # Old code to compute class order:
        if len(remaining_classes) > 1:
            raise ValueError("Class ordering cannot be determined because not all classes were predicted")
        elif len(remaining_classes) == 1:
            if '' not in class_order:
                raise ValueError("Class ordering cannot be determined due to error in remaining_classes")
            else:
                remain_idx = class_order.index('')
                class_order[remain_idx] = remaining_classes[0]
        """
        y_pred = pd.Series(predictions)
        y_prob = np.array(probabilities).astype('float')
        if eval_metric == 'log_loss': # ensure there are no probabilities = 0 which may cause infinite loss.
            EPS = 1e-8
            for i in range(len(y_prob)):
                prob_i = y_prob[i]
                extra_prob = 0.0 # additional probability mass.
                for j in range(len(prob_i)):
                    if prob_i[j] == 0.0:
                        prob_i[j] = EPS
                        extra_prob += EPS
                while extra_prob > 0:
                    ind = np.argmax(prob_i)
                    ind_prob = prob_i[ind]
                    if ind_prob > extra_prob:
                        prob_i[ind] = ind_prob - extra_prob
                        extra_prob = 0
                    else:
                        prob_i[ind] = ind_prob - EPS
                        extra_prob -= EPS
        
        y_probsums = np.sum(y_prob, axis=1)
        y_prob = y_prob / y_probsums[:,None] # ensure all probs sum to 1
    elif problem_type == REGRESSION: # Load regression predictions:
        pred_df = load_pd.load(weka_file)
        y_pred = pred_df['predicted']
        y_prob = None
        # class_order = None
    else:
        raise ValueError("Unknown problem_type specified")
    return (y_pred, y_prob)
コード例 #18
0
def run():
    results_dir = 'data/results/'
    results_dir_input = results_dir + 'input/prepared/kaggle/'
    output_prefix = 'output/kaggle/'
    raw_kaggle_file = 'results_kaggle_wpercentile.csv'

    results_raw = load_pd.load(path=[
        results_dir_input + 'kaggle_core.csv',
    ])
    # First generate datasets x frameworks raw data dumps:
    metrics = ['LEADER_PERCENTILE', METRIC_SCORE]
    dataset_order = [
        'house-prices-advanced-regression-techniques',
        'mercedes-benz-greener-manufacturing',
        'santander-value-prediction-challenge', 'allstate-claims-severity',
        'bnp-paribas-cardif-claims-management',
        'santander-customer-transaction-prediction',
        'santander-customer-satisfaction',
        'porto-seguro-safe-driver-prediction', 'ieee-fraud-detection',
        'walmart-recruiting-trip-type-classification',
        'otto-group-product-classification-challenge'
    ]
    dataset_order = [KAGGLE_ABBREVS[dat] for dat in dataset_order]
    method_order = [
        'AutoWEKA', 'autosklearn', 'TPOT', 'H2OAutoML', 'GCPTables',
        'autogluon'
    ]
    time_limits = ['4h', '8h']
    results_raw2 = results_raw.drop(METRIC_ERROR, axis=1).copy()
    results_raw2['LEADER_PERCENTILE'] = 1 - results_raw2[
        'LEADER_PERCENTILE']  # convert to actual percentile
    results_raw2.rename(columns={'LEADER_PERCENTILE': METRIC_ERROR},
                        inplace=True)

    # loss_df = generate_charts.compute_dataset_framework_df(results_raw) # values = losses
    percentile_df = generate_charts.compute_dataset_framework_df(results_raw2)
    for time_limit in time_limits:
        methods_t = [meth + "_" + time_limit for meth in method_order]
        df_time = percentile_df[[DATASET] + methods_t].copy()
        df_time[DATASET] = df_time[DATASET].map(KAGGLE_ABBREVS)
        df_ordered = df_time.set_index(DATASET)
        df_ordered = df_ordered.reindex(dataset_order)
        # df_ordered.reset_index(inplace=True)
        # df_ordered.rename(columns={'dataset': 'Dataset'},inplace=True)
        df_ordered.rename(columns=NOTIME_NAMES, inplace=True)
        save_pd.save(path=results_dir + output_prefix + time_limit +
                     "/datasetsXframeworks.csv",
                     df=df_ordered)
        textable_file = results_dir + output_prefix + time_limit + "/allpercentiles.tex"
        tex_table.tex_table(df_ordered,
                            textable_file,
                            bold='max',
                            nan_char=" x ",
                            max_digits=5)

    # Next do pairwise comparisons:
    num_frameworks = 6
    valid_frameworks = [
        'autogluon_4h',
        'GCPTables_4h',
        'autosklearn_4h',
        'H2OAutoML_4h',
        'TPOT_4h',
        'AutoWEKA_4h',
        'autogluon_8h',
        'GCPTables_8h',
        'H2OAutoML_8h',
        'autosklearn_8h',
        'TPOT_8h',
        'AutoWEKA_8h',
    ]

    frameworks_compare_vs_all_list = [
        'autogluon_4h', 'autogluon_8h', 'autogluon_4h', 'autogluon_8h'
    ]
    results_dir_output_list = [
        '4h/', '8h/', 'allVautogluon_4h/', 'allVautogluon_8h/'
    ]
    results_dir_output_list = [
        results_dir + output_prefix + name for name in results_dir_output_list
    ]
    framework_compare_ind_list = [  # list of lists, each corresponding to indices of valid_frameworks that should be compared in a single table.
        list(range(num_frameworks)),
        list(range(num_frameworks, num_frameworks * 2)),
        range(num_frameworks * 2),
        range(num_frameworks * 2),
    ]

    for i in range(len(results_dir_output_list)):
        results_dir_output = results_dir_output_list[i]
        frameworks_to_compare = [
            valid_frameworks[j] for j in framework_compare_ind_list[i]
        ]
        framework_compare_vs_all = frameworks_compare_vs_all_list[i]
        results_ranked, results_ranked_by_dataset, results_ranked_all, results_ranked_by_dataset_all, results_pairs_merged_dict = evaluate_results.evaluate(
            results_raw=results_raw,
            frameworks=frameworks_to_compare,
            banned_datasets=[],
            folds_to_keep=None,
            frameworks_compare_vs_all=[framework_compare_vs_all],
            output_dir=results_dir_output,
            columns_to_agg_extra=['LEADER_PERCENTILE'],
        )
        textab = tex_pairwise_table(results_dir_output,
                                    framework_compare_vs_all)

    # Generate plots:
    producePlots(time_limits, results_dir, raw_kaggle_file)
コード例 #19
0
# Set arguments:
output_directory = 'autosklearn_models/'  # where to save trained models
train_file = 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/train.csv'
test_file = 'https://autogluon.s3-us-west-2.amazonaws.com/datasets/Inc/test.csv'
predict_proba = False
pred_class_and_proba = True
runtime_sec = 120
num_cores = None

# Specify prediction problem:
label_column = 'class'  # specifies which column do we want to predict
problem_type = BINARY
eval_metric = 'roc_auc'

# Load data:
train_data = load_pd.load(
    train_file)  # can be local CSV file as well, returns Pandas DataFrame
train_data = train_data.head(500)  # subsample for faster demo
print(train_data.head())

test_data = load_pd.load(
    test_file)  # can be local CSV file as well, returns Pandas DataFrame
y_test = test_data[label_column]
test_data.drop(
    [label_column], axis=1, inplace=True
)  # If you do not remove test-data labels, then predictAutoSklearn() may return less predictions than datapoints (preprocessing filters out rowws with badly-formatted labels)

# Run auto-sklearn:
autosk = AutoSklearnBaseline()

num_models_trained, num_models_ensemble, fit_time = autosk.fit(
    train_data=train_data,
コード例 #20
0
def gcptables_fit_predict(train_data,
                          test_data,
                          dataset_name,
                          label_column,
                          problem_type,
                          output_directory,
                          gcp_info,
                          eval_metric=None,
                          runtime_sec=3600,
                          fit_model=True,
                          model_name=None,
                          make_predictions=True):
    """ Use GCP AutoML tables for both fitting and prediction. 
        Returns all outputs of AbstractBaseline.fit(), AbstractBaseline.predict() as one big tuple, with one final element: class_order
        Also takes in the same arguments as these methods, except for num_cores.
        Other Args:
            dataset_name: str Name
                GCP data and outputs will be stored in GCS Storage Bucket under this name, should be unique for every GCP run on a new dataset.
            gcp_info: dict of critical informtion regarding GCP configuration, project, and access keys.
            fit_model: bool indicating whether or not to actually fit models using GCP AutoML Tables.
                If a previous run of this function crashed after the model had been trained, then you just produce predictions via: 
                fit_model = False. Similarly, you can set this False in order to get predictions in a separate process from the fit() call.
                When False, you must specify: model_name as the string corresponding to the model.name entry from previous fit(),
                but without the project/path prefix (this thus matches the display name of the model in the GCP console).
            make_predictions: bool indicating whether or not we should return after fit() without making predictions.
    
        Note: For classification, your class labels cannot end with suffix: '_score'
    """

    train_data = train_data.copy()
    test_data = test_data.copy()

    # Reformat column names to only contain alphanumeric characters:
    label_column_index = train_data.columns.get_loc(label_column)
    train_data.columns = [
        re.sub(r'\W+', '_', col) for col in train_data.columns.tolist()
    ]  # ensure alphanumeric-only column-names
    test_data.columns = [
        re.sub(r'\W+', '_', col) for col in test_data.columns.tolist()
    ]  # ensure alphanumeric-only column-names
    label_column = train_data.columns[
        label_column_index]  # re-assign as it may have changed
    train_data[id_column] = list(train_data.index)
    test_data[id_column] = list(test_data.index)
    data_colnames = list(set(train_data.columns))

    # Drop test labels if they exist:
    if label_column in test_data.columns:
        test_data = test_data.drop([label_column], axis=1)

    og_dataset_name = dataset_name
    dataset_name = re.sub(
        r'\W+', '_', dataset_name)  # Ensure GCP will not complain about names
    dataset_name = dataset_name[:(GCP_DISPLAY_NAME_MAXCHARS -
                                  len(GCP_MODEL_PREFIX))]
    if model_name is None:
        model_display_name = GCP_MODEL_PREFIX + dataset_name
    else:
        model_display_name = model_name
    if og_dataset_name != dataset_name:
        print("GCP will complain about provided dataset_name, renamed to: %s" %
              dataset_name)

    PROJECT_ID = gcp_info['PROJECT_ID']
    BUCKET_NAME = gcp_info['BUCKET_NAME']
    COMPUTE_REGION = gcp_info['COMPUTE_REGION']
    GOOGLE_APPLICATION_CREDENTIALS = gcp_info['GOOGLE_APPLICATION_CREDENTIALS']
    num_models_trained = None
    num_models_ensemble = None
    fit_time = None
    y_pred = None
    y_prob = None
    predict_time = None
    class_order = None
    if len(train_data) < 1000:
        raise ValueError(
            "GCP AutoML tables can only be trained on datasets with >= 1000 rows"
        )

    # Create GCP clients:
    storage_client = storage.Client.from_service_account_json(
        GOOGLE_APPLICATION_CREDENTIALS)
    bucket = storage_client.get_bucket(BUCKET_NAME)
    credentials = service_account.Credentials.from_service_account_file(
        GOOGLE_APPLICATION_CREDENTIALS)
    automl_client = automl.AutoMlClient(credentials=credentials)
    tables_client = automl.TablesClient(project=PROJECT_ID,
                                        region=COMPUTE_REGION,
                                        credentials=credentials)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    #  Upload training data to GCS:
    gcs_train_path = dataset_name + "/" + GCS_TRAIN_FILENAME  # target file-name
    train_file_exists = storage.Blob(
        bucket=bucket, name=gcs_train_path).exists(storage_client)
    if not train_file_exists:
        print('Uploading training data')
        train_file_path = output_directory + GCS_TRAIN_FILENAME
        train_data.to_csv(
            train_file_path,
            index=False)  # write reformatted train-data to CSV file.
        # Upload to GCS:
        blob = bucket.blob(gcs_train_path)
        blob.upload_from_filename(train_file_path)
    else:  # need to rename columns anyway to process predictions.
        print('Training data already uploaded')

    # Upload test data:
    gcs_test_path = dataset_name + "/" + GCS_TEST_FILENAME  # target file-name
    test_file_exists = storage.Blob(bucket=bucket,
                                    name=gcs_test_path).exists(storage_client)
    if not test_file_exists:
        print('Uploading test data')
        test_file_path = output_directory + GCS_TEST_FILENAME
        test_data.to_csv(
            test_file_path,
            index=False)  # write reformatted test-data to CSV file.
        # Upload to GCS:
        blob = bucket.blob(gcs_test_path)
        blob.upload_from_filename(test_file_path)
    else:
        print('Test data already uploaded')

    if not train_file_exists:
        os.remove(train_file_path)

    if not test_file_exists:
        os.remove(test_file_path)

    # print("train_data.columns", train_data.columns)
    # print("test_data.columns", test_data.columns) # TODO remove

    # Use AutoML-Tables to fit models with training data:
    dataset = tables_client.create_dataset(dataset_display_name=dataset_name)
    tables_dataset_name = dataset.name
    import_data_response = tables_client.import_data(
        dataset=dataset,
        gcs_input_uris=GCS_PREFIX + BUCKET_NAME + "/" + gcs_train_path)
    print('Dataset import operation: {}'.format(
        import_data_response.operation))
    print('Dataset import response: {}'.format(import_data_response.result())
          )  # print ensures block until dataset has been uploaded.
    list_table_specs_response = tables_client.list_table_specs(dataset=dataset)
    table_specs = [s for s in list_table_specs_response]
    print(table_specs)
    # list_column_specs_response = tables_client.list_column_specs(dataset=dataset)
    # column_specs = [s for s in list_column_specs_response]
    # label_spec = [column_specs[i] for i in range(len(column_specs)) if column_specs[i].display_name == label_column]
    # print(label_spec[0])

    # Set label column:
    if problem_type in [BINARY, MULTICLASS]:
        type_code = 'CATEGORY'
        update_column_response = tables_client.update_column_spec(
            dataset=dataset,
            column_spec_display_name=label_column,
            type_code=type_code,
            nullable=False,
        )  # ensure label_column is categorical
        print(update_column_response)

    update_dataset_response = tables_client.set_target_column(
        dataset=dataset,
        column_spec_display_name=label_column,
    )
    print(update_dataset_response)

    # Fit AutoML Tables:
    gcp_metric = None  # Metric passed to GCP as optimization_objective argument
    if fit_model:
        if eval_metric is not None:
            metrics_map = {  # Mapping of benchmark metrics to GCP AutoML Tables metrics: https://googleapis.dev/python/automl/latest/gapic/v1beta1/types.html
                'accuracy': 'MINIMIZE_LOG_LOSS',
                'f1': 'MAXIMIZE_AU_PRC',
                'log_loss': 'MINIMIZE_LOG_LOSS',
                'roc_auc': 'MAXIMIZE_AU_ROC',
                'balanced_accuracy': 'MAXIMIZE_AU_ROC',
                'precision': 'MAXIMIZE_PRECISION_AT_RECALL',
                'recall': 'MAXIMIZE_RECALL_AT_PRECISION',
                'mean_squared_error': 'MINIMIZE_RMSE',
                'median_absolute_error': 'MINIMIZE_MAE',
                'mean_absolute_error': 'MINIMIZE_MAE',
                'r2': 'MINIMIZE_RMSE',
            }
            if eval_metric in metrics_map:
                gcp_metric = metrics_map[eval_metric]
        else:
            warnings.warn(
                "Unknown metric will not be used by GCP AutoML Tables: %s" %
                eval_metric)
        t0 = time.time()
        model_train_hours = math.ceil(runtime_sec / 3600.)
        print('Training model for %s hours' % model_train_hours)
        print('Training model with name: %s' % model_display_name)
        # TODO FIXME TODO FIXME:
        #  exclude_column_spec_names (Optional[str]) – The list of the names of the columns you want to exclude and not train your model on.
        # FIXME: ADD AN ID COLUMN
        create_model_response = tables_client.create_model(
            model_display_name=model_display_name,
            dataset=dataset,
            train_budget_milli_node_hours=model_train_hours * 1000,
            optimization_objective=gcp_metric,
            exclude_column_spec_names=[id_column, label_column],
        )
        operation_id = create_model_response.operation.name
        print('Create GCP model operation: {}'.format(
            create_model_response.operation))
        check_interval = 60  # check for model status updates every check_interval seconds
        keep_checking = True
        check_time = time.time()
        while keep_checking:  # and time.time() - t0 <= runtime_sec: # check on current model status
            if time.time() - check_time > check_interval:
                api = operations_v1.OperationsClient(
                    channel=automl_client.transport.channel)
                status_update = api.get_operation(operation_id)
                print(
                    "Status update on GCP model: \n {}".format(status_update))
                print('Time Elapsed: %s of %s' %
                      ((time.time() - t0), runtime_sec))
                check_time = time.time()
                if hasattr(status_update, 'done') and status_update.done:
                    keep_checking = False

        # Waits until model training is done:
        model = create_model_response.result()
        model_name = model.name
        print("GCP training completed, produced model object with name: %s" %
              model_name)
        print(
            "You can use this trained model for batch prediction by specifying model_name=%s"
            % model_display_name)
        print(model)
        t1 = time.time()
        fit_time = t1 - t0
        print("GCP Tables Model fit complete, runtime: %s" % fit_time)
        print("GCP model name = %s" % model_name)
    else:  #skip model fitting:
        fit_time = None
        print(
            "Skipping GCP Tables Model fit, just using trained model for prediction"
        )
        if model_name is None:
            raise ValueError(
                "When fit_model=False, model_name must be specified.")
        model = tables_client.get_model(model_display_name=model_name)

    # Automatically-generated held-out validation performance estimates:
    num_models_trained = -1
    num_models_ensemble = -1
    summary_list = tables_client.list_model_evaluations(model=model)
    model_eval_summaries = [s for s in summary_list]
    if problem_type in [BINARY, MULTICLASS]:
        log_losses = [
            model_eval_summaries[i +
                                 1].classification_evaluation_metrics.log_loss
            for i in range(len(model_eval_summaries) - 1)
        ]
        log_loss = np.mean(np.array(log_losses))
        print("Validation log_loss = %s" % log_loss)

    if problem_type == BINARY:
        auc_rocs = [
            model_eval_summaries[i +
                                 1].classification_evaluation_metrics.au_roc
            for i in range(len(model_eval_summaries) - 1)
        ]
        auc_roc = np.mean(np.array(auc_rocs))
        print("Validation AUC_ROC = %s" % auc_roc)

    if not make_predictions:
        print(
            "Skipping predictions, set model_name = %s to use this trained model for prediction later on"
            % model_name)
        return num_models_trained, num_models_ensemble, fit_time, y_pred, y_prob, predict_time, class_order

    # Predict (using batch inference, so no need to deploy model):
    t2 = time.time()
    preds_file_prefix = GCS_PREFIX + BUCKET_NAME + "/" + dataset_name + "/pred"
    batch_predict_response = tables_client.batch_predict(
        model=model,
        gcs_input_uris=GCS_PREFIX + BUCKET_NAME + "/" + gcs_test_path,
        gcs_output_uri_prefix=preds_file_prefix)
    print('Batch prediction operation: {}'.format(
        batch_predict_response.operation))

    # Wait until batch prediction is done.
    batch_predict_result = batch_predict_response.result()
    print(batch_predict_response.metadata)
    t3 = time.time()
    predict_time = t3 - t2

    # Fetch predictions from GCS bucket to local file:
    preds_gcs_folder = batch_predict_response.metadata.batch_predict_details.output_info.gcs_output_directory  # full path to GCS file containing predictions
    preds_gcs_filename = 'tables_1.csv'  # default file name created by GCP Tables.
    preds_gcs_file = preds_gcs_folder + "/" + preds_gcs_filename
    local_preds_file = output_directory + LOCAL_PREDS_FILENAME

    with open(local_preds_file, 'wb') as file_obj:
        storage_client.download_blob_to_file(preds_gcs_file, file_obj)

    # Load predictions into python and format:
    test_pred_df = load_pd.load(local_preds_file)
    same_cols = [col for col in test_pred_df.columns if col in data_colnames]
    keep_cols = [
        col for col in test_pred_df.columns if col not in data_colnames
    ]
    original_gcp_length = len(test_pred_df)
    original_test_length = len(test_data)
    print('test orig:', )
    print(test_data)
    print('before dedupe...')
    print(test_pred_df)
    test_pred_df = test_pred_df.drop_duplicates(subset=[
        id_column
    ])  # drop any duplicate rows in predictions before join
    print('before merge...')
    print(test_pred_df)
    test_pred_df = test_data.merge(
        test_pred_df, on=[id_column],
        how='left')  # un-shuffle the predictions so order matches test data.
    print('after merge...')
    print(test_pred_df)
    test_pred_df = test_pred_df[keep_cols]
    if len(test_pred_df) != len(test_data):
        warnings.warn(
            "GCP failed to produce predictions for some test data rows")
        print('diff: %s | %s' % (len(test_pred_df), len(test_data)))
        print('DIFF ORIGINAL:')
        print(original_test_length)
        print(original_gcp_length)

    if problem_type != REGRESSION:
        gcp_classes = list(test_pred_df.columns)
        og_classes = list(train_data[label_column].unique())

        print('Num Classes orig:', len(og_classes))
        print('Num Classes GCP: ', len(gcp_classes))
        print('GCP Class Names                 : ', gcp_classes)
        print('Original Class Names            : ', og_classes)
        orig_colnames = [
            column[(len(label_column) + 1):-len('_score')]
            for column in gcp_classes
        ]
        print('Original Class Names (Reordered): ', orig_colnames)

        if len(gcp_classes) != len(og_classes):
            warnings.warn("GCP AutoML Tables predictions are missing classes")
            raise AssertionError(
                'GCP AutoML did not predict with all classes! GCP returned %s of %s classes!'
                % (len(gcp_classes), len(og_classes)))

        test_pred_df.columns = orig_colnames
    else:
        test_pred_df.columns = [label_column]

    if test_pred_df.isnull().values.any(
    ):  # Some missing predictions exist that need to be imputed.
        test_pred_df = impute_dummy_predictor(test_pred_df=test_pred_df,
                                              train_data=train_data,
                                              label_column=label_column,
                                              problem_type=problem_type)

    if problem_type == REGRESSION:
        if len(keep_cols) != 1:
            warnings.warn(
                "GCP AutoML Tables regression predictions are incorrectly formatted"
            )
            print('keep_cols:', keep_cols)
            raise AssertionError(
                'GCP AutoML did not return a valid regression prediction! GCP returned %s of %s classes!'
                % (len(keep_cols), 1))
        y_pred = test_pred_df[label_column]
        y_prob = None
        return num_models_trained, num_models_ensemble, fit_time, y_pred, y_prob, predict_time, class_order
    else:
        y_pred = test_pred_df.idxmax(axis=1)
        class_order = list(test_pred_df.columns)
        y_prob = np.array(test_pred_df)
        return num_models_trained, num_models_ensemble, fit_time, y_pred, y_prob, predict_time, class_order
コード例 #21
0
def run():
    results_dir = 'data/results/'
    results_dir_input = results_dir + 'input/prepared/openml/'
    results_dir_output = results_dir + 'output/openml/orig_vs_core10fold/'

    results_raw = load_pd.load(path=[
        results_dir_input + 'openml_core.csv',
        results_dir_input + 'openml_original.csv',
    ])

    frameworks_1h = [
        'H2OAutoML_1h',
        'autosklearn_1h',
        'TPOT_1h',
        'AutoWEKA_1h',
    ]

    frameworks_4h = [
        'H2OAutoML_4h',
        'autosklearn_4h',
        'TPOT_4h',
        'AutoWEKA_4h',
    ]

    frameworks_run_list = [frameworks_1h, frameworks_4h]
    folds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    folds_to_keep_list = [folds, folds]
    banned_datasets_list = [DATASETS_LARGE, []]
    num_runs = len(frameworks_run_list)
    full_results_pairs_merged_dict = {}
    for i in range(num_runs):
        frameworks_run = frameworks_run_list[i]
        folds_to_keep = folds_to_keep_list[i]
        banned_datasets = banned_datasets_list[i]

        for framework in frameworks_run:
            run_path_prefix = framework + '/'
            orig_framework = 'orig_' + framework

            results_ranked, results_ranked_by_dataset, results_ranked_all, results_ranked_by_dataset_all, results_pairs_merged_dict = evaluate_results.evaluate(
                results_raw=results_raw,
                frameworks=[framework, orig_framework],
                banned_datasets=banned_datasets,
                folds_to_keep=folds_to_keep,
                columns_to_agg_extra=[
                    # TIME_INFER_S,
                    'acc',
                    'auc',
                    'logloss'
                ],
                frameworks_compare_vs_all=[orig_framework],
                output_dir=results_dir_output + run_path_prefix,
            )
            full_results_pairs_merged_dict.update(results_pairs_merged_dict)

    dfs = []
    frameworks_full = frameworks_1h + frameworks_4h
    for framework in frameworks_full:
        orig_framework = 'orig_' + framework
        cur_df = full_results_pairs_merged_dict[orig_framework]
        cur_df = cur_df[cur_df[FRAMEWORK] == framework]
        cur_columns = list(cur_df.columns)
        cur_columns[1] = '> Original'
        cur_columns[2] = '< Original'
        cur_columns[3] = '= Original'
        cur_df.columns = cur_columns
        dfs.append(cur_df)
    df_final = pd.concat(dfs, ignore_index=True)
    print(df_final)
    save_pd.save(path=results_dir_output + 'pairwise/new_vs_old.csv', df=df_final)