Beispiel #1
0
def test_estimator():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    rep_num = args.rep
    run_count = args.run_count
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        task_format = dataset + '_est_%d'
        X, y, _ = load_data(dataset)
        dm = DataManager(X, y)
        seed = np.random.random_integers(MAX_INT)
        for optimizer in ['smbo']:
            cls = Classifier(include_models=['gradient_boosting'],
                             optimizer=optimizer,
                             seed=seed).fit(dm,
                                            metric='accuracy',
                                            runcount=run_count,
                                            task_name=task_format)
            print(cls.predict(X))
Beispiel #2
0
def test_hyperspace():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    try:
        for dataset in datasets:
            for run_id in range(start_run, rep_num):
                X, y, _ = load_data(dataset)
                dm = DataManager(X, y)
                seed = np.random.random_integers(MAX_INT)

                for update_mode in [2, 3]:
                    task_format = dataset + '_mode_%d_%d' % (update_mode,
                                                             run_id)
                    cls = Classifier(optimizer='ts_smbo',
                                     seed=seed).fit(dm,
                                                    metric='accuracy',
                                                    runcount=run_count,
                                                    task_name=task_format,
                                                    update_mode=update_mode)
                    print(cls.predict(X))
    except Exception as e:
        print(e)
        print('Exit!')
Beispiel #3
0
def test_cash_module():
    df = pd.read_csv("data/cls_data/santander/train.csv")
    df = df.drop(columns=["ID"])
    cls = Classifier(include_models=['xgboost', 'random_forest', 'decision_tree'],
                     optimizer='baseline',
                     ensemble_method='ensemble_selection',
                     ensemble_size=30,
                     ).fit(df, metric='auc', runcount=1)
    df = pd.read_csv("data/cls_data/santander/test.csv")
    data = df.values
    df = df.drop(columns=["ID"])
    pred2 = cls.predict(df)
    print(pred2)

    import csv
    with open('data/cls_data/santander/submission.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['ID', 'TARGET'])
        for i in range(len(pred2)):
            line = [int(data[i, 0]), pred2[i]]
            writer.writerow(line)
Beispiel #4
0
def test_no_free_lunch():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data

    for dataset in datasets:
        seeds = get_seeds(dataset, rep_num)
        for run_id in range(rep_num):
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)
            for algo in algo_list:
                for optimizer in ['smbo']:
                    task_format = dataset + '_' + algo + '_%d_%d'
                    cls = Classifier(
                        include_models=[algo], optimizer=optimizer, seed=seed).fit(
                        dm, metric='accuracy', runcount=run_count, task_name=task_format % (run_count, run_id))
                    print(cls.predict(X))
Beispiel #5
0
def test_hyperspace():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    try:
        for dataset in datasets:
            for run_id in range(start_run, rep_num):
                X, y, _ = load_data(dataset)
                dm = DataManager(X, y)
                seed = np.random.random_integers(MAX_INT)

                for n_est in [1, 2, 4, 8, 12]:
                    algos = algo_list[:n_est]
                    task_format = dataset + '_hp_%d_%d' % (n_est, run_id)
                    cls = Classifier(
                        include_models=algos, optimizer='smbo', seed=seed).fit(
                        dm, metric='accuracy', runcount=run_count, task_name=task_format)
                    print(cls.predict(X))
    except Exception as e:
        print(e)
        print('Exit!')
Beispiel #6
0
def test_claim():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    perfs_list = list()
    for dataset in datasets:
        for run_id in range(rep_num):
            X, y, _ = load_data(dataset)
            dm = DataManager(X, y)
            seed = np.random.random_integers(MAX_INT)
            task_format = dataset + '_claim_%d'

            for optimizer in ['smbo']:
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_format % run_id)
                print(cls.predict(X))

                file_id = 'data/%s/%s_claim_%d_%s.data' % (dataset, dataset,
                                                           run_id, 'smac')
                with open(file_id, 'rb') as f:
                    data = pickle.load(f)

                best_id = np.argmax(data['perfs'])
                best_value = data['perfs'][best_id]
                if data['perfs'].count(best_value) > 1:
                    stats = dict()
                    for conf, perf in zip(data['configs'], data['perfs']):
                        if perf == best_value:
                            est = conf['estimator']
                            if est not in stats:
                                stats[est] = 0
                            stats[est] += 1
                    tmp_id = np.argmax(stats.values())
                    best_estimator = list(stats.keys())[tmp_id]
                    print('=' * 20, best_value, stats)
                else:
                    best_estimator = data['configs'][best_id]['estimator']
                    print('=' * 20, data['perfs'][best_id],
                          data['configs'][best_id])

                run_cnts = len([
                    item for item in data['configs']
                    if item['estimator'] == best_estimator
                ])

                task_format = dataset + '_claim_single_%d'
                cls = Classifier(include_models=[best_estimator],
                                 optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_cnts,
                                                task_name=task_format % run_id)
                print(cls.predict(X))

                file_id = 'data/%s/%s_claim_single_%d_%s.data' % (
                    dataset, dataset, run_id, 'smac')
                with open(file_id, 'rb') as f:
                    data_s = pickle.load(f)
                print('=' * 20 + 'single', max(data_s['perfs']))
                perfs_list.append((data['perfs'], data_s['perfs']))

    for item in perfs_list:
        item1, item2 = item
        print(len(item1), max(item1), len(item2), max(item2))
    print('=' * 50)
    print(perfs_list)
Beispiel #7
0
df["Sex"] = df["Sex"].replace(["male", "female"], [0, 1])

df.drop(columns="Ticket", axis=1, inplace=True)

for i in range(df.shape[0]):
    if df["Cabin"][i] == "C23 C25 C27":
        df["Cabin"][i] = 0
    else:
        df["Cabin"][i] = 1

df["Cabin"] = df["Cabin"].astype("float")

df = pd.get_dummies(df)

x = df.values

x_train = x[:train_size]
x_test = x[train_size:]

dm = DataManager()
dm.train_X = x_train
dm.train_y = y_train


clf = Classifier(optimizer="smbo")
clf.fit(dm, metric="accuracy", runcount=200)

submission = pd.read_csv(home_path + "/datasets/titanic/gender_submission.csv")
submission["Survived"] = clf.predict(x_test)
submission.to_csv(home_path + "/datasets/titanic/xgboost.csv", index=False)