Ejemplo n.º 1
0
def test_AutoMLClassifier(get_data):
    data = get_data
    test_model = AutoMLClassifier(databunch=data, random_state=RANDOM_SEED)
    predict_test, predict_train = test_model.opt(timeout=1500, verbose=0,)
    assert predict_test is not None
    score = sklearn.metrics.roc_auc_score(data.y_test, predict_test)
    assert score is not None
    assert score >= 0.8
Ejemplo n.º 2
0
def test_automl_default_classification():
    for data_id in [
            179,
            4135,
    ]:
        dataset = fetch_openml(data_id=data_id, as_frame=True)
        dataset.target = dataset.target.astype('category').cat.codes
        if len(dataset.data) < 2000:
            crop = len(dataset.data)
        else:
            crop = 2000
        X_train, X_test, y_train, y_test = train_test_split(
            dataset.data[:crop],
            dataset.target[:crop],
            test_size=0.2,
            random_state=RANDOM_SEED,
        )
        model = AutoMLClassifier(random_state=RANDOM_SEED, )
        model.fit(X_train, y_train, timeout=600)
        predicts = model.predict(X_test)

        score = round(sklearn.metrics.roc_auc_score(y_test, predicts), 4)
        assert score is not None
        assert 0.5 < score <= 1

        model.save('AutoML_model_1', folder=TMP_FOLDER)
        model_new = AutoMLClassifier(random_state=RANDOM_SEED, )
        model_new = model_new.load('AutoML_model_1', folder=TMP_FOLDER)
        predicts = model_new.predict(X_test)
        score2 = round(sklearn.metrics.roc_auc_score(y_test, predicts), 4)
        assert score2 is not None
        assert 0.5 < score2 <= 1
        assert (score - score2) == 0.
Ejemplo n.º 3
0
        #shuffle columns for more randomization experiment
        columns_tmp = list(X.columns.values)
        np.random.shuffle(columns_tmp)
        X = X[columns_tmp]

        # Split
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        #print(DATASET_NAME, X_train.shape, X_test.shape)

        # Auto_ml
        START_EXPERIMENT = time.time()
              
        model = AutoMLClassifier(X_train, y_train, X_test, 
                                 #cat_encoder_names=['OneHotEncoder', 'FrequencyEncoder'],
                                 cat_features=cat_features, 
                                 random_state=RANDOM_SEED, 
                                 verbose=1)
        
        time.sleep(0.5)

        y_test_predict_proba, predict_train = model.opt(timeout=TIME_LIMIT, verbose=2)           

        #y_test_predict_proba, _ = model.fit_predict()
        #y_test_predict = automl.predict(X_test)
        print('*'*75)
        print('AUC: ', round(roc_auc_score(y_test, y_test_predict_proba),4))
        print('AUC mean models: ', round(roc_auc_score(y_test, model.stack_models_predicts['predict_test'].mean()),4))
        print('Model_0 FullX: ', round(roc_auc_score(y_test, model.predicts_model_0_full_x['predict_test'].mean()),4))
        print('Model_1 FullX: ', round(roc_auc_score(y_test, model.predicts_model_1_full_x['predict_test'].mean()),4))
        
Ejemplo n.º 4
0
def test_automl_classifier_bench():
    for data_id in [
            # 179,
            # 4135,
            1461,
            # 1226,
            # 31,
            1471,
            151,
            # 1067,
            # 1046,
            1489,
            1494,
    ]:
        dataset = fetch_openml(data_id=data_id, as_frame=True)
        dataset.target = dataset.target.astype("category").cat.codes

        logger.info("=" * 75)
        logger.info("LOAD DATASET")
        logger.info(f"Dataset: {data_id} {dataset.data.shape}")

        y = dataset.target
        X = dataset.data

        skf = StratifiedKFold(n_splits=CV, shuffle=True, random_state=42)

        metrics = []

        for count, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            # if count > 3:
            #    continue
            logger.info(f"START FOLD {count}")
            RANDOM_SEED = count
            EXPERIMENT = count
            np.random.seed(RANDOM_SEED)

            # shuffle columns for more randomization experiment
            columns_tmp = list(X.columns.values)
            np.random.shuffle(columns_tmp)
            X = X[columns_tmp]

            # Split
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

            START_EXPERIMENT = time.time()

            model = AutoMLClassifier(random_state=RANDOM_SEED, )
            model.fit(X_train, y_train, timeout=TIME_LIMIT)

            predicts = model.predict(X_test)
            assert predicts is not None

            # model.save(f'AutoML_fold_{count}', folder='./result/')

            logger.info("*" * 75)
            logger.info(f"AUC: {round(roc_auc_score(y_test, predicts),4)}")

            logger.info(
                f"predict_model_1 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_1),4)}"
            )
            logger.info(
                f"predict_model_2 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_2),4)}"
            )
            # logger.info(f'predict_model_3 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_3),4)}')
            # logger.info(f'predict_model_4 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_4),4)}')
            # logger.info(f'predict_model_5 AUC: {round(sklearn.metrics.roc_auc_score(y_test, model.predict_model_5),4)}')
            logger.info("-" * 75)

            END_EXPERIMENT = time.time()

            metrics.append({
                "AUC":
                round(roc_auc_score(y_test, predicts), 4),
                "log_loss":
                round(log_loss(y_test, predicts), 4),
                "Accuracy":
                round(accuracy_score(y_test, predicts > 0.5), 4),
                "Time_min": (END_EXPERIMENT - START_EXPERIMENT) // 60,
                "Time":
                datetime.datetime.now(),
            })

            pd.DataFrame(metrics).to_csv(
                f"./result/{data_id}_metrics.csv",
                index=False,
            )
            model = None
Ejemplo n.º 5
0
def load_model(model_name, folder):
    model = AutoMLClassifier()
    model = model.load(model_name, folder=folder)
    return(model)
Ejemplo n.º 6
0
        #shuffle columns for more randomization experiment
        columns_tmp = list(X.columns.values)
        np.random.shuffle(columns_tmp)
        X = X[columns_tmp]

        # Split
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        #print(DATASET_NAME, X_train.shape, X_test.shape)

        # Auto_ml
        START_EXPERIMENT = time.time()

        model = AutoMLClassifier(X_train,
                                 y_train,
                                 X_test,
                                 random_state=RANDOM_SEED,
                                 verbose=1)

        time.sleep(0.5)

        y_test_predict_proba, predict_train = model.opt(timeout=TIME_LIMIT,
                                                        verbose=2)

        #y_test_predict_proba, _ = model.fit_predict()
        #y_test_predict = automl.predict(X_test)
        print('*' * 75)
        print('AUC: ', round(roc_auc_score(y_test, y_test_predict_proba), 4))
        print(
            'AUC mean models: ',
            round(