Example #1
0
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a NaiveSelectorOperator is a list of DataManager
        self.check_phase(phase)

        x = None
        y = None
        if phase == 'train':
            for dm in dm_list:
                if x is None:
                    x = dm.train_X
                    y = dm.train_y
                else:
                    x = np.hstack((x, dm.train_X))
            x = self.selector.fit_transform(x, y)
            dm = DataManager(x, y, spilt=False)
        else:
            for dm in dm_list:
                if x is None:
                    x = dm.test_X
                else:
                    x = np.hstack((x, dm.test_X))
            x = self.selector.transform(x)
            dm = DataManager()
            dm.test_X = x
        return dm
Example #2
0
 def predict_from_dirctory(self,
                           dirname,
                           target_shape=(224, 224, 3),
                           **kwargs):
     img_data_manager = DataManager()
     img_data_manager.test_dir = dirname
     img_data_manager.target_shape = target_shape
     super().predict(img_data_manager, **kwargs)
     return self
Example #3
0
def test_cash_module():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.regressor import Regressor
    from alphaml.datasets.rgs_dataset.dataset_loader import load_data
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    import random
    result = []
    for i in range(1):
        x, y, _ = load_data('boston')
        train_x, test_x, train_y, test_y = train_test_split(x,
                                                            y,
                                                            test_size=0.2)
        dm = DataManager(train_x, train_y)
        cls = Regressor(
            exclude_models=['xgboost', 'mlp'],
            #include_models=['mlp'],
            optimizer='tpe',
            ensemble_method='blending',
            ensemble_size=args.ensemble_size,
        ).fit(dm, metric='mse', update_mode=2, runcount=args.run_count)

        pred = cls.predict(test_x)
        print(pred)
        result.append(mean_squared_error(test_y, pred))
        print(result)
Example #4
0
def test_hyperspace():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    try:
        for dataset in datasets:
            for run_id in range(start_run, rep_num):
                X, y, _ = load_data(dataset)
                dm = DataManager(X, y)
                seed = np.random.random_integers(MAX_INT)

                for update_mode in [2, 3]:
                    task_format = dataset + '_mode_%d_%d' % (update_mode,
                                                             run_id)
                    cls = Classifier(optimizer='ts_smbo',
                                     seed=seed).fit(dm,
                                                    metric='accuracy',
                                                    runcount=run_count,
                                                    task_name=task_format,
                                                    update_mode=update_mode)
                    print(cls.predict(X))
    except Exception as e:
        print(e)
        print('Exit!')
Example #5
0
def test_estimator():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    rep_num = args.rep
    run_count = args.run_count
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        task_format = dataset + '_est_%d'
        X, y, _ = load_data(dataset)
        dm = DataManager(X, y)
        seed = np.random.random_integers(MAX_INT)
        for optimizer in ['smbo']:
            cls = Classifier(include_models=['gradient_boosting'],
                             optimizer=optimizer,
                             seed=seed).fit(dm,
                                            metric='accuracy',
                                            runcount=run_count,
                                            task_name=task_format)
            print(cls.predict(X))
Example #6
0
    def operate(self, dm_list: typing.List, phase='train'):
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        if phase == 'train':
            x = dm.train_X
            newfeature = np.zeros((len(x), 1))
            for i, sample in enumerate(x):
                cnt = 0
                for column in sample:
                    if column == 0:
                        cnt += 1
                newfeature[i] = cnt
            result_dm = DataManager()
            result_dm.train_X = newfeature
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            newfeature = np.zeros((len(x), 1))
            for i, sample in enumerate(x):
                cnt = 0
                for column in sample:
                    if column == 0:
                        cnt += 1
                newfeature[i] = cnt
            result_dm = DataManager()
            result_dm.test_X = newfeature
        return result_dm
Example #7
0
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a AutoCrossOperator is a DataManager
        assert len(dm_list) == 1
        dm = dm_list[0]
        assert isinstance(dm, DataManager)
        self.check_phase(phase)

        feature_types = dm.feature_types
        onehot_index = [i for i in range(len(feature_types))
                        if feature_types[i] == "One-Hot"]
        numerical_index = [i for i in range(len(feature_types))
                           if feature_types[i] == 'Discrete' or feature_types[i] == 'Float']

        if phase == 'train':
            from sklearn.model_selection import train_test_split
            if self.stratify:
                train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2,
                                                                  stratify=dm.train_y)
            else:
                train_x, val_x, train_y, val_y = train_test_split(dm.train_X, dm.train_y, test_size=0.2)
            x = dm.train_X
            self.autocross.fit(train_x, val_x, train_y, val_y, onehot_index, numerical_index)
            result_dm = DataManager()
            result_dm.train_X = self.autocross.transform(x)
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            result_dm = DataManager()
            result_dm.test_X = self.autocross.transform(x)
        return result_dm
Example #8
0
def one_hot(dm: DataManager) -> DataManager:
    """
    Convert the categorical features to float with one-hot encoding
    :param dm:
    :return:
    """
    feature_types = dm.feature_types
    categorical_index = [
        i for i in range(len(feature_types))
        if feature_types[i] == "Categorical"
    ]
    other_index = [
        i for i in range(len(feature_types))
        if feature_types[i] != "Categorical"
    ]

    encoder = OneHotEncoder(handle_unknown="ignore")
    (train_x, _), (valid_x,
                   _), (test_x,
                        _) = dm.get_train(), dm.get_val(), dm.get_test()

    train_size = len(train_x)
    valid_size = 0
    test_size = 0
    if train_x is None:
        raise ValueError("train_x has no value!!!")
    if valid_x is not None and test_x is not None:
        x = np.concatenate([train_x, valid_x, test_x])
        valid_size = len(valid_x)
        test_size = len(test_x)
    elif valid_x is not None:
        x = np.concatenate([train_x, valid_x])
        valid_size = len(valid_x)
    else:
        x = train_x
    categorical_x = x[:, categorical_index]
    other_x = x[:, other_index]

    encoder.fit(categorical_x)
    categorical_x = encoder.transform(categorical_x).toarray()

    categorical_features = ["One-Hot"] * categorical_x.shape[1]
    other_features = [feature_types[i] for i in other_index]

    x = np.hstack((categorical_x, other_x)).astype(np.float)
    dm.feature_types = np.concatenate((categorical_features, other_features))

    train_x, valid_x, test_x = _split_data(x, train_size, valid_size,
                                           test_size)
    if valid_size == 0:
        valid_x = None
    if test_size == 0:
        test_x = None

    dm.train_X = train_x
    dm.val_X = valid_x
    dm.test_X = test_x

    return dm
Example #9
0
def evaluate_c():
    rep_num = 10
    run_count = 500
    start_id = args.start_runid
    datasets = args.datasets.split(',')
    task_id = 'exp5_eval_c'
    print(rep_num, run_count, datasets, task_id)

    for dataset in datasets:
        # Make directories.
        dataset_id = dataset.split('_')[0]
        save_dir = "data/%s/" % dataset_id
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        result = dict()
        seeds = get_seeds(dataset, start_id + rep_num)
        for run_id in range(start_id, start_id + rep_num):
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)

            # Test each optimizer algorithm:
            for p in [1, 4, 10, 14, 16, 20]:
                task_name = dataset + '_%s_%d_%d_%d' % (task_id, run_count,
                                                        run_id, p)
                mode = 3
                optimizer = 'mono_smbo'

                print('Test %s optimizer => %s' % (optimizer, task_name))

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=p)
                acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%d_%s' % (dataset, run_count, run_id, p,
                                             optimizer)
                result[key_id] = acc

            # Display and save the test result.
            print(result)
            with open(
                    'data/%s/%s_test_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, task_id, run_count, rep_num, start_id),
                    'wb') as f:
                pickle.dump(result, f)
Example #10
0
def test_categorical_indexer():
    train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8],
                        ["c", 10, "java", 4.8]])

    valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8],
                        ["d", 10, "python", 4.8]])

    test_x = np.array([["a", 1, "scala", 4.5]])

    dm = DataManager()

    dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"]

    dm.train_X = train_x
    dm.val_X = valid_x
    dm.test_X = test_x

    dm = categorical_indexer(dm)

    print(dm.feature_types)
    print(dm.train_X)
    print("----------------------------")
    print(dm.val_X)
    print("----------------------------")
    print(dm.test_X)
Example #11
0
    def operate(self, dm_list: typing.List, phase='train'):
        '''
        :return: self.result_dm is a new Datamanager with data splited for training and validation
        '''
        x = None
        y = None
        if phase == 'train':
            for dm in dm_list:
                if x is None:
                    x = dm.train_X
                    y = dm.train_y
                else:
                    x = np.hstack((x, dm.train_X))
            self.selector.fit(x, y)
        else:
            for dm in dm_list:
                if x is None:
                    x = dm.test_X
                else:
                    x = np.hstack((x, dm.test_X))

        if self.model == self.RANDOM_FOREST:
            self.sorted_features = np.argsort(
                self.selector.feature_importances_)[::-1]
        elif self.model == self.LASSO_REGRESSION:
            if self.selector.coef_.ndim == 1:
                self.sorted_features = np.argsort(self.selector.coef_)[::-1]
            else:
                importances = np.linalg.norm(self.selector.coef_,
                                             axis=0,
                                             ord=1)
                self.sorted_features = np.argsort(importances)[::-1]
        x = x[:, self.sorted_features[:self.kbest]]
        dm = DataManager()
        if phase == 'train':
            dm.train_X = x
            dm.train_y = y
        else:
            dm.test_X = x
        return dm
Example #12
0
def test_impute_dm():
    train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8],
                        ["c", 10, "java", 4.8]])

    valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8],
                        ["d", 10, "python", 4.8]])

    test_x = np.array([["a", 1, "scala", 4.5]])

    train_x[2][0] = "???"
    train_x[2][2] = "???"
    valid_x[0][1] = np.nan
    test_x[0][-1] = np.nan

    dm = DataManager()

    dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"]

    dm.train_X = train_x.astype(np.object)
    dm.val_X = valid_x.astype(np.object)
    dm.test_X = test_x.astype(np.object)

    dm = impute_dm(dm, "???")

    print(dm.feature_types)
    print(dm.train_X)
    print("----------------------------")
    print(dm.val_X)
    print("----------------------------")
    print(dm.test_X)
Example #13
0
    def fit(self, data: DataManager, **kwargs):
        from alphaml.engine.evaluator.dl_evaluator import BaseImgEvaluator
        task_type = kwargs['task_type']
        if data.train_X is None and data.train_y is None:
            inputshape = data.target_shape
            classnum = None
        else:
            inputshape = data.train_X.shape[1:]
            classnum = None
            if task_type == 'img_multiclass':
                data.train_y, self.map_dict, self.rev_map_dict = map_label(data.train_y)
                data.train_y = to_categorical(data.train_y)
                data.val_y, _, _ = map_label(data.val_y, self.map_dict)
                data.val_y = to_categorical(data.val_y)
                classnum = len(self.rev_map_dict)

            elif task_type == 'img_binary':
                data.train_y, self.map_dict, self.rev_map_dict = map_label(data.train_y, if_binary=True)
                data.val_y, _, _ = map_label(data.val_y, self.map_dict, if_binary=True)
                classnum = 1

            elif task_type == 'img_multilabel-indicator':
                classnum = get_classnum(data.train_y)

        self.evaluator = BaseImgEvaluator(inputshape, classnum)

        return super().fit(data, **kwargs)
Example #14
0
    def operate(self, dm_list: typing.List, phase='train'):
        self.check_phase(phase)

        x = None
        y = None
        if phase == 'train':
            for dm in dm_list:
                if x is None:
                    x = dm.train_X
                    y = dm.train_y
                else:
                    x = np.hstack((x, dm.train_X))
            dm = DataManager(x, y, spilt=False)
        else:
            for dm in dm_list:
                if x is None:
                    x = dm.test_X
                else:
                    x = np.hstack((x, dm.test_X))
            dm = DataManager()
            dm.test_X = x
        return dm
Example #15
0
def test_cash_module():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    import random
    from sklearn.metrics import roc_auc_score
    result = []
    for i in range(1):
        import xlrd
        sheet = xlrd.open_workbook("lyqdata.xlsx")
        sheet = sheet.sheet_by_index(0)
        nrows = sheet.nrows
        X_train = []
        y_train = []
        for i in range(2, nrows):
            X_train.append(sheet.row_values(i, start_colx=1))
            y_train.append(int(sheet.cell_value(i, 0)))

        dm = DataManager(X_train, y_train)
        cls = Classifier(
            # include_models=['liblinear_svc', 'libsvm_svc', 'random_forest', 'logistic_regression', 'mlp'],
            include_models=['mlp'],
            optimizer='smbo',
            cross_valid=False,
            ensemble_method='ensemble_selection',
            ensemble_size=args.ensemble_size,
            save_dir='data/save_models'
        )
        cls.fit(dm, metric='auc', runcount=args.run_count)

        sheet = xlrd.open_workbook("lyqtestdata.xlsx")
        sheet = sheet.sheet_by_index(0)
        nrows = sheet.nrows
        X_test = []
        y_test = []
        for i in range(1, nrows):
            X_test.append(sheet.row_values(i, start_colx=1))
            y_test.append(int(sheet.cell_value(i, 0)))

        pred = cls.predict_proba(X_test)
        result.append(roc_auc_score(y_test, pred[:, 1:2]))
        print(result)

    import pickle
    with open('result.pkl', 'wb') as f:
        pickle.dump(result, f)
Example #16
0
def test_cash_module():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    import random
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import OneHotEncoder

    result = []
    for i in range(1):
        import xlrd
        sheet = xlrd.open_workbook("ybai_Keratoconus_TJ_20190425.xlsx")
        sheet = sheet.sheet_by_index(0)
        nrows = sheet.nrows
        X_train = []
        y_train = []

        for i in range(1, nrows):
            X_train.append(sheet.row_values(i, start_colx=1))
            y_train.append(int(sheet.cell_value(i, 0)))

        encoder = OneHotEncoder()
        encoder.fit(np.reshape(y_train, (len(y_train), 1)))
        X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

        dm = DataManager(X_train, y_train)
        cls = Classifier(
            # include_models=['liblinear_svc', 'libsvm_svc', 'xgboost', 'random_forest', 'logistic_regression', 'mlp'],
            optimizer='smbo',
            ensemble_method='bagging',
            ensemble_size=args.ensemble_size,
        )
        cls.fit(dm, metric='auc', runcount=args.run_count)

        pred = cls.predict_proba(X_test)
        print(pred)
        y_test = encoder.transform(np.reshape(y_test, (len(y_test), 1))).toarray()
        result.append(roc_auc_score(y_test, pred))
        print(result)

        import pickle
        with open('result.pkl', 'wb') as f:
            pickle.dump(result, f)
Example #17
0
def test_no_free_lunch():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data

    for dataset in datasets:
        seeds = get_seeds(dataset, rep_num)
        for run_id in range(rep_num):
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)
            for algo in algo_list:
                for optimizer in ['smbo']:
                    task_format = dataset + '_' + algo + '_%d_%d'
                    cls = Classifier(
                        include_models=[algo], optimizer=optimizer, seed=seed).fit(
                        dm, metric='accuracy', runcount=run_count, task_name=task_format % (run_count, run_id))
                    print(cls.predict(X))
Example #18
0
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a PCAOperator is a DataManager
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        feature_types = dm.feature_types
        numerical_index = [i for i in range(len(feature_types))
                           if feature_types[i] == "Float" or feature_types[i] == "Discrete"]
        if phase == 'train':
            x = dm.train_X
            result_dm = DataManager()
            result_dm.train_X = self.pca.fit_transform(x[:, numerical_index])
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            result_dm = DataManager()
            result_dm.test_X = self.pca.fit_transform(x[:, numerical_index])
        return result_dm
Example #19
0
 def fit_from_directory(self,
                        dirname,
                        target_shape=(224, 224, 3),
                        valid_split=0.1,
                        **kwargs):
     img_data_manager = DataManager()
     if isinstance(dirname, (list, tuple)):
         if len(dirname) != 2:
             raise ValueError(
                 "Expected one directory or a list or tuple of two directories for training and validation!"
             )
         if dirname[1] is None:
             img_data_manager.train_valid_dir = dirname[0]
         else:
             img_data_manager.train_dir = dirname[0]
             img_data_manager.valid_dir = dirname[1]
     else:
         img_data_manager.train_valid_dir = dirname
     img_data_manager.target_shape = target_shape
     img_data_manager.split_size = valid_split
     kwargs['task_type'] = 'img_multilabel-indicator'
     kwargs['metric'] = kwargs.get('metric', 'acc')
     super().fit(img_data_manager, **kwargs)
     return self
Example #20
0
def test_hyperspace():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    try:
        for dataset in datasets:
            for run_id in range(start_run, rep_num):
                X, y, _ = load_data(dataset)
                dm = DataManager(X, y)
                seed = np.random.random_integers(MAX_INT)

                for n_est in [1, 2, 4, 8, 12]:
                    algos = algo_list[:n_est]
                    task_format = dataset + '_hp_%d_%d' % (n_est, run_id)
                    cls = Classifier(
                        include_models=algos, optimizer='smbo', seed=seed).fit(
                        dm, metric='accuracy', runcount=run_count, task_name=task_format)
                    print(cls.predict(X))
    except Exception as e:
        print(e)
        print('Exit!')
Example #21
0
    def operate(self, dm_list: typing.List, phase='train') -> DataManager:
        # The input of a PolynomialFeatureOperator is a DataManager
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        feature_types = dm.feature_types
        numericial_index = [i for i in range(len(feature_types))
                            if feature_types[i] == "Float" or feature_types[i] == "Discrete"]
        init_length = len(numericial_index) + 1
        if phase == 'train':
            x = dm.train_X
            newfeatures = self.polynomialfeatures.fit_transform(x[:, numericial_index])
            result_dm = DataManager()
            result_dm.train_X = newfeatures[:, init_length:]
            result_dm.train_y = dm.train_y
        else:
            x = dm.test_X
            newfeatures = self.polynomialfeatures.transform(x[:, numericial_index])
            result_dm = DataManager()
            result_dm.test_X = newfeatures[:, init_length:]
        return result_dm
    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a ImputeOperator is a pd.Dataframe
        assert len(dm_list) == 1 and isinstance(dm_list[0], pd.DataFrame)
        self.check_phase(phase)

        input_df = dm_list[0]
        df = self.impute_df(input_df)
        dm = DataManager()

        label_col = df.columns[self.label_col] if phase == 'train' else None
        dm.set_col_type(df, label_col)
        data = df.values
        if phase == 'train':
            # Swap label index to -1
            swap_list = list(range(data.shape[1]))
            del (swap_list[self.label_col])
            swap_list.append(self.label_col)
            data = data[:, swap_list]
            dm.train_X = data[:, :-1]
            dm.train_y = data[:, -1]
        else:
            dm.test_X = data
        return dm
Example #23
0
from alphaml.engine.components.feature_engineering.auto_feature import AutoFeature
from alphaml.estimators.classifier import Classifier
from alphaml.engine.components.data_manager import DataManager

from time import time

warnings.filterwarnings("ignore")

parser = argparse.ArgumentParser()
parser.add_argument("--generated_feature", type=int, default=1)
parser.add_argument("--dataset", type=str)
args = parser.parse_args()

x, y, c = load_data(args.dataset)

dm = DataManager(x, y)

lr = LogisticRegression()
lr.fit(dm.train_X, dm.train_y)
y_pred = lr.predict(dm.val_X)
print("original lr accu:", accuracy_score(dm.val_y, y_pred), flush=True)

if args.generated_feature > 0:
    af = AutoFeature("accuracy", "auto_cross")
    af.fit(dm, args.generated_feature)
    dm = af.transform(dm)

clf = Classifier()
start_time = time()
clf.fit(dm, metric="accuracy", runcount=50)
print("alphaml time:", time() - start_time)
Example #24
0
def test_claim():
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier
    from alphaml.datasets.cls_dataset.dataset_loader import load_data
    from alphaml.utils.constants import MAX_INT

    perfs_list = list()
    for dataset in datasets:
        for run_id in range(rep_num):
            X, y, _ = load_data(dataset)
            dm = DataManager(X, y)
            seed = np.random.random_integers(MAX_INT)
            task_format = dataset + '_claim_%d'

            for optimizer in ['smbo']:
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_format % run_id)
                print(cls.predict(X))

                file_id = 'data/%s/%s_claim_%d_%s.data' % (dataset, dataset,
                                                           run_id, 'smac')
                with open(file_id, 'rb') as f:
                    data = pickle.load(f)

                best_id = np.argmax(data['perfs'])
                best_value = data['perfs'][best_id]
                if data['perfs'].count(best_value) > 1:
                    stats = dict()
                    for conf, perf in zip(data['configs'], data['perfs']):
                        if perf == best_value:
                            est = conf['estimator']
                            if est not in stats:
                                stats[est] = 0
                            stats[est] += 1
                    tmp_id = np.argmax(stats.values())
                    best_estimator = list(stats.keys())[tmp_id]
                    print('=' * 20, best_value, stats)
                else:
                    best_estimator = data['configs'][best_id]['estimator']
                    print('=' * 20, data['perfs'][best_id],
                          data['configs'][best_id])

                run_cnts = len([
                    item for item in data['configs']
                    if item['estimator'] == best_estimator
                ])

                task_format = dataset + '_claim_single_%d'
                cls = Classifier(include_models=[best_estimator],
                                 optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_cnts,
                                                task_name=task_format % run_id)
                print(cls.predict(X))

                file_id = 'data/%s/%s_claim_single_%d_%s.data' % (
                    dataset, dataset, run_id, 'smac')
                with open(file_id, 'rb') as f:
                    data_s = pickle.load(f)
                print('=' * 20 + 'single', max(data_s['perfs']))
                perfs_list.append((data['perfs'], data_s['perfs']))

    for item in perfs_list:
        item1, item2 = item
        print(len(item1), max(item1), len(item2), max(item2))
    print('=' * 50)
    print(perfs_list)
Example #25
0
def test_auto():
    from sklearn.datasets import load_breast_cancer
    from alphaml.engine.components.data_manager import DataManager
    from alphaml.estimators.classifier import Classifier

    X, y = load_breast_cancer(return_X_y=True)
    # Classifier(exclude_models=['libsvm_svc']).fit(DataManager(X, y))

    for _ in range(5):
        Classifier(include_models=['adaboost', 'gradient_boosting', 'random_forest'], optimizer='ts_smac').fit(DataManager(X, y))

    for _ in range(5):
        Classifier(include_models=['adaboost', 'gradient_boosting', 'random_forest'], optimizer='smac').fit(DataManager(X, y))
Example #26
0
import pandas as pd
import warnings

from alphaml.engine.components.data_manager import DataManager
from alphaml.engine.components.feature_engineering.auto_feature import AutoFeature

warnings.filterwarnings("ignore")

home_path = os.path.expanduser('~')
train_path = os.path.join(home_path, "datasets/santander/train.csv")
test_path = os.path.join(home_path, "datasets/santander/test.csv")

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

df_train.drop(labels=["ID_code"], axis=1, inplace=True)
df_test.drop(labels=["ID_code"], axis=1, inplace=True)

x_train = df_train.drop(labels=["target"], axis=1).values
y_train = df_train["target"].values
x_test = df_test.values

del df_train
del df_test

dm = DataManager(x_train, y_train)
dm.test_X = x_test

auto_feature = AutoFeature(metrics="auc")
dm = auto_feature.fit(dm, generated_num=100)
Example #27
0
def test_normalize(dm):
    dm = normalize(dm)

    print("after normalize rescale\n")

    print(dm.train_X)
    print(dm.val_X)
    print(dm.test_X)
    print(dm.feature_types)


if __name__ == '__main__':
    np.random.seed(19941125)

    dm = DataManager()
    dm.train_X = np.random.rand(5, 5)
    dm.val_X = np.random.rand(3, 5)
    dm.test_X = np.random.rand(2, 5)
    dm.feature_types = ["Discrete", "One-Hot", "Float", "Float", "Categorical"]

    print("Original data......\n")
    print(dm.train_X)
    print(dm.val_X)
    print(dm.test_X)
    print(dm.feature_types)

    print("start test MinMaxScaler.......\n")
    test_minmax(dm)

    print("start test StandardScaler......\n")
Example #28
0
def test_exp2_evaluation():
    rep_num = args.rep
    run_count = args.run_count

    start_id = args.start_runid
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)
    task_id = "exp_2_evaluation"

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        # Dataset partition.
        X, y, _ = load_data(dataset)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        dm = DataManager(X_train, y_train)

        # optimizer_algos = ['cmab_ts', 'rl_1_0.3', 'rl_2_1', 'rl_3_0']
        optimizer_algos = ['cmab_ts', 'rl_2_1', 'rl_3_0']
        # Test each optimizer algorithm:
        for opt_algo in optimizer_algos:
            result = dict()
            mode, eta = None, None
            # Parse the parameters for each optimizer.
            if opt_algo.startswith('rl'):
                if len(opt_algo.split('_')) == 3:
                    _, mode, eta = opt_algo.split('_')
                    mode = int(mode)
                    optimizer = 'rl_smbo'
                    eta = float(eta)
                else:
                    raise ValueError('Wrong params!')
            else:
                optimizer = opt_algo

            print('Test optimizer: %s' % optimizer)

            seeds = get_seeds(dataset, rep_num)
            for run_id in range(start_id, rep_num):
                task_name = dataset + '_%s_%d_%d' % (task_id, run_count,
                                                     run_id)
                seed = seeds[run_id]

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                runtime=None,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=eta)

                # Test the CASH performance on test set.
                cash_test_acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = [cash_test_acc]
                print(result)

            # Save the test result.
            with open(
                    'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, opt_algo, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)
Example #29
0
def test_exp4_runtime():
    rep_num = args.rep
    run_count = args.run_count
    B = args.B
    if B > 0:
        run_count = 0

    start_id = args.start_runid
    datasets = args.datasets.split(',')
    print(rep_num, run_count, datasets)
    task_id = "exp4_runtime"

    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        # Dataset partition.
        X, y, _ = load_data(dataset)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        dm = DataManager(X_train, y_train)

        runcount_dict = dict()
        tpe_runcount = 0.

        optimizer_algos = ['mono_smbo_4', 'smbo', 'tpe']
        # optimizer_algos = ['mono_smbo_3_0']
        # Test each optimizer algorithm:
        assert optimizer_algos[-1] == 'tpe'
        for opt_algo in optimizer_algos:
            # if algo is tpe, we need to estimate its runcount in one hour.
            if opt_algo != 'tpe':
                runcount_dict[opt_algo] = list()
            else:
                count_list = list()
                for key in runcount_dict.keys():
                    count_list.append(np.mean(runcount_dict[key]))
                assert len(count_list) > 0
                tpe_runcount = np.min(count_list)
                print('=' * 50, tpe_runcount)

            result = dict()
            mode, eta = None, None
            # Parse the parameters for each optimizer.
            if opt_algo.startswith('mono_smbo'):
                mode = 2
                if len(opt_algo.split('_')) == 3:
                    _, _, mode = opt_algo.split('_')
                    mode = int(mode)
                    eta = 10
                    optimizer = 'mono_smbo'
            else:
                optimizer = opt_algo

            print('Test optimizer: %s' % optimizer)

            seeds = get_seeds(dataset, rep_num)
            for run_id in range(start_id, rep_num):
                if B > 0:
                    task_name = dataset + '_%s_%d_%d_%d' % (task_id, B,
                                                            run_count, run_id)
                else:
                    task_name = dataset + '_%s_%d_%d' % (task_id, run_count,
                                                         run_id)
                seed = seeds[run_id]

                runcount_const = run_count if opt_algo != 'tpe' else tpe_runcount
                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=runcount_const,
                                                runtime=B,
                                                task_name=task_name,
                                                update_mode=mode,
                                                param=eta)

                # Test the CASH performance on test set.
                cash_test_acc = cls.score(X_test, y_test)

                # Load CASH intermediate infos.
                if optimizer == 'smbo':
                    file_id = 'smac'
                elif optimizer == 'tpe':
                    file_id = 'hyperopt'
                elif optimizer == 'mono_smbo':
                    file_id = 'mm_bandit_%d_smac' % mode
                else:
                    raise ValueError('Invalid optimizer!')

                tmp_task_id = '%s_%d' % (task_id, B) if B > 0 else task_id
                tmp_configs, tmp_perfs = load_infos(dataset, tmp_task_id,
                                                    run_count, run_id, file_id)
                if opt_algo != 'tpe':
                    runcount_dict[opt_algo].append(len(tmp_configs))

                model_infos = (tmp_configs, tmp_perfs)
                ensemble_size = 50
                task_type = type_of_target(dm.train_y)
                if optimizer == 'tpe':
                    task_type = 'hyperopt_' + task_type
                metric = accuracy_score

                ensemble_model = EnsembleSelection(model_infos,
                                                   ensemble_size,
                                                   task_type,
                                                   metric,
                                                   n_best=20)
                ensemble_model.fit(dm)

                ens_val_pred = ensemble_model.predict(dm.val_X)
                ens_val_acc = accuracy_score(ens_val_pred, dm.val_y)

                ens_pred = ensemble_model.predict(X_test)
                ens_test_acc = accuracy_score(ens_pred, y_test)

                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = [cash_test_acc, ens_val_acc, ens_test_acc]
                print(result)

            # Save the test result.
            with open(
                    'data/%s/%s_test_result_%s_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset, opt_algo, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)
Example #30
0
def test_cash_module():
    rep_num = args.rep
    run_count = args.run_count
    start_id = args.start_runid
    datasets = args.datasets.split(',')
    optimizer_algos = args.opt_algo.split(',')
    task_id = args.task_id
    print(rep_num, run_count, datasets, optimizer_algos, task_id)

    result = dict()
    for dataset in datasets:
        dataset_id = dataset.split('_')[0]
        result_dir = 'data/' + dataset_id
        if not os.path.exists(result_dir):
            os.mkdir(result_dir)

        seeds = get_seeds(dataset, rep_num)
        for run_id in range(start_id, rep_num):
            task_name = dataset + '_%s_%d_%d' % (task_id, run_count, run_id)
            seed = seeds[run_id]

            # Dataset partition.
            X, y, _ = load_data(dataset)
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y)
            dm = DataManager(X_train, y_train)

            # Test each optimizer algorithm:
            for optimizer in optimizer_algos:
                # Parse the parameters for each optimizer.
                mode = 2
                eta, r = 2, 2
                if optimizer.startswith('baseline'):
                    optimizer, mode = optimizer.split('_')
                    mode = 1 if mode == 'rand' else 2
                if optimizer.startswith('sh'):
                    if len(optimizer.split('_')) == 2:
                        optimizer, eta = optimizer.split('_')
                        eta = float(eta)
                    else:
                        raise ValueError('Wrong SH params!')
                if optimizer.startswith('rl'):
                    if len(optimizer.split('_')) == 3:
                        _, mode, eta = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'rl_smbo'
                    else:
                        raise ValueError('Wrong SH params!')
                if optimizer.startswith('ts_smbo'):
                    mode = 1
                    if len(optimizer.split('_')) == 3:
                        _, _, mode = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'ts_smbo'
                if optimizer.startswith('mcmc_ts'):
                    _, _, mode, eta, r = optimizer.split('_')
                    mode = int(mode)
                    eta = int(eta)
                    r = int(r)
                    optimizer = 'mcmc_ts_smbo'

                if optimizer.startswith('ucb_smbo'):
                    mode = 1
                    if len(optimizer.split('_')) == 3:
                        _, _, mode = optimizer.split('_')
                        mode = int(mode)
                        optimizer = 'ucb_smbo'

                if optimizer.startswith('mono_smbo'):
                    mode = 2
                    if len(optimizer.split('_')) == 4:
                        _, _, mode, r = optimizer.split('_')
                        mode, r = int(mode), int(r)
                        eta = 10
                        optimizer = 'mono_smbo'

                print('Test %s optimizer => %s' % (optimizer, task_name))

                # Construct the AutoML classifier.
                cls = Classifier(optimizer=optimizer,
                                 seed=seed).fit(dm,
                                                metric='accuracy',
                                                runcount=run_count,
                                                task_name=task_name,
                                                update_mode=mode,
                                                eta=eta,
                                                r=r,
                                                param=eta)
                acc = cls.score(X_test, y_test)
                key_id = '%s_%d_%d_%s' % (dataset, run_count, run_id,
                                          optimizer)
                result[key_id] = acc

            # Display and save the test result.
            print(result)
            with open(
                    'data/%s/%s_test_result_%s_%d_%d_%d.pkl' %
                (dataset_id, dataset_id, task_id, run_count, rep_num,
                 start_id), 'wb') as f:
                pickle.dump(result, f)