Exemple #1
0
    def remove_uninf_cols(self, input_node: DataNode, train_phase=True):
        raw_dataframe = input_node.data[0]
        types = input_node.feature_types
        if train_phase:
            # Remove the uninformative columns.
            uninformative_columns, uninformative_idx = list(), list()
            for idx, column in enumerate(list(raw_dataframe)):
                if raw_dataframe[column].isnull().values.all():
                    uninformative_columns.append(column)
                    uninformative_idx.append(idx)
                    continue
                if types[idx] == CATEGORICAL:
                    num_sample = input_node.data[0].shape[0]
                    num_unique = len(set(input_node.data[0][column]))
                    if num_unique >= int(0.8 * num_sample):
                        uninformative_columns.append(column)
                        uninformative_idx.append(idx)
            self.uninformative_columns, self.uninformative_idx = uninformative_columns, uninformative_idx

        input_node.feature_types = [
            types[idx] for idx in range(len(types))
            if idx not in self.uninformative_idx
        ]
        raw_dataframe = raw_dataframe.drop(self.uninformative_columns, axis=1)
        input_node.data[0] = raw_dataframe
        return input_node
Exemple #2
0
 def train_valid_split(self, node: DataNode):
     X, y = node.copy_().data
     sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
     for train_index, test_index in sss.split(X, y):
         X_train, X_val = X[train_index], X[test_index]
         y_train, y_val = y[train_index], y[test_index]
     train_data = DataNode(data=[X_train, y_train], feature_type=node.feature_types.copy())
     valid_data = DataNode(data=[X_val, y_val], feature_type=node.feature_types.copy())
     return train_data, valid_data
    def dec(*args, **kwargs):
        param_name = 'target_fields'
        target_fields = None
        if len(args) == 3:
            trans, input, target_fields = args
            if type(target_fields) is list and len(target_fields) == 0:
                target_fields = None
        elif len(args) == 2:
            trans, input = args
            if param_name in kwargs and len(kwargs[param_name]) > 0:
                target_fields = kwargs[param_name]
        else:
            raise ValueError('The number of input is wrong!')

        if target_fields is None:
            target_fields = collect_fields(input.feature_types,
                                           trans.input_type)
        if len(target_fields) == 0:
            return input.copy_()

        X, y = input.data
        if isinstance(X, pd.DataFrame):
            X = X.values

        args = (trans, input, target_fields)
        _X = func(*args)
        if isinstance(trans.output_type, list):
            trans.output_type = trans.output_type[0]
        _types = [trans.output_type] * _X.shape[1]

        if trans.compound_mode == 'only_new':
            new_X = _X
            new_types = _types
        elif trans.compound_mode == 'concatenate':
            new_X = np.hstack((X, _X))
            new_types = input.feature_types.copy()
            new_types.extend(_types)
        elif trans.compound_mode == 'replace':
            new_X = np.hstack((X, _X))
            new_types = input.feature_types.copy()
            new_types.extend(_types)
            new_X = np.delete(new_X, target_fields, axis=1)
            temp_array = np.array(new_types)
            new_types = list(np.delete(temp_array, target_fields))
        else:
            assert _X.shape[1] == len(target_fields)
            new_X = X.copy()
            new_X[:, target_fields] = _X
            new_types = input.feature_types.copy()

        output_datanode = DataNode((new_X, y), new_types, input.task_type)
        output_datanode.trans_hist = input.trans_hist.copy()
        output_datanode.trans_hist.append(trans.type)
        trans.target_fields = target_fields
        return output_datanode
Exemple #4
0
def load_train_test_data(dataset,
                         data_dir='./',
                         test_size=0.2,
                         random_state=45):
    X, y, feature_type = load_data(dataset, data_dir, False)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y)
    train_node = DataNode(data=[X_train, y_train],
                          feature_type=feature_type.copy())
    test_node = DataNode(data=[X_test, y_test],
                         feature_type=feature_type.copy())
    return train_node, test_node
def evaluate_feature_selectors():
    data = (np.array([[0, 1.2, 2, 1], [0, 1, 2, 1], [0, 3, 2, 2],
                      [0, 5, 4, 5]]), np.array([1, 2, 3, 4]))
    feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    datanode = DataNode(data, feature_type)

    scaler = GenericUnivariateSelector()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    # transformer = VarianceSelector()
    # transformer = ModelBasedSelector(param='rf')
    # output_datanode = transformer.operate([datanode])
    print(output_datanode)
    print(output_datanode.data)
    print(output_datanode.feature_types)
    def load_train_csv(self,
                       file_location,
                       label_col=-1,
                       drop_index=None,
                       keep_default_na=True,
                       na_values=None,
                       header='infer',
                       sep=','):
        # Set the NA values.
        if na_values is not None:
            na_set = set(self.na_values)
            for item in na_values:
                na_set.add(item)
            self.na_values = list(na_set)

        if file_location.endswith('csv'):
            df = pd.read_csv(file_location,
                             keep_default_na=keep_default_na,
                             na_values=self.na_values,
                             header=header,
                             sep=sep)
        elif file_location.endswith('xls'):
            df = pd.read_csv(file_location,
                             keep_default_na=keep_default_na,
                             na_values=self.na_values,
                             header=header)
        else:
            raise ValueError('Unsupported file format: %s!' %
                             file_location.split('.')[-1])

        # Drop the row with all NaNs.
        df.dropna(how='all')

        # Clean the data where the label columns have nans.
        self.clean_data_with_nan(df, label_col, drop_index=drop_index)

        # The columns with missing values.
        columns_missed = df.columns[df.isnull().any()].tolist()

        # Identify the feature types
        self.set_feat_types(df, columns_missed)

        self.train_X = df
        data = [self.train_X, self.train_y]
        return DataNode(data, self.feature_types)
def test_additional_transformations():
    data = (np.array([[0, 1.2, 2, 1], [0.01, 1, 2, 1], [0.02, 3, 2, 2],
                      [0.015, 5, 4, 5], [0.12, 3, 2, 2],
                      [0.16, 5, 4, 5]]), np.array([1, 1, 2, 2, 3, 3]))
    feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    datanode = DataNode(data, feature_type)
    from automlToolkit.components.feature_engineering.transformations.generator.arithmetic_transformer import \
        ArithmeticTransformation
    from automlToolkit.components.feature_engineering.transformations.generator.lda_decomposer import LdaDecomposer
    from automlToolkit.components.feature_engineering.transformations.continous_discretizer import KBinsDiscretizer
    from automlToolkit.components.feature_engineering.transformations.discrete_categorizer import DiscreteCategorizer
    # trans = ArithmeticTransformation()
    # trans = LdaDecomposer()
    # trans = KBinsDiscretizer()
    trans = DiscreteCategorizer()
    output_datanode = trans.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)
    def load_test_csv(self,
                      file_location,
                      has_label=False,
                      label_col=-1,
                      drop_index=None,
                      keep_default_na=True,
                      header='infer',
                      sep=','):
        df = pd.read_csv(file_location,
                         keep_default_na=keep_default_na,
                         na_values=self.na_values,
                         header=header,
                         sep=sep)
        # Drop the row with all NaNs.
        df.dropna(how='all')
        self.clean_data_with_nan(df,
                                 label_col,
                                 phase='test',
                                 drop_index=drop_index,
                                 has_label=has_label)
        self.test_X = df

        data = [self.test_X, self.test_y]
        return DataNode(data, self.feature_types)
def test_selector():
    data = (np.array([[0, 1.2, 2, 1], [0, 1, 2, 1], [0, 3, 2, 2],
                      [0, 5, 4, 5]]), np.array([1, 2, 3, 4]))
    feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    datanode = DataNode(data, feature_type)

    # Test generic univariate selector.
    scaler = GenericUnivariateSelector()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test percentile selector.
    from automlToolkit.components.feature_engineering.transformations.selector.percentile_selector import \
        PercentileSelector
    scaler = PercentileSelector(percentile=25)
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test model based selector.
    from automlToolkit.components.feature_engineering.transformations.selector.model_based_selector import \
        ModelBasedSelector
    scaler = ModelBasedSelector(param='et')
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test variance threshold.
    from automlToolkit.components.feature_engineering.transformations.selector.variance_selector import VarianceSelector
    scaler = VarianceSelector()
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)
    def __init__(self, task_type, estimator_id: str, data: DataNode, metric,
                 share_fe=False, output_dir='logs',
                 per_run_time_limit=120,
                 per_run_mem_limit=5120,
                 dataset_id='default',
                 eval_type='holdout',
                 mth='rb', sw_size=3,
                 n_jobs=1, seed=1,
                 enable_intersection=True,
                 number_of_unit_resource=2):
        self.task_type = task_type
        self.metric = metric
        self.number_of_unit_resource = number_of_unit_resource
        # One unit of resource, that's, the number of trials per iteration.
        self.one_unit_of_resource = 5
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.estimator_id = estimator_id
        self.evaluation_type = eval_type
        self.original_data = data.copy_()
        self.share_fe = share_fe
        self.output_dir = output_dir
        self.mth = mth
        self.seed = seed
        self.n_jobs = n_jobs
        self.sliding_window_size = sw_size
        self.logger = get_logger('%s:%s-%d=>%s' % (
            __class__.__name__, dataset_id, seed, estimator_id))
        np.random.seed(self.seed)

        # Bandit settings.
        self.arms = ['fe', 'hpo']
        self.rewards = dict()
        self.optimizer = dict()
        self.evaluation_cost = dict()
        self.update_flag = dict()
        # Global incumbent.
        self.inc = dict()
        self.local_inc = dict()
        self.local_hist = {'fe': [], 'hpo': []}
        for arm in self.arms:
            self.rewards[arm] = list()
            self.update_flag[arm] = False
            self.evaluation_cost[arm] = list()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()
        self.incumbent_perf = float("-INF")
        self.early_stopped_flag = False
        self.enable_intersection = enable_intersection

        # Fetch hyperparameter space.
        if self.task_type in CLS_TASKS:
            from automlToolkit.components.models.classification import _classifiers, _addons
            if estimator_id in _classifiers:
                clf_class = _classifiers[estimator_id]
            elif estimator_id in _addons.components:
                clf_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = clf_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        elif self.task_type in REG_TASKS:
            from automlToolkit.components.models.regression import _regressors, _addons
            if estimator_id in _regressors:
                reg_class = _regressors[estimator_id]
            elif estimator_id in _addons.components:
                reg_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = reg_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        else:
            raise ValueError("Unknown task type %s!" % self.task_type)

        self.config_space = cs
        self.default_config = cs.get_default_configuration()
        self.config_space.seed(self.seed)

        # Build the Feature Engineering component.
        if self.task_type in CLS_TASKS:
            fe_evaluator = ClassificationEvaluator(self.default_config, scorer=self.metric,
                                                   name='fe', resampling_strategy=self.evaluation_type,
                                                   seed=self.seed)
            hpo_evaluator = ClassificationEvaluator(self.default_config, scorer=self.metric,
                                                    data_node=self.original_data, name='hpo',
                                                    resampling_strategy=self.evaluation_type,
                                                    seed=self.seed)
        elif self.task_type in REG_TASKS:
            fe_evaluator = RegressionEvaluator(self.default_config, scorer=self.metric,
                                               name='fe', resampling_strategy=self.evaluation_type,
                                               seed=self.seed)
            hpo_evaluator = RegressionEvaluator(self.default_config, scorer=self.metric,
                                                data_node=self.original_data, name='hpo',
                                                resampling_strategy=self.evaluation_type,
                                                seed=self.seed)
        else:
            raise ValueError('Invalid task type!')

        self.optimizer['fe'] = build_fe_optimizer(self.evaluation_type, self.task_type, self.original_data,
                                                  fe_evaluator, estimator_id, per_run_time_limit,
                                                  per_run_mem_limit, self.seed,
                                                  shared_mode=self.share_fe, n_jobs=n_jobs)

        self.inc['fe'], self.local_inc['fe'] = self.original_data, self.original_data

        # Build the HPO component.
        # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20)
        trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource

        self.optimizer['hpo'] = build_hpo_optimizer(self.evaluation_type, hpo_evaluator, cs, output_dir=output_dir,
                                                    per_run_time_limit=per_run_time_limit,
                                                    trials_per_iter=trials_per_iter,
                                                    seed=self.seed, n_jobs=n_jobs)

        self.inc['hpo'], self.local_inc['hpo'] = self.default_config, self.default_config
        self.local_hist['fe'].append(self.original_data)
        self.local_hist['hpo'].append(self.default_config)
Exemple #11
0
import numpy as np
import os
import sys

sys.path.append(os.getcwd())

from automlToolkit.components.feature_engineering.transformations.preprocessor.text2vector import \
    Text2VectorTransformation
from automlToolkit.components.feature_engineering.transformation_graph import DataNode
from automlToolkit.components.utils.constants import *

x = np.array([[1, 'I am good', 'I am right', 3], [2, 'He is silly', 'He is stupid', 4]])
y = np.array([0, 1])

t2v = Text2VectorTransformation()
data = (x, y)
feature_type = [NUMERICAL, TEXT, TEXT, DISCRETE]
datanode = DataNode(data, feature_type)
print(t2v.operate(datanode))
    def __init__(self, trial_num, classifier_ids: List[str], data: DataNode,
                 per_run_time_limit=300, output_dir=None,
                 dataset_name='default_dataset',
                 tmp_directory='logs',
                 eval_type='holdout',
                 share_feature=False,
                 num_meta_configs=0,
                 n_jobs=1,
                 logging_config=None,
                 opt_algo='rb',
                 seed=1):
        """
        :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting',
        'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda',
        'random_forest','sgd'}
        """
        self.original_data = data.copy_()
        self.trial_num = trial_num
        self.n_jobs = n_jobs
        self.alpha = 6
        self.B = 0.01
        self.seed = seed
        self.shared_mode = share_feature
        np.random.seed(self.seed)

        # Best configuration.
        self.optimal_algo_id = None
        self.nbest_algo_ids = None
        self.best_lower_bounds = None

        # Set up backend.
        self.dataset_name = dataset_name
        self.tmp_directory = tmp_directory
        self.logging_config = logging_config
        if not os.path.exists(self.tmp_directory):
            os.makedirs(self.tmp_directory)
        logger_name = "%s-%s" % (__class__.__name__, self.dataset_name)
        self.logger = self._get_logger(logger_name)

        # Meta-learning setting
        self.meta_configs = self.fetch_meta_configs(num_meta_configs)

        # Bandit settings.
        self.incumbent_perf = -1.
        self.arms = classifier_ids
        self.rewards = dict()
        self.sub_bandits = dict()
        self.evaluation_cost = dict()
        self.fe_datanodes = dict()
        self.eval_type = eval_type

        for arm in self.arms:
            self.rewards[arm] = list()
            self.evaluation_cost[arm] = list()
            self.fe_datanodes[arm] = list()
            self.sub_bandits[arm] = SecondLayerBandit(
                arm, self.original_data, output_dir=output_dir,
                per_run_time_limit=per_run_time_limit,
                share_fe=self.shared_mode,
                seed=self.seed,
                eval_type=eval_type,
                dataset_id=dataset_name,
                n_jobs=self.n_jobs,
                mth=opt_algo
            )

        self.action_sequence = list()
        self.final_rewards = list()
        self.start_time = time.time()
        self.time_records = list()
def test_generator():
    data = (np.array([[0, 1.2, 2, 1], [0, 1, 2, 1], [0, 3, 2, 2],
                      [0, 5, 4, 5]]), np.array([1, 2, 3, 4]))
    feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    datanode = DataNode(data, feature_type)

    # Test SVD.
    from automlToolkit.components.feature_engineering.transformations.generator.svd_decomposer import SvdDecomposer
    scaler = SvdDecomposer()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test feature agglomerate.
    from automlToolkit.components.feature_engineering.transformations.generator.feature_agglomeration_decomposer import \
        FeatureAgglomerationDecomposer
    scaler = FeatureAgglomerationDecomposer()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test PCA.
    from automlToolkit.components.feature_engineering.transformations.generator.pca_decomposer import PcaDecomposer
    scaler = PcaDecomposer()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test kernel PCA.
    from automlToolkit.components.feature_engineering.transformations.generator.kernel_pca import KernelPCA
    scaler = KernelPCA()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test fast ICA.
    from automlToolkit.components.feature_engineering.transformations.generator.fast_ica_decomposer import \
        FastIcaDecomposer
    scaler = FastIcaDecomposer()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test LDA.
    # from components.transformers.generator.lda_decomposer import LdaDecomposer
    # scaler = LdaDecomposer(frac=0.3)
    # scaler.concatenate = False
    # output_datanode = scaler.operate(datanode)
    # print(output_datanode)
    # print(output_datanode.data)

    # Test random trees embedding.
    from automlToolkit.components.feature_engineering.transformations.generator.random_trees_embedding import \
        RandomTreesEmbeddingTransformation
    scaler = RandomTreesEmbeddingTransformation()
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)
    def predict_proba(self, X_test, is_weighted=False):
        """
            weight source: ...
            model 1: local_inc['fe'], default_hpo
            model 2: default_fe, local_inc['hpo']
            model 3: local_inc['fe'], local_inc['hpo']
        :param X_test:
        :param is_weighted:
        :return:
        """
        X_train_ori, y_train_ori = self.original_data.data
        X_train_inc, y_train_inc = self.local_inc['fe'].data

        model1_clf = fetch_predict_estimator(self.task_type, self.default_config, X_train_inc, y_train_inc)
        model2_clf = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train_ori, y_train_ori)
        model3_clf = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train_inc, y_train_inc)
        model4_clf = fetch_predict_estimator(self.task_type, self.default_config, X_train_ori, y_train_ori)

        if is_weighted:
            # Based on performance on the validation set
            # TODO: Save the results so that the models will not be trained again
            from automlToolkit.components.ensemble.ensemble_selection import EnsembleSelection
            from autosklearn.metrics import balanced_accuracy
            sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=1)
            X, y = X_train_ori.copy(), y_train_ori.copy()
            _X, _y = X_train_inc.copy(), y_train_inc.copy()
            for train_index, test_index in sss.split(X, y):
                X_train, X_val, y_train, y_val = X[train_index], X[test_index], y[train_index], y[test_index]
                _X_train, _X_val, _y_train, _y_val = _X[train_index], _X[test_index], _y[train_index], _y[test_index]

            assert (y_val == _y_val).all()
            model1_clf_temp = fetch_predict_estimator(self.task_type, self.default_config, _X_train, _y_train)
            model2_clf_temp = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], X_train, y_train)
            model3_clf_temp = fetch_predict_estimator(self.task_type, self.local_inc['hpo'], _X_train, _y_train)
            model4_clf_temp = fetch_predict_estimator(self.task_type, self.default_config, X_train, y_train)
            pred1 = model1_clf_temp.predict_proba(_X_val)
            pred2 = model2_clf_temp.predict_proba(X_val)
            pred3 = model3_clf_temp.predict_proba(_X_val)
            pred4 = model4_clf_temp.predict_proba(X_val)

            # Ensemble size is a hyperparameter
            es = EnsembleSelection(ensemble_size=20, task_type=1, metric=balanced_accuracy,
                                   random_state=np.random.RandomState(self.seed))
            es.fit([pred1, pred2, pred3, pred4], y_val, None)
            weights = es.weights_
            print("weights " + str(weights))

        # Make sure that the estimator has "predict_proba"
        _test_node = DataNode(data=[X_test, None], feature_type=self.original_data.feature_types.copy())
        _X_test = self.optimizer['fe'].apply(_test_node, self.local_inc['fe']).data[0]
        pred1 = model1_clf.predict_proba(_X_test)
        pred2 = model2_clf.predict_proba(X_test)
        pred3 = model3_clf.predict_proba(_X_test)
        pred4 = model4_clf.predict_proba(X_test)

        if is_weighted:
            final_pred = weights[0] * pred1 + weights[1] * pred2 + weights[2] * pred3 + weights[3] * pred4
        else:
            final_pred = (pred1 + pred2 + pred3 + pred4) / 4

        return final_pred
def evaluate_transformation_graph():
    data = (np.array([[np.nan, 2, 1], [1, 2, 2], [3, 4, 2],
                      [5, np.nan, 1]]), np.array([1, 2, 3, 4]))
    feature_type = [NUMERICAL, NUMERICAL, CATEGORICAL]
    datanode = DataNode(data, feature_type)

    graph = TransformationGraph()
    graph.add_node(datanode)

    transformer = ImputationTransformation()
    output_datanode1 = transformer.operate(datanode, target_fields=[0, 1])
    graph.add_node(output_datanode1)
    graph.add_edge(datanode.node_id(), output_datanode1.node_id(), transformer)

    transformer = OneHotTransformation()
    output_datanode2 = transformer.operate(output_datanode1)
    graph.add_node(output_datanode2)
    graph.add_edge(output_datanode1.get_node_id(),
                   output_datanode2.get_node_id(), transformer)

    transformer = ScaleTransformation()
    transformer.concatenate = True
    output_datanode3 = transformer.operate(output_datanode2)
    graph.add_node(output_datanode3)
    graph.add_edge(output_datanode2.get_node_id(),
                   output_datanode3.get_node_id(), transformer)

    print(output_datanode3)
    print(output_datanode3.data)

    transformer = ScaleTransformation()
    transformer.concatenate = False
    output_datanode4 = transformer.operate(output_datanode2)
    graph.add_node(output_datanode4)
    graph.add_edge(output_datanode2.get_node_id(),
                   output_datanode4.get_node_id(), transformer)

    transformer = Merger()
    output_datanode5 = transformer.operate(
        [output_datanode3, output_datanode4])
    graph.add_node(output_datanode5)
    graph.add_transformation(
        [output_datanode3.get_node_id(),
         output_datanode4.get_node_id()], output_datanode5.get_node_id(),
        transformer)

    print(output_datanode5)
    print(output_datanode5.data)

    order_ids = graph.topological_sort()
    print(order_ids)
    test_data = (np.array([[np.nan, 2, 1], [1, 2, 1], [3, 2, 1],
                           [3, np.nan, 1]]), None)
    test_node = DataNode(test_data, feature_types)

    inputnode = graph.get_node(order_ids[0])
    inputnode.set_values(test_node)

    for idx in range(1, len(order_ids)):
        node_id = order_ids[idx]

        input_node_list = list()
        for input_id in graph.input_data_dict[node_id]:
            inputnode = graph.get_node(input_id)
            input_node_list.append(inputnode)
        inputnode = input_node_list[0] if len(
            input_node_list) == 1 else input_node_list

        edge = graph.get_edge(graph.input_edge_dict[node_id])
        outputnode = edge.transformer.operate(inputnode, edge.target_fields)
        graph.get_node(node_id).set_values(outputnode)
    output_node = graph.get_node(order_ids[-1])
    print(output_node)
    print(output_node.data)
Exemple #16
0
    def __init__(self, classifier_id: str, data: DataNode,
                 share_fe=False, output_dir='logs',
                 per_run_time_limit=120,
                 per_run_mem_limit=5120,
                 eval_type='cv', dataset_id='default',
                 mth='rb', sw_size=3, strategy='avg',
                 n_jobs=1, seed=1):
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.classifier_id = classifier_id
        self.evaluation_type = eval_type
        self.original_data = data.copy_()
        self.share_fe = share_fe
        self.output_dir = output_dir
        self.mth = mth
        self.strategy = strategy
        self.seed = seed
        self.sliding_window_size = sw_size
        self.logger = get_logger('%s:%s-%d=>%s' % (__class__.__name__, dataset_id, seed, classifier_id))
        np.random.seed(self.seed)

        # Bandit settings.
        self.arms = ['fe', 'hpo']
        self.rewards = dict()
        self.optimizer = dict()
        self.evaluation_cost = dict()
        self.inc = dict()
        self.local_inc = dict()
        for arm in self.arms:
            self.rewards[arm] = list()
            self.evaluation_cost[arm] = list()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()
        self.incumbent_perf = -1.
        self.incumbent_source = None
        self.update_flag = dict()
        self.imp_rewards = dict()
        for arm in self.arms:
            self.update_flag[arm] = True
            self.imp_rewards[arm] = list()

        from autosklearn.pipeline.components.classification import _classifiers
        clf_class = _classifiers[classifier_id]
        cs = clf_class.get_hyperparameter_search_space()
        model = UnParametrizedHyperparameter("estimator", classifier_id)
        cs.add_hyperparameter(model)
        self.config_space = cs
        self.default_config = cs.get_default_configuration()
        self.config_space.seed(self.seed)

        # Build the Feature Engineering component.
        fe_evaluator = Evaluator(self.default_config,
                                 name='fe', resampling_strategy=self.evaluation_type,
                                 seed=self.seed)
        self.optimizer['fe'] = EvaluationBasedOptimizer(
                self.original_data, fe_evaluator,
                classifier_id, per_run_time_limit, per_run_mem_limit, self.seed,
                shared_mode=self.share_fe, n_jobs=n_jobs)
        self.inc['fe'], self.local_inc['fe'] = self.original_data, self.original_data

        # Build the HPO component.
        trials_per_iter = len(self.optimizer['fe'].trans_types)
        hpo_evaluator = Evaluator(self.default_config,
                                  data_node=self.original_data, name='hpo',
                                  resampling_strategy=self.evaluation_type,
                                  seed=self.seed)
        if n_jobs == 1:
            self.optimizer['hpo'] = SMACOptimizer(
                hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit,
                trials_per_iter=trials_per_iter // 2, seed=self.seed)
        else:
            self.optimizer['hpo'] = PSMACOptimizer(
                hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit,
                trials_per_iter=trials_per_iter // 2, seed=self.seed,
                n_jobs=n_jobs
            )
        self.inc['hpo'], self.local_inc['hpo'] = self.default_config, self.default_config
Exemple #17
0
 def get_data_node(self, X, y):
     if self.feature_types is None:
         raise ValueError("Feature type missing")
     return DataNode([X, y], self.feature_types)