def test_additional_transformations():
    data = (np.array([[0, 1.2, 2, 1], [0.01, 1, 2, 1], [0.02, 3, 2, 2],
                      [0.015, 5, 4, 5], [0.12, 3, 2, 2],
                      [0.16, 5, 4, 5]]), np.array([1, 1, 2, 2, 3, 3]))
    feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    datanode = DataNode(data, feature_type)
    from mindware.components.feature_engineering.transformations.discrete_categorizer import DiscreteCategorizer
    # trans = ArithmeticTransformation()
    # trans = LdaDecomposer()
    # trans = KBinsDiscretizer()
    trans = DiscreteCategorizer()
    output_datanode = trans.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)
Ejemplo n.º 2
0
    def remove_uninf_cols(self, input_node: DataNode, train_phase=True):
        raw_dataframe = input_node.data[0]
        types = input_node.feature_types
        if train_phase:
            # Remove the uninformative columns.
            uninformative_columns, uninformative_idx = list(), list()
            for idx, column in enumerate(list(raw_dataframe)):
                if raw_dataframe[column].isnull().values.all():
                    uninformative_columns.append(column)
                    uninformative_idx.append(idx)
                    continue
                if types[idx] == CATEGORICAL:
                    num_sample = input_node.data[0].shape[0]
                    num_unique = len(set(input_node.data[0][column]))
                    if num_unique >= int(0.8 * num_sample):
                        uninformative_columns.append(column)
                        uninformative_idx.append(idx)
            self.uninformative_columns, self.uninformative_idx = uninformative_columns, uninformative_idx

        input_node.feature_types = [types[idx] for idx in range(len(types)) if idx not in self.uninformative_idx]
        raw_dataframe = raw_dataframe.drop(self.uninformative_columns, axis=1)
        input_node.data[0] = raw_dataframe
        return input_node
def evaluate_feature_selectors():
    data = (np.array([[0, 1.2, 2, 1], [0, 1, 2, 1], [0, 3, 2, 2],
                      [0, 5, 4, 5]]), np.array([1, 2, 3, 4]))
    feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    datanode = DataNode(data, feature_type)

    scaler = GenericUnivariateSelector()
    print(dir(scaler))
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    # transformer = VarianceSelector()
    # transformer = ModelBasedSelector(param='rf')
    # output_datanode = transformer.operate([datanode])
    print(scaler.get_attributes())
    print(output_datanode)
    print(output_datanode.data)
    print(output_datanode.feature_types)
def test_selector():
    data = (np.array([[0, 1.2, 2, 1], [0, 1, 2, 1], [0, 3, 2, 2],
                      [0, 5, 4, 5]]), np.array([1, 2, 3, 4]))
    feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    datanode = DataNode(data, feature_type)

    # Test generic univariate selector.
    scaler = GenericUnivariateSelector()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test percentile selector.
    from mindware.components.feature_engineering.transformations.selector.percentile_selector import \
        PercentileSelector
    scaler = PercentileSelector(percentile=25)
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test model based selector.
    from mindware.components.feature_engineering.transformations.selector.model_based_selector import \
        ModelBasedSelector
    scaler = ModelBasedSelector(param='et')
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test variance threshold.
    from mindware.components.feature_engineering.transformations.selector.variance_selector import VarianceSelector
    scaler = VarianceSelector()
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)
Ejemplo n.º 5
0
    def __init__(self,
                 node_list,
                 node_index,
                 task_type,
                 timestamp,
                 fe_config_space: ConfigurationSpace,
                 cash_config_space: ConfigurationSpace,
                 data: DataNode,
                 fixed_config=None,
                 trial_num=0,
                 time_limit=None,
                 metric='acc',
                 optimizer='smac',
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 resampling_params=None,
                 n_jobs=1,
                 seed=1):
        # Tree setting
        self.node_list = node_list
        self.node_index = node_index

        # Set up backend.
        self.dataset_name = dataset_name
        self.trial_num = trial_num
        self.time_limit = time_limit
        self.per_run_time_limit = per_run_time_limit
        self.start_time = time.time()
        self.logger = get_logger('Soln-ml: %s' % dataset_name)

        # Basic settings.
        self.eval_type = eval_type
        self.resampling_params = resampling_params
        self.task_type = task_type
        self.timestamp = timestamp
        self.fe_config_space = fe_config_space
        self.cash_config_space = cash_config_space
        self.fixed_config = fixed_config
        self.original_data = data.copy_()
        self.metric = get_metric(metric)
        self.optimizer = optimizer
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.n_jobs = n_jobs
        self.seed = seed
        self.output_dir = output_dir

        self.early_stop_flag = False
        self.timeout_flag = False
        self.incumbent_perf = -float("INF")
        self.incumbent = None
        self.eval_dict = dict()

        if self.task_type in CLS_TASKS:
            self.if_imbal = is_imbalanced_dataset(self.original_data)
        else:
            self.if_imbal = False

        self.es = None
Ejemplo n.º 6
0
 def get_data_node(self, X, y):
     if self.feature_types is None:
         raise ValueError("Feature type missing")
     return DataNode([X, y], self.feature_types, feature_names=self.feature_names)
def test_generator():
    data = (np.array([[0, 1.2, 2, 1], [0, 1, 2, 1], [0, 3, 2, 2],
                      [0, 5, 4, 5]]), np.array([1, 2, 3, 4]))
    feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    datanode = DataNode(data, feature_type)

    # Test SVD.
    from mindware.components.feature_engineering.transformations.generator.svd_decomposer import SvdDecomposer
    scaler = SvdDecomposer()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test feature agglomerate.
    from mindware.components.feature_engineering.transformations.generator.feature_agglomeration_decomposer import \
        FeatureAgglomerationDecomposer
    scaler = FeatureAgglomerationDecomposer()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test PCA.
    from mindware.components.feature_engineering.transformations.generator.pca_decomposer import PcaDecomposer
    scaler = PcaDecomposer()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test kernel PCA.
    from mindware.components.feature_engineering.transformations.generator.kernel_pca import KernelPCA
    scaler = KernelPCA()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test fast ICA.
    from mindware.components.feature_engineering.transformations.generator.fast_ica_decomposer import \
        FastIcaDecomposer
    scaler = FastIcaDecomposer()
    scaler.concatenate = False
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)

    # Test LDA.
    # from components.transformers.generator.lda_decomposer import LdaDecomposer
    # scaler = LdaDecomposer(frac=0.3)
    # scaler.concatenate = False
    # output_datanode = scaler.operate(datanode)
    # print(output_datanode)
    # print(output_datanode.data)

    # Test random trees embedding.
    from mindware.components.feature_engineering.transformations.generator.random_trees_embedding import \
        RandomTreesEmbeddingTransformation
    scaler = RandomTreesEmbeddingTransformation()
    output_datanode = scaler.operate(datanode)
    print(output_datanode)
    print(output_datanode.data)
def evaluate_transformation_graph():
    data = (np.array([[np.nan, 2, 1], [1, 2, 2], [3, 4, 2],
                      [5, np.nan, 1]]), np.array([1, 2, 3, 4]))
    feature_type = [NUMERICAL, NUMERICAL, CATEGORICAL]
    datanode = DataNode(data, feature_type)

    graph = TransformationGraph()
    graph.add_node(datanode)

    transformer = ImputationTransformation()
    output_datanode1 = transformer.operate(datanode, target_fields=[0, 1])
    graph.add_node(output_datanode1)
    graph.add_edge(datanode.node_id(), output_datanode1.node_id(), transformer)

    transformer = OneHotTransformation()
    output_datanode2 = transformer.operate(output_datanode1)
    graph.add_node(output_datanode2)
    graph.add_edge(output_datanode1.get_node_id(),
                   output_datanode2.get_node_id(), transformer)

    transformer = ScaleTransformation()
    transformer.concatenate = True
    output_datanode3 = transformer.operate(output_datanode2)
    graph.add_node(output_datanode3)
    graph.add_edge(output_datanode2.get_node_id(),
                   output_datanode3.get_node_id(), transformer)

    print(output_datanode3)
    print(output_datanode3.data)

    transformer = ScaleTransformation()
    transformer.concatenate = False
    output_datanode4 = transformer.operate(output_datanode2)
    graph.add_node(output_datanode4)
    graph.add_edge(output_datanode2.get_node_id(),
                   output_datanode4.get_node_id(), transformer)

    transformer = Merger()
    output_datanode5 = transformer.operate(
        [output_datanode3, output_datanode4])
    graph.add_node(output_datanode5)
    graph.add_transformation(
        [output_datanode3.get_node_id(),
         output_datanode4.get_node_id()], output_datanode5.get_node_id(),
        transformer)

    print(output_datanode5)
    print(output_datanode5.data)

    order_ids = graph.topological_sort()
    print(order_ids)
    test_data = (np.array([[np.nan, 2, 1], [1, 2, 1], [3, 2, 1],
                           [3, np.nan, 1]]), None)
    test_node = DataNode(test_data, feature_types)

    inputnode = graph.get_node(order_ids[0])
    inputnode.set_values(test_node)

    for idx in range(1, len(order_ids)):
        node_id = order_ids[idx]

        input_node_list = list()
        for input_id in graph.input_data_dict[node_id]:
            inputnode = graph.get_node(input_id)
            input_node_list.append(inputnode)
        inputnode = input_node_list[0] if len(
            input_node_list) == 1 else input_node_list

        edge = graph.get_edge(graph.input_edge_dict[node_id])
        outputnode = edge.transformer.operate(inputnode, edge.target_fields)
        graph.get_node(node_id).set_values(outputnode)
    output_node = graph.get_node(order_ids[-1])
    print(output_node)
    print(output_node.data)
Ejemplo n.º 9
0
import numpy as np
import os
import sys

sys.path.append(os.getcwd())

from mindware.components.feature_engineering.transformations.preprocessor.text2vector import \
    Text2VectorTransformation
from mindware.components.feature_engineering.transformation_graph import DataNode
from mindware.components.utils.constants import *
from mindware.estimators import Classifier

x = np.array([[1, 'I am good', 'I am right', 3],
              [2, 'He is good', 'He is ok', 4],
              [2.5, 'Everyone is good', 'Everyone is ok', 7],
              [1.3333, 'well', 'what', 5]])
y = np.array([0, 1, 0, 1])

t2v = Text2VectorTransformation()
data = (x, y)
feature_type = [NUMERICAL, TEXT, TEXT, DISCRETE]
datanode = DataNode(data, feature_type)

clf = Classifier(time_limit=20,
                 enable_meta_algorithm_selection=False,
                 include_algorithms=['random_forest'])

clf.fit(datanode, opt_strategy='combined')
print(clf.predict(datanode))
Ejemplo n.º 10
0
    def __init__(self,
                 node_list,
                 node_index,
                 task_type,
                 timestamp,
                 fe_config_space: ConfigurationSpace,
                 cash_config_space: ConfigurationSpace,
                 data: DataNode,
                 fixed_config=None,
                 time_limit=None,
                 trial_num=0,
                 metric='acc',
                 optimizer='smac',
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 resampling_params=None,
                 n_jobs=1,
                 seed=1):
        super(AlternatingBlock,
              self).__init__(node_list,
                             node_index,
                             task_type,
                             timestamp,
                             fe_config_space,
                             cash_config_space,
                             data,
                             fixed_config=fixed_config,
                             time_limit=time_limit,
                             trial_num=trial_num,
                             metric=metric,
                             optimizer=optimizer,
                             ensemble_method=ensemble_method,
                             ensemble_size=ensemble_size,
                             per_run_time_limit=per_run_time_limit,
                             output_dir=output_dir,
                             dataset_name=dataset_name,
                             eval_type=eval_type,
                             resampling_params=resampling_params,
                             n_jobs=n_jobs,
                             seed=seed)

        self.arms = ['hpo', 'fe']
        self.optimal_algo_id = None
        self.first_start = True
        self.sub_bandits = dict()
        self.rewards = dict()
        self.evaluation_cost = dict()
        self.update_flag = dict()

        # Global incumbent.
        self.init_config = {
            'fe':
            fe_config_space.get_default_configuration().get_dictionary().copy(
            ),
            'hpo':
            cash_config_space.get_default_configuration().get_dictionary().
            copy()
        }
        self.inc = {
            'fe':
            fe_config_space.get_default_configuration().get_dictionary().copy(
            ),
            'hpo':
            cash_config_space.get_default_configuration().get_dictionary().
            copy()
        }
        self.local_inc = {
            'fe':
            fe_config_space.get_default_configuration().get_dictionary().copy(
            ),
            'hpo':
            cash_config_space.get_default_configuration().get_dictionary().
            copy()
        }
        self.local_hist = {'fe': [], 'hpo': []}
        self.inc_record = {'fe': list(), 'hpo': list()}
        self.exp_output = dict()
        self.eval_dict = dict()
        self.arm_eval_dict = {'fe': dict(), 'hpo': dict()}
        for arm in self.arms:
            self.rewards[arm] = list()
            self.update_flag[arm] = False
            self.evaluation_cost[arm] = list()
            self.exp_output[arm] = dict()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()

        for arm in self.arms:
            if arm == 'fe':
                from mindware.blocks.block_utils import get_node_type
                child_type = get_node_type(node_list, node_index + 1)
                self.sub_bandits[arm] = child_type(
                    node_list,
                    node_index + 1,
                    task_type,
                    timestamp,
                    fe_config_space,
                    None,
                    data.copy_(),
                    fixed_config=self.init_config['hpo'],
                    time_limit=time_limit,
                    metric=metric,
                    optimizer=optimizer,
                    ensemble_method=ensemble_method,
                    ensemble_size=ensemble_size,
                    per_run_time_limit=per_run_time_limit,
                    output_dir=output_dir,
                    dataset_name=dataset_name,
                    eval_type=eval_type,
                    resampling_params=resampling_params,
                    n_jobs=n_jobs,
                    seed=seed)
            else:
                from mindware.blocks.block_utils import get_node_type
                child_type = get_node_type(node_list, node_index + 2)
                self.sub_bandits[arm] = child_type(
                    node_list,
                    node_index + 2,
                    task_type,
                    timestamp,
                    None,
                    cash_config_space,
                    data.copy_(),
                    fixed_config=self.init_config['fe'],
                    time_limit=time_limit,
                    metric=metric,
                    ensemble_method=ensemble_method,
                    ensemble_size=ensemble_size,
                    per_run_time_limit=per_run_time_limit,
                    output_dir=output_dir,
                    dataset_name=dataset_name,
                    eval_type=eval_type,
                    resampling_params=resampling_params,
                    n_jobs=n_jobs,
                    seed=seed)

        self.topk_saver = CombinedTopKModelSaver(k=50,
                                                 model_dir=self.output_dir,
                                                 identifier=self.timestamp)
Ejemplo n.º 11
0
def parse_config(data_node: DataNode,
                 config: dict,
                 record=False,
                 skip_balance=False,
                 if_imbal=False):
    """
        Transform the data node based on the pipeline specified by configuration.
    :param data_node:
    :param config:
    :param record:
    :return: the resulting data node.
    """
    _preprocessor_candidates = get_combined_fe_candidtates(
        _preprocessor, _gen_addons)
    _preprocessor_candidates = get_combined_fe_candidtates(
        _preprocessor_candidates, _sel_addons)
    _rescaler_candidates = get_combined_fe_candidtates(_rescaler, _res_addons)

    if not if_imbal:
        _balancer_candidates = get_combined_fe_candidtates(
            _bal_balancer, _bal_addons)
    else:
        _balancer_candidates = get_combined_fe_candidtates(
            _imb_balancer, _bal_addons)

    # Remove the indicator in config_dict.
    config_dict = config.copy()

    image_pre_id = config_dict.get('image_preprocessor', None)
    if image_pre_id:
        config_dict.pop('image_preprocessor')
    text_pre_id = config_dict.get('text_preprocessor', None)
    if text_pre_id:
        config_dict.pop('text_preprocessor')

    def tran_operate(id, tran_set, config, node):
        _config = {}
        for key in config:
            if id in key:
                config_name = key.split(':')[1]
                _config[config_name] = config[key]
        tran = tran_set[id](**_config)
        output_node = tran.operate(node)
        return output_node, tran

    _node = data_node.copy_()
    tran_dict = dict()

    # Image preprocessor
    if image_pre_id:
        _node, image_tran = tran_operate(image_pre_id, _image_preprocessor,
                                         config_dict, _node)
        tran_dict['image_preprocessor'] = image_tran

    # Text preprocessor
    if text_pre_id:
        _node, text_tran = tran_operate(text_pre_id, _text_preprocessor,
                                        config_dict, _node)
        tran_dict['text_preprocessor'] = text_tran

    for stage in stage_list:
        if stage == 'balancer':
            if skip_balance:
                op_id = 'empty'
            else:
                if stage in config_dict:
                    op_id = config_dict[stage]
                    config_dict.pop(stage)
                else:
                    op_id = 'empty'
        else:
            op_id = config_dict[stage]
            config_dict.pop(stage)
        if stage == 'preprocessor':
            _node, tran = tran_operate(op_id, _preprocessor_candidates,
                                       config_dict, _node)
        elif stage == 'rescaler':
            _node, tran = tran_operate(op_id, _rescaler_candidates,
                                       config_dict, _node)
        elif stage == 'balancer':
            _node, tran = tran_operate(op_id, _balancer_candidates,
                                       config_dict, _node)
        else:
            # Third party stage
            _node, tran = tran_operate(op_id,
                                       thirdparty_candidates_dict[stage],
                                       config_dict, _node)

        tran_dict[stage] = tran

    _node.config = config
    if record:
        return _node, tran_dict
    return _node