Ejemplo n.º 1
0
def evaluate(candidate: CandidateFeature, classifier, grid_search_parameters, preprocessed_folds, score, train_data, current_target, train_X_all, train_y_all_target, test_data, test_target):
    pipeline = Pipeline([('features', FeatureUnion(
        [
            (candidate.get_name(), candidate.pipeline)
        ])),
                         ('classifier', classifier())
                         ])

    refit = False
    if Config.get_default('score.test', 'False') == 'True' and not Config.get_default('instance.selection',
                                                                                      'False') == 'True':
        refit = True

    clf = GridSearchCV(pipeline, grid_search_parameters, cv=preprocessed_folds, scoring=score, iid=False,
                       error_score='raise', refit=refit)
    clf.fit(train_data, current_target) #dataset.splitted_values['train']
    candidate.runtime_properties['score'] = clf.best_score_
    candidate.runtime_properties['hyperparameters'] = clf.best_params_

    if Config.get_default('score.test', 'False') == 'True':
        if Config.get_default('instance.selection', 'False') == 'True':
            clf = GridSearchCV(pipeline, grid_search_parameters, cv=preprocessed_folds, scoring=score,
                               iid=False, error_score='raise', refit=True)

            clf.fit(train_X_all, train_y_all_target)
        candidate.runtime_properties['test_score'] = clf.score(test_data, test_target) #self.dataset.splitted_values['test']
    else:
        candidate.runtime_properties['test_score'] = 0.0

    return candidate
Ejemplo n.º 2
0
    def materialize_raw_features(self, candidate):
        train_transformed = [None] * len(self.preprocessed_folds)
        test_transformed = [None] * len(self.preprocessed_folds)

        # test
        training_all = None
        one_test_set_transformed = None

        candidate.fit(self.dataset.splitted_values['train'])
        raw_feature = candidate.transform(self.dataset.splitted_values['train'])

        for fold in range(len(self.preprocessed_folds)):
            train_transformed[fold] = raw_feature[self.preprocessed_folds[fold][0]]
            test_transformed[fold] = raw_feature[self.preprocessed_folds[fold][1]]

        if Config.get_default('score.test', 'False') == 'True':
            training_all = raw_feature
            if Config.get_default('instance.selection', 'False') == 'True':
                candidate.fit(self.train_X_all)
                training_all = candidate.transform(self.train_X_all)
            one_test_set_transformed = candidate.transform(self.dataset.splitted_values['test'])

            candidate.runtime_properties['training_all'] = training_all
            candidate.runtime_properties['one_test_set_transformed'] = one_test_set_transformed

        candidate.runtime_properties['train_transformed'] = train_transformed
        candidate.runtime_properties['test_transformed'] = test_transformed
Ejemplo n.º 3
0
 def __init__(
         self,
         dataset_config,
         classifier=XGBClassifier,
         grid_search_parameters={
             'classifier__min_child_weight': [1, 5, 10],
             'classifier__gamma': [0.5, 1, 1.5, 2, 5],
             'classifier__subsample': [0.6, 0.8, 1.0],
             'classifier__colsample_bytree': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
             'classifier__max_depth': [3, 4, 5],
             'classifier__learning_rate': [0.02],
             'classifier__n_estimators': [600],
             'classifier__objective': ['binary:logistic'],
             'classifier__silent': [True],
             'classifier__n_jobs':
             [int(Config.get_default("parallelism", mp.cpu_count()))]
         },
         transformation_producer=get_transformation_for_feature_space,
         score=make_scorer(f1_score, average='micro'),
         reader=None,
         folds=10,
         max_complexity=3):
     self.dataset_config = dataset_config
     self.classifier = classifier
     self.grid_search_parameters = grid_search_parameters
     self.transformation_producer = transformation_producer
     self.score = score
     self.reader = reader
     self.folds = folds
     self.max_complexity = max_complexity
Ejemplo n.º 4
0
    def create_pipeline(self):
        #parent_features = FeatureUnion([(p.get_name(), p.pipeline) for p in self.parents], n_jobs=Config.get('feature.union.parallelism'))

        if len(self.parents) > 1:
            parent_features = FeatureUnion([(p.get_name() + str(time.time()),
                                             copy.deepcopy(p.pipeline))
                                            for p in self.parents])
        else:
            parent_features = copy.deepcopy(self.parents[0].pipeline)

        memory = None
        if Config.get_default('pipeline.caching', 'False') == 'True':
            memory = "/dev/shm"

        pipeline = None
        if isinstance(parent_features, Pipeline):
            parent_features.steps.append(
                (self.transformation.name + str(time.time()),
                 self.transformation))
            pipeline = parent_features
        else:
            pipeline = Pipeline(
                [('parents', parent_features),
                 (self.transformation.name, self.transformation)],
                memory=memory)
        return pipeline
Ejemplo n.º 5
0
    def generate(self, seed=42):
        if type(self.reader) == type(None):
            s = None
            if isinstance(self.classifier(), ClassifierMixin):
                s = Splitter(train_fraction=[0.6, 10000000], valid_fraction=0.0, test_fraction=0.4, seed=seed)
            elif isinstance(self.classifier(), RegressorMixin):
                s = RandomSplitter(train_fraction=[0.6, 10000000], valid_fraction=0.0, test_fraction=0.4, seed=seed)
            else:
                pass

            self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s)
        else:
            self.dataset = self.reader
        self.raw_features = self.dataset.read()

        print("training:" + str(len(self.dataset.splitted_target['train'])))
        print("test:" + str(len(self.dataset.splitted_target['test'])))

        if Config.get_default('instance.selection', 'False') == 'True':
            self.train_X_all = copy.deepcopy(self.dataset.splitted_values['train'])
            self.train_y_all = copy.deepcopy(self.dataset.splitted_target['train'])

            self.dataset.splitted_values['train'], self.dataset.splitted_target['train'] = sample_data_by_cnn(self.dataset.splitted_values['train'], self.dataset.splitted_target['train'])
            print("training:" + str(len(self.dataset.splitted_target['train'])))
        else:
            self.train_X_all = self.dataset.splitted_values['train']
            self.train_y_all = self.dataset.splitted_target['train']
Ejemplo n.º 6
0
    def generate_target(self):
        current_target = self.dataset.splitted_target['train']

        if isinstance(self.classifier(), ClassifierMixin):
            label_encoder = LabelEncoder()
            label_encoder.fit(current_target)

            current_target = label_encoder.transform(current_target)

            self.test_target = None
            self.train_y_all_target = None
            if Config.get_default('score.test', 'False') == 'True':
                self.test_target = label_encoder.transform(
                    self.dataset.splitted_target['test'])
                self.train_y_all_target = label_encoder.transform(
                    self.train_y_all)

            self.preprocessed_folds = []
            for train, test in StratifiedKFold(
                    n_splits=self.folds, random_state=42).split(
                        self.dataset.splitted_values['train'], current_target):
                self.preprocessed_folds.append((train, test))
        elif isinstance(self.classifier(), RegressorMixin):

            if Config.get_default('score.test', 'False') == 'True':
                self.test_target = self.dataset.splitted_target['test']
                self.train_y_all_target = self.train_y_all

            self.preprocessed_folds = []
            for train, test in KFold(n_splits=self.folds,
                                     random_state=42).split(
                                         self.dataset.splitted_values['train'],
                                         current_target):
                self.preprocessed_folds.append((train, test))
        else:
            pass

        self.target_train_folds = [None] * self.folds
        self.target_test_folds = [None] * self.folds

        for fold in range(len(self.preprocessed_folds)):
            self.target_train_folds[fold] = current_target[
                self.preprocessed_folds[fold][0]]
            self.target_test_folds[fold] = current_target[
                self.preprocessed_folds[fold][1]]
Ejemplo n.º 7
0
def evaluate_candidates(
        candidates: List[CandidateFeature]) -> List[CandidateFeature]:

    my_globale_module.candidate_list_global = candidates

    with mp.Pool(processes=int(
            Config.get_default("parallelism", mp.cpu_count()))) as pool:
        my_function = evaluate_catch
        candidates_ids = list(range(len(candidates)))

        if Config.get_default("show_progess", 'True') == 'True':
            results = []
            for x in tqdm.tqdm(pool.imap_unordered(my_function,
                                                   candidates_ids),
                               total=len(candidates_ids)):
                results.append(x)
        else:
            results = pool.map(my_function, candidates_ids)

    return results
    def create_pipeline(self):
        memory = None
        if Config.get_default('pipeline.caching', 'False') == 'True':
            memory = "/dev/shm"

        pipeline = Pipeline(
            [(self.name,
              ColumnTransformer(
                  [('identity', FunctionTransformer(
                      identity, validate=False), [self.column_id])]))],
            memory=memory)
        return pipeline
Ejemplo n.º 9
0
def nested_cv_score_parallel(
    candidates: List[CandidateFeature],
    splitted_values_train,
    splitted_target_train,
    n_jobs: int = int(Config.get_default("parallelism", mp.cpu_count()))
) -> List[CandidateFeature]:
    nested_my_globale_module.candidate_list_global = candidates
    nested_my_globale_module.splitted_values_train = splitted_values_train
    nested_my_globale_module.splitted_target_train = splitted_target_train

    with mp.Pool(processes=n_jobs) as pool:
        my_function = run_nested_cross_validation_global
        candidates_ids = list(range(len(candidates)))

        if Config.get_default("show_progess", 'True') == 'True':
            results = []
            for x in tqdm.tqdm(pool.imap_unordered(my_function,
                                                   candidates_ids),
                               total=len(candidates_ids)):
                results.append(x)
        else:
            results = pool.map(my_function, candidates_ids)

    return results
Ejemplo n.º 10
0
    def generate_target(self):
        current_target = self.dataset.splitted_target['train']

        label_encoder = LabelEncoder()
        label_encoder.fit(current_target)

        self.current_target = label_encoder.transform(current_target)

        if Config.get_default('score.test', 'False') == 'True':
            self.test_target = label_encoder.transform(self.dataset.splitted_target['test'])
            self.train_y_all_target = label_encoder.transform(self.train_y_all)


        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=self.folds, random_state=42).split(self.dataset.splitted_values['train'],
                                                                                       self.current_target):
            self.preprocessed_folds.append((train, test))
Ejemplo n.º 11
0
def grid_search(train_transformed, test_transformed, training_all,
                one_test_set_transformed, grid_search_parameters, score,
                classifier, target_train_folds, target_test_folds,
                train_y_all_target, test_target):

    hyperparam_to_score_list = {}

    my_keys = list(grid_search_parameters.keys())

    for parameter_combination in itertools.product(
            *[grid_search_parameters[k] for k in my_keys]):
        parameter_set = hashabledict(zip(my_keys, parameter_combination))
        hyperparam_to_score_list[parameter_set] = []
        for fold in range(len(train_transformed)):
            clf = classifier(**parameter_set)
            clf.fit(train_transformed[fold], target_train_folds[fold])
            y_pred = clf.predict(test_transformed[fold])
            hyperparam_to_score_list[parameter_set].append(
                score._sign * score._score_func(target_test_folds[fold],
                                                y_pred, **score._kwargs))

    best_param = None
    best_mean_cross_val_score = -float("inf")
    best_score_list = []
    for parameter_config, score_list in hyperparam_to_score_list.items():
        #mean_score = np.min(score_list)
        mean_score = np.mean(score_list)
        if mean_score > best_mean_cross_val_score:
            best_param = parameter_config
            best_mean_cross_val_score = mean_score
            best_score_list = copy.deepcopy(score_list)

    test_score = None
    if Config.get_default('score.test', 'False') == 'True':
        # refit to entire training and test on test set
        clf = classifier(**best_param)
        clf.fit(training_all, train_y_all_target)
        y_pred = clf.predict(one_test_set_transformed)
        test_score = score._sign * score._score_func(test_target, y_pred, **
                                                     score._kwargs)

        #np.save('/tmp/true_predictions', self.test_target)

    return best_mean_cross_val_score, test_score, best_param, y_pred, best_score_list, clf
Ejemplo n.º 12
0
def grid(grid_search_parameters, preprocessed_folds, train_data,
         current_target, test_data, test_target, pipeline, classifier, score):
    hyperparam_to_score_list = {}

    my_keys = list(grid_search_parameters.keys())

    for parameter_combination in itertools.product(
            *[grid_search_parameters[k] for k in my_keys]):
        parameter_set = hashabledict(zip(my_keys, parameter_combination))
        hyperparam_to_score_list[parameter_set] = []
        for train_id, test_id in preprocessed_folds:
            clf = classifier(**parameter_set)
            my_train = pipeline.fit_transform(train_data[train_id],
                                              current_target[train_id])
            clf.fit(my_train, current_target[train_id])
            y_pred = clf.predict(pipeline.transform(train_data[test_id]))

            hyperparam_to_score_list[parameter_set].append(
                score._sign * score._score_func(current_target[test_id],
                                                y_pred, **score._kwargs))

    best_param = None
    best_mean_cross_val_score = -float("inf")
    for parameter_config, score_list in hyperparam_to_score_list.items():
        mean_score = np.mean(score_list)
        if mean_score > best_mean_cross_val_score:
            best_param = parameter_config
            best_mean_cross_val_score = mean_score

    test_score = None
    if Config.get_default('score.test', 'False') == 'True':
        # refit to entire training and test on test set
        clf = classifier(**best_param)
        my_train = pipeline.fit_transform(train_data, current_target)
        clf.fit(my_train, current_target)
        y_pred = clf.predict(pipeline.transform(test_data))
        test_score = score._sign * score._score_func(test_target, y_pred, **
                                                     score._kwargs)

        # np.save('/tmp/true_predictions', self.test_target)

    return best_mean_cross_val_score, test_score, best_param, y_pred
    def __init__(self, dataset_config, classifier=LogisticRegression, grid_search_parameters={'penalty': ['l2'],
                                                                                                'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                                                                                                'solver': ['lbfgs'],
                                                                                                'class_weight': ['balanced'],
                                                                                                'max_iter': [10000],
                                                                                                'multi_class':['auto']
                                                                                                },
                 transformation_producer=get_transformation_for_division,
                 epsilon=0.0,
                 c_max=2,
                 folds=10,
                 score=make_scorer(f1_score, average='micro'),
                 max_seconds=None,
                 save_logs=False,
                 reader=None,
                 upload2openml=False,
                 remove_parents=True,
                 n_jobs=None,
                 max_feature_depth=np.inf
                 ):
        super(ComplexityDrivenFeatureConstruction, self).__init__(dataset_config, classifier, grid_search_parameters,
                                                        transformation_producer)
        self.epsilon = epsilon
        self.c_max = c_max
        self.folds = folds
        self.score = score
        self.save_logs = save_logs
        self.reader = reader
        self.upload2openml = upload2openml
        self.remove_parents = remove_parents

        self.max_timestamp = None
        if type(max_seconds) != type(None):
            self.max_timestamp = time.time() + max_seconds

        if type(n_jobs) == type(None):
            self.n_jobs = int(Config.get_default("parallelism", mp.cpu_count()))
        else:
            self.n_jobs = n_jobs

        self.max_feature_depth = max_feature_depth
Ejemplo n.º 14
0
def evaluate(candidate_id: int):
    #process = psutil.Process(os.getpid())
    #print(str(process.memory_info().rss) + " " + str(sys.getsizeof(my_globale_module.candidate_list_global)))

    candidate: CandidateFeature = my_globale_module.candidate_list_global[
        candidate_id]

    if type(my_globale_module.max_timestamp_global) != type(
            None) and time.time() >= my_globale_module.max_timestamp_global:
        raise RuntimeError('Out of time!')

    train_transformed = [None] * len(
        my_globale_module.preprocessed_folds_global)
    test_transformed = [None] * len(
        my_globale_module.preprocessed_folds_global)

    #test
    training_all = None
    one_test_set_transformed = None

    if isinstance(candidate, RawFeature):

        if 'training_all' in candidate.runtime_properties:
            training_all = candidate.runtime_properties['training_all']
            one_test_set_transformed = candidate.runtime_properties[
                'one_test_set_transformed']

        train_transformed = candidate.runtime_properties['train_transformed']
        test_transformed = candidate.runtime_properties['test_transformed']

    else:

        #print(self.name_to_train_transformed.keys())

        #merge columns from parents
        test_unique_values = 0
        for fold in range(len(my_globale_module.preprocessed_folds_global)):
            train_transformed_input = np.hstack([
                p.runtime_properties['train_transformed'][fold]
                for p in candidate.parents
            ])
            test_transformed_input = np.hstack([
                p.runtime_properties['test_transformed'][fold]
                for p in candidate.parents
            ])

            candidate.transformation.fit(
                train_transformed_input,
                my_globale_module.target_train_folds_global[fold])
            train_transformed[fold] = candidate.transformation.transform(
                train_transformed_input)
            test_transformed[fold] = candidate.transformation.transform(
                test_transformed_input)
            test_unique_values += len(np.unique(test_transformed[fold]))

        if not isinstance(candidate.transformation, IdentityTransformation):
            #check whether feature is constant
            if test_unique_values == len(
                    my_globale_module.preprocessed_folds_global):
                return None

            ## check if we computed an equivalent feature before
            materialized_all = []
            for fold_ii in range(
                    len(my_globale_module.preprocessed_folds_global)):
                materialized_all.extend(test_transformed[fold_ii].flatten())
            materialized = tuple(materialized_all)
            if materialized in my_globale_module.materialized_set:
                return None

        if 'training_all' in list(candidate.parents)[0].runtime_properties:
            training_all_input = np.hstack([
                p.runtime_properties['training_all'] for p in candidate.parents
            ])
            one_test_set_transformed_input = np.hstack([
                p.runtime_properties['one_test_set_transformed']
                for p in candidate.parents
            ])

            candidate.transformation.fit(
                training_all_input,
                my_globale_module.train_y_all_target_global)
            training_all = candidate.transformation.transform(
                training_all_input)
            one_test_set_transformed = candidate.transformation.transform(
                one_test_set_transformed_input)

    evaluated = True
    if  (
            (
                    isinstance(candidate.transformation, MinusTransformation) or
                    (isinstance(candidate.transformation, HigherOrderCommutativeTransformation) and candidate.transformation.method == np.nansum) or
                    (isinstance(candidate.transformation, NonCommutativeBinaryTransformation) and candidate.transformation.method == np.subtract)
            ) \
                and \
            (
              my_globale_module.classifier_global == LogisticRegression or
              my_globale_module.classifier_global == LinearRegression
            )
        ):
        candidate.runtime_properties['score'] = np.max(
            [p.runtime_properties['score'] for p in candidate.parents])
        candidate.runtime_properties['test_score'] = -1.0
        candidate.runtime_properties['hyperparameters'] = None
        y_pred = None
        evaluated = False
        candidate.runtime_properties['passed'] = True
    else:
        candidate.runtime_properties['passed'] = False
        candidate.runtime_properties['score'], candidate.runtime_properties[
            'test_score'], candidate.runtime_properties[
                'hyperparameters'], test_fold_predictions, candidate.runtime_properties[
                    'fold_scores'], my_clf, candidate.runtime_properties[
                        'mean_scores'], candidate.runtime_properties[
                            'additional_metrics'] = grid_search(
                                candidate, train_transformed, test_transformed,
                                training_all, one_test_set_transformed,
                                my_globale_module.
                                grid_search_parameters_global,
                                my_globale_module.score_global,
                                my_globale_module.classifier_global,
                                my_globale_module.target_train_folds_global,
                                my_globale_module.target_test_folds_global,
                                my_globale_module.train_y_all_target_global,
                                my_globale_module.test_target_global)
        '''
        candidate.runtime_properties['score'], candidate.runtime_properties['test_score'], candidate.runtime_properties[
            'hyperparameters'], test_fold_predictions, candidate.runtime_properties['fold_scores'], my_clf, \
        candidate.runtime_properties['mean_scores'], candidate.runtime_properties['additional_metrics'] = hyperopt_search(
            train_transformed, test_transformed, training_all, one_test_set_transformed,
            my_globale_module.grid_search_parameters_global, my_globale_module.score_global,
            my_globale_module.classifier_global, my_globale_module.target_train_folds_global,
            my_globale_module.target_test_folds_global, my_globale_module.train_y_all_target_global,
            my_globale_module.test_target_global)
        '''

        #if True:
        #    candidate.runtime_properties['coef_'] = my_clf.coef_

        if Config.get_default('store.predictions', 'False') == 'True':
            candidate.runtime_properties[
                'test_fold_predictions'] = test_fold_predictions
            '''
            ## check whether prediction already exists
            materialized_all = []
            for fold_ii in range(len(my_globale_module.preprocessed_folds_global)):
                materialized_all.extend(candidate.runtime_properties['test_fold_predictions'][fold_ii].flatten())
            materialized = tuple(materialized_all)
            if materialized in my_globale_module.predictions_set:
                return None
            '''

    if isinstance(
            candidate.transformation, OneHotTransformation
    ) or isinstance(candidate, RawFeature) or not evaluated or (
        (candidate.runtime_properties['score'] -
         np.max([p.runtime_properties['score'] for p in candidate.parents])) /
            my_globale_module.complexity_delta_global
    ) * my_globale_module.score_global._sign > my_globale_module.epsilon_global:
        candidate.runtime_properties['passed'] = True
        if not isinstance(candidate, RawFeature):
            candidate.runtime_properties[
                'train_transformed'] = train_transformed
            candidate.runtime_properties['test_transformed'] = test_transformed

            if Config.get_default(
                    'score.test',
                    'False') == 'True' and type(training_all) != type(None):
                candidate.runtime_properties['training_all'] = training_all
                candidate.runtime_properties[
                    'one_test_set_transformed'] = one_test_set_transformed

            # derive properties
            if not isinstance(candidate, RawFeature):
                candidate.derive_properties(
                    candidate.runtime_properties['train_transformed'][0])

                #avoid nan for specific ml models
                if candidate.properties[
                        'missing_values'] and my_globale_module.classifier_global == LogisticRegression:
                    return None

        # remove parents' materialization
        candidate.get_name()
        candidate.get_complexity()
        candidate.get_sympy_representation()
        if my_globale_module.remove_parents:
            candidate.parents = None
        return candidate

    return None
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)
        my_globale_module.remove_parents = copy.deepcopy(self.remove_parents)

        my_globale_module.materialized_set = set()
        my_globale_module.predictions_set = set()

        number_of_multiple_cvs = 10
        nested_my_globale_module.splitting_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs)
        nested_my_globale_module.model_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs)

        #pickle.dump(my_globale_module.target_test_folds_global, open('/tmp/test_groundtruth.p', 'wb+'))


        c = 1
        while(True):
            current_layer: List[CandidateFeature] = []

            if c <= self.max_feature_depth:
                #0th
                if c == 1:
                    cost_2_raw_features[c]: List[CandidateFeature] = []
                    #print(self.raw_features)
                    for raw_f in self.raw_features:
                        sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
                        raw_f.sympy_representation = sympy_representation
                        all_evaluated_features.add(sympy_representation)
                        if raw_f.is_numeric():
                            if raw_f.properties['missing_values']:
                                raw_f.runtime_properties['score'] = 0.0
                                cost_2_raw_features[c].append(raw_f)
                            else:
                                current_layer.append(raw_f)
                            #print("numeric: " + str(raw_f))
                        else:
                            raw_f.runtime_properties['score'] = 0.0
                            cost_2_raw_features[c].append(raw_f)
                            #print("nonnumeric: " + str(raw_f))

                        self.materialize_raw_features(raw_f)
                        #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

                # first unary
                # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?)
                unary_candidates_to_be_applied: List[CandidateFeature] = []
                if (c - 1) in cost_2_raw_features:
                    unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1])
                if (c - 1) in cost_2_unary_transformed:
                    unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1])
                if (c - 1) in cost_2_binary_transformed:
                    unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1])

                all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features)
                current_layer.extend(all_unary_features)

                #second binary
                #get length 2 partitions for current cost
                partition = self.get_length_2_partition(c-1)
                #print("bin: c: " + str(c) + " partition" + str(partition))

                #apply cross product from partitions
                binary_candidates_to_be_applied: List[CandidateFeature] = []
                for p in partition:
                    lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                    for element in range(2):
                        if p[element] in cost_2_raw_features:
                            lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                        if p[element] in cost_2_unary_transformed:
                            lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                        if p[element] in cost_2_binary_transformed:
                            lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])

                    for bt in binary_transformations:
                        list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed)
                        #print(list_of_combinations)
                        for combo in list_of_combinations:
                            if bt.is_applicable(combo):
                                sympy_representation = bt.get_sympy_representation(
                                    [p.get_sympy_representation() for p in combo])
                                try:
                                    if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                        if not sympy_representation in all_evaluated_features:
                                            bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                            bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                            all_evaluated_features.add(sympy_representation)
                                            binary_candidates_to_be_applied.append(bin_candidate)
                                        else:
                                            #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                            pass
                                    else:
                                        #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                except:
                                    pass
                current_layer.extend(binary_candidates_to_be_applied)

            #third: feature combinations
            #first variant: treat combination as a transformation
            #therefore, we can use the same partition as for binary data
            partition = self.get_length_2_partition(c)
            #print("combo c: " + str(c) + " partition" + str(partition))


            def filter_minus(features: List[CandidateFeature]):
                filtered_features: List[CandidateFeature] = []
                if my_globale_module.classifier_global == LogisticRegression:
                    for check_f in features:
                        if not isinstance(check_f.transformation, MinusTransformation):
                            filtered_features.append(check_f)
                return filtered_features

            '''
            combinations_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(filter_minus(cost_2_unary_transformed[p[element]]))
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(filter_minus(cost_2_binary_transformed[p[element]]))
                    if p[element] in cost_2_combination:
                        lists_for_each_element[element].extend(cost_2_combination[p[element]])

                combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1])
            current_layer.extend(combinations_to_be_applied)
            '''



            if unique_raw_combinations:
                length = len(current_layer)
                current_layer = self.filter_non_unique_combinations(current_layer)
                print("From " + str(length) + " combinations, we filter " +  str(length - len(current_layer)) + " nonunique raw feature combinations.")



            #now evaluate all from this layer
            #print(current_layer)

            print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
            results = evaluate_candidates_parallel(current_layer, self.n_jobs)
            print("----------- Evaluation Finished -----------")





            ##nested cv
            '''
            new_results_with_nested = []
            for r_result in results:
                if type(r_result) != type(None):
                    new_results_with_nested.append(r_result)
            #results = nested_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train'])
            results = multiple_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train'])
            for r_result in results:
                #print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str(r_result.runtime_properties['test_score']) + ' nested: ' + str(r_result.runtime_properties['nested_cv_score']))
                print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str(
                    r_result.runtime_properties['test_score']) + ' nested: ' + str(
                    r_result.runtime_properties['multiple_cv_score']))
            '''


            #print(results)

            layer_end_time = time.time() - self.global_starting_time

            #calculate whether we drop the evaluated candidate
            for candidate in results:

                ## check if we computed an equivalent feature before
                if type(candidate) != type(None) and not isinstance(candidate.transformation, IdentityTransformation):
                    materialized_all = []
                    for fold_ii in range(len(my_globale_module.preprocessed_folds_global)):
                        materialized_all.extend(candidate.runtime_properties['test_transformed'][fold_ii].flatten())
                    materialized = tuple(materialized_all)
                    if materialized in my_globale_module.materialized_set:
                        candidate = None
                    else:
                        my_globale_module.materialized_set.add(materialized)

                '''
                ## check if predictions exist already
                if type(candidate) != type(None) and 'test_fold_predictions' in candidate.runtime_properties:
                    materialized_all = []
                    for fold_ii in range(len(my_globale_module.preprocessed_folds_global)):
                        materialized_all.extend(candidate.runtime_properties['test_fold_predictions'][fold_ii].flatten())
                    materialized = tuple(materialized_all)
                    if materialized in my_globale_module.predictions_set:
                        candidate = None
                    else:
                        my_globale_module.predictions_set.add(materialized)
                '''



                if type(candidate) != type(None):
                    candidate.runtime_properties['layer_end_time'] = layer_end_time

                    #print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))


                    if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                        max_feature = candidate

                    if candidate.runtime_properties['passed']:

                        if isinstance(candidate, RawFeature):
                            if not c in cost_2_raw_features:
                                cost_2_raw_features[c]: List[CandidateFeature] = []
                            cost_2_raw_features[c].append(candidate)
                        elif isinstance(candidate.transformation, UnaryTransformation):
                            if not c in cost_2_unary_transformed:
                                cost_2_unary_transformed[c]: List[CandidateFeature] = []
                            cost_2_unary_transformed[c].append(candidate)
                        elif isinstance(candidate.transformation, IdentityTransformation):
                            if not c in cost_2_combination:
                                cost_2_combination[c]: List[CandidateFeature] = []
                            cost_2_combination[c].append(candidate)
                        else:
                            if not c in cost_2_binary_transformed:
                                cost_2_binary_transformed[c]: List[CandidateFeature] = []
                            cost_2_binary_transformed[c].append(candidate)
                    else:
                        if self.save_logs:
                            if not c in cost_2_dropped_evaluated_candidates:
                                cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                            cost_2_dropped_evaluated_candidates[c].append(candidate)
            


            satisfied_count = 0
            if c in cost_2_raw_features:
                satisfied_count += len(cost_2_raw_features[c])
            if c in cost_2_unary_transformed:
                satisfied_count += len(cost_2_unary_transformed[c])
            if c in cost_2_binary_transformed:
                satisfied_count += len(cost_2_binary_transformed[c])
            if c in cost_2_combination:
                satisfied_count += len(cost_2_combination[c])

            all_count = len(current_layer)
            if c == 1:
                all_count = len(cost_2_raw_features[c])


            print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.")


            if len(current_layer) > 0:
                if 'test_score' in max_feature.runtime_properties:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n")
                else:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(
                        max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(
                        max_feature.runtime_properties['score']) + "\n")
                #print("hyper: " + str(max_feature.runtime_properties['hyperparameters']))

                #print(max_feature.runtime_properties['fold_scores'])

            # upload best feature to OpenML
            if self.upload2openml:
                candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven')


            if self.save_logs:
                try:
                    pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                except:
                    pickle.dump(cost_2_raw_features, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"),
                                protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_unary_transformed, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_unary.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_binary_transformed, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_binary.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_combination, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_combination.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_dropped_evaluated_candidates, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)


            max_feature_per_complexity[c] = max_feature


            if type(self.c_max) == type(None) and c > 2:
                # calculate harmonic mean
                harmonic_means = [0.0]*3
                for h_i in range(len(harmonic_means)):
                    simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c,
                                                                       cost_2_raw_features, cost_2_unary_transformed,
                                                                       cost_2_binary_transformed, cost_2_combination)
                    accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c,
                                                                   cost_2_raw_features, cost_2_unary_transformed,
                                                                   cost_2_binary_transformed, cost_2_combination)

                    harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score)
                    #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i))

                if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]:
                    print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2]))
                    break


            if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp:
                break

            c += 1

            if type(self.c_max) != type(None) and self.c_max < c:
                break





        def extend_all(all_representations: List[CandidateFeature], new_llist):
            for mylist in new_llist:
                all_representations.extend(mylist)

        #get all representation
        all_representations: List[CandidateFeature] = []
        extend_all(all_representations, cost_2_raw_features.values())
        extend_all(all_representations, cost_2_unary_transformed.values())
        extend_all(all_representations, cost_2_binary_transformed.values())
        extend_all(all_representations, cost_2_combination.values())

        self.all_representations = all_representations

        '''

        #find top k based on cv score
        scores = [c.runtime_properties['score'] for c in all_representations]
        sorted_cv_score_ids = np.argsort(np.array(scores)*-1)
        checking_k = 50
        top_k_representations = [all_representations[sorted_id] for sorted_id in sorted_cv_score_ids[0:checking_k]]

        #from top k - select best based on nested cv score
        top_k_representations = multiple_cv_score_parallel(top_k_representations, self.reader.splitted_values['train'],
                                           self.reader.splitted_target['train'])

        scores = [c.runtime_properties['multiple_cv_score'] for c in top_k_representations]

        max_nested_cv_score = -1
        max_nested_rep = None
        for eval_candidate in top_k_representations:
            if eval_candidate.runtime_properties['multiple_cv_score'] > max_nested_cv_score:
                max_nested_cv_score = eval_candidate.runtime_properties['multiple_cv_score']
                max_nested_rep = eval_candidate

        print(max_nested_rep)
        max_feature = max_nested_rep
        '''

        '''
        all_features = list(max_feature_per_complexity.values())
        all_features = multiple_cv_score_parallel(all_features, self.reader.splitted_values['train'], self.reader.splitted_target['train'])

        best_multiple_cv_score = -np.inf
        best_multiple_cv_candidate = None
        for all_f in all_features:
            if all_f.runtime_properties['multiple_cv_score'] > best_multiple_cv_score:
                best_multiple_cv_score = all_f.runtime_properties['multiple_cv_score']
                best_multiple_cv_candidate = all_f

        #find the most simple representation that is within the best representation's std
        complexities = [all_f.get_complexity() for all_f in all_features]
        ids_complex = np.argsort(complexities)
        for all_f_i in range(len(all_features)):
            print(str(all_features[ids_complex[all_f_i]]) + ' mcv: ' + str(all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score']) + ' mcv_std: ' + str(
                all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score_std']))

            if all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score'] > best_multiple_cv_candidate.runtime_properties['multiple_cv_score'] - best_multiple_cv_candidate.runtime_properties['multiple_cv_score_std']:
                max_feature = all_features[ids_complex[all_f_i]]
                break

        print(max_feature)
        '''

        #min AICc selection
        min_aicc = np.inf
        min_aicc_feature = None

        all_aiccs = []
        for rep in list(max_feature_per_complexity.values()):
            all_aiccs.append(np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity']))

        def calculate_AIC_for_classification_paper(rss, n, k):
            AIC = 2 * k + float(n) * np.log(rss / float(n))
            return AIC

        def calculate_AICc_for_classification_paper(rss, n, k):
            AIC = calculate_AIC_for_classification_paper(rss, n, k)
            AICc = AIC + ((2 * k * (k + 1)) / (n - k - 1))
            return AICc


        def calc_global_aicc(rep):
            return calculate_AICc_for_classification_paper(np.sum(rep.runtime_properties['additional_metrics']['rss']), np.sum(rep.runtime_properties['additional_metrics']['n']), rep.get_complexity())

        def is_better(old_aics, new_aics):
            print(np.sum(np.array(new_aics) < np.array(old_aics)))
            return np.sum(np.array(new_aics) < np.array(old_aics)) > len(new_aics) / 2.0

        for rep in list(max_feature_per_complexity.values()):
            curr = np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity'])
            #print(str(rep) + ': ' + str(curr) + ' AICc min: ' + str(np.min(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' AICc std: ' + str(np.std(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' P: ' + str(np.exp((min(all_aiccs) - curr)/2)) + ' CV AUC: ' + str(rep.runtime_properties['score']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['AICc_complexity']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['rss']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['n']))

            print(str(rep) + 'global_aicc: ' + str(calc_global_aicc(rep)))

            #if type(min_aicc_feature) == type(None) or is_better(min_aicc_feature.runtime_properties['additional_metrics']['AICc_complexity'], rep.runtime_properties['additional_metrics']['AICc_complexity']):
            if type(min_aicc_feature) == type(None) or calc_global_aicc(rep) < calc_global_aicc(min_aicc_feature):
                #min_aicc = np.min(rep.runtime_properties['additional_metrics']['AICc_complexity'])
                min_aicc_feature = rep
        max_feature = min_aicc_feature

        print(max_feature)

        return max_feature
Ejemplo n.º 16
0
def evaluate(candidate: CandidateFeature,
             classifier,
             grid_search_parameters,
             preprocessed_folds,
             score,
             train_data,
             current_target,
             train_X_all,
             train_y_all_target,
             test_data,
             test_target,
             cv_jobs=1):
    pipeline = Pipeline([('features',
                          FeatureUnion([(candidate.get_name(),
                                         candidate.pipeline)])),
                         ('classifier', classifier())])

    refit = False
    if Config.get_default('score.test',
                          'False') == 'True' and not Config.get_default(
                              'instance.selection', 'False') == 'True':
        refit = True

    print(grid_search_parameters)
    clf = GridSearchCV(pipeline,
                       grid_search_parameters,
                       cv=preprocessed_folds,
                       scoring=score,
                       iid=False,
                       error_score='raise',
                       refit=refit,
                       n_jobs=cv_jobs)
    clf.fit(train_data, current_target)  #dataset.splitted_values['train']
    candidate.runtime_properties['score'] = clf.best_score_
    candidate.runtime_properties['hyperparameters'] = clf.best_params_

    #for
    test_fold_predictions = []
    for fold in range(len(preprocessed_folds)):
        test_fold_predictions.append(
            clf.predict(train_data[preprocessed_folds[fold][1]]) ==
            current_target[preprocessed_folds[fold][1]])
    candidate.runtime_properties[
        'test_fold_predictions'] = test_fold_predictions

    if Config.get_default('score.test',
                          'False') == 'True' and len(test_data) > 0:
        if Config.get_default('instance.selection', 'False') == 'True':
            clf = GridSearchCV(pipeline,
                               grid_search_parameters,
                               cv=preprocessed_folds,
                               scoring=score,
                               iid=False,
                               error_score='raise',
                               refit=True)

            clf.fit(train_X_all, train_y_all_target)
        candidate.runtime_properties['test_score'] = clf.score(
            test_data, test_target)  #self.dataset.splitted_values['test']
    else:
        candidate.runtime_properties['test_score'] = 0.0

    return candidate
Ejemplo n.º 17
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer()



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        complexity_delta = 1.0

        epsilon = self.epsilon
        limit_runs = self.c_max + 1  # 5
        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -2

        self.name_to_transfomed = {}

        for c in range(1, limit_runs):
            current_layer: List[CandidateFeature] = []

            #0th
            if c == 1:
                cost_2_raw_features[c]: List[CandidateFeature] = []
                for raw_f in self.raw_features:
                    if raw_f.is_numeric():
                        current_layer.append(raw_f)
                    else:
                        raw_f.runtime_properties['score'] = 0.0
                        cost_2_raw_features[c].append(raw_f)

            # first unary
            # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?)
            unary_candidates_to_be_applied: List[CandidateFeature] = []
            if (c - 1) in cost_2_raw_features:
                unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1])
            if (c - 1) in cost_2_unary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1])
            if (c - 1) in cost_2_binary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1])


            current_layer.extend(self.generate_features(unary_transformations, unary_candidates_to_be_applied))

            #second binary
            #get length 2 partitions for current cost
            partition = self.get_length_2_partition(c-1)
            #print("bin: c: " + str(c) + " partition" + str(partition))

            #apply cross product from partitions
            binary_candidates_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])

                for bt in binary_transformations:
                    list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed)
                    for combo in list_of_combinations:
                        if bt.is_applicable(combo):
                            binary_candidates_to_be_applied.append(CandidateFeature(copy.deepcopy(bt), combo))
            current_layer.extend(binary_candidates_to_be_applied)

            #third: feature combinations
            #first variant: treat combination as a transformation
            #therefore, we can use the same partition as for binary data
            partition = self.get_length_2_partition(c)
            #print("combo c: " + str(c) + " partition" + str(partition))

            combinations_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])
                    if p[element] in cost_2_combination:
                        lists_for_each_element[element].extend(cost_2_combination[p[element]])


                list_of_combinations = self.generate_merge_for_combination(lists_for_each_element[0], lists_for_each_element[1])
                for combo in list_of_combinations:
                    if IdentityTransformation(None).is_applicable(list(combo)):
                        combinations_to_be_applied.append(CandidateFeature(IdentityTransformation(None), list(combo)))
            current_layer.extend(combinations_to_be_applied)



            if unique_raw_combinations:
                length = len(current_layer)
                current_layer = self.filter_non_unique_combinations(current_layer)
                print("From " + str(length) + " combinations, we filter " +  str(length - len(current_layer)) + " nonunique raw feature combinations.")



            #now evaluate all from this layer
            #print(current_layer)
            print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
            results = self.evaluate_candidates(current_layer)
            print("----------- Evaluation Finished -----------")

            layer_end_time = time.time() - self.global_starting_time

            #calculate whether we drop the evaluated candidate
            for result in results:
                candidate: CandidateFeature = result['candidate']
                candidate.runtime_properties['score'] = result['score']
                candidate.runtime_properties['test_score'] = result['test_score']
                candidate.runtime_properties['execution_time'] = result['execution_time']
                candidate.runtime_properties['global_time'] = result['global_time']
                candidate.runtime_properties['hyperparameters'] = result['hyperparameters']
                candidate.runtime_properties['layer_end_time'] = layer_end_time

                #print(str(candidate) + " -> " + str(candidate.score))

                if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                    max_feature = candidate

                #calculate original score
                original_score = baseline_score #or zero??
                if not isinstance(candidate, RawFeature):
                    original_score = max([p.runtime_properties['score'] for p in candidate.parents])

                accuracy_delta = result['score'] - original_score

                if accuracy_delta / complexity_delta > epsilon:
                    if isinstance(candidate, RawFeature):
                        if not c in cost_2_raw_features:
                            cost_2_raw_features[c]: List[CandidateFeature] = []
                        cost_2_raw_features[c].append(candidate)
                    elif isinstance(candidate.transformation, UnaryTransformation):
                        if not c in cost_2_unary_transformed:
                            cost_2_unary_transformed[c]: List[CandidateFeature] = []
                        cost_2_unary_transformed[c].append(candidate)
                    elif isinstance(candidate.transformation, IdentityTransformation):
                        if not c in cost_2_combination:
                            cost_2_combination[c]: List[CandidateFeature] = []
                        cost_2_combination[c].append(candidate)
                    else:
                        if not c in cost_2_binary_transformed:
                            cost_2_binary_transformed[c]: List[CandidateFeature] = []
                        cost_2_binary_transformed[c].append(candidate)
                else:
                    if not c in cost_2_dropped_evaluated_candidates:
                        cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                    cost_2_dropped_evaluated_candidates[c].append(candidate)
            


            if c in cost_2_dropped_evaluated_candidates:
                print("Of " + str(len(current_layer)) + " candidate representations, " + str(len(cost_2_dropped_evaluated_candidates[c])) + " did not satisfy the epsilon threshold.")
            else:
                print("Of " + str(len(current_layer)) + " candidate representations, all satisfied the epsilon threshold.")


            print("Best representation found for complexity = " + str(c) + ": " + str(max_feature) + "\n")

            if self.save_logs:
                pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"))
                pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"))
                pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"))
                pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"))
                pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"))
Ejemplo n.º 18
0
    def nested_grid_search(self, train_transformed, validation_transformed, train_and_validation_transformed, test_transformed, training_all, one_test_set_transformed):

        q = deque(range(self.folds))

        nested_test_score = []

        for outer_i in range(self.folds):
            test_fold = q.pop()

            hyperparam_to_score_list = {}
            my_keys = list(self.grid_search_parameters.keys())
            for parameter_combination in itertools.product(*[self.grid_search_parameters[k] for k in my_keys]):
                parameter_set = hashabledict(zip(my_keys, parameter_combination))
                hyperparam_to_score_list[parameter_set] = []

                for inner_i in range(self.folds - 1):
                    q.rotate(-1)
                    validation_fold = q[0]
                    training_folds = [q[training_i] for training_i in range(1, self.folds - 1)]

                    ################################################################
                    clf = self.classifier(**parameter_set)
                    clf.fit(train_transformed[frozenset(training_folds)], self.store_cv_sets_train_target[frozenset(training_folds)])
                    y_val_pred = clf.predict(validation_transformed[frozenset(training_folds)]) #validation
                    hyperparam_to_score_list[parameter_set].append(
                        f1_score(self.store_cv_sets_validation_target[frozenset(training_folds)], y_val_pred, average='micro'))


                    ################################################################

            #find best parameter
            best_param = None
            best_mean_cross_val_score = -1
            for parameter_config, score_list in hyperparam_to_score_list.items():
                mean_score = np.mean(score_list)
                if mean_score > best_mean_cross_val_score:
                    best_param = parameter_config
                    best_mean_cross_val_score = mean_score

            ######################

            clf = self.classifier(**best_param)
            clf.fit(train_and_validation_transformed[test_fold], self.store_cv_sets_train_and_validation_target[test_fold])
            y_test_pred = clf.predict(test_transformed[test_fold])

            nested_test_score.append(f1_score(self.store_cv_sets_test_target[test_fold], y_test_pred, average='micro'))



            ######################


            q.appendleft(test_fold)



        nested_cross_val_score = np.mean(nested_test_score)


        test_score = None
        if Config.get_default('score.test', 'False') == 'True':
            # refit to entire training and test on test set
            clf = self.classifier(**best_param)
            clf.fit(training_all, self.train_y_all_target)
            y_pred = clf.predict(one_test_set_transformed)

            test_score = f1_score(self.test_target, y_pred, average='micro')

            #np.save('/tmp/true_predictions', self.test_target)


        return nested_cross_val_score, test_score, best_param, y_pred
Ejemplo n.º 19
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)
        my_globale_module.remove_parents = copy.deepcopy(self.remove_parents)





        c = 1
        while(True):
            current_layer: List[CandidateFeature] = []

            #0th
            if c == 1:
                cost_2_raw_features[c]: List[CandidateFeature] = []
                #print(self.raw_features)
                for raw_f in self.raw_features:
                    sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
                    raw_f.sympy_representation = sympy_representation
                    all_evaluated_features.add(sympy_representation)
                    if raw_f.is_numeric():
                        if raw_f.properties['missing_values']:
                            raw_f.runtime_properties['score'] = 0.0
                            cost_2_raw_features[c].append(raw_f)
                        else:
                            current_layer.append(raw_f)
                        #print("numeric: " + str(raw_f))
                    else:
                        raw_f.runtime_properties['score'] = 0.0
                        cost_2_raw_features[c].append(raw_f)
                        #print("nonnumeric: " + str(raw_f))

                    self.materialize_raw_features(raw_f)
                    #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

            # first unary
            # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?)
            unary_candidates_to_be_applied: List[CandidateFeature] = []
            if (c - 1) in cost_2_raw_features:
                unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1])
            if (c - 1) in cost_2_unary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1])
            if (c - 1) in cost_2_binary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1])

            all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features)
            current_layer.extend(all_unary_features)

            #second binary
            #get length 2 partitions for current cost
            partition = self.get_length_2_partition(c-1)
            #print("bin: c: " + str(c) + " partition" + str(partition))

            #apply cross product from partitions
            binary_candidates_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])

                for bt in binary_transformations:
                    list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed)
                    #print(list_of_combinations)
                    for combo in list_of_combinations:
                        if bt.is_applicable(combo):
                            sympy_representation = bt.get_sympy_representation(
                                [p.get_sympy_representation() for p in combo])
                            try:
                                if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                    if not sympy_representation in all_evaluated_features:
                                        bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                        bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                        all_evaluated_features.add(sympy_representation)
                                        binary_candidates_to_be_applied.append(bin_candidate)
                                    else:
                                        #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                else:
                                    #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                    pass
                            except:
                                pass
            current_layer.extend(binary_candidates_to_be_applied)

            #third: feature combinations
            #first variant: treat combination as a transformation
            #therefore, we can use the same partition as for binary data
            partition = self.get_length_2_partition(c)
            #print("combo c: " + str(c) + " partition" + str(partition))

            combinations_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])
                    if p[element] in cost_2_combination:
                        lists_for_each_element[element].extend(cost_2_combination[p[element]])

                combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1])
            current_layer.extend(combinations_to_be_applied)



            if unique_raw_combinations:
                length = len(current_layer)
                current_layer = self.filter_non_unique_combinations(current_layer)
                print("From " + str(length) + " combinations, we filter " +  str(length - len(current_layer)) + " nonunique raw feature combinations.")



            #now evaluate all from this layer
            #print(current_layer)
            print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
            results = evaluate_candidates(current_layer)
            print("----------- Evaluation Finished -----------")

            #print(results)

            layer_end_time = time.time() - self.global_starting_time

            #calculate whether we drop the evaluated candidate
            for candidate in results:
                if type(candidate) != type(None):
                    candidate.runtime_properties['layer_end_time'] = layer_end_time

                    #print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))


                    if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                        max_feature = candidate

                    if candidate.runtime_properties['passed']:
                        if isinstance(candidate, RawFeature):
                            if not c in cost_2_raw_features:
                                cost_2_raw_features[c]: List[CandidateFeature] = []
                            cost_2_raw_features[c].append(candidate)
                        elif isinstance(candidate.transformation, UnaryTransformation):
                            if not c in cost_2_unary_transformed:
                                cost_2_unary_transformed[c]: List[CandidateFeature] = []
                            cost_2_unary_transformed[c].append(candidate)
                        elif isinstance(candidate.transformation, IdentityTransformation):
                            if not c in cost_2_combination:
                                cost_2_combination[c]: List[CandidateFeature] = []
                            cost_2_combination[c].append(candidate)
                        else:
                            if not c in cost_2_binary_transformed:
                                cost_2_binary_transformed[c]: List[CandidateFeature] = []
                            cost_2_binary_transformed[c].append(candidate)
                    else:
                        if self.save_logs:
                            if not c in cost_2_dropped_evaluated_candidates:
                                cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                            cost_2_dropped_evaluated_candidates[c].append(candidate)
            


            satisfied_count = 0
            if c in cost_2_raw_features:
                satisfied_count += len(cost_2_raw_features[c])
            if c in cost_2_unary_transformed:
                satisfied_count += len(cost_2_unary_transformed[c])
            if c in cost_2_binary_transformed:
                satisfied_count += len(cost_2_binary_transformed[c])
            if c in cost_2_combination:
                satisfied_count += len(cost_2_combination[c])

            all_count = len(current_layer)
            if c == 1:
                all_count = len(cost_2_raw_features[c])


            print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.")


            if len(current_layer) > 0:
                if Config.get_default('score.test', 'False') == 'True':
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n")
                else:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(
                        max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(
                        max_feature.runtime_properties['score']) + "\n")
                #print("hyper: " + str(max_feature.runtime_properties['hyperparameters']))

                #print(max_feature.runtime_properties['fold_scores'])

            # upload best feature to OpenML
            if self.upload2openml:
                candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven')


            if self.save_logs:
                pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)


            max_feature_per_complexity[c] = max_feature


            if type(self.c_max) == type(None) and c > 2:
                # calculate harmonic mean
                harmonic_means = [0.0]*3
                for h_i in range(len(harmonic_means)):
                    simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c,
                                                                       cost_2_raw_features, cost_2_unary_transformed,
                                                                       cost_2_binary_transformed, cost_2_combination)
                    accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c,
                                                                   cost_2_raw_features, cost_2_unary_transformed,
                                                                   cost_2_binary_transformed, cost_2_combination)

                    harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score)
                    #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i))

                if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]:
                    print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2]))
                    break


            if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp:
                break

            c += 1

            if type(self.c_max) != type(None) and self.c_max < c:
                break
Ejemplo n.º 20
0
    def evaluate(self, candidate: CandidateFeature, score=make_scorer(f1_score, average='micro')):

        if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp:
            raise RuntimeError('Out of time!')

        result = {}
        train_transformed = {}
        validation_transformed = {}
        train_and_validation_transformed = [None] * self.folds
        test_transformed = [None] * self.folds

        #test
        training_all = None
        one_test_set_transformed = None

        result['train_transformed'] = None
        result['validation_transformed'] = None
        result['train_and_validation_transformed'] = None
        result['test_transformed'] = None
        result['one_test_set_transformed'] = None

        if isinstance(candidate, RawFeature):

            if Config.get_default('score.test', 'False') == 'True':
                result['training_all'] = training_all = self.name_to_training_all[str(candidate)]
                result['one_test_set_transformed'] = one_test_set_transformed = self.name_to_one_test_set_transformed[str(candidate)]

            result['train_transformed'] = train_transformed = self.name_to_train_transformed[str(candidate)]
            result['validation_transformed'] = validation_transformed = self.name_to_validation_transformed[str(candidate)]
            result['train_and_validation_transformed'] = train_and_validation_transformed =self.name_to_train_and_validation_transformed[str(candidate)]
            result['test_transformed'] = test_transformed = self.name_to_test_transformed[str(candidate)]

        else:

            #print(self.name_to_train_transformed.keys())

            #merge columns from parents
            for key, value in self.name_to_train_transformed[str(list(candidate.parents)[0])].items():
                train_transformed_input = np.hstack([self.name_to_train_transformed[str(p)][key] for p in candidate.parents])
                validation_transformed_input = np.hstack([self.name_to_validation_transformed[str(p)][key] for p in candidate.parents])

                candidate.transformation.fit(train_transformed_input)

                train_transformed[key] = candidate.transformation.transform(train_transformed_input)
                validation_transformed[key] = candidate.transformation.transform(validation_transformed_input)

            for fold_i in range(self.folds):
                train_and_validation_transformed_input = np.hstack([self.name_to_train_and_validation_transformed[str(p)][fold_i] for p in candidate.parents])
                test_transformed_input = np.hstack([self.name_to_test_transformed[str(p)][fold_i] for p in candidate.parents])

                candidate.transformation.fit(train_and_validation_transformed_input)

                train_and_validation_transformed[fold_i] = candidate.transformation.transform(train_and_validation_transformed_input)
                test_transformed[fold_i] = candidate.transformation.transform(test_transformed_input)

            if Config.get_default('score.test', 'False') == 'True':
                training_all_input = np.hstack(
                    [self.name_to_training_all[str(p)] for p in candidate.parents])
                one_test_set_transformed_input = np.hstack(
                    [self.name_to_one_test_set_transformed[str(p)] for p in candidate.parents])

                candidate.transformation.fit(training_all_input)
                training_all = candidate.transformation.transform(training_all_input)
                one_test_set_transformed = candidate.transformation.transform(one_test_set_transformed_input)

        candidate.runtime_properties['score'], candidate.runtime_properties['test_score'], candidate.runtime_properties['hyperparameters'], y_pred = self.nested_grid_search(train_transformed, validation_transformed, train_and_validation_transformed, test_transformed, training_all, one_test_set_transformed)

        if Config.get_default('store.predictions', 'False') == 'True':
            candidate.runtime_properties['predictions'] = y_pred

        if not isinstance(candidate, RawFeature):
            #only save the transformed data if we need it in the future
            max_parent = np.max([p.runtime_properties['score'] for p in candidate.parents])
            accuracy_delta = candidate.runtime_properties['score'] - max_parent
            if accuracy_delta / self.complexity_delta > self.epsilon:

                result['train_transformed'] = train_transformed
                result['validation_transformed'] = validation_transformed
                result['train_and_validation_transformed'] = train_and_validation_transformed
                result['test_transformed'] = test_transformed


                result['training_all'] = training_all
                result['one_test_set_transformed'] = one_test_set_transformed

                # derive properties
                if not isinstance(candidate, RawFeature):
                    candidate.derive_properties(result['train_and_validation_transformed'][0])


        return result