Beispiel #1
0
def evaluate(candidate: CandidateFeature, classifier, grid_search_parameters, preprocessed_folds, score, train_data, current_target, train_X_all, train_y_all_target, test_data, test_target):
    pipeline = Pipeline([('features', FeatureUnion(
        [
            (candidate.get_name(), candidate.pipeline)
        ])),
                         ('classifier', classifier())
                         ])

    refit = False
    if Config.get_default('score.test', 'False') == 'True' and not Config.get_default('instance.selection',
                                                                                      'False') == 'True':
        refit = True

    clf = GridSearchCV(pipeline, grid_search_parameters, cv=preprocessed_folds, scoring=score, iid=False,
                       error_score='raise', refit=refit)
    clf.fit(train_data, current_target) #dataset.splitted_values['train']
    candidate.runtime_properties['score'] = clf.best_score_
    candidate.runtime_properties['hyperparameters'] = clf.best_params_

    if Config.get_default('score.test', 'False') == 'True':
        if Config.get_default('instance.selection', 'False') == 'True':
            clf = GridSearchCV(pipeline, grid_search_parameters, cv=preprocessed_folds, scoring=score,
                               iid=False, error_score='raise', refit=True)

            clf.fit(train_X_all, train_y_all_target)
        candidate.runtime_properties['test_score'] = clf.score(test_data, test_target) #self.dataset.splitted_values['test']
    else:
        candidate.runtime_properties['test_score'] = 0.0

    return candidate
Beispiel #2
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate(42)
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        level_scores: Dict[int, List[float]] = {}
        level_test_scores: Dict[int, List[float]] = {}

        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all')
        #string2candidate = self.load_data_all('/tmp')

        baseline_features: List[CandidateFeature] = []
        for r in self.raw_features:
            if r.is_numeric() and not r.properties['categorical']:
                if not r.properties['missing_values']:
                    baseline_features.append(r)
                else:
                    baseline_features.append(
                        CandidateFeature(ImputationTransformation(), [r]))
            else:
                baseline_features.extend([
                    CandidateFeature(t, [r])
                    for t in OneHotGenerator(self.train_X_all, [r]).produce()
                ])

        #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_small', 24))
        #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_new_all', 10))
        #baseline_features.extend(self.get_interesting_features(string2candidate, 2))
        '''
        for c in baseline_features:
            if isinstance(c, RawFeature):
                print(str(c) + " complexity: " + str(c.get_complexity()))
            else:
                print('nr: ' + str(c) + " complexity: " + str(c.get_complexity()))+
        '''

        # standardize
        scaled_baseline_features = []
        for c in baseline_features:
            scaled_baseline_features.append(
                CandidateFeature(MinMaxScalingTransformation(), [c]))

        #scaled_baseline_features = baseline_features

        combo = CandidateFeature(
            IdentityTransformation(len(baseline_features)),
            scaled_baseline_features)

        results = self.evaluate_candidates_detail([combo], myfolds, 1)

        print(str(results[0].runtime_properties))
Beispiel #3
0
 def generate_features(
         self, transformations: List[Transformation],
         features: List[CandidateFeature],
         all_evaluated_features: Set) -> List[CandidateFeature]:
     generated_features: List[CandidateFeature] = []
     for t_i in transformations:
         for f_i in t_i.get_combinations(features):
             if t_i.is_applicable(f_i):
                 sympy_representation = t_i.get_sympy_representation(
                     [p.get_sympy_representation() for p in f_i])
                 try:
                     if len(sympy_representation.free_symbols
                            ) > 0:  # if expression is not constant
                         if not sympy_representation in all_evaluated_features:
                             candidate = CandidateFeature(
                                 copy.deepcopy(t_i),
                                 f_i)  # do we need a deep copy here?
                             candidate.sympy_representation = copy.deepcopy(
                                 sympy_representation)
                             generated_features.append(candidate)
                             all_evaluated_features.add(
                                 sympy_representation)
                         else:
                             #print("skipped: " + str(sympy_representation))
                             pass
                 except:
                     pass
     return generated_features
Beispiel #4
0
    def run(self):

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        self.global_starting_time = time.time()

        for k in range(1, len(self.raw_features)+1):


            all_f = CandidateFeature(IdentityTransformation(len(self.raw_features)), self.raw_features)


            t = CandidateFeature(SelectKBestTransformer(len(self.raw_features),k), [all_f])

            t.pipeline.fit(self.dataset.splitted_values['train'], self.current_target)
            X = t.transform(self.dataset.splitted_values['train'])
            X_test = t.transform(self.dataset.splitted_values['test'])

            print("time: " + str(time.time() - self.global_starting_time))

            clf = GridSearchCV(self.classifier(), self.grid_search_parameters, cv=self.preprocessed_folds, scoring=self.score, iid=False,
                               error_score='raise')
            clf.fit(X, self.current_target)

            print('test score: ' + str(clf.score(X_test, self.test_target)))
            print("\n\n")
Beispiel #5
0
    def fit(self, X, y=None):
        fe = ComplexityDrivenFeatureConstruction(
            None,
            reader=ScikitReader(
                X,
                y,
                feature_names=self.feature_names,
                feature_is_categorical=self.feature_is_categorical),
            score=self.scoring,
            c_max=self.c_max,
            folds=self.cv,
            max_seconds=self.max_time_secs,
            classifier=self.model.__class__,
            grid_search_parameters=self.parameter_grid,
            n_jobs=self.n_jobs,
            epsilon=self.epsilon,
            remove_parents=False,
            transformation_producer=self.transformation_producer)

        fe.run()

        numeric_representations = []
        for r in fe.all_representations:
            if 'score' in r.runtime_properties:
                if not 'object' in str(r.properties['type']):
                    if not isinstance(r.transformation,
                                      MinMaxScalingTransformation):
                        #if not (isinstance(r.transformation, HigherOrderCommutativeTransformation) and r.transformation.method == np.nansum):
                        if isinstance(r.sympy_representation, sympy.Mul):
                            found = False
                            for e in r.sympy_representation._args:
                                if e == S.NegativeOne:
                                    found = True
                            if found == False:
                                numeric_representations.append(r)
                        else:
                            numeric_representations.append(r)

        self.numeric_features = numeric_representations

        my_list = []
        for ff in self.numeric_features:
            my_list.append(str(ff))

        with open('/tmp/names.pickle', 'wb') as f:
            pickle.dump(X, f, pickle.HIGHEST_PROTOCOL)

        all_features = CandidateFeature(IdentityTransformation(-1),
                                        numeric_representations)

        #all_imputation = CandidateFeature(ImputationTransformation(), [all_features])
        all_standardized = CandidateFeature(MinMaxScalingTransformation(),
                                            [all_features])

        #all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

        self.pipeline_ = all_standardized.pipeline

        self.pipeline_.fit(X, y)
        return self
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        #working_features = self.filter_failing_in_parallel()
        #all_f = CandidateFeature(IdentityTransformation(len(working_features)), working_features)

        all_f = CandidateFeature(
            IdentityTransformation(len(self.raw_features)), self.raw_features)

        my_list = []

        for i in range(1, len(self.raw_features) + 1):
            my_list.append(
                CandidateFeature(
                    skrebateTransformer(len(self.raw_features), i), [all_f]))

        #my_list.append(CandidateFeature(SissoTransformer(len(self.raw_features)), [all_f]))

        results = self.evaluate_candidates(my_list)

        print(results)

        for r in range(len(results)):
            print("(" + str(r + 1) + "," + str(results[r]['test_score']) + ")")
    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        #working_features = self.filter_failing_in_parallel()
        #all_f = CandidateFeature(IdentityTransformation(len(working_features)), working_features)

        all_f = CandidateFeature(
            IdentityTransformation(len(self.raw_features)), self.raw_features)

        my_list = []

        for i in range(1, len(self.raw_features) + 1):
            my_list.append(
                CandidateFeature(BorutaTransformer(len(self.raw_features), i),
                                 [all_f]))

        #my_list.append(CandidateFeature(SissoTransformer(len(self.raw_features)), [all_f]))

        results = self.evaluate_candidates(my_list)

        print(results)

        for r in range(len(results)):
            print("(" + str(r + 1) + "," + str(results[r]['score']) + ")")

        new_scores = [r['score'] for r in results]
        best_id = np.argmax(new_scores)

        print(results[best_id])
Beispiel #8
0
    def nested(self, name2feature):
        nested_features = []
        nested_features.append(name2feature["chest"])
        nested_features.append(
            CandidateFeature(IdentityTransformation(2), [
                name2feature["exercise_induced_angina"],
                name2feature["number_of_major_vessels"]
            ]))
        nested_features.append(
            CandidateFeature(IdentityTransformation(3), [
                name2feature["slope"], name2feature["chest"],
                name2feature["number_of_major_vessels"]
            ]))
        nested_features.append(
            CandidateFeature(IdentityTransformation(3), [
                name2feature["slope"], name2feature["chest"],
                name2feature["resting_electrocardiographic_results"],
                name2feature["number_of_major_vessels"]
            ]))

        nested_features.append(
            CandidateFeature(IdentityTransformation(3), [
                name2feature["sex"], name2feature["chest"],
                CandidateFeature(
                    HigherOrderCommutativeTransformation(
                        np.nansum, sympy.Add, 2), [
                            name2feature["slope"],
                            name2feature["number_of_major_vessels"]
                        ])
            ]))
        return nested_features
Beispiel #9
0
    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        working_features = self.filter_failing_features()

        all_f = CandidateFeature(IdentityTransformation(len(working_features)),
                                 working_features)

        selection = CandidateFeature(
            FeatureSelectionTransformation(
                1, 2,
                LogisticRegression(penalty='l2',
                                   solver='lbfgs',
                                   class_weight='balanced',
                                   max_iter=10000)), [all_f])

        results = self.evaluate_candidates([selection])

        new_scores = [r['score'] for r in results]
        best_id = np.argmax(new_scores)

        print(results[best_id])
Beispiel #10
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate(42)
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        baseline_features: List[CandidateFeature] = []
        for r in self.raw_features:
            if r.is_numeric() and (not 'categorical' in r.properties
                                   or not r.properties['categorical']):
                if not r.properties['missing_values']:
                    baseline_features.append(r)
                else:
                    baseline_features.append(
                        CandidateFeature(ImputationTransformation(), [r]))
            else:
                baseline_features.extend([
                    CandidateFeature(t, [r])
                    for t in OneHotGenerator(self.train_X_all, [r]).produce()
                ])

        #scale everything
        for bf_i in range(len(baseline_features)):
            baseline_features[bf_i] = CandidateFeature(
                StandardScalingTransformation(), [baseline_features[bf_i]])

        print(len(baseline_features))

        combo = CandidateFeature(
            IdentityTransformation(len(baseline_features)), baseline_features)
        '''
        categorical_ids = []
        for r in self.raw_features:
            if 'categorical' in r.properties and r.properties['categorical']:
                categorical_ids.append(r.column_id)

        combo = CandidateFeature(IdentityTransformation(0), self.raw_features)
        if len(categorical_ids) >= 1:
            combo.pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='mean')),
                                         ('onehot', OneHotEncoder(categorical_features=categorical_ids)), ('scaling', StandardScaler(with_mean=False))])
        else:
            combo.pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='mean')), ('scaling', StandardScaler(with_mean=False))])
        '''

        results = self.evaluate_candidates([combo], myfolds)

        #print(results[0].runtime_properties)

        #candidate2openml(results[0], self.classifier, self.reader.task, 'RawFeatureBaseline')

        return results[0]
Beispiel #11
0
    def sisso_transfusion_features_new(self, name2feature):
        sisso_features = []
        sisso_features.extend(self.raw_features)

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(
                np.divide), [name2feature['Frequency'], name2feature['Time']]))

        all_f = CandidateFeature(IdentityTransformation(len(sisso_features)),
                                 sisso_features)
        return [all_f]
Beispiel #12
0
 def generate_features1(self, transformations: List[Transformation],
                        features: List[CandidateFeature]):
     generated_features: List[CandidateFeature] = []
     for t_i in transformations:
         for f_i in t_i.get_combinations(features):
             if t_i.is_applicable(f_i):
                 can = CandidateFeature(copy.deepcopy(t_i), f_i)
                 can.properties['type'] = 'float'
                 generated_features.append(
                     can)  # do we need a deep copy here?
                 #if output is multidimensional adapt here
     return generated_features
Beispiel #13
0
    def generate_merge_for_combination(self, all_evaluated_features, a: List[CandidateFeature], b: List[CandidateFeature]) -> Set[Set[CandidateFeature]]:
        cat_candidates_to_be_applied = []
        id_t = IdentityTransformation(None)
        for a_i in range(len(a)):
            for b_i in range(len(b)):
                combo = [a[a_i], b[b_i]]
                if id_t.is_applicable(combo):
                    sympy_representation = id_t.get_sympy_representation([p.get_sympy_representation() for p in combo])
                    if not sympy_representation in all_evaluated_features:
                        cat_candidate = CandidateFeature(copy.deepcopy(id_t), combo)
                        cat_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                        #all_evaluated_features.add(sympy_representation)
                        cat_candidates_to_be_applied.append(cat_candidate)

        return cat_candidates_to_be_applied
Beispiel #14
0
    def nested_transfusion(self, name2feature):

        group_by = CandidateFeature(
            FastGroupByThenTransformation(np.nanmin, groupbythenmin),
            [name2feature["Time"], name2feature["Recency"]])

        my_sum = CandidateFeature(
            HigherOrderCommutativeTransformation(np.nansum, sympy.Add, 2),
            [name2feature["Recency"], group_by])

        return [
            CandidateFeature(
                NonCommutativeBinaryTransformation(np.divide, sympy_divide),
                [my_sum, name2feature["Monetary"]])
        ]
Beispiel #15
0
 def generate_features(self, transformations: List[Transformation], features: List[CandidateFeature]) -> List[CandidateFeature]:
     generated_features: List[CandidateFeature] = []
     for t_i in transformations:
         for f_i in t_i.get_combinations(features):
             if t_i.is_applicable(f_i):
                 generated_features.append(CandidateFeature(copy.deepcopy(t_i), f_i)) # do we need a deep copy here?
                 #if output is multidimensional adapt here
     return generated_features
Beispiel #16
0
    def sisso_transfusion_features_new3(
            self, name2feature) -> List[CandidateFeature]:
        sisso_features = []
        sisso_features.extend(self.raw_features)

        squared_recency = CandidateFeature(
            HigherOrderCommutativeTransformation(np.prod, 2),
            [name2feature['Recency'], name2feature['Recency']])

        squared_monetary = CandidateFeature(
            HigherOrderCommutativeTransformation(np.prod, 2),
            [name2feature['Monetary'], name2feature['Monetary']])

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(np.divide),
                             [name2feature['Recency'], name2feature['Time']]))

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(np.divide),
                             [name2feature['Monetary'], name2feature['Time']]))

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(np.divide),
                             [squared_monetary, name2feature['Time']]))

        sisso_features.append(
            CandidateFeature(NonCommutativeBinaryTransformation(np.divide),
                             [squared_recency, name2feature['Time']]))

        all_f = CandidateFeature(IdentityTransformation(len(sisso_features)),
                                 sisso_features)
        return [all_f]
Beispiel #17
0
 def get_info_gain_of_feature(self, candidate: CandidateFeature):
     try:
         new_candidate = CandidateFeature(IdentityTransformation(2),
                                          [self.base_features, candidate])
         X = new_candidate.pipeline.fit_transform(
             self.dataset.splitted_values['train'], self.train_y_all_target)
         return mutual_info_classif(X, self.train_y_all_target)[-1]
     except:
         return 0.0
Beispiel #18
0
    def generate_for_transformation(self, t_i):
        result_features = []
        for f_i in t_i.get_combinations(
                list(itertools.chain(*self.current_features))):
            if t_i.is_applicable(f_i):
                current_feature = CandidateFeature(copy.deepcopy(t_i), f_i)
                #print(current_feature)

                result_features.append(current_feature)
        return result_features
Beispiel #19
0
    def explorekit_heart_features(self, name2feature):
        explore_kit_features = []
        explore_kit_features.extend(self.raw_features)

        # Discretize({Mean(age) GROUP BY Discretize(sex), Discretize(exercise_induced_angina)})
        discr_sex = CandidateFeature(PandasDiscretizerTransformation(10),
                                     [name2feature['sex']])
        discr_angina = CandidateFeature(
            PandasDiscretizerTransformation(10),
            [name2feature['exercise_induced_angina']])
        grouped = CandidateFeature(GroupByThenTransformation(
            np.mean, 3), [name2feature['age'], discr_sex, discr_angina])
        final = CandidateFeature(PandasDiscretizerTransformation(10),
                                 [grouped])

        explore_kit_features.append(final)

        all_f = CandidateFeature(
            IdentityTransformation(len(explore_kit_features)),
            explore_kit_features)
        return [all_f]
    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        print([r.name for r in self.dataset.raw_features])


        plain_attributes = CandidateFeature(IdentityTransformation(len(self.dataset.raw_features)), self.dataset.raw_features)


        self.evaluate_candidates([plain_attributes])
Beispiel #21
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate(42)
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        level_scores: Dict[int, List[float]] = {}
        level_test_scores: Dict[int, List[float]] = {}

        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit')
        #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all')
        #string2candidate = self.load_data_all('/tmp')

        features = pickle.load(open('/tmp/cover_features.p', "rb"))

        #apply minmax scaling
        new_features: List[CandidateFeature] = []
        for f in features:
            new_features.append(
                CandidateFeature(MinMaxScalingTransformation(), [f]))

        results = self.evaluate_candidates([
            CandidateFeature(IdentityTransformation(len(new_features)),
                             new_features)
        ], myfolds)

        print(results[0])
        print(results[0].runtime_properties)

        return results[0]
Beispiel #22
0
    def run(self):
        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        all_f = CandidateFeature(
            IdentityTransformation(len(self.raw_features)), self.raw_features)

        feature_names = [str(r) for r in self.raw_features]

        t = CandidateFeature(
            SissoTransformer(len(self.raw_features), feature_names,
                             ["^2", "^3", "1/"]), [all_f])

        t.pipeline.fit(self.dataset.splitted_values['train'],
                       self.train_y_all_target)
        X = t.transform(self.dataset.splitted_values['train'])
        X_test = t.transform(self.dataset.splitted_values['test'])

        print("time: " + str(time.time() - self.global_starting_time))

        clf = GridSearchCV(self.classifier(),
                           self.grid_search_parameters,
                           cv=self.preprocessed_folds,
                           scoring=self.score,
                           iid=False,
                           error_score='raise')
        clf.fit(X, self.train_y_all_target)

        print(X_test)

        print('test score: ' + str(clf.score(X_test, self.test_target)))
        print("\n\n")
Beispiel #23
0
def getSimplicityScore(current: CandidateFeature, complexity):
    count_greater_or_equal_v = 0
    count_all = 0

    for c in range(1, complexity + 1):
        if c >= current.get_number_of_transformations():
            count_greater_or_equal_v += len(cost_2_raw_features[c])
            count_greater_or_equal_v += len(cost_2_unary_transformed[c])
            count_greater_or_equal_v += len(cost_2_binary_transformed[c])
            count_greater_or_equal_v += len(cost_2_combination[c])

        count_all += len(cost_2_raw_features[c])
        count_all += len(cost_2_unary_transformed[c])
        count_all += len(cost_2_binary_transformed[c])
        count_all += len(cost_2_combination[c])

    return count_greater_or_equal_v / float(count_all)
Beispiel #24
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)




        ############################

        # start

        ############################

        current_layer = []
        c = 1

        cost_2_raw_features[c]: List[CandidateFeature] = []
        # print(self.raw_features)
        for raw_f in self.raw_features:
            sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
            raw_f.sympy_representation = sympy_representation
            all_evaluated_features.add(sympy_representation)
            if raw_f.is_numeric():
                current_layer.append(raw_f)
                # print("numeric: " + str(raw_f))
            else:
                raw_f.runtime_properties['score'] = 0.0
                cost_2_raw_features[c].append(raw_f)
                # print("nonnumeric: " + str(raw_f))

            self.materialize_raw_features(raw_f)
            raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

        # now evaluate all from this layer
        # print(current_layer)
        print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
        results = evaluate_candidates(current_layer)
        print("----------- Evaluation Finished -----------")

        layer_end_time = time.time() - self.global_starting_time

        # calculate whether we drop the evaluated candidate
        for candidate in results:
            if type(candidate) != type(None):
                candidate.runtime_properties['layer_end_time'] = layer_end_time

                # print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))

                if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                    max_feature = candidate

                if candidate.runtime_properties['passed']:
                    if isinstance(candidate, RawFeature):
                        if not c in cost_2_raw_features:
                            cost_2_raw_features[c]: List[CandidateFeature] = []
                        cost_2_raw_features[c].append(candidate)
                    elif isinstance(candidate.transformation, UnaryTransformation):
                        if not c in cost_2_unary_transformed:
                            cost_2_unary_transformed[c]: List[CandidateFeature] = []
                        cost_2_unary_transformed[c].append(candidate)
                    elif isinstance(candidate.transformation, IdentityTransformation):
                        if not c in cost_2_combination:
                            cost_2_combination[c]: List[CandidateFeature] = []
                        cost_2_combination[c].append(candidate)
                    else:
                        if not c in cost_2_binary_transformed:
                            cost_2_binary_transformed[c]: List[CandidateFeature] = []
                        cost_2_binary_transformed[c].append(candidate)
                else:
                    if self.save_logs:
                        if not c in cost_2_dropped_evaluated_candidates:
                            cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                        cost_2_dropped_evaluated_candidates[c].append(candidate)

        print(cost_2_raw_features[c])

        #select next representation

        #next_id = np.argmax([rf.runtime_properties['score'] for rf in cost_2_raw_features[1]])
        next_id = np.random.randint(len(cost_2_raw_features[1]))
        next_rep = cost_2_raw_features[c][next_id]

        max_rep = next_rep

        current_lambda = 0

        number_runs= 200

        rep_succesion = []

        for runs in range(number_runs):
            rep_succesion.append(next_rep)
            #print('next: ' + str(next_rep))

            #######################
            #create branch
            #######################
            current_layer = []
            # first unary
            if not isinstance(next_rep.transformation, IdentityTransformation):
                current_layer.extend(self.generate_features(unary_transformations, [next_rep], all_evaluated_features))

            # second binary
            if not isinstance(next_rep.transformation, IdentityTransformation):
                binary_candidates_to_be_applied = []
                for bt in binary_transformations:
                    list_of_combinations = self.generate_merge([next_rep], cost_2_raw_features[1],
                                                               bt.parent_feature_order_matters,
                                                               bt.parent_feature_repetition_is_allowed)
                    # print(list_of_combinations)
                    for combo in list_of_combinations:
                        if bt.is_applicable(combo):
                            sympy_representation = bt.get_sympy_representation(
                                [p.get_sympy_representation() for p in combo])
                            try:
                                if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                    if not sympy_representation in all_evaluated_features:
                                        bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                        bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                        binary_candidates_to_be_applied.append(bin_candidate)
                                    else:
                                        # print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                else:
                                    # print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                    pass
                            except:
                                pass
                current_layer.extend(binary_candidates_to_be_applied)

            # third: feature combinations
            '''
            combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, [next_rep], cost_2_raw_features[1])
            current_layer.extend(combinations_to_be_applied)
            '''
            #print(current_layer)

            # select next representation
            shuffled_indices = np.arange(len(current_layer))
            np.random.shuffle(shuffled_indices)
            for rep_i in range(len(current_layer)):
                new_rep = current_layer[shuffled_indices[rep_i]]
                all_evaluated_features.add(next_rep.sympy_representation)

                new_rep = evaluate_candidates([new_rep])[0]
                if new_rep != None:
                    break

            print(str(new_rep) + " cv score: " + str(new_rep.runtime_properties['score']) + " test: " + str(
                new_rep.runtime_properties['test_score']))
            if new_rep == None:
                break

            if new_rep.runtime_properties['score'] * self.score._sign > max_rep.runtime_properties['score']:
                max_rep = new_rep
                print("max representation: " + str(max_rep))

            if new_rep.runtime_properties['score'] * self.score._sign <= rep_succesion[-1*(current_lambda+1)].runtime_properties['score']:
                current_lambda += 1
            if current_lambda >= self.lambda_threshold:
                next_rep = max_rep
                current_lambda = 0
            else:
                next_rep = new_rep
Beispiel #25
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()

        for raw_f in self.raw_features:
            raw_f.properties['type'] = 'float'

        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        myfolds = copy.deepcopy(list(self.preprocessed_folds))

        R_w = 15000
        max_iterations = 15  #15
        threshold_f = 0.001
        epsilon_w = 0.01
        threshold_w = 0.0

        all_features = self.produce_features()

        print(len(all_features))

        self.base_features = CandidateFeature(
            IdentityTransformation(len(self.raw_features)), self.raw_features)

        results = {}

        for i in range(max_iterations):

            print("base features: " + str(self.base_features))

            results[i] = self.evaluate_candidates([self.base_features],
                                                  myfolds)[0]
            print(results[i])
            print(results[i].runtime_properties)

            feature_scores = self.evaluate_ranking(all_features)
            ids = np.argsort(np.array(feature_scores) * -1)
            print(feature_scores)

            best_improvement_so_far = np.NINF
            best_Feature_So_Far = None
            evaluated_candidate_features = 0
            for f_i in range(len(feature_scores)):
                if feature_scores[ids[f_i]] < threshold_f:
                    break

                current_feature_set = CandidateFeature(
                    IdentityTransformation(2),
                    [self.base_features, all_features[ids[f_i]]])
                print(current_feature_set)
                result = self.evaluate_candidates([current_feature_set],
                                                  myfolds)[0]
                evaluated_candidate_features += 1
                improvement = result.runtime_properties['score'] - results[
                    i].runtime_properties['score']

                print("Candidate: " + str(all_features[ids[f_i]]) +
                      " score: " + str(result.runtime_properties['score']) +
                      " info: " + str(feature_scores[ids[f_i]]))
                print("improvement: " + str(improvement))
                if improvement > best_improvement_so_far:
                    best_improvement_so_far = improvement
                    best_Feature_So_Far = result

                    results[i] = best_Feature_So_Far
                    results[i].runtime_properties[
                        'score_improvement'] = improvement
                    results[i].runtime_properties[
                        'info_gain'] = feature_scores[ids[f_i]]
                    results[i].runtime_properties['global time'] = time.time(
                    ) - self.global_starting_time

                    pickle.dump(
                        results,
                        open(
                            Config.get("tmp.folder") + "/explorekit_results.p",
                            "wb"))

                if improvement >= epsilon_w:
                    break
                if evaluated_candidate_features >= R_w:
                    break

            if best_improvement_so_far > threshold_w:
                self.base_features = best_Feature_So_Far
            else:
                return self.base_features

            all_features_new = []
            for i in range(len(feature_scores)):
                if feature_scores[i] >= 0:
                    all_features_new.append(all_features[i])
            all_features = all_features_new
        return results
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)
        my_globale_module.remove_parents = copy.deepcopy(self.remove_parents)

        my_globale_module.materialized_set = set()
        my_globale_module.predictions_set = set()

        number_of_multiple_cvs = 10
        nested_my_globale_module.splitting_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs)
        nested_my_globale_module.model_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs)

        #pickle.dump(my_globale_module.target_test_folds_global, open('/tmp/test_groundtruth.p', 'wb+'))


        c = 1
        while(True):
            current_layer: List[CandidateFeature] = []

            if c <= self.max_feature_depth:
                #0th
                if c == 1:
                    cost_2_raw_features[c]: List[CandidateFeature] = []
                    #print(self.raw_features)
                    for raw_f in self.raw_features:
                        sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
                        raw_f.sympy_representation = sympy_representation
                        all_evaluated_features.add(sympy_representation)
                        if raw_f.is_numeric():
                            if raw_f.properties['missing_values']:
                                raw_f.runtime_properties['score'] = 0.0
                                cost_2_raw_features[c].append(raw_f)
                            else:
                                current_layer.append(raw_f)
                            #print("numeric: " + str(raw_f))
                        else:
                            raw_f.runtime_properties['score'] = 0.0
                            cost_2_raw_features[c].append(raw_f)
                            #print("nonnumeric: " + str(raw_f))

                        self.materialize_raw_features(raw_f)
                        #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

                # first unary
                # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?)
                unary_candidates_to_be_applied: List[CandidateFeature] = []
                if (c - 1) in cost_2_raw_features:
                    unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1])
                if (c - 1) in cost_2_unary_transformed:
                    unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1])
                if (c - 1) in cost_2_binary_transformed:
                    unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1])

                all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features)
                current_layer.extend(all_unary_features)

                #second binary
                #get length 2 partitions for current cost
                partition = self.get_length_2_partition(c-1)
                #print("bin: c: " + str(c) + " partition" + str(partition))

                #apply cross product from partitions
                binary_candidates_to_be_applied: List[CandidateFeature] = []
                for p in partition:
                    lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                    for element in range(2):
                        if p[element] in cost_2_raw_features:
                            lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                        if p[element] in cost_2_unary_transformed:
                            lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                        if p[element] in cost_2_binary_transformed:
                            lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])

                    for bt in binary_transformations:
                        list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed)
                        #print(list_of_combinations)
                        for combo in list_of_combinations:
                            if bt.is_applicable(combo):
                                sympy_representation = bt.get_sympy_representation(
                                    [p.get_sympy_representation() for p in combo])
                                try:
                                    if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                        if not sympy_representation in all_evaluated_features:
                                            bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                            bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                            all_evaluated_features.add(sympy_representation)
                                            binary_candidates_to_be_applied.append(bin_candidate)
                                        else:
                                            #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                            pass
                                    else:
                                        #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                except:
                                    pass
                current_layer.extend(binary_candidates_to_be_applied)

            #third: feature combinations
            #first variant: treat combination as a transformation
            #therefore, we can use the same partition as for binary data
            partition = self.get_length_2_partition(c)
            #print("combo c: " + str(c) + " partition" + str(partition))


            def filter_minus(features: List[CandidateFeature]):
                filtered_features: List[CandidateFeature] = []
                if my_globale_module.classifier_global == LogisticRegression:
                    for check_f in features:
                        if not isinstance(check_f.transformation, MinusTransformation):
                            filtered_features.append(check_f)
                return filtered_features

            '''
            combinations_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(filter_minus(cost_2_unary_transformed[p[element]]))
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(filter_minus(cost_2_binary_transformed[p[element]]))
                    if p[element] in cost_2_combination:
                        lists_for_each_element[element].extend(cost_2_combination[p[element]])

                combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1])
            current_layer.extend(combinations_to_be_applied)
            '''



            if unique_raw_combinations:
                length = len(current_layer)
                current_layer = self.filter_non_unique_combinations(current_layer)
                print("From " + str(length) + " combinations, we filter " +  str(length - len(current_layer)) + " nonunique raw feature combinations.")



            #now evaluate all from this layer
            #print(current_layer)

            print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
            results = evaluate_candidates_parallel(current_layer, self.n_jobs)
            print("----------- Evaluation Finished -----------")





            ##nested cv
            '''
            new_results_with_nested = []
            for r_result in results:
                if type(r_result) != type(None):
                    new_results_with_nested.append(r_result)
            #results = nested_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train'])
            results = multiple_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train'])
            for r_result in results:
                #print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str(r_result.runtime_properties['test_score']) + ' nested: ' + str(r_result.runtime_properties['nested_cv_score']))
                print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str(
                    r_result.runtime_properties['test_score']) + ' nested: ' + str(
                    r_result.runtime_properties['multiple_cv_score']))
            '''


            #print(results)

            layer_end_time = time.time() - self.global_starting_time

            #calculate whether we drop the evaluated candidate
            for candidate in results:

                ## check if we computed an equivalent feature before
                if type(candidate) != type(None) and not isinstance(candidate.transformation, IdentityTransformation):
                    materialized_all = []
                    for fold_ii in range(len(my_globale_module.preprocessed_folds_global)):
                        materialized_all.extend(candidate.runtime_properties['test_transformed'][fold_ii].flatten())
                    materialized = tuple(materialized_all)
                    if materialized in my_globale_module.materialized_set:
                        candidate = None
                    else:
                        my_globale_module.materialized_set.add(materialized)

                '''
                ## check if predictions exist already
                if type(candidate) != type(None) and 'test_fold_predictions' in candidate.runtime_properties:
                    materialized_all = []
                    for fold_ii in range(len(my_globale_module.preprocessed_folds_global)):
                        materialized_all.extend(candidate.runtime_properties['test_fold_predictions'][fold_ii].flatten())
                    materialized = tuple(materialized_all)
                    if materialized in my_globale_module.predictions_set:
                        candidate = None
                    else:
                        my_globale_module.predictions_set.add(materialized)
                '''



                if type(candidate) != type(None):
                    candidate.runtime_properties['layer_end_time'] = layer_end_time

                    #print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))


                    if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                        max_feature = candidate

                    if candidate.runtime_properties['passed']:

                        if isinstance(candidate, RawFeature):
                            if not c in cost_2_raw_features:
                                cost_2_raw_features[c]: List[CandidateFeature] = []
                            cost_2_raw_features[c].append(candidate)
                        elif isinstance(candidate.transformation, UnaryTransformation):
                            if not c in cost_2_unary_transformed:
                                cost_2_unary_transformed[c]: List[CandidateFeature] = []
                            cost_2_unary_transformed[c].append(candidate)
                        elif isinstance(candidate.transformation, IdentityTransformation):
                            if not c in cost_2_combination:
                                cost_2_combination[c]: List[CandidateFeature] = []
                            cost_2_combination[c].append(candidate)
                        else:
                            if not c in cost_2_binary_transformed:
                                cost_2_binary_transformed[c]: List[CandidateFeature] = []
                            cost_2_binary_transformed[c].append(candidate)
                    else:
                        if self.save_logs:
                            if not c in cost_2_dropped_evaluated_candidates:
                                cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                            cost_2_dropped_evaluated_candidates[c].append(candidate)
            


            satisfied_count = 0
            if c in cost_2_raw_features:
                satisfied_count += len(cost_2_raw_features[c])
            if c in cost_2_unary_transformed:
                satisfied_count += len(cost_2_unary_transformed[c])
            if c in cost_2_binary_transformed:
                satisfied_count += len(cost_2_binary_transformed[c])
            if c in cost_2_combination:
                satisfied_count += len(cost_2_combination[c])

            all_count = len(current_layer)
            if c == 1:
                all_count = len(cost_2_raw_features[c])


            print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.")


            if len(current_layer) > 0:
                if 'test_score' in max_feature.runtime_properties:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n")
                else:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(
                        max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(
                        max_feature.runtime_properties['score']) + "\n")
                #print("hyper: " + str(max_feature.runtime_properties['hyperparameters']))

                #print(max_feature.runtime_properties['fold_scores'])

            # upload best feature to OpenML
            if self.upload2openml:
                candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven')


            if self.save_logs:
                try:
                    pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                except:
                    pickle.dump(cost_2_raw_features, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"),
                                protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_unary_transformed, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_unary.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_binary_transformed, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_binary.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_combination, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_combination.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                    pickle.dump(cost_2_dropped_evaluated_candidates, open(
                        Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p",
                        "wb"), protocol=pickle.HIGHEST_PROTOCOL)


            max_feature_per_complexity[c] = max_feature


            if type(self.c_max) == type(None) and c > 2:
                # calculate harmonic mean
                harmonic_means = [0.0]*3
                for h_i in range(len(harmonic_means)):
                    simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c,
                                                                       cost_2_raw_features, cost_2_unary_transformed,
                                                                       cost_2_binary_transformed, cost_2_combination)
                    accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c,
                                                                   cost_2_raw_features, cost_2_unary_transformed,
                                                                   cost_2_binary_transformed, cost_2_combination)

                    harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score)
                    #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i))

                if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]:
                    print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2]))
                    break


            if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp:
                break

            c += 1

            if type(self.c_max) != type(None) and self.c_max < c:
                break





        def extend_all(all_representations: List[CandidateFeature], new_llist):
            for mylist in new_llist:
                all_representations.extend(mylist)

        #get all representation
        all_representations: List[CandidateFeature] = []
        extend_all(all_representations, cost_2_raw_features.values())
        extend_all(all_representations, cost_2_unary_transformed.values())
        extend_all(all_representations, cost_2_binary_transformed.values())
        extend_all(all_representations, cost_2_combination.values())

        self.all_representations = all_representations

        '''

        #find top k based on cv score
        scores = [c.runtime_properties['score'] for c in all_representations]
        sorted_cv_score_ids = np.argsort(np.array(scores)*-1)
        checking_k = 50
        top_k_representations = [all_representations[sorted_id] for sorted_id in sorted_cv_score_ids[0:checking_k]]

        #from top k - select best based on nested cv score
        top_k_representations = multiple_cv_score_parallel(top_k_representations, self.reader.splitted_values['train'],
                                           self.reader.splitted_target['train'])

        scores = [c.runtime_properties['multiple_cv_score'] for c in top_k_representations]

        max_nested_cv_score = -1
        max_nested_rep = None
        for eval_candidate in top_k_representations:
            if eval_candidate.runtime_properties['multiple_cv_score'] > max_nested_cv_score:
                max_nested_cv_score = eval_candidate.runtime_properties['multiple_cv_score']
                max_nested_rep = eval_candidate

        print(max_nested_rep)
        max_feature = max_nested_rep
        '''

        '''
        all_features = list(max_feature_per_complexity.values())
        all_features = multiple_cv_score_parallel(all_features, self.reader.splitted_values['train'], self.reader.splitted_target['train'])

        best_multiple_cv_score = -np.inf
        best_multiple_cv_candidate = None
        for all_f in all_features:
            if all_f.runtime_properties['multiple_cv_score'] > best_multiple_cv_score:
                best_multiple_cv_score = all_f.runtime_properties['multiple_cv_score']
                best_multiple_cv_candidate = all_f

        #find the most simple representation that is within the best representation's std
        complexities = [all_f.get_complexity() for all_f in all_features]
        ids_complex = np.argsort(complexities)
        for all_f_i in range(len(all_features)):
            print(str(all_features[ids_complex[all_f_i]]) + ' mcv: ' + str(all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score']) + ' mcv_std: ' + str(
                all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score_std']))

            if all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score'] > best_multiple_cv_candidate.runtime_properties['multiple_cv_score'] - best_multiple_cv_candidate.runtime_properties['multiple_cv_score_std']:
                max_feature = all_features[ids_complex[all_f_i]]
                break

        print(max_feature)
        '''

        #min AICc selection
        min_aicc = np.inf
        min_aicc_feature = None

        all_aiccs = []
        for rep in list(max_feature_per_complexity.values()):
            all_aiccs.append(np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity']))

        def calculate_AIC_for_classification_paper(rss, n, k):
            AIC = 2 * k + float(n) * np.log(rss / float(n))
            return AIC

        def calculate_AICc_for_classification_paper(rss, n, k):
            AIC = calculate_AIC_for_classification_paper(rss, n, k)
            AICc = AIC + ((2 * k * (k + 1)) / (n - k - 1))
            return AICc


        def calc_global_aicc(rep):
            return calculate_AICc_for_classification_paper(np.sum(rep.runtime_properties['additional_metrics']['rss']), np.sum(rep.runtime_properties['additional_metrics']['n']), rep.get_complexity())

        def is_better(old_aics, new_aics):
            print(np.sum(np.array(new_aics) < np.array(old_aics)))
            return np.sum(np.array(new_aics) < np.array(old_aics)) > len(new_aics) / 2.0

        for rep in list(max_feature_per_complexity.values()):
            curr = np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity'])
            #print(str(rep) + ': ' + str(curr) + ' AICc min: ' + str(np.min(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' AICc std: ' + str(np.std(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' P: ' + str(np.exp((min(all_aiccs) - curr)/2)) + ' CV AUC: ' + str(rep.runtime_properties['score']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['AICc_complexity']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['rss']))
            print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['n']))

            print(str(rep) + 'global_aicc: ' + str(calc_global_aicc(rep)))

            #if type(min_aicc_feature) == type(None) or is_better(min_aicc_feature.runtime_properties['additional_metrics']['AICc_complexity'], rep.runtime_properties['additional_metrics']['AICc_complexity']):
            if type(min_aicc_feature) == type(None) or calc_global_aicc(rep) < calc_global_aicc(min_aicc_feature):
                #min_aicc = np.min(rep.runtime_properties['additional_metrics']['AICc_complexity'])
                min_aicc_feature = rep
        max_feature = min_aicc_feature

        print(max_feature)

        return max_feature
Beispiel #27
0
    def evaluate(self, candidate: CandidateFeature, score=make_scorer(f1_score, average='micro')):

        if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp:
            raise RuntimeError('Out of time!')

        result = {}
        train_transformed = {}
        validation_transformed = {}
        train_and_validation_transformed = [None] * self.folds
        test_transformed = [None] * self.folds

        #test
        training_all = None
        one_test_set_transformed = None

        result['train_transformed'] = None
        result['validation_transformed'] = None
        result['train_and_validation_transformed'] = None
        result['test_transformed'] = None
        result['one_test_set_transformed'] = None

        if isinstance(candidate, RawFeature):

            if Config.get_default('score.test', 'False') == 'True':
                result['training_all'] = training_all = self.name_to_training_all[str(candidate)]
                result['one_test_set_transformed'] = one_test_set_transformed = self.name_to_one_test_set_transformed[str(candidate)]

            result['train_transformed'] = train_transformed = self.name_to_train_transformed[str(candidate)]
            result['validation_transformed'] = validation_transformed = self.name_to_validation_transformed[str(candidate)]
            result['train_and_validation_transformed'] = train_and_validation_transformed =self.name_to_train_and_validation_transformed[str(candidate)]
            result['test_transformed'] = test_transformed = self.name_to_test_transformed[str(candidate)]

        else:

            #print(self.name_to_train_transformed.keys())

            #merge columns from parents
            for key, value in self.name_to_train_transformed[str(list(candidate.parents)[0])].items():
                train_transformed_input = np.hstack([self.name_to_train_transformed[str(p)][key] for p in candidate.parents])
                validation_transformed_input = np.hstack([self.name_to_validation_transformed[str(p)][key] for p in candidate.parents])

                candidate.transformation.fit(train_transformed_input)

                train_transformed[key] = candidate.transformation.transform(train_transformed_input)
                validation_transformed[key] = candidate.transformation.transform(validation_transformed_input)

            for fold_i in range(self.folds):
                train_and_validation_transformed_input = np.hstack([self.name_to_train_and_validation_transformed[str(p)][fold_i] for p in candidate.parents])
                test_transformed_input = np.hstack([self.name_to_test_transformed[str(p)][fold_i] for p in candidate.parents])

                candidate.transformation.fit(train_and_validation_transformed_input)

                train_and_validation_transformed[fold_i] = candidate.transformation.transform(train_and_validation_transformed_input)
                test_transformed[fold_i] = candidate.transformation.transform(test_transformed_input)

            if Config.get_default('score.test', 'False') == 'True':
                training_all_input = np.hstack(
                    [self.name_to_training_all[str(p)] for p in candidate.parents])
                one_test_set_transformed_input = np.hstack(
                    [self.name_to_one_test_set_transformed[str(p)] for p in candidate.parents])

                candidate.transformation.fit(training_all_input)
                training_all = candidate.transformation.transform(training_all_input)
                one_test_set_transformed = candidate.transformation.transform(one_test_set_transformed_input)

        candidate.runtime_properties['score'], candidate.runtime_properties['test_score'], candidate.runtime_properties['hyperparameters'], y_pred = self.nested_grid_search(train_transformed, validation_transformed, train_and_validation_transformed, test_transformed, training_all, one_test_set_transformed)

        if Config.get_default('store.predictions', 'False') == 'True':
            candidate.runtime_properties['predictions'] = y_pred

        if not isinstance(candidate, RawFeature):
            #only save the transformed data if we need it in the future
            max_parent = np.max([p.runtime_properties['score'] for p in candidate.parents])
            accuracy_delta = candidate.runtime_properties['score'] - max_parent
            if accuracy_delta / self.complexity_delta > self.epsilon:

                result['train_transformed'] = train_transformed
                result['validation_transformed'] = validation_transformed
                result['train_and_validation_transformed'] = train_and_validation_transformed
                result['test_transformed'] = test_transformed


                result['training_all'] = training_all
                result['one_test_set_transformed'] = one_test_set_transformed

                # derive properties
                if not isinstance(candidate, RawFeature):
                    candidate.derive_properties(result['train_and_validation_transformed'][0])


        return result
Beispiel #28
0
def evaluate(candidate: CandidateFeature,
             classifier,
             grid_search_parameters,
             preprocessed_folds,
             score,
             train_data,
             current_target,
             train_X_all,
             train_y_all_target,
             test_data,
             test_target,
             cv_jobs=1):
    pipeline = Pipeline([('features',
                          FeatureUnion([(candidate.get_name(),
                                         candidate.pipeline)])),
                         ('classifier', classifier())])

    refit = False
    if Config.get_default('score.test',
                          'False') == 'True' and not Config.get_default(
                              'instance.selection', 'False') == 'True':
        refit = True

    print(grid_search_parameters)
    clf = GridSearchCV(pipeline,
                       grid_search_parameters,
                       cv=preprocessed_folds,
                       scoring=score,
                       iid=False,
                       error_score='raise',
                       refit=refit,
                       n_jobs=cv_jobs)
    clf.fit(train_data, current_target)  #dataset.splitted_values['train']
    candidate.runtime_properties['score'] = clf.best_score_
    candidate.runtime_properties['hyperparameters'] = clf.best_params_

    #for
    test_fold_predictions = []
    for fold in range(len(preprocessed_folds)):
        test_fold_predictions.append(
            clf.predict(train_data[preprocessed_folds[fold][1]]) ==
            current_target[preprocessed_folds[fold][1]])
    candidate.runtime_properties[
        'test_fold_predictions'] = test_fold_predictions

    if Config.get_default('score.test',
                          'False') == 'True' and len(test_data) > 0:
        if Config.get_default('instance.selection', 'False') == 'True':
            clf = GridSearchCV(pipeline,
                               grid_search_parameters,
                               cv=preprocessed_folds,
                               scoring=score,
                               iid=False,
                               error_score='raise',
                               refit=True)

            clf.fit(train_X_all, train_y_all_target)
        candidate.runtime_properties['test_score'] = clf.score(
            test_data, test_target)  #self.dataset.splitted_values['test']
    else:
        candidate.runtime_properties['test_score'] = 0.0

    return candidate
Beispiel #29
0
ground_truth = [28, 48, 64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433, 442, 451, 453, 455, 472, 475, 493]


print(len(ground_truth))

mask = np.zeros(len(numeric_representations), dtype=bool)
for i in range(len(numeric_representations)):
	for g in ground_truth:
		if str(numeric_representations[i]) == 'V' + str(g):
			mask[i] = True
			break

print(np.sum(mask))

all_features = CandidateFeature(IdentityTransformation(-1), numeric_representations)
all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features])

#foreigner = np.array(X_train[:,7])
#gender = np.array(['female' in personal_status for personal_status in X_train[:,15]])

scoring = {'auc': make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)}


#for count_i in range(10):
parameter_grid = {'model__penalty': ['l2'], 'model__C': [1], 'model__solver': ['lbfgs'],
				  'model__class_weight': ['balanced'], 'model__max_iter': [10000], 'model__multi_class': ['auto']}

my_pipeline = Pipeline([('features', all_standardized.pipeline),
						#('selection', L1Selection()),
						#('selection', SelectKBest(score_func=mutual_info_classif,k=10)),
Beispiel #30
0
def run_multiple_cross_validation(feature: CandidateFeature,
                                  splitted_values_train, splitted_target_train,
                                  parameters, model, score):

    #try:
    X_train = splitted_values_train
    y_train = splitted_target_train

    pipeline = generate_pipeline(feature, model)
    #pipeline = generate_smote_pipeline(feature, model)

    multiple_cv_score = []

    multiple_cv_complexity_score = []

    hyperparameters2count = {}

    #print(str(feature) + ' before: ' + str(feature.runtime_properties['hyperparameters']))

    for m_i in range(len(nested_my_globale_module.model_seeds)):
        preprocessed_folds = []
        for train, test in StratifiedKFold(
                n_splits=len(nested_my_globale_module.splitting_seeds),
                shuffle=True,
                random_state=nested_my_globale_module.splitting_seeds[m_i]
        ).split(splitted_values_train, splitted_target_train):
            preprocessed_folds.append((train, test))

        #replace parameter keys
        new_parameters = copy.deepcopy(parameters)
        new_parameters['random_state'] = [
            int(nested_my_globale_module.model_seeds[m_i])
        ]
        old_keys = list(new_parameters.keys())
        for k in old_keys:
            if not str(k).startswith('c__'):
                new_parameters['c__' + str(k)] = new_parameters.pop(k)

        scoring = {
            'accuracy':
            score,
            'complexity':
            make_scorer(customAICc,
                        greater_is_better=False,
                        needs_proba=True,
                        k=feature.get_complexity())
        }

        cv = GridSearchCV(pipeline,
                          param_grid=new_parameters,
                          scoring=scoring,
                          cv=preprocessed_folds,
                          refit='accuracy')
        cv.fit(X_train, y_train)
        multiple_cv_score.append(cv.best_score_)

        multiple_cv_complexity_score.append(
            cv.cv_results_['mean_test_complexity'][cv.best_index_])

        if not hashabledict(cv.best_params_) in hyperparameters2count:
            hyperparameters2count[hashabledict(cv.best_params_)] = 0
        hyperparameters2count[hashabledict(cv.best_params_)] += 1
        '''
			new_parameters = copy.deepcopy(feature.runtime_properties['hyperparameters'])
			new_parameters['random_state'] = int(nested_my_globale_module.model_seeds[m_i])
			old_keys = list(new_parameters.keys())
			for k in old_keys:
				if not str(k).startswith('c__'):
					new_parameters['c__' + str(k)] = new_parameters.pop(k)

			pipeline.set_params(**new_parameters)

			cv_results = cross_validate(pipeline, X_train, y_train, scoring=score, cv=preprocessed_folds)
			multiple_cv_score.append(np.mean(cv_results['test_score']))
			'''

    feature.runtime_properties['hyperparameters'] = max(
        hyperparameters2count.items(), key=operator.itemgetter(1))[0]

    new_parameters = copy.deepcopy(
        feature.runtime_properties['hyperparameters'])
    old_keys = list(new_parameters.keys())
    for k in old_keys:
        if str(k).startswith('c__'):
            new_parameters[str(k[3:])] = new_parameters.pop(k)
    feature.runtime_properties['hyperparameters'] = new_parameters

    print(
        str(feature) + ' AICc: ' + str(np.mean(multiple_cv_complexity_score)))

    #print(str(feature) + ' after: ' + str(feature.runtime_properties['hyperparameters']))

    return np.mean(multiple_cv_score), np.std(multiple_cv_score)