Beispiel #1
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)




        ############################

        # start

        ############################

        current_layer = []
        c = 1

        cost_2_raw_features[c]: List[CandidateFeature] = []
        # print(self.raw_features)
        for raw_f in self.raw_features:
            sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
            raw_f.sympy_representation = sympy_representation
            all_evaluated_features.add(sympy_representation)
            if raw_f.is_numeric():
                current_layer.append(raw_f)
                # print("numeric: " + str(raw_f))
            else:
                raw_f.runtime_properties['score'] = 0.0
                cost_2_raw_features[c].append(raw_f)
                # print("nonnumeric: " + str(raw_f))

            self.materialize_raw_features(raw_f)
            raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

        # now evaluate all from this layer
        # print(current_layer)
        print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
        results = evaluate_candidates(current_layer)
        print("----------- Evaluation Finished -----------")

        layer_end_time = time.time() - self.global_starting_time

        # calculate whether we drop the evaluated candidate
        for candidate in results:
            if type(candidate) != type(None):
                candidate.runtime_properties['layer_end_time'] = layer_end_time

                # print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))

                if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                    max_feature = candidate

                if candidate.runtime_properties['passed']:
                    if isinstance(candidate, RawFeature):
                        if not c in cost_2_raw_features:
                            cost_2_raw_features[c]: List[CandidateFeature] = []
                        cost_2_raw_features[c].append(candidate)
                    elif isinstance(candidate.transformation, UnaryTransformation):
                        if not c in cost_2_unary_transformed:
                            cost_2_unary_transformed[c]: List[CandidateFeature] = []
                        cost_2_unary_transformed[c].append(candidate)
                    elif isinstance(candidate.transformation, IdentityTransformation):
                        if not c in cost_2_combination:
                            cost_2_combination[c]: List[CandidateFeature] = []
                        cost_2_combination[c].append(candidate)
                    else:
                        if not c in cost_2_binary_transformed:
                            cost_2_binary_transformed[c]: List[CandidateFeature] = []
                        cost_2_binary_transformed[c].append(candidate)
                else:
                    if self.save_logs:
                        if not c in cost_2_dropped_evaluated_candidates:
                            cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                        cost_2_dropped_evaluated_candidates[c].append(candidate)

        print(cost_2_raw_features[c])

        #select next representation

        #next_id = np.argmax([rf.runtime_properties['score'] for rf in cost_2_raw_features[1]])
        next_id = np.random.randint(len(cost_2_raw_features[1]))
        next_rep = cost_2_raw_features[c][next_id]

        max_rep = next_rep

        current_lambda = 0

        number_runs= 200

        rep_succesion = []

        for runs in range(number_runs):
            rep_succesion.append(next_rep)
            #print('next: ' + str(next_rep))

            #######################
            #create branch
            #######################
            current_layer = []
            # first unary
            if not isinstance(next_rep.transformation, IdentityTransformation):
                current_layer.extend(self.generate_features(unary_transformations, [next_rep], all_evaluated_features))

            # second binary
            if not isinstance(next_rep.transformation, IdentityTransformation):
                binary_candidates_to_be_applied = []
                for bt in binary_transformations:
                    list_of_combinations = self.generate_merge([next_rep], cost_2_raw_features[1],
                                                               bt.parent_feature_order_matters,
                                                               bt.parent_feature_repetition_is_allowed)
                    # print(list_of_combinations)
                    for combo in list_of_combinations:
                        if bt.is_applicable(combo):
                            sympy_representation = bt.get_sympy_representation(
                                [p.get_sympy_representation() for p in combo])
                            try:
                                if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                    if not sympy_representation in all_evaluated_features:
                                        bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                        bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                        binary_candidates_to_be_applied.append(bin_candidate)
                                    else:
                                        # print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                else:
                                    # print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                    pass
                            except:
                                pass
                current_layer.extend(binary_candidates_to_be_applied)

            # third: feature combinations
            '''
            combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, [next_rep], cost_2_raw_features[1])
            current_layer.extend(combinations_to_be_applied)
            '''
            #print(current_layer)

            # select next representation
            shuffled_indices = np.arange(len(current_layer))
            np.random.shuffle(shuffled_indices)
            for rep_i in range(len(current_layer)):
                new_rep = current_layer[shuffled_indices[rep_i]]
                all_evaluated_features.add(next_rep.sympy_representation)

                new_rep = evaluate_candidates([new_rep])[0]
                if new_rep != None:
                    break

            print(str(new_rep) + " cv score: " + str(new_rep.runtime_properties['score']) + " test: " + str(
                new_rep.runtime_properties['test_score']))
            if new_rep == None:
                break

            if new_rep.runtime_properties['score'] * self.score._sign > max_rep.runtime_properties['score']:
                max_rep = new_rep
                print("max representation: " + str(max_rep))

            if new_rep.runtime_properties['score'] * self.score._sign <= rep_succesion[-1*(current_lambda+1)].runtime_properties['score']:
                current_lambda += 1
            if current_lambda >= self.lambda_threshold:
                next_rep = max_rep
                current_lambda = 0
            else:
                next_rep = new_rep
Beispiel #2
0
    def run(self):

        self.global_starting_time = time.time()

        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features)



        cost_2_raw_features: Dict[int, List[CandidateFeature]] = {}
        cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {}
        cost_2_combination: Dict[int, List[CandidateFeature]] = {}

        if self.save_logs:
            cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {}

        self.complexity_delta = 1.0

        unique_raw_combinations = False


        baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score']
        #print("baseline: " + str(baseline_score))


        max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]])
        max_feature.runtime_properties['score'] = -float("inf")

        max_feature_per_complexity: Dict[int, CandidateFeature] = {}

        all_evaluated_features = set()

        my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time)
        my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters)
        my_globale_module.score_global = copy.deepcopy(self.score)
        my_globale_module.classifier_global = copy.deepcopy(self.classifier)
        my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds)
        my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds)
        my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target)
        my_globale_module.test_target_global = copy.deepcopy(self.test_target)
        my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp)
        my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds)
        my_globale_module.epsilon_global = copy.deepcopy(self.epsilon)
        my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta)
        my_globale_module.remove_parents = copy.deepcopy(self.remove_parents)





        c = 1
        while(True):
            current_layer: List[CandidateFeature] = []

            #0th
            if c == 1:
                cost_2_raw_features[c]: List[CandidateFeature] = []
                #print(self.raw_features)
                for raw_f in self.raw_features:
                    sympy_representation = sympy.Symbol('X' + str(raw_f.column_id))
                    raw_f.sympy_representation = sympy_representation
                    all_evaluated_features.add(sympy_representation)
                    if raw_f.is_numeric():
                        if raw_f.properties['missing_values']:
                            raw_f.runtime_properties['score'] = 0.0
                            cost_2_raw_features[c].append(raw_f)
                        else:
                            current_layer.append(raw_f)
                        #print("numeric: " + str(raw_f))
                    else:
                        raw_f.runtime_properties['score'] = 0.0
                        cost_2_raw_features[c].append(raw_f)
                        #print("nonnumeric: " + str(raw_f))

                    self.materialize_raw_features(raw_f)
                    #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0])

            # first unary
            # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?)
            unary_candidates_to_be_applied: List[CandidateFeature] = []
            if (c - 1) in cost_2_raw_features:
                unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1])
            if (c - 1) in cost_2_unary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1])
            if (c - 1) in cost_2_binary_transformed:
                unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1])

            all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features)
            current_layer.extend(all_unary_features)

            #second binary
            #get length 2 partitions for current cost
            partition = self.get_length_2_partition(c-1)
            #print("bin: c: " + str(c) + " partition" + str(partition))

            #apply cross product from partitions
            binary_candidates_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])

                for bt in binary_transformations:
                    list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed)
                    #print(list_of_combinations)
                    for combo in list_of_combinations:
                        if bt.is_applicable(combo):
                            sympy_representation = bt.get_sympy_representation(
                                [p.get_sympy_representation() for p in combo])
                            try:
                                if len(sympy_representation.free_symbols) > 0:  # if expression is not constant
                                    if not sympy_representation in all_evaluated_features:
                                        bin_candidate = CandidateFeature(copy.deepcopy(bt), combo)
                                        bin_candidate.sympy_representation = copy.deepcopy(sympy_representation)
                                        all_evaluated_features.add(sympy_representation)
                                        binary_candidates_to_be_applied.append(bin_candidate)
                                    else:
                                        #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                        pass
                                else:
                                    #print(str(bin_candidate) + " skipped: " + str(sympy_representation))
                                    pass
                            except:
                                pass
            current_layer.extend(binary_candidates_to_be_applied)

            #third: feature combinations
            #first variant: treat combination as a transformation
            #therefore, we can use the same partition as for binary data
            partition = self.get_length_2_partition(c)
            #print("combo c: " + str(c) + " partition" + str(partition))

            combinations_to_be_applied: List[CandidateFeature] = []
            for p in partition:
                lists_for_each_element: List[List[CandidateFeature]] = [[], []]
                for element in range(2):
                    if p[element] in cost_2_raw_features:
                        lists_for_each_element[element].extend(cost_2_raw_features[p[element]])
                    if p[element] in cost_2_unary_transformed:
                        lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]])
                    if p[element] in cost_2_binary_transformed:
                        lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]])
                    if p[element] in cost_2_combination:
                        lists_for_each_element[element].extend(cost_2_combination[p[element]])

                combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1])
            current_layer.extend(combinations_to_be_applied)



            if unique_raw_combinations:
                length = len(current_layer)
                current_layer = self.filter_non_unique_combinations(current_layer)
                print("From " + str(length) + " combinations, we filter " +  str(length - len(current_layer)) + " nonunique raw feature combinations.")



            #now evaluate all from this layer
            #print(current_layer)
            print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------")
            results = evaluate_candidates(current_layer)
            print("----------- Evaluation Finished -----------")

            #print(results)

            layer_end_time = time.time() - self.global_starting_time

            #calculate whether we drop the evaluated candidate
            for candidate in results:
                if type(candidate) != type(None):
                    candidate.runtime_properties['layer_end_time'] = layer_end_time

                    #print(str(candidate) + " -> " + str(candidate.runtime_properties['score']))


                    if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']:
                        max_feature = candidate

                    if candidate.runtime_properties['passed']:
                        if isinstance(candidate, RawFeature):
                            if not c in cost_2_raw_features:
                                cost_2_raw_features[c]: List[CandidateFeature] = []
                            cost_2_raw_features[c].append(candidate)
                        elif isinstance(candidate.transformation, UnaryTransformation):
                            if not c in cost_2_unary_transformed:
                                cost_2_unary_transformed[c]: List[CandidateFeature] = []
                            cost_2_unary_transformed[c].append(candidate)
                        elif isinstance(candidate.transformation, IdentityTransformation):
                            if not c in cost_2_combination:
                                cost_2_combination[c]: List[CandidateFeature] = []
                            cost_2_combination[c].append(candidate)
                        else:
                            if not c in cost_2_binary_transformed:
                                cost_2_binary_transformed[c]: List[CandidateFeature] = []
                            cost_2_binary_transformed[c].append(candidate)
                    else:
                        if self.save_logs:
                            if not c in cost_2_dropped_evaluated_candidates:
                                cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = []
                            cost_2_dropped_evaluated_candidates[c].append(candidate)
            


            satisfied_count = 0
            if c in cost_2_raw_features:
                satisfied_count += len(cost_2_raw_features[c])
            if c in cost_2_unary_transformed:
                satisfied_count += len(cost_2_unary_transformed[c])
            if c in cost_2_binary_transformed:
                satisfied_count += len(cost_2_binary_transformed[c])
            if c in cost_2_combination:
                satisfied_count += len(cost_2_combination[c])

            all_count = len(current_layer)
            if c == 1:
                all_count = len(cost_2_raw_features[c])


            print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.")


            if len(current_layer) > 0:
                if Config.get_default('score.test', 'False') == 'True':
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n")
                else:
                    print("\nBest representation found for complexity = " + str(c) + ": " + str(
                        max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(
                        max_feature.runtime_properties['score']) + "\n")
                #print("hyper: " + str(max_feature.runtime_properties['hyperparameters']))

                #print(max_feature.runtime_properties['fold_scores'])

            # upload best feature to OpenML
            if self.upload2openml:
                candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven')


            if self.save_logs:
                pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)


            max_feature_per_complexity[c] = max_feature


            if type(self.c_max) == type(None) and c > 2:
                # calculate harmonic mean
                harmonic_means = [0.0]*3
                for h_i in range(len(harmonic_means)):
                    simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c,
                                                                       cost_2_raw_features, cost_2_unary_transformed,
                                                                       cost_2_binary_transformed, cost_2_combination)
                    accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c,
                                                                   cost_2_raw_features, cost_2_unary_transformed,
                                                                   cost_2_binary_transformed, cost_2_combination)

                    harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score)
                    #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i))

                if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]:
                    print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2]))
                    break


            if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp:
                break

            c += 1

            if type(self.c_max) != type(None) and self.c_max < c:
                break