def generate_features( self, transformations: List[Transformation], features: List[CandidateFeature], all_evaluated_features: Set) -> List[CandidateFeature]: generated_features: List[CandidateFeature] = [] for t_i in transformations: for f_i in t_i.get_combinations(features): if t_i.is_applicable(f_i): sympy_representation = t_i.get_sympy_representation( [p.get_sympy_representation() for p in f_i]) try: if len(sympy_representation.free_symbols ) > 0: # if expression is not constant if not sympy_representation in all_evaluated_features: candidate = CandidateFeature( copy.deepcopy(t_i), f_i) # do we need a deep copy here? candidate.sympy_representation = copy.deepcopy( sympy_representation) generated_features.append(candidate) all_evaluated_features.add( sympy_representation) else: #print("skipped: " + str(sympy_representation)) pass except: pass return generated_features
def generate_merge_for_combination(self, all_evaluated_features, a: List[CandidateFeature], b: List[CandidateFeature]) -> Set[Set[CandidateFeature]]: cat_candidates_to_be_applied = [] id_t = IdentityTransformation(None) for a_i in range(len(a)): for b_i in range(len(b)): combo = [a[a_i], b[b_i]] if id_t.is_applicable(combo): sympy_representation = id_t.get_sympy_representation([p.get_sympy_representation() for p in combo]) if not sympy_representation in all_evaluated_features: cat_candidate = CandidateFeature(copy.deepcopy(id_t), combo) cat_candidate.sympy_representation = copy.deepcopy(sympy_representation) #all_evaluated_features.add(sympy_representation) cat_candidates_to_be_applied.append(cat_candidate) return cat_candidates_to_be_applied
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features) cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} if self.save_logs: cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} self.complexity_delta = 1.0 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -float("inf") max_feature_per_complexity: Dict[int, CandidateFeature] = {} all_evaluated_features = set() my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time) my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters) my_globale_module.score_global = copy.deepcopy(self.score) my_globale_module.classifier_global = copy.deepcopy(self.classifier) my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds) my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds) my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target) my_globale_module.test_target_global = copy.deepcopy(self.test_target) my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp) my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds) my_globale_module.epsilon_global = copy.deepcopy(self.epsilon) my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta) ############################ # start ############################ current_layer = [] c = 1 cost_2_raw_features[c]: List[CandidateFeature] = [] # print(self.raw_features) for raw_f in self.raw_features: sympy_representation = sympy.Symbol('X' + str(raw_f.column_id)) raw_f.sympy_representation = sympy_representation all_evaluated_features.add(sympy_representation) if raw_f.is_numeric(): current_layer.append(raw_f) # print("numeric: " + str(raw_f)) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) # print("nonnumeric: " + str(raw_f)) self.materialize_raw_features(raw_f) raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0]) # now evaluate all from this layer # print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = evaluate_candidates(current_layer) print("----------- Evaluation Finished -----------") layer_end_time = time.time() - self.global_starting_time # calculate whether we drop the evaluated candidate for candidate in results: if type(candidate) != type(None): candidate.runtime_properties['layer_end_time'] = layer_end_time # print(str(candidate) + " -> " + str(candidate.runtime_properties['score'])) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate if candidate.runtime_properties['passed']: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if self.save_logs: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) print(cost_2_raw_features[c]) #select next representation #next_id = np.argmax([rf.runtime_properties['score'] for rf in cost_2_raw_features[1]]) next_id = np.random.randint(len(cost_2_raw_features[1])) next_rep = cost_2_raw_features[c][next_id] max_rep = next_rep current_lambda = 0 number_runs= 200 rep_succesion = [] for runs in range(number_runs): rep_succesion.append(next_rep) #print('next: ' + str(next_rep)) ####################### #create branch ####################### current_layer = [] # first unary if not isinstance(next_rep.transformation, IdentityTransformation): current_layer.extend(self.generate_features(unary_transformations, [next_rep], all_evaluated_features)) # second binary if not isinstance(next_rep.transformation, IdentityTransformation): binary_candidates_to_be_applied = [] for bt in binary_transformations: list_of_combinations = self.generate_merge([next_rep], cost_2_raw_features[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) # print(list_of_combinations) for combo in list_of_combinations: if bt.is_applicable(combo): sympy_representation = bt.get_sympy_representation( [p.get_sympy_representation() for p in combo]) try: if len(sympy_representation.free_symbols) > 0: # if expression is not constant if not sympy_representation in all_evaluated_features: bin_candidate = CandidateFeature(copy.deepcopy(bt), combo) bin_candidate.sympy_representation = copy.deepcopy(sympy_representation) binary_candidates_to_be_applied.append(bin_candidate) else: # print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass else: # print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass except: pass current_layer.extend(binary_candidates_to_be_applied) # third: feature combinations ''' combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, [next_rep], cost_2_raw_features[1]) current_layer.extend(combinations_to_be_applied) ''' #print(current_layer) # select next representation shuffled_indices = np.arange(len(current_layer)) np.random.shuffle(shuffled_indices) for rep_i in range(len(current_layer)): new_rep = current_layer[shuffled_indices[rep_i]] all_evaluated_features.add(next_rep.sympy_representation) new_rep = evaluate_candidates([new_rep])[0] if new_rep != None: break print(str(new_rep) + " cv score: " + str(new_rep.runtime_properties['score']) + " test: " + str( new_rep.runtime_properties['test_score'])) if new_rep == None: break if new_rep.runtime_properties['score'] * self.score._sign > max_rep.runtime_properties['score']: max_rep = new_rep print("max representation: " + str(max_rep)) if new_rep.runtime_properties['score'] * self.score._sign <= rep_succesion[-1*(current_lambda+1)].runtime_properties['score']: current_lambda += 1 if current_lambda >= self.lambda_threshold: next_rep = max_rep current_lambda = 0 else: next_rep = new_rep
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features) cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} if self.save_logs: cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} self.complexity_delta = 1.0 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -float("inf") max_feature_per_complexity: Dict[int, CandidateFeature] = {} all_evaluated_features = set() my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time) my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters) my_globale_module.score_global = copy.deepcopy(self.score) my_globale_module.classifier_global = copy.deepcopy(self.classifier) my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds) my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds) my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target) my_globale_module.test_target_global = copy.deepcopy(self.test_target) my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp) my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds) my_globale_module.epsilon_global = copy.deepcopy(self.epsilon) my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta) my_globale_module.remove_parents = copy.deepcopy(self.remove_parents) my_globale_module.materialized_set = set() my_globale_module.predictions_set = set() number_of_multiple_cvs = 10 nested_my_globale_module.splitting_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs) nested_my_globale_module.model_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs) #pickle.dump(my_globale_module.target_test_folds_global, open('/tmp/test_groundtruth.p', 'wb+')) c = 1 while(True): current_layer: List[CandidateFeature] = [] if c <= self.max_feature_depth: #0th if c == 1: cost_2_raw_features[c]: List[CandidateFeature] = [] #print(self.raw_features) for raw_f in self.raw_features: sympy_representation = sympy.Symbol('X' + str(raw_f.column_id)) raw_f.sympy_representation = sympy_representation all_evaluated_features.add(sympy_representation) if raw_f.is_numeric(): if raw_f.properties['missing_values']: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) else: current_layer.append(raw_f) #print("numeric: " + str(raw_f)) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) #print("nonnumeric: " + str(raw_f)) self.materialize_raw_features(raw_f) #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0]) # first unary # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?) unary_candidates_to_be_applied: List[CandidateFeature] = [] if (c - 1) in cost_2_raw_features: unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1]) if (c - 1) in cost_2_unary_transformed: unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1]) if (c - 1) in cost_2_binary_transformed: unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1]) all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features) current_layer.extend(all_unary_features) #second binary #get length 2 partitions for current cost partition = self.get_length_2_partition(c-1) #print("bin: c: " + str(c) + " partition" + str(partition)) #apply cross product from partitions binary_candidates_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) for bt in binary_transformations: list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) #print(list_of_combinations) for combo in list_of_combinations: if bt.is_applicable(combo): sympy_representation = bt.get_sympy_representation( [p.get_sympy_representation() for p in combo]) try: if len(sympy_representation.free_symbols) > 0: # if expression is not constant if not sympy_representation in all_evaluated_features: bin_candidate = CandidateFeature(copy.deepcopy(bt), combo) bin_candidate.sympy_representation = copy.deepcopy(sympy_representation) all_evaluated_features.add(sympy_representation) binary_candidates_to_be_applied.append(bin_candidate) else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass except: pass current_layer.extend(binary_candidates_to_be_applied) #third: feature combinations #first variant: treat combination as a transformation #therefore, we can use the same partition as for binary data partition = self.get_length_2_partition(c) #print("combo c: " + str(c) + " partition" + str(partition)) def filter_minus(features: List[CandidateFeature]): filtered_features: List[CandidateFeature] = [] if my_globale_module.classifier_global == LogisticRegression: for check_f in features: if not isinstance(check_f.transformation, MinusTransformation): filtered_features.append(check_f) return filtered_features ''' combinations_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(filter_minus(cost_2_unary_transformed[p[element]])) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(filter_minus(cost_2_binary_transformed[p[element]])) if p[element] in cost_2_combination: lists_for_each_element[element].extend(cost_2_combination[p[element]]) combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1]) current_layer.extend(combinations_to_be_applied) ''' if unique_raw_combinations: length = len(current_layer) current_layer = self.filter_non_unique_combinations(current_layer) print("From " + str(length) + " combinations, we filter " + str(length - len(current_layer)) + " nonunique raw feature combinations.") #now evaluate all from this layer #print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = evaluate_candidates_parallel(current_layer, self.n_jobs) print("----------- Evaluation Finished -----------") ##nested cv ''' new_results_with_nested = [] for r_result in results: if type(r_result) != type(None): new_results_with_nested.append(r_result) #results = nested_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train']) results = multiple_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train']) for r_result in results: #print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str(r_result.runtime_properties['test_score']) + ' nested: ' + str(r_result.runtime_properties['nested_cv_score'])) print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str( r_result.runtime_properties['test_score']) + ' nested: ' + str( r_result.runtime_properties['multiple_cv_score'])) ''' #print(results) layer_end_time = time.time() - self.global_starting_time #calculate whether we drop the evaluated candidate for candidate in results: ## check if we computed an equivalent feature before if type(candidate) != type(None) and not isinstance(candidate.transformation, IdentityTransformation): materialized_all = [] for fold_ii in range(len(my_globale_module.preprocessed_folds_global)): materialized_all.extend(candidate.runtime_properties['test_transformed'][fold_ii].flatten()) materialized = tuple(materialized_all) if materialized in my_globale_module.materialized_set: candidate = None else: my_globale_module.materialized_set.add(materialized) ''' ## check if predictions exist already if type(candidate) != type(None) and 'test_fold_predictions' in candidate.runtime_properties: materialized_all = [] for fold_ii in range(len(my_globale_module.preprocessed_folds_global)): materialized_all.extend(candidate.runtime_properties['test_fold_predictions'][fold_ii].flatten()) materialized = tuple(materialized_all) if materialized in my_globale_module.predictions_set: candidate = None else: my_globale_module.predictions_set.add(materialized) ''' if type(candidate) != type(None): candidate.runtime_properties['layer_end_time'] = layer_end_time #print(str(candidate) + " -> " + str(candidate.runtime_properties['score'])) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate if candidate.runtime_properties['passed']: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if self.save_logs: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) satisfied_count = 0 if c in cost_2_raw_features: satisfied_count += len(cost_2_raw_features[c]) if c in cost_2_unary_transformed: satisfied_count += len(cost_2_unary_transformed[c]) if c in cost_2_binary_transformed: satisfied_count += len(cost_2_binary_transformed[c]) if c in cost_2_combination: satisfied_count += len(cost_2_combination[c]) all_count = len(current_layer) if c == 1: all_count = len(cost_2_raw_features[c]) print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.") if len(current_layer) > 0: if 'test_score' in max_feature.runtime_properties: print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n") else: print("\nBest representation found for complexity = " + str(c) + ": " + str( max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format( max_feature.runtime_properties['score']) + "\n") #print("hyper: " + str(max_feature.runtime_properties['hyperparameters'])) #print(max_feature.runtime_properties['fold_scores']) # upload best feature to OpenML if self.upload2openml: candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven') if self.save_logs: try: pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) except: pickle.dump(cost_2_raw_features, open( Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_unary_transformed, open( Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_binary_transformed, open( Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_combination, open( Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_dropped_evaluated_candidates, open( Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) max_feature_per_complexity[c] = max_feature if type(self.c_max) == type(None) and c > 2: # calculate harmonic mean harmonic_means = [0.0]*3 for h_i in range(len(harmonic_means)): simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score) #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i)) if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]: print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2])) break if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp: break c += 1 if type(self.c_max) != type(None) and self.c_max < c: break def extend_all(all_representations: List[CandidateFeature], new_llist): for mylist in new_llist: all_representations.extend(mylist) #get all representation all_representations: List[CandidateFeature] = [] extend_all(all_representations, cost_2_raw_features.values()) extend_all(all_representations, cost_2_unary_transformed.values()) extend_all(all_representations, cost_2_binary_transformed.values()) extend_all(all_representations, cost_2_combination.values()) self.all_representations = all_representations ''' #find top k based on cv score scores = [c.runtime_properties['score'] for c in all_representations] sorted_cv_score_ids = np.argsort(np.array(scores)*-1) checking_k = 50 top_k_representations = [all_representations[sorted_id] for sorted_id in sorted_cv_score_ids[0:checking_k]] #from top k - select best based on nested cv score top_k_representations = multiple_cv_score_parallel(top_k_representations, self.reader.splitted_values['train'], self.reader.splitted_target['train']) scores = [c.runtime_properties['multiple_cv_score'] for c in top_k_representations] max_nested_cv_score = -1 max_nested_rep = None for eval_candidate in top_k_representations: if eval_candidate.runtime_properties['multiple_cv_score'] > max_nested_cv_score: max_nested_cv_score = eval_candidate.runtime_properties['multiple_cv_score'] max_nested_rep = eval_candidate print(max_nested_rep) max_feature = max_nested_rep ''' ''' all_features = list(max_feature_per_complexity.values()) all_features = multiple_cv_score_parallel(all_features, self.reader.splitted_values['train'], self.reader.splitted_target['train']) best_multiple_cv_score = -np.inf best_multiple_cv_candidate = None for all_f in all_features: if all_f.runtime_properties['multiple_cv_score'] > best_multiple_cv_score: best_multiple_cv_score = all_f.runtime_properties['multiple_cv_score'] best_multiple_cv_candidate = all_f #find the most simple representation that is within the best representation's std complexities = [all_f.get_complexity() for all_f in all_features] ids_complex = np.argsort(complexities) for all_f_i in range(len(all_features)): print(str(all_features[ids_complex[all_f_i]]) + ' mcv: ' + str(all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score']) + ' mcv_std: ' + str( all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score_std'])) if all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score'] > best_multiple_cv_candidate.runtime_properties['multiple_cv_score'] - best_multiple_cv_candidate.runtime_properties['multiple_cv_score_std']: max_feature = all_features[ids_complex[all_f_i]] break print(max_feature) ''' #min AICc selection min_aicc = np.inf min_aicc_feature = None all_aiccs = [] for rep in list(max_feature_per_complexity.values()): all_aiccs.append(np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity'])) def calculate_AIC_for_classification_paper(rss, n, k): AIC = 2 * k + float(n) * np.log(rss / float(n)) return AIC def calculate_AICc_for_classification_paper(rss, n, k): AIC = calculate_AIC_for_classification_paper(rss, n, k) AICc = AIC + ((2 * k * (k + 1)) / (n - k - 1)) return AICc def calc_global_aicc(rep): return calculate_AICc_for_classification_paper(np.sum(rep.runtime_properties['additional_metrics']['rss']), np.sum(rep.runtime_properties['additional_metrics']['n']), rep.get_complexity()) def is_better(old_aics, new_aics): print(np.sum(np.array(new_aics) < np.array(old_aics))) return np.sum(np.array(new_aics) < np.array(old_aics)) > len(new_aics) / 2.0 for rep in list(max_feature_per_complexity.values()): curr = np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity']) #print(str(rep) + ': ' + str(curr) + ' AICc min: ' + str(np.min(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' AICc std: ' + str(np.std(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' P: ' + str(np.exp((min(all_aiccs) - curr)/2)) + ' CV AUC: ' + str(rep.runtime_properties['score'])) print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['AICc_complexity'])) print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['rss'])) print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['n'])) print(str(rep) + 'global_aicc: ' + str(calc_global_aicc(rep))) #if type(min_aicc_feature) == type(None) or is_better(min_aicc_feature.runtime_properties['additional_metrics']['AICc_complexity'], rep.runtime_properties['additional_metrics']['AICc_complexity']): if type(min_aicc_feature) == type(None) or calc_global_aicc(rep) < calc_global_aicc(min_aicc_feature): #min_aicc = np.min(rep.runtime_properties['additional_metrics']['AICc_complexity']) min_aicc_feature = rep max_feature = min_aicc_feature print(max_feature) return max_feature
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features) cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} if self.save_logs: cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} self.complexity_delta = 1.0 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -float("inf") max_feature_per_complexity: Dict[int, CandidateFeature] = {} all_evaluated_features = set() my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time) my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters) my_globale_module.score_global = copy.deepcopy(self.score) my_globale_module.classifier_global = copy.deepcopy(self.classifier) my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds) my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds) my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target) my_globale_module.test_target_global = copy.deepcopy(self.test_target) my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp) my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds) my_globale_module.epsilon_global = copy.deepcopy(self.epsilon) my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta) my_globale_module.remove_parents = copy.deepcopy(self.remove_parents) c = 1 while(True): current_layer: List[CandidateFeature] = [] #0th if c == 1: cost_2_raw_features[c]: List[CandidateFeature] = [] #print(self.raw_features) for raw_f in self.raw_features: sympy_representation = sympy.Symbol('X' + str(raw_f.column_id)) raw_f.sympy_representation = sympy_representation all_evaluated_features.add(sympy_representation) if raw_f.is_numeric(): if raw_f.properties['missing_values']: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) else: current_layer.append(raw_f) #print("numeric: " + str(raw_f)) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) #print("nonnumeric: " + str(raw_f)) self.materialize_raw_features(raw_f) #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0]) # first unary # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?) unary_candidates_to_be_applied: List[CandidateFeature] = [] if (c - 1) in cost_2_raw_features: unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1]) if (c - 1) in cost_2_unary_transformed: unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1]) if (c - 1) in cost_2_binary_transformed: unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1]) all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features) current_layer.extend(all_unary_features) #second binary #get length 2 partitions for current cost partition = self.get_length_2_partition(c-1) #print("bin: c: " + str(c) + " partition" + str(partition)) #apply cross product from partitions binary_candidates_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) for bt in binary_transformations: list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) #print(list_of_combinations) for combo in list_of_combinations: if bt.is_applicable(combo): sympy_representation = bt.get_sympy_representation( [p.get_sympy_representation() for p in combo]) try: if len(sympy_representation.free_symbols) > 0: # if expression is not constant if not sympy_representation in all_evaluated_features: bin_candidate = CandidateFeature(copy.deepcopy(bt), combo) bin_candidate.sympy_representation = copy.deepcopy(sympy_representation) all_evaluated_features.add(sympy_representation) binary_candidates_to_be_applied.append(bin_candidate) else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass except: pass current_layer.extend(binary_candidates_to_be_applied) #third: feature combinations #first variant: treat combination as a transformation #therefore, we can use the same partition as for binary data partition = self.get_length_2_partition(c) #print("combo c: " + str(c) + " partition" + str(partition)) combinations_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) if p[element] in cost_2_combination: lists_for_each_element[element].extend(cost_2_combination[p[element]]) combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1]) current_layer.extend(combinations_to_be_applied) if unique_raw_combinations: length = len(current_layer) current_layer = self.filter_non_unique_combinations(current_layer) print("From " + str(length) + " combinations, we filter " + str(length - len(current_layer)) + " nonunique raw feature combinations.") #now evaluate all from this layer #print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = evaluate_candidates(current_layer) print("----------- Evaluation Finished -----------") #print(results) layer_end_time = time.time() - self.global_starting_time #calculate whether we drop the evaluated candidate for candidate in results: if type(candidate) != type(None): candidate.runtime_properties['layer_end_time'] = layer_end_time #print(str(candidate) + " -> " + str(candidate.runtime_properties['score'])) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate if candidate.runtime_properties['passed']: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if self.save_logs: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) satisfied_count = 0 if c in cost_2_raw_features: satisfied_count += len(cost_2_raw_features[c]) if c in cost_2_unary_transformed: satisfied_count += len(cost_2_unary_transformed[c]) if c in cost_2_binary_transformed: satisfied_count += len(cost_2_binary_transformed[c]) if c in cost_2_combination: satisfied_count += len(cost_2_combination[c]) all_count = len(current_layer) if c == 1: all_count = len(cost_2_raw_features[c]) print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.") if len(current_layer) > 0: if Config.get_default('score.test', 'False') == 'True': print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n") else: print("\nBest representation found for complexity = " + str(c) + ": " + str( max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format( max_feature.runtime_properties['score']) + "\n") #print("hyper: " + str(max_feature.runtime_properties['hyperparameters'])) #print(max_feature.runtime_properties['fold_scores']) # upload best feature to OpenML if self.upload2openml: candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven') if self.save_logs: pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) max_feature_per_complexity[c] = max_feature if type(self.c_max) == type(None) and c > 2: # calculate harmonic mean harmonic_means = [0.0]*3 for h_i in range(len(harmonic_means)): simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score) #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i)) if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]: print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2])) break if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp: break c += 1 if type(self.c_max) != type(None) and self.c_max < c: break