def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() #working_features = self.filter_failing_in_parallel() #all_f = CandidateFeature(IdentityTransformation(len(working_features)), working_features) all_f = CandidateFeature( IdentityTransformation(len(self.raw_features)), self.raw_features) my_list = [] for i in range(1, len(self.raw_features) + 1): my_list.append( CandidateFeature(BorutaTransformer(len(self.raw_features), i), [all_f])) #my_list.append(CandidateFeature(SissoTransformer(len(self.raw_features)), [all_f])) results = self.evaluate_candidates(my_list) print(results) for r in range(len(results)): print("(" + str(r + 1) + "," + str(results[r]['score']) + ")") new_scores = [r['score'] for r in results] best_id = np.argmax(new_scores) print(results[best_id])
def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() self.global_starting_time = time.time() for k in range(1, len(self.raw_features)+1): all_f = CandidateFeature(IdentityTransformation(len(self.raw_features)), self.raw_features) t = CandidateFeature(SelectKBestTransformer(len(self.raw_features),k), [all_f]) t.pipeline.fit(self.dataset.splitted_values['train'], self.current_target) X = t.transform(self.dataset.splitted_values['train']) X_test = t.transform(self.dataset.splitted_values['test']) print("time: " + str(time.time() - self.global_starting_time)) clf = GridSearchCV(self.classifier(), self.grid_search_parameters, cv=self.preprocessed_folds, scoring=self.score, iid=False, error_score='raise') clf.fit(X, self.current_target) print('test score: ' + str(clf.score(X_test, self.test_target))) print("\n\n")
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate(42) #starting_feature_matrix = self.create_starting_features() self.generate_target() myfolds = copy.deepcopy(list(self.preprocessed_folds)) level_scores: Dict[int, List[float]] = {} level_test_scores: Dict[int, List[float]] = {} #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all') #string2candidate = self.load_data_all('/tmp') baseline_features: List[CandidateFeature] = [] for r in self.raw_features: if r.is_numeric() and not r.properties['categorical']: if not r.properties['missing_values']: baseline_features.append(r) else: baseline_features.append( CandidateFeature(ImputationTransformation(), [r])) else: baseline_features.extend([ CandidateFeature(t, [r]) for t in OneHotGenerator(self.train_X_all, [r]).produce() ]) #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_small', 24)) #baseline_features.extend(self.get_interesting_features('/home/felix/phd/fastfeatures/results/heart_new_all', 10)) #baseline_features.extend(self.get_interesting_features(string2candidate, 2)) ''' for c in baseline_features: if isinstance(c, RawFeature): print(str(c) + " complexity: " + str(c.get_complexity())) else: print('nr: ' + str(c) + " complexity: " + str(c.get_complexity()))+ ''' # standardize scaled_baseline_features = [] for c in baseline_features: scaled_baseline_features.append( CandidateFeature(MinMaxScalingTransformation(), [c])) #scaled_baseline_features = baseline_features combo = CandidateFeature( IdentityTransformation(len(baseline_features)), scaled_baseline_features) results = self.evaluate_candidates_detail([combo], myfolds, 1) print(str(results[0].runtime_properties))
def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() working_features = self.filter_failing_features() all_f = CandidateFeature(IdentityTransformation(len(working_features)), working_features) selection = CandidateFeature( FeatureSelectionTransformation( 1, 2, LogisticRegression(penalty='l2', solver='lbfgs', class_weight='balanced', max_iter=10000)), [all_f]) results = self.evaluate_candidates([selection]) new_scores = [r['score'] for r in results] best_id = np.argmax(new_scores) print(results[best_id])
def sisso_transfusion_features_new3( self, name2feature) -> List[CandidateFeature]: sisso_features = [] sisso_features.extend(self.raw_features) squared_recency = CandidateFeature( HigherOrderCommutativeTransformation(np.prod, 2), [name2feature['Recency'], name2feature['Recency']]) squared_monetary = CandidateFeature( HigherOrderCommutativeTransformation(np.prod, 2), [name2feature['Monetary'], name2feature['Monetary']]) sisso_features.append( CandidateFeature(NonCommutativeBinaryTransformation(np.divide), [name2feature['Recency'], name2feature['Time']])) sisso_features.append( CandidateFeature(NonCommutativeBinaryTransformation(np.divide), [name2feature['Monetary'], name2feature['Time']])) sisso_features.append( CandidateFeature(NonCommutativeBinaryTransformation(np.divide), [squared_monetary, name2feature['Time']])) sisso_features.append( CandidateFeature(NonCommutativeBinaryTransformation(np.divide), [squared_recency, name2feature['Time']])) all_f = CandidateFeature(IdentityTransformation(len(sisso_features)), sisso_features) return [all_f]
def generate_merge_for_combination(self, all_evaluated_features, a: List[CandidateFeature], b: List[CandidateFeature]) -> Set[Set[CandidateFeature]]: cat_candidates_to_be_applied = [] id_t = IdentityTransformation(None) for a_i in range(len(a)): for b_i in range(len(b)): combo = [a[a_i], b[b_i]] if id_t.is_applicable(combo): sympy_representation = id_t.get_sympy_representation([p.get_sympy_representation() for p in combo]) if not sympy_representation in all_evaluated_features: cat_candidate = CandidateFeature(copy.deepcopy(id_t), combo) cat_candidate.sympy_representation = copy.deepcopy(sympy_representation) all_evaluated_features.add(sympy_representation) cat_candidates_to_be_applied.append(cat_candidate) return cat_candidates_to_be_applied
def fit(self, X, y=None): fe = ComplexityDrivenFeatureConstruction( None, reader=ScikitReader( X, y, feature_names=self.feature_names, feature_is_categorical=self.feature_is_categorical), score=self.scoring, c_max=self.c_max, folds=self.cv, max_seconds=self.max_time_secs, classifier=self.model.__class__, grid_search_parameters=self.parameter_grid, n_jobs=self.n_jobs, epsilon=self.epsilon, remove_parents=False, transformation_producer=self.transformation_producer) fe.run() numeric_representations = [] for r in fe.all_representations: if 'score' in r.runtime_properties: if not 'object' in str(r.properties['type']): if not isinstance(r.transformation, MinMaxScalingTransformation): #if not (isinstance(r.transformation, HigherOrderCommutativeTransformation) and r.transformation.method == np.nansum): if isinstance(r.sympy_representation, sympy.Mul): found = False for e in r.sympy_representation._args: if e == S.NegativeOne: found = True if found == False: numeric_representations.append(r) else: numeric_representations.append(r) self.numeric_features = numeric_representations my_list = [] for ff in self.numeric_features: my_list.append(str(ff)) with open('/tmp/names.pickle', 'wb') as f: pickle.dump(X, f, pickle.HIGHEST_PROTOCOL) all_features = CandidateFeature(IdentityTransformation(-1), numeric_representations) #all_imputation = CandidateFeature(ImputationTransformation(), [all_features]) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) #all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) self.pipeline_ = all_standardized.pipeline self.pipeline_.fit(X, y) return self
def get_info_gain_of_feature(self, candidate: CandidateFeature): try: new_candidate = CandidateFeature(IdentityTransformation(2), [self.base_features, candidate]) X = new_candidate.pipeline.fit_transform( self.dataset.splitted_values['train'], self.train_y_all_target) return mutual_info_classif(X, self.train_y_all_target)[-1] except: return 0.0
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate(42) #starting_feature_matrix = self.create_starting_features() self.generate_target() myfolds = copy.deepcopy(list(self.preprocessed_folds)) baseline_features: List[CandidateFeature] = [] for r in self.raw_features: if r.is_numeric() and (not 'categorical' in r.properties or not r.properties['categorical']): if not r.properties['missing_values']: baseline_features.append(r) else: baseline_features.append( CandidateFeature(ImputationTransformation(), [r])) else: baseline_features.extend([ CandidateFeature(t, [r]) for t in OneHotGenerator(self.train_X_all, [r]).produce() ]) #scale everything for bf_i in range(len(baseline_features)): baseline_features[bf_i] = CandidateFeature( StandardScalingTransformation(), [baseline_features[bf_i]]) print(len(baseline_features)) combo = CandidateFeature( IdentityTransformation(len(baseline_features)), baseline_features) ''' categorical_ids = [] for r in self.raw_features: if 'categorical' in r.properties and r.properties['categorical']: categorical_ids.append(r.column_id) combo = CandidateFeature(IdentityTransformation(0), self.raw_features) if len(categorical_ids) >= 1: combo.pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='mean')), ('onehot', OneHotEncoder(categorical_features=categorical_ids)), ('scaling', StandardScaler(with_mean=False))]) else: combo.pipeline = Pipeline(steps=[('imputation', SimpleImputer(strategy='mean')), ('scaling', StandardScaler(with_mean=False))]) ''' results = self.evaluate_candidates([combo], myfolds) #print(results[0].runtime_properties) #candidate2openml(results[0], self.classifier, self.reader.task, 'RawFeatureBaseline') return results[0]
def sisso_transfusion_features_new(self, name2feature): sisso_features = [] sisso_features.extend(self.raw_features) sisso_features.append( CandidateFeature(NonCommutativeBinaryTransformation( np.divide), [name2feature['Frequency'], name2feature['Time']])) all_f = CandidateFeature(IdentityTransformation(len(sisso_features)), sisso_features) return [all_f]
def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() print([r.name for r in self.dataset.raw_features]) plain_attributes = CandidateFeature(IdentityTransformation(len(self.dataset.raw_features)), self.dataset.raw_features) self.evaluate_candidates([plain_attributes])
def explorekit_heart_features(self, name2feature): explore_kit_features = [] explore_kit_features.extend(self.raw_features) # Discretize({Mean(age) GROUP BY Discretize(sex), Discretize(exercise_induced_angina)}) discr_sex = CandidateFeature(PandasDiscretizerTransformation(10), [name2feature['sex']]) discr_angina = CandidateFeature( PandasDiscretizerTransformation(10), [name2feature['exercise_induced_angina']]) grouped = CandidateFeature(GroupByThenTransformation( np.mean, 3), [name2feature['age'], discr_sex, discr_angina]) final = CandidateFeature(PandasDiscretizerTransformation(10), [grouped]) explore_kit_features.append(final) all_f = CandidateFeature( IdentityTransformation(len(explore_kit_features)), explore_kit_features) return [all_f]
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate(42) #starting_feature_matrix = self.create_starting_features() self.generate_target() myfolds = copy.deepcopy(list(self.preprocessed_folds)) level_scores: Dict[int, List[float]] = {} level_test_scores: Dict[int, List[float]] = {} #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/eucalyptus') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/contraceptive') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/diabetes') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/credit') #string2candidate = self.load_data_all('/home/felix/phd/fastfeatures/results/heart_new_all') #string2candidate = self.load_data_all('/tmp') features = pickle.load(open('/tmp/cover_features.p', "rb")) #apply minmax scaling new_features: List[CandidateFeature] = [] for f in features: new_features.append( CandidateFeature(MinMaxScalingTransformation(), [f])) results = self.evaluate_candidates([ CandidateFeature(IdentityTransformation(len(new_features)), new_features) ], myfolds) print(results[0]) print(results[0].runtime_properties) return results[0]
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() all_f = CandidateFeature( IdentityTransformation(len(self.raw_features)), self.raw_features) feature_names = [str(r) for r in self.raw_features] t = CandidateFeature( SissoTransformer(len(self.raw_features), feature_names, ["^2", "^3", "1/"]), [all_f]) t.pipeline.fit(self.dataset.splitted_values['train'], self.train_y_all_target) X = t.transform(self.dataset.splitted_values['train']) X_test = t.transform(self.dataset.splitted_values['test']) print("time: " + str(time.time() - self.global_starting_time)) clf = GridSearchCV(self.classifier(), self.grid_search_parameters, cv=self.preprocessed_folds, scoring=self.score, iid=False, error_score='raise') clf.fit(X, self.train_y_all_target) print(X_test) print('test score: ' + str(clf.score(X_test, self.test_target))) print("\n\n")
my_names: List[CandidateFeature] = pickle.load( open( "/home/felix/phd/feature_constraints/" + str(which_experiment) + "/names.p", "rb")) print(my_names) X_train = pickle.load( open( "/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_train.p", "rb")) y_train = pickle.load( open( "/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_train.p", "rb")) all_features = CandidateFeature(IdentityTransformation(-1), numeric_representations) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) foreigner = np.array(X_train[:, 7]) gender = np.array( ['female' in personal_status for personal_status in X_train[:, 15]]) my_runner = Runner(c=1.0, sensitive=gender, labels=['bad', 'good']) #my_runner = Runner(c=1.0, sensitive=foreigner, labels=['bad', 'good']) model = xgb.XGBClassifier(objective="binary:logistic", n_estimators=1000, random_state=42)
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer() cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} complexity_delta = 1.0 epsilon = self.epsilon limit_runs = self.c_max + 1 # 5 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -2 self.name_to_transfomed = {} for c in range(1, limit_runs): current_layer: List[CandidateFeature] = [] #0th if c == 1: cost_2_raw_features[c]: List[CandidateFeature] = [] for raw_f in self.raw_features: if raw_f.is_numeric(): current_layer.append(raw_f) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) # first unary # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?) unary_candidates_to_be_applied: List[CandidateFeature] = [] if (c - 1) in cost_2_raw_features: unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1]) if (c - 1) in cost_2_unary_transformed: unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1]) if (c - 1) in cost_2_binary_transformed: unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1]) current_layer.extend(self.generate_features(unary_transformations, unary_candidates_to_be_applied)) #second binary #get length 2 partitions for current cost partition = self.get_length_2_partition(c-1) #print("bin: c: " + str(c) + " partition" + str(partition)) #apply cross product from partitions binary_candidates_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) for bt in binary_transformations: list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) for combo in list_of_combinations: if bt.is_applicable(combo): binary_candidates_to_be_applied.append(CandidateFeature(copy.deepcopy(bt), combo)) current_layer.extend(binary_candidates_to_be_applied) #third: feature combinations #first variant: treat combination as a transformation #therefore, we can use the same partition as for binary data partition = self.get_length_2_partition(c) #print("combo c: " + str(c) + " partition" + str(partition)) combinations_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) if p[element] in cost_2_combination: lists_for_each_element[element].extend(cost_2_combination[p[element]]) list_of_combinations = self.generate_merge_for_combination(lists_for_each_element[0], lists_for_each_element[1]) for combo in list_of_combinations: if IdentityTransformation(None).is_applicable(list(combo)): combinations_to_be_applied.append(CandidateFeature(IdentityTransformation(None), list(combo))) current_layer.extend(combinations_to_be_applied) if unique_raw_combinations: length = len(current_layer) current_layer = self.filter_non_unique_combinations(current_layer) print("From " + str(length) + " combinations, we filter " + str(length - len(current_layer)) + " nonunique raw feature combinations.") #now evaluate all from this layer #print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = self.evaluate_candidates(current_layer) print("----------- Evaluation Finished -----------") layer_end_time = time.time() - self.global_starting_time #calculate whether we drop the evaluated candidate for result in results: candidate: CandidateFeature = result['candidate'] candidate.runtime_properties['score'] = result['score'] candidate.runtime_properties['test_score'] = result['test_score'] candidate.runtime_properties['execution_time'] = result['execution_time'] candidate.runtime_properties['global_time'] = result['global_time'] candidate.runtime_properties['hyperparameters'] = result['hyperparameters'] candidate.runtime_properties['layer_end_time'] = layer_end_time #print(str(candidate) + " -> " + str(candidate.score)) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate #calculate original score original_score = baseline_score #or zero?? if not isinstance(candidate, RawFeature): original_score = max([p.runtime_properties['score'] for p in candidate.parents]) accuracy_delta = result['score'] - original_score if accuracy_delta / complexity_delta > epsilon: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) if c in cost_2_dropped_evaluated_candidates: print("Of " + str(len(current_layer)) + " candidate representations, " + str(len(cost_2_dropped_evaluated_candidates[c])) + " did not satisfy the epsilon threshold.") else: print("Of " + str(len(current_layer)) + " candidate representations, all satisfied the epsilon threshold.") print("Best representation found for complexity = " + str(c) + ": " + str(max_feature) + "\n") if self.save_logs: pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb")) pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb")) pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb")) pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb")) pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"))
def run_pipeline(self, which_features_to_use, runs=1): results = {} start_time = time.time() # generate pipeline results['complexity'] = 0 all_selected_features = [] for i in range(len(which_features_to_use)): if which_features_to_use[i]: all_selected_features.append(self.numeric_representations[i]) results['complexity'] += self.numeric_representations[ i].get_complexity() all_features = CandidateFeature(IdentityTransformation(-1), all_selected_features) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) my_pipeline = Pipeline([('f', all_standardized.pipeline), ('c', self.model())]) cv_scores = [] test_scores = [] pred_test = None proba_pred_test = None if runs > 1: for r in range(runs): kfolds = StratifiedKFold(10, shuffle=True, random_state=42 + r) self.pipeline = GridSearchCV(my_pipeline, self.parameter_grid, cv=kfolds.split( self.X_train, self.y_train), scoring=self.scoring, n_jobs=4) self.pipeline.fit(self.X_train, self.y_train) pred_test = self.pipeline.predict(self.X_test) proba_pred_test = self.pipeline.predict_proba(self.X_test) test_auc = self.auc(self.pipeline, self.X_test, self.y_test) cv_scores.append(self.pipeline.best_score_) test_scores.append(test_auc) std_loss = np.std(cv_scores) loss = np.average(cv_scores) else: kfolds = StratifiedKFold(10, shuffle=True, random_state=42) self.pipeline = GridSearchCV(my_pipeline, self.parameter_grid, cv=kfolds.split( self.X_train, self.y_train), scoring=self.scoring, n_jobs=1, refit='auc') self.pipeline.fit(self.X_train, pd.DataFrame(self.y_train)) pred_test = self.pipeline.predict(self.X_test) proba_pred_test = self.pipeline.predict_proba(self.X_test) test_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)(self.pipeline, self.X_test, self.y_test) for k in self.scoring.keys(): results[k] = self.pipeline.cv_results_['mean_test_' + str(k)][ self.pipeline.best_index_] loss = self.pipeline.cv_results_['mean_test_auc'][ self.pipeline.best_index_] test_scores.append(test_auc) results['test_auc'] = np.average(test_scores) results['cv_time'] = time.time() - start_time results['global_time'] = time.time() - self.global_starting_time return results #loss, np.average(test_scores), pred_test, 0.0, proba_pred_test
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features) cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} if self.save_logs: cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} self.complexity_delta = 1.0 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -float("inf") max_feature_per_complexity: Dict[int, CandidateFeature] = {} all_evaluated_features = set() my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time) my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters) my_globale_module.score_global = copy.deepcopy(self.score) my_globale_module.classifier_global = copy.deepcopy(self.classifier) my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds) my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds) my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target) my_globale_module.test_target_global = copy.deepcopy(self.test_target) my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp) my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds) my_globale_module.epsilon_global = copy.deepcopy(self.epsilon) my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta) my_globale_module.remove_parents = copy.deepcopy(self.remove_parents) my_globale_module.materialized_set = set() my_globale_module.predictions_set = set() number_of_multiple_cvs = 10 nested_my_globale_module.splitting_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs) nested_my_globale_module.model_seeds = np.random.randint(low=0, high=10000, size=number_of_multiple_cvs) #pickle.dump(my_globale_module.target_test_folds_global, open('/tmp/test_groundtruth.p', 'wb+')) c = 1 while(True): current_layer: List[CandidateFeature] = [] if c <= self.max_feature_depth: #0th if c == 1: cost_2_raw_features[c]: List[CandidateFeature] = [] #print(self.raw_features) for raw_f in self.raw_features: sympy_representation = sympy.Symbol('X' + str(raw_f.column_id)) raw_f.sympy_representation = sympy_representation all_evaluated_features.add(sympy_representation) if raw_f.is_numeric(): if raw_f.properties['missing_values']: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) else: current_layer.append(raw_f) #print("numeric: " + str(raw_f)) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) #print("nonnumeric: " + str(raw_f)) self.materialize_raw_features(raw_f) #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0]) # first unary # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?) unary_candidates_to_be_applied: List[CandidateFeature] = [] if (c - 1) in cost_2_raw_features: unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1]) if (c - 1) in cost_2_unary_transformed: unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1]) if (c - 1) in cost_2_binary_transformed: unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1]) all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features) current_layer.extend(all_unary_features) #second binary #get length 2 partitions for current cost partition = self.get_length_2_partition(c-1) #print("bin: c: " + str(c) + " partition" + str(partition)) #apply cross product from partitions binary_candidates_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) for bt in binary_transformations: list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) #print(list_of_combinations) for combo in list_of_combinations: if bt.is_applicable(combo): sympy_representation = bt.get_sympy_representation( [p.get_sympy_representation() for p in combo]) try: if len(sympy_representation.free_symbols) > 0: # if expression is not constant if not sympy_representation in all_evaluated_features: bin_candidate = CandidateFeature(copy.deepcopy(bt), combo) bin_candidate.sympy_representation = copy.deepcopy(sympy_representation) all_evaluated_features.add(sympy_representation) binary_candidates_to_be_applied.append(bin_candidate) else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass except: pass current_layer.extend(binary_candidates_to_be_applied) #third: feature combinations #first variant: treat combination as a transformation #therefore, we can use the same partition as for binary data partition = self.get_length_2_partition(c) #print("combo c: " + str(c) + " partition" + str(partition)) def filter_minus(features: List[CandidateFeature]): filtered_features: List[CandidateFeature] = [] if my_globale_module.classifier_global == LogisticRegression: for check_f in features: if not isinstance(check_f.transformation, MinusTransformation): filtered_features.append(check_f) return filtered_features ''' combinations_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(filter_minus(cost_2_unary_transformed[p[element]])) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(filter_minus(cost_2_binary_transformed[p[element]])) if p[element] in cost_2_combination: lists_for_each_element[element].extend(cost_2_combination[p[element]]) combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1]) current_layer.extend(combinations_to_be_applied) ''' if unique_raw_combinations: length = len(current_layer) current_layer = self.filter_non_unique_combinations(current_layer) print("From " + str(length) + " combinations, we filter " + str(length - len(current_layer)) + " nonunique raw feature combinations.") #now evaluate all from this layer #print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = evaluate_candidates_parallel(current_layer, self.n_jobs) print("----------- Evaluation Finished -----------") ##nested cv ''' new_results_with_nested = [] for r_result in results: if type(r_result) != type(None): new_results_with_nested.append(r_result) #results = nested_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train']) results = multiple_cv_score_parallel(new_results_with_nested, self.reader.splitted_values['train'], self.reader.splitted_target['train']) for r_result in results: #print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str(r_result.runtime_properties['test_score']) + ' nested: ' + str(r_result.runtime_properties['nested_cv_score'])) print(str(r_result) + ' cv: ' + str(r_result.runtime_properties['score']) + ' test: ' + str( r_result.runtime_properties['test_score']) + ' nested: ' + str( r_result.runtime_properties['multiple_cv_score'])) ''' #print(results) layer_end_time = time.time() - self.global_starting_time #calculate whether we drop the evaluated candidate for candidate in results: ## check if we computed an equivalent feature before if type(candidate) != type(None) and not isinstance(candidate.transformation, IdentityTransformation): materialized_all = [] for fold_ii in range(len(my_globale_module.preprocessed_folds_global)): materialized_all.extend(candidate.runtime_properties['test_transformed'][fold_ii].flatten()) materialized = tuple(materialized_all) if materialized in my_globale_module.materialized_set: candidate = None else: my_globale_module.materialized_set.add(materialized) ''' ## check if predictions exist already if type(candidate) != type(None) and 'test_fold_predictions' in candidate.runtime_properties: materialized_all = [] for fold_ii in range(len(my_globale_module.preprocessed_folds_global)): materialized_all.extend(candidate.runtime_properties['test_fold_predictions'][fold_ii].flatten()) materialized = tuple(materialized_all) if materialized in my_globale_module.predictions_set: candidate = None else: my_globale_module.predictions_set.add(materialized) ''' if type(candidate) != type(None): candidate.runtime_properties['layer_end_time'] = layer_end_time #print(str(candidate) + " -> " + str(candidate.runtime_properties['score'])) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate if candidate.runtime_properties['passed']: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if self.save_logs: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) satisfied_count = 0 if c in cost_2_raw_features: satisfied_count += len(cost_2_raw_features[c]) if c in cost_2_unary_transformed: satisfied_count += len(cost_2_unary_transformed[c]) if c in cost_2_binary_transformed: satisfied_count += len(cost_2_binary_transformed[c]) if c in cost_2_combination: satisfied_count += len(cost_2_combination[c]) all_count = len(current_layer) if c == 1: all_count = len(cost_2_raw_features[c]) print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.") if len(current_layer) > 0: if 'test_score' in max_feature.runtime_properties: print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n") else: print("\nBest representation found for complexity = " + str(c) + ": " + str( max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format( max_feature.runtime_properties['score']) + "\n") #print("hyper: " + str(max_feature.runtime_properties['hyperparameters'])) #print(max_feature.runtime_properties['fold_scores']) # upload best feature to OpenML if self.upload2openml: candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven') if self.save_logs: try: pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped" + str(self.reader.rotate_test) + ".p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) except: pickle.dump(cost_2_raw_features, open( Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_unary_transformed, open( Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_binary_transformed, open( Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_combination, open( Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_dropped_evaluated_candidates, open( Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) max_feature_per_complexity[c] = max_feature if type(self.c_max) == type(None) and c > 2: # calculate harmonic mean harmonic_means = [0.0]*3 for h_i in range(len(harmonic_means)): simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score) #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i)) if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]: print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2])) break if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp: break c += 1 if type(self.c_max) != type(None) and self.c_max < c: break def extend_all(all_representations: List[CandidateFeature], new_llist): for mylist in new_llist: all_representations.extend(mylist) #get all representation all_representations: List[CandidateFeature] = [] extend_all(all_representations, cost_2_raw_features.values()) extend_all(all_representations, cost_2_unary_transformed.values()) extend_all(all_representations, cost_2_binary_transformed.values()) extend_all(all_representations, cost_2_combination.values()) self.all_representations = all_representations ''' #find top k based on cv score scores = [c.runtime_properties['score'] for c in all_representations] sorted_cv_score_ids = np.argsort(np.array(scores)*-1) checking_k = 50 top_k_representations = [all_representations[sorted_id] for sorted_id in sorted_cv_score_ids[0:checking_k]] #from top k - select best based on nested cv score top_k_representations = multiple_cv_score_parallel(top_k_representations, self.reader.splitted_values['train'], self.reader.splitted_target['train']) scores = [c.runtime_properties['multiple_cv_score'] for c in top_k_representations] max_nested_cv_score = -1 max_nested_rep = None for eval_candidate in top_k_representations: if eval_candidate.runtime_properties['multiple_cv_score'] > max_nested_cv_score: max_nested_cv_score = eval_candidate.runtime_properties['multiple_cv_score'] max_nested_rep = eval_candidate print(max_nested_rep) max_feature = max_nested_rep ''' ''' all_features = list(max_feature_per_complexity.values()) all_features = multiple_cv_score_parallel(all_features, self.reader.splitted_values['train'], self.reader.splitted_target['train']) best_multiple_cv_score = -np.inf best_multiple_cv_candidate = None for all_f in all_features: if all_f.runtime_properties['multiple_cv_score'] > best_multiple_cv_score: best_multiple_cv_score = all_f.runtime_properties['multiple_cv_score'] best_multiple_cv_candidate = all_f #find the most simple representation that is within the best representation's std complexities = [all_f.get_complexity() for all_f in all_features] ids_complex = np.argsort(complexities) for all_f_i in range(len(all_features)): print(str(all_features[ids_complex[all_f_i]]) + ' mcv: ' + str(all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score']) + ' mcv_std: ' + str( all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score_std'])) if all_features[ids_complex[all_f_i]].runtime_properties['multiple_cv_score'] > best_multiple_cv_candidate.runtime_properties['multiple_cv_score'] - best_multiple_cv_candidate.runtime_properties['multiple_cv_score_std']: max_feature = all_features[ids_complex[all_f_i]] break print(max_feature) ''' #min AICc selection min_aicc = np.inf min_aicc_feature = None all_aiccs = [] for rep in list(max_feature_per_complexity.values()): all_aiccs.append(np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity'])) def calculate_AIC_for_classification_paper(rss, n, k): AIC = 2 * k + float(n) * np.log(rss / float(n)) return AIC def calculate_AICc_for_classification_paper(rss, n, k): AIC = calculate_AIC_for_classification_paper(rss, n, k) AICc = AIC + ((2 * k * (k + 1)) / (n - k - 1)) return AICc def calc_global_aicc(rep): return calculate_AICc_for_classification_paper(np.sum(rep.runtime_properties['additional_metrics']['rss']), np.sum(rep.runtime_properties['additional_metrics']['n']), rep.get_complexity()) def is_better(old_aics, new_aics): print(np.sum(np.array(new_aics) < np.array(old_aics))) return np.sum(np.array(new_aics) < np.array(old_aics)) > len(new_aics) / 2.0 for rep in list(max_feature_per_complexity.values()): curr = np.mean(rep.runtime_properties['additional_metrics']['AICc_complexity']) #print(str(rep) + ': ' + str(curr) + ' AICc min: ' + str(np.min(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' AICc std: ' + str(np.std(rep.runtime_properties['additional_metrics']['AICc_complexity'])) + ' P: ' + str(np.exp((min(all_aiccs) - curr)/2)) + ' CV AUC: ' + str(rep.runtime_properties['score'])) print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['AICc_complexity'])) print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['rss'])) print(str(rep) + ':' + str(rep.runtime_properties['additional_metrics']['n'])) print(str(rep) + 'global_aicc: ' + str(calc_global_aicc(rep))) #if type(min_aicc_feature) == type(None) or is_better(min_aicc_feature.runtime_properties['additional_metrics']['AICc_complexity'], rep.runtime_properties['additional_metrics']['AICc_complexity']): if type(min_aicc_feature) == type(None) or calc_global_aicc(rep) < calc_global_aicc(min_aicc_feature): #min_aicc = np.min(rep.runtime_properties['additional_metrics']['AICc_complexity']) min_aicc_feature = rep max_feature = min_aicc_feature print(max_feature) return max_feature
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features) cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} if self.save_logs: cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} self.complexity_delta = 1.0 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -float("inf") max_feature_per_complexity: Dict[int, CandidateFeature] = {} all_evaluated_features = set() my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time) my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters) my_globale_module.score_global = copy.deepcopy(self.score) my_globale_module.classifier_global = copy.deepcopy(self.classifier) my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds) my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds) my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target) my_globale_module.test_target_global = copy.deepcopy(self.test_target) my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp) my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds) my_globale_module.epsilon_global = copy.deepcopy(self.epsilon) my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta) my_globale_module.remove_parents = copy.deepcopy(self.remove_parents) c = 1 while(True): current_layer: List[CandidateFeature] = [] #0th if c == 1: cost_2_raw_features[c]: List[CandidateFeature] = [] #print(self.raw_features) for raw_f in self.raw_features: sympy_representation = sympy.Symbol('X' + str(raw_f.column_id)) raw_f.sympy_representation = sympy_representation all_evaluated_features.add(sympy_representation) if raw_f.is_numeric(): if raw_f.properties['missing_values']: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) else: current_layer.append(raw_f) #print("numeric: " + str(raw_f)) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) #print("nonnumeric: " + str(raw_f)) self.materialize_raw_features(raw_f) #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0]) # first unary # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?) unary_candidates_to_be_applied: List[CandidateFeature] = [] if (c - 1) in cost_2_raw_features: unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1]) if (c - 1) in cost_2_unary_transformed: unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1]) if (c - 1) in cost_2_binary_transformed: unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1]) all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features) current_layer.extend(all_unary_features) #second binary #get length 2 partitions for current cost partition = self.get_length_2_partition(c-1) #print("bin: c: " + str(c) + " partition" + str(partition)) #apply cross product from partitions binary_candidates_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) for bt in binary_transformations: list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) #print(list_of_combinations) for combo in list_of_combinations: if bt.is_applicable(combo): sympy_representation = bt.get_sympy_representation( [p.get_sympy_representation() for p in combo]) try: if len(sympy_representation.free_symbols) > 0: # if expression is not constant if not sympy_representation in all_evaluated_features: bin_candidate = CandidateFeature(copy.deepcopy(bt), combo) bin_candidate.sympy_representation = copy.deepcopy(sympy_representation) all_evaluated_features.add(sympy_representation) binary_candidates_to_be_applied.append(bin_candidate) else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass except: pass current_layer.extend(binary_candidates_to_be_applied) #third: feature combinations #first variant: treat combination as a transformation #therefore, we can use the same partition as for binary data partition = self.get_length_2_partition(c) #print("combo c: " + str(c) + " partition" + str(partition)) combinations_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) if p[element] in cost_2_combination: lists_for_each_element[element].extend(cost_2_combination[p[element]]) combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1]) current_layer.extend(combinations_to_be_applied) if unique_raw_combinations: length = len(current_layer) current_layer = self.filter_non_unique_combinations(current_layer) print("From " + str(length) + " combinations, we filter " + str(length - len(current_layer)) + " nonunique raw feature combinations.") #now evaluate all from this layer #print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = evaluate_candidates(current_layer) print("----------- Evaluation Finished -----------") #print(results) layer_end_time = time.time() - self.global_starting_time #calculate whether we drop the evaluated candidate for candidate in results: if type(candidate) != type(None): candidate.runtime_properties['layer_end_time'] = layer_end_time #print(str(candidate) + " -> " + str(candidate.runtime_properties['score'])) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate if candidate.runtime_properties['passed']: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if self.save_logs: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) satisfied_count = 0 if c in cost_2_raw_features: satisfied_count += len(cost_2_raw_features[c]) if c in cost_2_unary_transformed: satisfied_count += len(cost_2_unary_transformed[c]) if c in cost_2_binary_transformed: satisfied_count += len(cost_2_binary_transformed[c]) if c in cost_2_combination: satisfied_count += len(cost_2_combination[c]) all_count = len(current_layer) if c == 1: all_count = len(cost_2_raw_features[c]) print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.") if len(current_layer) > 0: if Config.get_default('score.test', 'False') == 'True': print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n") else: print("\nBest representation found for complexity = " + str(c) + ": " + str( max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format( max_feature.runtime_properties['score']) + "\n") #print("hyper: " + str(max_feature.runtime_properties['hyperparameters'])) #print(max_feature.runtime_properties['fold_scores']) # upload best feature to OpenML if self.upload2openml: candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven') if self.save_logs: pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) max_feature_per_complexity[c] = max_feature if type(self.c_max) == type(None) and c > 2: # calculate harmonic mean harmonic_means = [0.0]*3 for h_i in range(len(harmonic_means)): simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score) #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i)) if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]: print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2])) break if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp: break c += 1 if type(self.c_max) != type(None) and self.c_max < c: break
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features) cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} if self.save_logs: cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} self.complexity_delta = 1.0 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -float("inf") max_feature_per_complexity: Dict[int, CandidateFeature] = {} all_evaluated_features = set() my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time) my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters) my_globale_module.score_global = copy.deepcopy(self.score) my_globale_module.classifier_global = copy.deepcopy(self.classifier) my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds) my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds) my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target) my_globale_module.test_target_global = copy.deepcopy(self.test_target) my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp) my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds) my_globale_module.epsilon_global = copy.deepcopy(self.epsilon) my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta) ############################ # start ############################ current_layer = [] c = 1 cost_2_raw_features[c]: List[CandidateFeature] = [] # print(self.raw_features) for raw_f in self.raw_features: sympy_representation = sympy.Symbol('X' + str(raw_f.column_id)) raw_f.sympy_representation = sympy_representation all_evaluated_features.add(sympy_representation) if raw_f.is_numeric(): current_layer.append(raw_f) # print("numeric: " + str(raw_f)) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) # print("nonnumeric: " + str(raw_f)) self.materialize_raw_features(raw_f) raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0]) # now evaluate all from this layer # print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = evaluate_candidates(current_layer) print("----------- Evaluation Finished -----------") layer_end_time = time.time() - self.global_starting_time # calculate whether we drop the evaluated candidate for candidate in results: if type(candidate) != type(None): candidate.runtime_properties['layer_end_time'] = layer_end_time # print(str(candidate) + " -> " + str(candidate.runtime_properties['score'])) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate if candidate.runtime_properties['passed']: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if self.save_logs: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) print(cost_2_raw_features[c]) #select next representation #next_id = np.argmax([rf.runtime_properties['score'] for rf in cost_2_raw_features[1]]) next_id = np.random.randint(len(cost_2_raw_features[1])) next_rep = cost_2_raw_features[c][next_id] max_rep = next_rep current_lambda = 0 number_runs= 200 rep_succesion = [] for runs in range(number_runs): rep_succesion.append(next_rep) #print('next: ' + str(next_rep)) ####################### #create branch ####################### current_layer = [] # first unary if not isinstance(next_rep.transformation, IdentityTransformation): current_layer.extend(self.generate_features(unary_transformations, [next_rep], all_evaluated_features)) # second binary if not isinstance(next_rep.transformation, IdentityTransformation): binary_candidates_to_be_applied = [] for bt in binary_transformations: list_of_combinations = self.generate_merge([next_rep], cost_2_raw_features[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) # print(list_of_combinations) for combo in list_of_combinations: if bt.is_applicable(combo): sympy_representation = bt.get_sympy_representation( [p.get_sympy_representation() for p in combo]) try: if len(sympy_representation.free_symbols) > 0: # if expression is not constant if not sympy_representation in all_evaluated_features: bin_candidate = CandidateFeature(copy.deepcopy(bt), combo) bin_candidate.sympy_representation = copy.deepcopy(sympy_representation) binary_candidates_to_be_applied.append(bin_candidate) else: # print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass else: # print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass except: pass current_layer.extend(binary_candidates_to_be_applied) # third: feature combinations ''' combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, [next_rep], cost_2_raw_features[1]) current_layer.extend(combinations_to_be_applied) ''' #print(current_layer) # select next representation shuffled_indices = np.arange(len(current_layer)) np.random.shuffle(shuffled_indices) for rep_i in range(len(current_layer)): new_rep = current_layer[shuffled_indices[rep_i]] all_evaluated_features.add(next_rep.sympy_representation) new_rep = evaluate_candidates([new_rep])[0] if new_rep != None: break print(str(new_rep) + " cv score: " + str(new_rep.runtime_properties['score']) + " test: " + str( new_rep.runtime_properties['test_score'])) if new_rep == None: break if new_rep.runtime_properties['score'] * self.score._sign > max_rep.runtime_properties['score']: max_rep = new_rep print("max representation: " + str(max_rep)) if new_rep.runtime_properties['score'] * self.score._sign <= rep_succesion[-1*(current_lambda+1)].runtime_properties['score']: current_lambda += 1 if current_lambda >= self.lambda_threshold: next_rep = max_rep current_lambda = 0 else: next_rep = new_rep
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() for raw_f in self.raw_features: raw_f.properties['type'] = 'float' #starting_feature_matrix = self.create_starting_features() self.generate_target() myfolds = copy.deepcopy(list(self.preprocessed_folds)) R_w = 15000 max_iterations = 15 #15 threshold_f = 0.001 epsilon_w = 0.01 threshold_w = 0.0 all_features = self.produce_features() print(len(all_features)) self.base_features = CandidateFeature( IdentityTransformation(len(self.raw_features)), self.raw_features) results = {} for i in range(max_iterations): print("base features: " + str(self.base_features)) results[i] = self.evaluate_candidates([self.base_features], myfolds)[0] print(results[i]) print(results[i].runtime_properties) feature_scores = self.evaluate_ranking(all_features) ids = np.argsort(np.array(feature_scores) * -1) print(feature_scores) best_improvement_so_far = np.NINF best_Feature_So_Far = None evaluated_candidate_features = 0 for f_i in range(len(feature_scores)): if feature_scores[ids[f_i]] < threshold_f: break current_feature_set = CandidateFeature( IdentityTransformation(2), [self.base_features, all_features[ids[f_i]]]) print(current_feature_set) result = self.evaluate_candidates([current_feature_set], myfolds)[0] evaluated_candidate_features += 1 improvement = result.runtime_properties['score'] - results[ i].runtime_properties['score'] print("Candidate: " + str(all_features[ids[f_i]]) + " score: " + str(result.runtime_properties['score']) + " info: " + str(feature_scores[ids[f_i]])) print("improvement: " + str(improvement)) if improvement > best_improvement_so_far: best_improvement_so_far = improvement best_Feature_So_Far = result results[i] = best_Feature_So_Far results[i].runtime_properties[ 'score_improvement'] = improvement results[i].runtime_properties[ 'info_gain'] = feature_scores[ids[f_i]] results[i].runtime_properties['global time'] = time.time( ) - self.global_starting_time pickle.dump( results, open( Config.get("tmp.folder") + "/explorekit_results.p", "wb")) if improvement >= epsilon_w: break if evaluated_candidate_features >= R_w: break if best_improvement_so_far > threshold_w: self.base_features = best_Feature_So_Far else: return self.base_features all_features_new = [] for i in range(len(feature_scores)): if feature_scores[i] >= 0: all_features_new.append(all_features[i]) all_features = all_features_new return results
ground_truth = [28, 48, 64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433, 442, 451, 453, 455, 472, 475, 493] print(len(ground_truth)) mask = np.zeros(len(numeric_representations), dtype=bool) for i in range(len(numeric_representations)): for g in ground_truth: if str(numeric_representations[i]) == 'V' + str(g): mask[i] = True break print(np.sum(mask)) all_features = CandidateFeature(IdentityTransformation(-1), numeric_representations) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) #foreigner = np.array(X_train[:,7]) #gender = np.array(['female' in personal_status for personal_status in X_train[:,15]]) scoring = {'auc': make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)} #for count_i in range(10): parameter_grid = {'model__penalty': ['l2'], 'model__C': [1], 'model__solver': ['lbfgs'], 'model__class_weight': ['balanced'], 'model__max_iter': [10000], 'model__multi_class': ['auto']} my_pipeline = Pipeline([('features', all_standardized.pipeline), #('selection', L1Selection()), #('selection', SelectKBest(score_func=mutual_info_classif,k=10)),
def run_pipeline(which_features_to_use, c=None, runs=1): model = LogisticRegression if type(c) == type(None): c = [0.001, 0.01, 0.1, 1, 10, 100, 1000] else: c = [c] parameter_grid = {'c__penalty': ['l2'], 'c__C': c, 'c__solver': ['lbfgs'], 'c__class_weight': ['balanced'], 'c__max_iter': [10000], 'c__multi_class': ['auto']} auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) numeric_representations = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/features.p", "rb")) #print(len(numeric_representations)) #X_train, X_test, y_train, y_test X_train = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_train.p", "rb")) X_test = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/X_test.p", "rb")) y_train = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_train.p", "rb")) y_test = pickle.load(open("/home/felix/phd/feature_constraints/" + str(which_experiment) + "/y_test.p", "rb")) #generate pipeline all_selected_features = [] for i in range(len(which_features_to_use)): if which_features_to_use[i]: all_selected_features.append(numeric_representations[i]) all_features = CandidateFeature(IdentityTransformation(-1), all_selected_features) all_standardized = CandidateFeature(MinMaxScalingTransformation(), [all_features]) my_pipeline = Pipeline([('f', all_standardized.pipeline), ('c', model()) ]) cv_scores = [] test_scores = [] pred_test = None proba_pred_test = None if runs > 1: for r in range(runs): kfolds = StratifiedKFold(10, shuffle=True, random_state=42+r) pipeline = GridSearchCV(my_pipeline, parameter_grid, cv=kfolds.split(X_train, y_train), scoring=auc, n_jobs=4) pipeline.fit(X_train, y_train) pred_test = pipeline.predict(X_test) proba_pred_test = pipeline.predict_proba(X_test) test_auc = auc(pipeline, X_test, y_test) cv_scores.append(pipeline.best_score_) test_scores.append(test_auc) std_loss = np.std(cv_scores) loss = np.average(cv_scores) else: kfolds = StratifiedKFold(10, shuffle=True, random_state=42) pipeline = GridSearchCV(my_pipeline, parameter_grid, cv=kfolds.split(X_train, y_train), scoring=auc, n_jobs=4) pipeline.fit(X_train, y_train) pred_test = pipeline.predict(X_test) proba_pred_test = pipeline.predict_proba(X_test) test_auc = auc(pipeline, X_test, y_test) std_loss = pipeline.cv_results_['std_test_score'][pipeline.best_index_] #std_loss = np.min([pipeline.cv_results_['split' + str(split)+ '_test_score'][pipeline.best_index_] for split in range(10)]) loss = pipeline.cv_results_['mean_test_score'][pipeline.best_index_] test_scores.append(test_auc) print(pipeline.classes_) return loss, np.average(test_scores), pred_test, std_loss, proba_pred_test