''' ''' data = pd.read_csv(Config.get('data_path') + '/promoters/dataset_106_molecular-biology_promoters.csv', delimiter=',', header=0) my_global_utils1.y_train = data['class'].values my_global_utils1.X_train = data[data.columns.difference(['class', 'instance'])].values my_global_utils1.data_name = 'promoters' my_global_utils1.one_hot = True ''' ''' data = pd.read_csv(Config.get('data_path') + '/leukemia/leukemia.csv', delimiter=',', header=0) my_global_utils1.y_train = data['CLASS'].values my_global_utils1.X_train = data[data.columns.difference(['CLASS'])].values my_global_utils1.data_name = 'leukemia' ''' data = pd.read_csv(Config.get('data_path') + '/breastTumor/breastTumor.csv', delimiter=',', header=0) my_global_utils1.y_train = data['binaryClass'].values my_global_utils1.X_train = data[data.columns.difference(['binaryClass' ])].values my_global_utils1.data_name = 'breastTumor' my_global_utils1.one_hot = True ''' data = pd.read_csv(Config.get('data_path') + '/coil2000/coil2000.csv', delimiter=',', header=0) my_global_utils1.y_train = data['CARAVAN'].values my_global_utils1.X_train = data[data.columns.difference(['CARAVAN'])].values my_global_utils1.data_name = 'coil2000' ''' '''
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer(self.train_X_all, self.raw_features) cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} if self.save_logs: cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} self.complexity_delta = 1.0 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -float("inf") max_feature_per_complexity: Dict[int, CandidateFeature] = {} all_evaluated_features = set() my_globale_module.global_starting_time_global = copy.deepcopy(self.global_starting_time) my_globale_module.grid_search_parameters_global = copy.deepcopy(self.grid_search_parameters) my_globale_module.score_global = copy.deepcopy(self.score) my_globale_module.classifier_global = copy.deepcopy(self.classifier) my_globale_module.target_train_folds_global = copy.deepcopy(self.target_train_folds) my_globale_module.target_test_folds_global = copy.deepcopy(self.target_test_folds) my_globale_module.train_y_all_target_global = copy.deepcopy(self.train_y_all_target) my_globale_module.test_target_global = copy.deepcopy(self.test_target) my_globale_module.max_timestamp_global = copy.deepcopy(self.max_timestamp) my_globale_module.preprocessed_folds_global = copy.deepcopy(self.preprocessed_folds) my_globale_module.epsilon_global = copy.deepcopy(self.epsilon) my_globale_module.complexity_delta_global = copy.deepcopy(self.complexity_delta) my_globale_module.remove_parents = copy.deepcopy(self.remove_parents) c = 1 while(True): current_layer: List[CandidateFeature] = [] #0th if c == 1: cost_2_raw_features[c]: List[CandidateFeature] = [] #print(self.raw_features) for raw_f in self.raw_features: sympy_representation = sympy.Symbol('X' + str(raw_f.column_id)) raw_f.sympy_representation = sympy_representation all_evaluated_features.add(sympy_representation) if raw_f.is_numeric(): if raw_f.properties['missing_values']: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) else: current_layer.append(raw_f) #print("numeric: " + str(raw_f)) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) #print("nonnumeric: " + str(raw_f)) self.materialize_raw_features(raw_f) #raw_f.derive_properties(raw_f.runtime_properties['train_transformed'][0]) # first unary # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?) unary_candidates_to_be_applied: List[CandidateFeature] = [] if (c - 1) in cost_2_raw_features: unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1]) if (c - 1) in cost_2_unary_transformed: unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1]) if (c - 1) in cost_2_binary_transformed: unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1]) all_unary_features = self.generate_features(unary_transformations, unary_candidates_to_be_applied, all_evaluated_features) current_layer.extend(all_unary_features) #second binary #get length 2 partitions for current cost partition = self.get_length_2_partition(c-1) #print("bin: c: " + str(c) + " partition" + str(partition)) #apply cross product from partitions binary_candidates_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) for bt in binary_transformations: list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) #print(list_of_combinations) for combo in list_of_combinations: if bt.is_applicable(combo): sympy_representation = bt.get_sympy_representation( [p.get_sympy_representation() for p in combo]) try: if len(sympy_representation.free_symbols) > 0: # if expression is not constant if not sympy_representation in all_evaluated_features: bin_candidate = CandidateFeature(copy.deepcopy(bt), combo) bin_candidate.sympy_representation = copy.deepcopy(sympy_representation) all_evaluated_features.add(sympy_representation) binary_candidates_to_be_applied.append(bin_candidate) else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass else: #print(str(bin_candidate) + " skipped: " + str(sympy_representation)) pass except: pass current_layer.extend(binary_candidates_to_be_applied) #third: feature combinations #first variant: treat combination as a transformation #therefore, we can use the same partition as for binary data partition = self.get_length_2_partition(c) #print("combo c: " + str(c) + " partition" + str(partition)) combinations_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) if p[element] in cost_2_combination: lists_for_each_element[element].extend(cost_2_combination[p[element]]) combinations_to_be_applied = self.generate_merge_for_combination(all_evaluated_features, lists_for_each_element[0], lists_for_each_element[1]) current_layer.extend(combinations_to_be_applied) if unique_raw_combinations: length = len(current_layer) current_layer = self.filter_non_unique_combinations(current_layer) print("From " + str(length) + " combinations, we filter " + str(length - len(current_layer)) + " nonunique raw feature combinations.") #now evaluate all from this layer #print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = evaluate_candidates(current_layer) print("----------- Evaluation Finished -----------") #print(results) layer_end_time = time.time() - self.global_starting_time #calculate whether we drop the evaluated candidate for candidate in results: if type(candidate) != type(None): candidate.runtime_properties['layer_end_time'] = layer_end_time #print(str(candidate) + " -> " + str(candidate.runtime_properties['score'])) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate if candidate.runtime_properties['passed']: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if self.save_logs: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) satisfied_count = 0 if c in cost_2_raw_features: satisfied_count += len(cost_2_raw_features[c]) if c in cost_2_unary_transformed: satisfied_count += len(cost_2_unary_transformed[c]) if c in cost_2_binary_transformed: satisfied_count += len(cost_2_binary_transformed[c]) if c in cost_2_combination: satisfied_count += len(cost_2_combination[c]) all_count = len(current_layer) if c == 1: all_count = len(cost_2_raw_features[c]) print("Of " + str(all_count) + " candidate representations, " + str(satisfied_count) + " did satisfy the epsilon threshold.") if len(current_layer) > 0: if Config.get_default('score.test', 'False') == 'True': print("\nBest representation found for complexity = " + str(c) + ": " + str(max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format(max_feature.runtime_properties['score']) + ", score on test: " + "{0:.2f}".format(max_feature.runtime_properties['test_score']) + "\n") else: print("\nBest representation found for complexity = " + str(c) + ": " + str( max_feature) + "\nmean cross-validation score: " + "{0:.2f}".format( max_feature.runtime_properties['score']) + "\n") #print("hyper: " + str(max_feature.runtime_properties['hyperparameters'])) #print(max_feature.runtime_properties['fold_scores']) # upload best feature to OpenML if self.upload2openml: candidate2openml(max_feature, my_globale_module.classifier_global, self.reader.task, 'ComplexityDriven') if self.save_logs: pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) max_feature_per_complexity[c] = max_feature if type(self.c_max) == type(None) and c > 2: # calculate harmonic mean harmonic_means = [0.0]*3 for h_i in range(len(harmonic_means)): simplicity_cum_score = self.getSimplicityScore(max_feature_per_complexity[c-h_i].get_complexity(), c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) accuracy_cum_score = self.getAccuracyScore(max_feature_per_complexity[c-h_i].runtime_properties['score'], c, cost_2_raw_features, cost_2_unary_transformed, cost_2_binary_transformed, cost_2_combination) harmonic_means[h_i] = self.harmonic_mean(simplicity_cum_score, accuracy_cum_score) #print(str(max_feature_per_complexity[c-h_i]) + ": " + str(harmonic_means[h_i]) + " h: " + str(h_i)) if harmonic_means[2] >= harmonic_means[1] and harmonic_means[2] >= harmonic_means[0]: print("Best Harmonic Mean: " + str(max_feature_per_complexity[c-2])) break if type(self.max_timestamp) != type(None) and time.time() >= self.max_timestamp: break c += 1 if type(self.c_max) != type(None) and self.c_max < c: break
def generate_in_parallel(self, transformations, current_features): self.current_features = current_features pool = mp.Pool(processes=int(Config.get("parallelism"))) results = pool.map(self.generate_for_transformation, transformations) return list(itertools.chain(*results))
self.splitted_values = {} self.splitted_target= {} self.splitted_target['train'], self.splitted_target['valid'], self.splitted_target['test'] = self.splitter.materialize_target(self.target_values) self.splitted_values['train'], self.splitted_values['valid'],self.splitted_values['test'] = self.splitter.materialize_values(self.dataframe) for attribute_i in range(self.dataframe.shape[1]): rf = RawFeature(self.dataframe.columns[attribute_i], attribute_i, {}) rf.derive_properties(self.dataframe[self.dataframe.columns[attribute_i]].values) self.raw_features.append(rf) return self.raw_features if __name__ == '__main__': from fastsklearnfeature.splitting.RandomSplitter import RandomSplitter from fastsklearnfeature.configuration.Config import Config s = RandomSplitter() dataset = (Config.get('data_path') + '/house_price.csv', 79) r = Reader(dataset[0], dataset[1], s) r.read() for rf in r.raw_features: print(str(rf) + ": " + str(rf.properties))
import openml from fastsklearnfeature.configuration.Config import Config import pickle import numpy as np import random from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from fastsklearnfeature.configuration.Config import Config from sklearn import preprocessing import openml from sklearn.pipeline import FeatureUnion from sklearn.model_selection import train_test_split openml.config.apikey = Config.get('openML.apikey') unique_data = {} for _, data_info in openml.datasets.list_datasets().items(): if 'status' in data_info and data_info['status'] == 'active' \ and 'NumberOfClasses' in data_info and data_info['NumberOfClasses'] == 2 \ and 'NumberOfInstances' in data_info and data_info['NumberOfInstances'] > 250: try: dataset = openml.datasets.get_dataset(data_info['did']) print(data_info) continuous_columns = [] categorical_features = []
if __name__ == '__main__': #dataset = (Config.get('statlog_heart.csv'), 13) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22) #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30) # dataset = ("/home/felix/datasets/ExploreKit/csv/phpOJxGL9_indianliver.csv", 10) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv", 20) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv", 9) # dataset = ("/home/felix/datasets/ExploreKit/csv/phpn1jVwe_mammography.csv", 6) #dataset = (Config.get('iris.csv'), 4) #dataset = (Config.get('banknote.csv'), 4) #dataset = (Config.get('ecoli.csv'), 8) #dataset = (Config.get('abalone.csv'), 8) #dataset = (Config.get('breastcancer.csv'), 0) dataset = (Config.get('data_path') + '/transfusion.data', 4) from fastsklearnfeature.reader.OnlineOpenMLReader import OnlineOpenMLReader from fastsklearnfeature.feature_selection.evaluation.openMLdict import openMLname2task #dataset = None #task_id = openMLname2task['transfusion'] #selector = ExploreKitSelection_iterative_search(dataset, reader=OnlineOpenMLReader(task_id)) selector = ExploreKitSelection_iterative_search(dataset) #selector = ExploreKitSelection(dataset, KNeighborsClassifier(), {'n_neighbors': np.arange(3,10), 'weights': ['uniform','distance'], 'metric': ['minkowski','euclidean','manhattan']}) selector.run()
if __name__ == '__main__': #dataset = (Config.get('statlog_heart.csv'), 13) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22) #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30) # dataset = ("/home/felix/datasets/ExploreKit/csv/phpOJxGL9_indianliver.csv", 10) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv", 20) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv", 9) # dataset = ("/home/felix/datasets/ExploreKit/csv/phpn1jVwe_mammography.csv", 6) #dataset = (Config.get('iris.csv'), 4) #dataset = (Config.get('banknote.csv'), 4) dataset = (Config.get('ecoli.csv'), 8) #dataset = (Config.get('abalone.csv'), 8) #dataset = (Config.get('breastcancer.csv'), 0) #dataset = (Config.get('transfusion.csv'), 4) #dataset = (Config.get('test_categorical.csv'), 4) #dataset = ('../configuration/resources/data/transfusion.data', 4) start = time.time() selector = SimpleFeatureConstruction(dataset, c_max=3, save_logs=True) ''' selector = SimpleFeatureConstruction(dataset, classifier=KNeighborsClassifier(), grid_search_parameters={'classifier__n_neighbors': np.arange(3,10), 'classifier__weights': ['uniform','distance'], 'classifier__metric': ['minkowski','euclidean','manhattan']}, c_max=3, save_logs=True)
hp.uniform('informative_specified', 0, 1), 'n_redundant': hp.uniform('redundant_specified', 0, 1), 'n_repeated': hp.uniform('repeated_specified', 0, 1), 'n_useless': hp.uniform('useless_specified', 0, 1), 'n_clusters_per_class': hp.randint('clusters_specified', 1, 10), } configurations = [] try: configurations = pickle.load( open( Config.get('data_path') + "/scaling_configurations_samples/scaling_configurations.pickle", "rb")) except: while len(configurations) < 100: my_config = hyperopt.pyll.stochastic.sample(space) try: generate_data(100, 50, my_config, 0) configurations.append(my_config) except: continue pickle.dump( configurations, open( Config.get('data_path') +
stratifier = StratifiedKFold(n_splits=2, random_state=42) self.train, self.test = next( stratifier.split(self.dataset.splitted_values['train'], self.current_target)) results = self.evaluate_candidates(self.candidates) return results #statlog_heart.csv=/home/felix/datasets/ExploreKit/csv/dataset_53_heart-statlog_heart.csv #statlog_heart.target=13 if __name__ == '__main__': dataset = (Config.get('statlog_heart.csv'), int(Config.get('statlog_heart.target'))) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22) #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30) # dataset = ("/home/felix/datasets/ExploreKit/csv/phpOJxGL9_indianliver.csv", 10) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv", 20) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv", 9) # dataset = ("/home/felix/datasets/ExploreKit/csv/phpn1jVwe_mammography.csv", 6) selector = ExploreKitSelection_iterative_search(dataset) #selector = ExploreKitSelection(dataset, KNeighborsClassifier(), {'n_neighbors': np.arange(3,10), 'weights': ['uniform','distance'], 'metric': ['minkowski','euclidean','manhattan']}) results = selector.run()
def evaluate_candidates(self, candidates): pool = mp.Pool(processes=int(Config.get("parallelism"))) results = pool.map(self.evaluate_single_candidate, candidates) return results
from sklearn.model_selection import cross_val_score from sklearn.preprocessing import MinMaxScaler import fastsklearnfeature.interactiveAutoML.feature_selection.WrapperBestK as wrap from sklearn.ensemble import ExtraTreesClassifier from hyperopt.fmin import generate_trials_to_calculate ''' data = pd.read_csv(Config.get('data_path') + '/breastTumor/breastTumor.csv', delimiter=',', header=0) y = data['binaryClass'].values X = data[data.columns.difference(['binaryClass'])].values data_name = 'breastTumor' one_hot = True ''' data = pd.read_csv(Config.get('data_path') + '/promoters/dataset_106_molecular-biology_promoters.csv', delimiter=',', header=0) y = data['class'].values X = data[data.columns.difference(['class', 'instance'])].values data_name = 'promoters' one_hot = True ''' X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:,0:500] y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values data_name = 'madelon' one_hot = False ''' ''' X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data', delimiter=' ', header=None).values[:,0:10000]
def evaluate_single_candidate(self, candidate): new_score = -1.0 new_score = self.evaluate(candidate) return new_score def run(self): # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() print([r.name for r in self.dataset.raw_features]) plain_attributes = CandidateFeature( IdentityTransformation(len(self.dataset.raw_features)), self.dataset.raw_features) self.evaluate_candidates([plain_attributes]) #statlog_heart.csv=/home/felix/datasets/ExploreKit/csv/dataset_53_heart-statlog_heart.csv #statlog_heart.target=13 if __name__ == '__main__': dataset = (Config.get('data_path') + '/dataset_53_heart-statlog_heart.csv', 13) selector = SissoExperiment(dataset) selector.run()
from sklearn.metrics import recall_score from sklearn.metrics import precision_score from sklearn.feature_selection import SelectKBest from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from mlxtend.feature_selection import SequentialFeatureSelector as SFS import multiprocessing as mp import itertools from sklearn.ensemble import RandomForestRegressor import scipy.special import seaborn as sns import matplotlib.pyplot as plt from fastsklearnfeature.configuration.Config import Config X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:, 0:500][0:100, :] y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values[0:100] name = 'hyperopt' # generate grid complexity_grid = np.arange(1, X_train.shape[1] + 1) max_acc = 0.7 accuracy_grid = np.arange(0.0, max_acc, max_acc / len(complexity_grid)) #print(complexity_grid)
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from mlxtend.feature_selection import SequentialFeatureSelector as SFS import multiprocessing as mp import itertools from sklearn.ensemble import RandomForestRegressor import scipy.special import seaborn as sns import matplotlib.pyplot as plt from fastsklearnfeature.configuration.Config import Config #X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:,0:500] [0:100,:] #y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values [0:100] X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data', delimiter=' ', header=None).values[:,0:10000][0:100,:] y_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.labels', delimiter=' ', header=None).values[0:100] data_name = 'ARCENE_sample' # generate grid complexity_grid = np.arange(1, X_train.shape[1]+1) max_acc = 1.0 accuracy_grid = np.arange(0.0, max_acc, max_acc / 100.0) def get_estimated_runtimes(old_model = "/tmp/model11_hyperopt.p"): grid = list(itertools.product(complexity_grid, accuracy_grid)) meta_X_data = np.matrix(grid)
def run(self): self.global_starting_time = time.time() # generate all candidates self.generate() #starting_feature_matrix = self.create_starting_features() self.generate_target() unary_transformations, binary_transformations = self.transformation_producer() cost_2_raw_features: Dict[int, List[CandidateFeature]] = {} cost_2_unary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_binary_transformed: Dict[int, List[CandidateFeature]] = {} cost_2_combination: Dict[int, List[CandidateFeature]] = {} cost_2_dropped_evaluated_candidates: Dict[int, List[CandidateFeature]] = {} complexity_delta = 1.0 epsilon = self.epsilon limit_runs = self.c_max + 1 # 5 unique_raw_combinations = False baseline_score = 0.0#self.evaluate_candidates([CandidateFeature(DummyOneTransformation(None), [self.raw_features[0]])])[0]['score'] #print("baseline: " + str(baseline_score)) max_feature = CandidateFeature(IdentityTransformation(None), [self.raw_features[0]]) max_feature.runtime_properties['score'] = -2 self.name_to_transfomed = {} for c in range(1, limit_runs): current_layer: List[CandidateFeature] = [] #0th if c == 1: cost_2_raw_features[c]: List[CandidateFeature] = [] for raw_f in self.raw_features: if raw_f.is_numeric(): current_layer.append(raw_f) else: raw_f.runtime_properties['score'] = 0.0 cost_2_raw_features[c].append(raw_f) # first unary # we apply all unary transformation to all c-1 in the repo (except combinations and other unary?) unary_candidates_to_be_applied: List[CandidateFeature] = [] if (c - 1) in cost_2_raw_features: unary_candidates_to_be_applied.extend(cost_2_raw_features[c - 1]) if (c - 1) in cost_2_unary_transformed: unary_candidates_to_be_applied.extend(cost_2_unary_transformed[c - 1]) if (c - 1) in cost_2_binary_transformed: unary_candidates_to_be_applied.extend(cost_2_binary_transformed[c - 1]) current_layer.extend(self.generate_features(unary_transformations, unary_candidates_to_be_applied)) #second binary #get length 2 partitions for current cost partition = self.get_length_2_partition(c-1) #print("bin: c: " + str(c) + " partition" + str(partition)) #apply cross product from partitions binary_candidates_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) for bt in binary_transformations: list_of_combinations = self.generate_merge(lists_for_each_element[0], lists_for_each_element[1], bt.parent_feature_order_matters, bt.parent_feature_repetition_is_allowed) for combo in list_of_combinations: if bt.is_applicable(combo): binary_candidates_to_be_applied.append(CandidateFeature(copy.deepcopy(bt), combo)) current_layer.extend(binary_candidates_to_be_applied) #third: feature combinations #first variant: treat combination as a transformation #therefore, we can use the same partition as for binary data partition = self.get_length_2_partition(c) #print("combo c: " + str(c) + " partition" + str(partition)) combinations_to_be_applied: List[CandidateFeature] = [] for p in partition: lists_for_each_element: List[List[CandidateFeature]] = [[], []] for element in range(2): if p[element] in cost_2_raw_features: lists_for_each_element[element].extend(cost_2_raw_features[p[element]]) if p[element] in cost_2_unary_transformed: lists_for_each_element[element].extend(cost_2_unary_transformed[p[element]]) if p[element] in cost_2_binary_transformed: lists_for_each_element[element].extend(cost_2_binary_transformed[p[element]]) if p[element] in cost_2_combination: lists_for_each_element[element].extend(cost_2_combination[p[element]]) list_of_combinations = self.generate_merge_for_combination(lists_for_each_element[0], lists_for_each_element[1]) for combo in list_of_combinations: if IdentityTransformation(None).is_applicable(list(combo)): combinations_to_be_applied.append(CandidateFeature(IdentityTransformation(None), list(combo))) current_layer.extend(combinations_to_be_applied) if unique_raw_combinations: length = len(current_layer) current_layer = self.filter_non_unique_combinations(current_layer) print("From " + str(length) + " combinations, we filter " + str(length - len(current_layer)) + " nonunique raw feature combinations.") #now evaluate all from this layer #print(current_layer) print("----------- Evaluation of " + str(len(current_layer)) + " representations -----------") results = self.evaluate_candidates(current_layer) print("----------- Evaluation Finished -----------") layer_end_time = time.time() - self.global_starting_time #calculate whether we drop the evaluated candidate for result in results: candidate: CandidateFeature = result['candidate'] candidate.runtime_properties['score'] = result['score'] candidate.runtime_properties['test_score'] = result['test_score'] candidate.runtime_properties['execution_time'] = result['execution_time'] candidate.runtime_properties['global_time'] = result['global_time'] candidate.runtime_properties['hyperparameters'] = result['hyperparameters'] candidate.runtime_properties['layer_end_time'] = layer_end_time #print(str(candidate) + " -> " + str(candidate.score)) if candidate.runtime_properties['score'] > max_feature.runtime_properties['score']: max_feature = candidate #calculate original score original_score = baseline_score #or zero?? if not isinstance(candidate, RawFeature): original_score = max([p.runtime_properties['score'] for p in candidate.parents]) accuracy_delta = result['score'] - original_score if accuracy_delta / complexity_delta > epsilon: if isinstance(candidate, RawFeature): if not c in cost_2_raw_features: cost_2_raw_features[c]: List[CandidateFeature] = [] cost_2_raw_features[c].append(candidate) elif isinstance(candidate.transformation, UnaryTransformation): if not c in cost_2_unary_transformed: cost_2_unary_transformed[c]: List[CandidateFeature] = [] cost_2_unary_transformed[c].append(candidate) elif isinstance(candidate.transformation, IdentityTransformation): if not c in cost_2_combination: cost_2_combination[c]: List[CandidateFeature] = [] cost_2_combination[c].append(candidate) else: if not c in cost_2_binary_transformed: cost_2_binary_transformed[c]: List[CandidateFeature] = [] cost_2_binary_transformed[c].append(candidate) else: if not c in cost_2_dropped_evaluated_candidates: cost_2_dropped_evaluated_candidates[c]: List[CandidateFeature] = [] cost_2_dropped_evaluated_candidates[c].append(candidate) if c in cost_2_dropped_evaluated_candidates: print("Of " + str(len(current_layer)) + " candidate representations, " + str(len(cost_2_dropped_evaluated_candidates[c])) + " did not satisfy the epsilon threshold.") else: print("Of " + str(len(current_layer)) + " candidate representations, all satisfied the epsilon threshold.") print("Best representation found for complexity = " + str(c) + ": " + str(max_feature) + "\n") if self.save_logs: pickle.dump(cost_2_raw_features, open(Config.get_default("tmp.folder", "/tmp") + "/data_raw.p", "wb")) pickle.dump(cost_2_unary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_unary.p", "wb")) pickle.dump(cost_2_binary_transformed, open(Config.get_default("tmp.folder", "/tmp") + "/data_binary.p", "wb")) pickle.dump(cost_2_combination, open(Config.get_default("tmp.folder", "/tmp") + "/data_combination.p", "wb")) pickle.dump(cost_2_dropped_evaluated_candidates, open(Config.get_default("tmp.folder", "/tmp") + "/data_dropped.p", "wb"))
from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.exhaustive import exhaustive from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.forward_floating_selection import forward_selection from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.backward_floating_selection import backward_selection from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.forward_floating_selection import forward_floating_selection from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.backward_floating_selection import backward_floating_selection from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.recursive_feature_elimination import recursive_feature_elimination #static constraints: fairness, number of features (absolute and relative), robustness, privacy, accuracy from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.bench_utils import get_fair_data1 from concurrent.futures import TimeoutError from pebble import ProcessPool, ProcessExpired #load list of viable datasets data_infos = pickle.load(open(Config.get('data_path') + '/openml_data/fitting_datasets.pickle', 'rb')) current_run_time_id = time.time() time_limit = 60 * 60 * 3 n_jobs = 20 number_of_runs = 1 X_train_meta_classifier = [] y_train_meta_classifier = [] ranking_scores_info = [] acc_value_list = [] fair_value_list = []
def __init__(self, taskID, test_folds=1, rotate_test=0): self.task_id = taskID self.raw_features: List[RawFeature] = [] self.test_folds = test_folds self.rotate_test = rotate_test openml.config.apikey = Config.get('openML.apikey')
from sklearn.ensemble import RandomForestClassifier from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.weighted_ranking import weighted_ranking from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.hyperparameter_optimization import hyperparameter_optimization from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.metalearning.strategies.evolution import evolution #static constraints: fairness, number of features (absolute and relative), robustness, privacy, accuracy from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.bench_utils import get_data_openml import multiprocessing as mp import tqdm #load list of viable datasets data_infos = pickle.load( open( Config.get('data_path') + '/openml_data/fitting_datasets.pickle', 'rb')) time_limit = 60 * 20 n_jobs = 20 number_of_runs = 2 meta_classifier = RandomForestClassifier(n_estimators=1000) X_train_meta_classifier = [] y_train_meta_classifier = [] y_train_meta_classifier_avg_times = [] y_train_meta_classifier_avg_acc = [] y_train_meta_classifier_avg_fair = [] y_train_meta_classifier_avg_robust = [] y_train_meta_classifier_avg_k = []
print("(" + str(r + 1) + "," + str(results[r]['score']) + ")") new_scores = [r['score'] for r in results] best_id = np.argmax(new_scores) print(results[best_id]) #statlog_heart.csv=/home/felix/datasets/ExploreKit/csv/dataset_53_heart-statlog_heart.csv #statlog_heart.target=13 if __name__ == '__main__': #dataset = (Config.get('statlog_heart.csv'), int(Config.get('statlog_heart.target'))) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22) #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30) # dataset = ("/home/felix/datasets/ExploreKit/csv/phpOJxGL9_indianliver.csv", 10) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15) #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv", 20) # dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv", 9) # dataset = ("/home/felix/datasets/ExploreKit/csv/phpn1jVwe_mammography.csv", 6) dataset = (Config.get('transfusion.csv'), 4) selector = ExploreKitSelection_iterative_search(dataset) #selector = ExploreKitSelection(dataset, KNeighborsClassifier(), {'n_neighbors': np.arange(3,10), 'weights': ['uniform','distance'], 'metric': ['minkowski','euclidean','manhattan']}) results = selector.run() pickle.dump(results, open("/tmp/all_data_iterations.p", "wb"))
from sklearn.feature_selection import mutual_info_classif from sklearn.feature_selection import f_classif from sklearn.feature_selection import chi2 from sklearn.model_selection import cross_val_score from fastsklearnfeature.interactiveAutoML.fair_measure import true_positive_rate_score from fastsklearnfeature.interactiveAutoML.new_bench.multiobjective.robust_measure import robust_score import diffprivlib.models as models sensitive_attribute = "sex" n_estimators = 5 df = pd.read_csv(Config.get('data_path') + '/adult/dataset_183_adult.csv', delimiter=',', header=0) y = df['class'] del df['class'] X = df one_hot = True limit = 1000 X_train, X_test, y_train, y_test = train_test_split(X.values[0:limit, :], y.values[0:limit], test_size=0.5, random_state=42) continuous_columns = [0, 2, 4, 10, 11, 12]
import itertools from sklearn.ensemble import RandomForestRegressor import scipy.special import seaborn as sns import matplotlib.pyplot as plt from fastsklearnfeature.configuration.Config import Config #X_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.data', delimiter=' ', header=None).values[:,0:500] [0:100,:] #y_train = pd.read_csv(Config.get('data_path') + '/madelon/madelon_train.labels', delimiter=' ', header=None).values [0:100] ''' X_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.data', delimiter=' ', header=None).values[:,0:10000][0:100,:] y_train = pd.read_csv(Config.get('data_path') + '/ARCENE/arcene_train.labels', delimiter=' ', header=None).values[0:100] data_name = 'ARCENE_sample' ''' data = pd.read_csv(Config.get('data_path') + '/musk/musk.csv', delimiter=',', header=0) y_train = data['class'] X_train = data[data.columns.difference( ['class', 'ID', 'molecule_name', 'conformation_name'])].values data_name = 'musk' # generate grid complexity_grid = np.arange(1, X_train.shape[1] + 1) max_acc = 1.0 accuracy_grid = np.arange(0.0, max_acc, max_acc / 100.0) def get_estimated_runtimes(old_model="/tmp/model11_hyperopt.p"):
def filter_failing_in_parallel(self): pool = mp.Pool(processes=int(Config.get("parallelism"))) results = pool.map(self.filter_candidate, self.candidates) return list(itertools.chain(*results))
[ (0.0), (hp.uniform('robustness_specified', 0, 1)) ]), ### dataset space 'n_informative': hp.uniform('informative_specified', 0, 1), 'n_redundant': hp.uniform('redundant_specified', 0, 1), 'n_repeated': hp.uniform('repeated_specified', 0, 1), 'n_useless': hp.uniform('useless_specified', 0, 1), 'n_clusters_per_class': hp.randint('clusters_specified', 1,10), } configurations = [] try: configurations = pickle.load(open(Config.get('data_path') + "/scaling_configurations_samples/scaling_configurations.pickle", "rb")) except: while len(configurations) < 100: my_config = hyperopt.pyll.stochastic.sample(space) try: generate_data(100, 50, my_config, 0) configurations.append(my_config) except: continue pickle.dump(configurations, open(Config.get('data_path') + "/scaling_configurations_samples/scaling_configurations.pickle", 'wb')) how_many_samples = int(input('enter number of samples please: '))