def generate(self):

        s = Splitter(train_fraction=[0.6, 10000000], seed=42)
        #s = Splitter(train_fraction=[0.1, 10000000], seed=42)

        self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s)
        raw_features = self.dataset.read()
Esempio n. 2
0
    def generate(self, seed=42):
        if type(self.reader) == type(None):
            s = None
            if isinstance(self.classifier(), ClassifierMixin):
                s = Splitter(train_fraction=[0.6, 10000000], valid_fraction=0.0, test_fraction=0.4, seed=seed)
            elif isinstance(self.classifier(), RegressorMixin):
                s = RandomSplitter(train_fraction=[0.6, 10000000], valid_fraction=0.0, test_fraction=0.4, seed=seed)
            else:
                pass

            self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s)
        else:
            self.dataset = self.reader
        self.raw_features = self.dataset.read()

        print("training:" + str(len(self.dataset.splitted_target['train'])))
        print("test:" + str(len(self.dataset.splitted_target['test'])))

        if Config.get_default('instance.selection', 'False') == 'True':
            self.train_X_all = copy.deepcopy(self.dataset.splitted_values['train'])
            self.train_y_all = copy.deepcopy(self.dataset.splitted_target['train'])

            self.dataset.splitted_values['train'], self.dataset.splitted_target['train'] = sample_data_by_cnn(self.dataset.splitted_values['train'], self.dataset.splitted_target['train'])
            print("training:" + str(len(self.dataset.splitted_target['train'])))
        else:
            self.train_X_all = self.dataset.splitted_values['train']
            self.train_y_all = self.dataset.splitted_target['train']
Esempio n. 3
0
    def generate(self):

        s = Splitter(train_fraction=[0.6, 10000000], seed=42)
        #s = Splitter(train_fraction=[0.1, 10000000], seed=42)

        self.dataset = Reader(self.dataset_config[0], self.dataset_config[1],
                              s)
        raw_features = self.dataset.read()

        g = Generator(raw_features)
        self.candidates = g.generate_all_candidates()
        print("Number candidates: " + str(len(self.candidates)))
Esempio n. 4
0
class ExploreKitSelection:
    def __init__(
        self,
        dataset_config,
        classifier=LogisticRegression(),
        grid_search_parameters={
            'classifier__penalty': ['l2'],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'classifier__solver': ['lbfgs']
        }):
        self.dataset_config = dataset_config
        self.classifier = classifier
        self.grid_search_parameters = grid_search_parameters

    #generate all possible combinations of features
    def generate(self):

        s = Splitter(train_fraction=[0.6, 10000000], seed=42)
        #s = Splitter(train_fraction=[0.1, 10000000], seed=42)

        self.dataset = Reader(self.dataset_config[0], self.dataset_config[1],
                              s)
        raw_features = self.dataset.read()

        g = Generator(raw_features)
        self.candidates = g.generate_all_candidates()
        print("Number candidates: " + str(len(self.candidates)))

    #rank and select features
    def random_select(self, k: int):
        arr = np.arange(len(self.candidates))
        np.random.shuffle(arr)
        return arr[0:k]

    def select_interpretable(self, k: int):
        inv_map = {v: k for k, v in self.candidate_id_to_ranked_id.items()}
        selected = []
        for i in range(k):
            selected.append(inv_map[i])
        return selected

    def generate_target(self):
        current_target = self.dataset.splitted_target['train']
        self.current_target = LabelEncoder().fit_transform(current_target)

    def evaluate(self, candidate: CandidateFeature, runs=10):
        for i in range(runs):
            candidate.pipeline.fit(self.dataset.splitted_values['train'])
            candidate.pipeline.transform(self.dataset.splitted_values['train'])

    def create_starting_features(self):
        Fi: List[RawFeature] = self.dataset.raw_features

        #materialize and numpyfy the features
        starting_feature_matrix = np.zeros(
            (Fi[0].materialize()['train'].shape[0], len(Fi)))
        for f_index in range(len(Fi)):
            starting_feature_matrix[:, f_index] = Fi[f_index].materialize(
            )['train']
        return starting_feature_matrix

    def my_arg_sort(self, seq):
        # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3383106#3383106
        # non-lambda version by Tony Veijalainen
        return [i for (v, i) in sorted((v, i) for (i, v) in enumerate(seq))]

    def get_interpretability_ranking(self):
        #high interpretability -> low interpretability
        interpretability_ids = self.my_arg_sort(self.candidates)

        self.candidate_id_to_ranked_id = {}
        for i in range(len(interpretability_ids)):
            self.candidate_id_to_ranked_id[interpretability_ids[i]] = i

    def get_traceability_ranking(self):
        # high traceability -> low traceability
        self.traceability: List[float] = []
        for c_i in range(len(self.candidates)):
            self.traceability.append(
                self.candidates[c_i].calculate_traceability())
        ids = np.argsort(np.array(self.traceability) * -1)

        self.candidate_id_to_ranked_id = {}
        for i in range(len(ids)):
            self.candidate_id_to_ranked_id[ids[i]] = i

        all_data = {}
        all_data['my_dict'] = self.candidate_id_to_ranked_id
        all_data['traceability'] = self.traceability
        pickle.dump(all_data, open("/tmp/traceability.p", "wb"))

    def get_interpretability(self, candidate_id):
        return 1.0 - ((self.candidate_id_to_ranked_id[candidate_id] + 1) /
                      float(len(self.candidate_id_to_ranked_id)))

    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(
                self.dataset.splitted_values['train'], self.current_target):
            self.preprocessed_folds.append((train, test))

        pool = mp.Pool(processes=int(Config.get("parallelism")))
        results = pool.map(self.evaluate_single_candidate, candidates)
        return results

    '''
    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'],
                                                                               self.current_target):
            self.preprocessed_folds.append((train, test))

        results = []
        for c in candidates:
            results.append(self.evaluate_single_candidate(c))
        return results

    '''

    def evaluate_single_candidate(self, candidate):
        result = {}
        time_start_gs = time.time()
        runs = 10
        try:
            self.evaluate(candidate, runs)
            #print("feature: " + str(candidate) + " -> " + str(new_score))
        except Exception as e:
            print(str(candidate) + " -> " + str(e))
            pass
        result['candidate'] = candidate
        result['time'] = (time.time() - time_start_gs) / float(runs)
        return result

    '''
    def evaluate_single_candidate(self, candidate):
        new_score = -1.0
        new_score = self.evaluate(candidate)
        return new_score
    '''

    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        candidate_name_to_id = {}
        for c_i in range(len(self.candidates)):
            candidate_name_to_id[self.candidates[c_i].get_name()] = c_i

        pickle.dump(candidate_name_to_id, open("/tmp/name2id.p", "wb"))

        pickle.dump(self.candidates, open("/tmp/all_candiates.p", "wb"))

        self.get_interpretability_ranking()
        #self.get_traceability_ranking()

        #evaluate starting matrix
        #start_score = self.evaluate(starting_feature_matrix)
        start_score = -1
        print("start score: " + str(start_score))

        #get candidates that should be evaluated
        ranked_selected_candidate_ids = self.select_interpretable(
            len(self.candidates))

        start_time = time.time()

        results = self.evaluate_candidates(
            np.array(self.candidates)[ranked_selected_candidate_ids])

        print("evaluation time: " + str((time.time() - start_time) / 60) +
              " min")

        return start_score, results, ranked_selected_candidate_ids
Esempio n. 5
0
all_data = pickle.load(open(file, "rb"))

feature_predictions = pickle.load(
    open('/home/felix/phd/feature_predictions/all_data_predictions.p', "rb"))

name2result_predictions = {}
for result in feature_predictions:
    name2result_predictions[str(result['candidate'])] = result

dataset_config = (Config.get('statlog_heart.csv'),
                  int(Config.get('statlog_heart.target')))

s = Splitter(train_fraction=[0.6, 10000000], seed=42)

dataset = Reader(dataset_config[0], dataset_config[1], s)
raw_features = dataset.read()
X = dataset.splitted_values['train']

#delta mean -> avg, min, max gain


def calculate_MSE(candidate: CandidateFeature, X):
    ys = []
    for p in candidate.parents:
        p.fit(X)
        y = p.transform(X)

        ys.append(y)

    #correlation
class SissoExperiment:
    def __init__(self, dataset_config, classifier=LogisticRegression(), grid_search_parameters={'classifier__penalty': ['l2'],
                                                                                                'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                                                                                                'classifier__solver': ['lbfgs']}):
        self.dataset_config = dataset_config
        self.classifier = classifier
        self.grid_search_parameters = grid_search_parameters

    #generate all possible combinations of features
    def generate(self):

        s = Splitter(train_fraction=[0.6, 10000000], seed=42)
        #s = Splitter(train_fraction=[0.1, 10000000], seed=42)

        self.dataset = Reader(self.dataset_config[0], self.dataset_config[1], s)
        raw_features = self.dataset.read()

        #g = Generator(raw_features)
        #self.candidates = g.generate_all_candidates()
        #print("Number candidates: " + str(len(self.candidates)))

    def generate_target(self):
        current_target = self.dataset.splitted_target['train']
        self.current_target = LabelEncoder().fit_transform(current_target)

    def evaluate(self, candidate, score=make_scorer(roc_auc_score, average='micro'), folds=10):
        parameters = self.grid_search_parameters


        '''
        if not isinstance(candidate, CandidateFeature):
            pipeline = Pipeline([('features',FeatureUnion(

                        [(p.get_name(), p.pipeline) for p in candidate]
                    )),
                ('classifier', self.classifier)
            ])
        else:
            pipeline = Pipeline([('features', FeatureUnion(
                [
                    (candidate.get_name(), candidate.pipeline)
                ])),
                 ('classifier', self.classifier)
                 ])
        '''

        result = {}

        ''''
        clf = GridSearchCV(pipeline, parameters, cv=self.preprocessed_folds, scoring=score, iid=False, error_score='raise')
        clf.fit(self.dataset.splitted_values['train'], self.current_target)
        result['score'] = clf.best_score_
        result['hyperparameters'] = clf.best_params_
        '''

        feateng_cols = ['age', 'sex', 'chest', 'resting_blood_pressure', 'serum_cholestoral', 'fasting_blood_sugar',
                        'resting_electrocardiographic_results', 'maximum_heart_rate_achieved',
                        'exercise_induced_angina', 'oldpeak', 'slope', 'number_of_major_vessels', 'thal']


        print(self.current_target)

        '''
        df = pd.DataFrame(data=self.dataset.splitted_values['train'], columns=feateng_cols)
        df['id'] = pd.Series(range(len(df)), index=df.index)

        df_target = pd.DataFrame(data=self.current_target, columns=['target'])
        df_target['id'] = pd.Series(range(len(df)), index=df_target.index)

        print(df)


        es = ft.EntitySet("example")
        es.entity_from_dataframe(dataframe=df, entity_id="heart",index="id")
        es.entity_from_dataframe(dataframe=df_target, entity_id="target", index="id")

        new_relationship = ft.Relationship(es["heart"]["id"], es["target"]["id"])
        es = es.add_relationship(new_relationship)

        
        '''

        df = pd.DataFrame(data=self.dataset.splitted_values['train'], columns=feateng_cols)
        df['id'] = pd.Series(range(len(df)), index=df.index)
        df['target'] = pd.Series(self.current_target, index=df.index).map({0: 'healthy', 1: 'ill'})

        es = ft.EntitySet("example")
        es.entity_from_dataframe(dataframe=df, entity_id="heart", index="id")


        es.normalize_entity(base_entity_id='heart', new_entity_id='target_e', index='id',
                            additional_variables=['target'])

        print(es)

        feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='target_e', max_depth=6, verbose=1,
                                              n_jobs=4, max_features=2)

        drop_cols = []
        for col in feature_matrix:
            if col == 'target':
                pass
            else:
                if 'target' in col:
                    drop_cols.append(col)

        feature_matrix = feature_matrix[[x for x in feature_matrix if x not in drop_cols]]
        print(feature_matrix.columns)




        return result



    '''
    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'], self.current_target):
            self.preprocessed_folds.append((train, test))

        pool = mp.Pool(processes=int(Config.get("parallelism")))
        results = pool.map(self.evaluate_single_candidate, candidates)
        return results

    '''
    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'],
                                                                               self.current_target):
            self.preprocessed_folds.append((train, test))

        results = []
        for c in candidates:
            results.append(self.evaluate_single_candidate(c))
        return results



    '''
    def evaluate_single_candidate(self, candidate):
        result = {}
        time_start_gs = time.time()
        try:
            result = self.evaluate(candidate)
            #print("feature: " + str(candidate) + " -> " + str(new_score))
        except Exception as e:
            print(str(candidate) + " -> " + str(e))
            result['score'] = -1.0
            result['hyperparameters'] = {}
            pass
        result['candidate'] = candidate
        result['time'] = time.time() - time_start_gs
        return result


    '''
    def evaluate_single_candidate(self, candidate):
        new_score = -1.0
        new_score = self.evaluate(candidate)
        return new_score



    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        print([r.name for r in self.dataset.raw_features])


        plain_attributes = CandidateFeature(IdentityTransformation(len(self.dataset.raw_features)), self.dataset.raw_features)


        self.evaluate_candidates([plain_attributes])
Esempio n. 7
0
class SissoExperiment:
    def __init__(
        self,
        dataset_config,
        classifier=LogisticRegression(),
        grid_search_parameters={
            'classifier__penalty': ['l2'],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'classifier__solver': ['lbfgs']
        }):
        self.dataset_config = dataset_config
        self.classifier = classifier
        self.grid_search_parameters = grid_search_parameters

    #generate all possible combinations of features
    def generate(self):

        s = Splitter(train_fraction=[0.6, 10000000], seed=42)
        #s = Splitter(train_fraction=[0.1, 10000000], seed=42)

        self.dataset = Reader(self.dataset_config[0], self.dataset_config[1],
                              s)
        raw_features = self.dataset.read()

        g = Generator(raw_features)
        self.candidates = g.generate_all_candidates()
        print("Number candidates: " + str(len(self.candidates)))

    def generate_target(self):
        current_target = self.dataset.splitted_target['train']
        self.current_target = LabelEncoder().fit_transform(current_target)

    def evaluate(self,
                 candidate,
                 score=make_scorer(roc_auc_score, average='micro'),
                 folds=10):
        parameters = self.grid_search_parameters
        '''
        if not isinstance(candidate, CandidateFeature):
            pipeline = Pipeline([('features',FeatureUnion(

                        [(p.get_name(), p.pipeline) for p in candidate]
                    )),
                ('classifier', self.classifier)
            ])
        else:
            pipeline = Pipeline([('features', FeatureUnion(
                [
                    (candidate.get_name(), candidate.pipeline)
                ])),
                 ('classifier', self.classifier)
                 ])
        '''

        result = {}
        ''''
        clf = GridSearchCV(pipeline, parameters, cv=self.preprocessed_folds, scoring=score, iid=False, error_score='raise')
        clf.fit(self.dataset.splitted_values['train'], self.current_target)
        result['score'] = clf.best_score_
        result['hyperparameters'] = clf.best_params_
        '''

        feateng_cols = [
            'age', 'sex', 'chest', 'resting_blood_pressure',
            'serum_cholestoral', 'fasting_blood_sugar',
            'resting_electrocardiographic_results',
            'maximum_heart_rate_achieved', 'exercise_induced_angina',
            'oldpeak', 'slope', 'number_of_major_vessels', 'thal'
        ]

        print(self.current_target)

        afreg = AutoFeatRegression(n_jobs=4, feateng_cols=feateng_cols)
        #df = afreg.fit_transform(pd.DataFrame(data=self.dataset.splitted_values['train'], columns=feateng_cols), self.current_target)

        np.save('/tmp/X', self.dataset.splitted_values['train'])
        np.save('/tmp/y', self.current_target)

        return result

    '''
    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'], self.current_target):
            self.preprocessed_folds.append((train, test))

        pool = mp.Pool(processes=int(Config.get("parallelism")))
        results = pool.map(self.evaluate_single_candidate, candidates)
        return results

    '''

    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(
                self.dataset.splitted_values['train'], self.current_target):
            self.preprocessed_folds.append((train, test))

        results = []
        for c in candidates:
            results.append(self.evaluate_single_candidate(c))
        return results

    '''
    def evaluate_single_candidate(self, candidate):
        result = {}
        time_start_gs = time.time()
        try:
            result = self.evaluate(candidate)
            #print("feature: " + str(candidate) + " -> " + str(new_score))
        except Exception as e:
            print(str(candidate) + " -> " + str(e))
            result['score'] = -1.0
            result['hyperparameters'] = {}
            pass
        result['candidate'] = candidate
        result['time'] = time.time() - time_start_gs
        return result


    '''

    def evaluate_single_candidate(self, candidate):
        new_score = -1.0
        new_score = self.evaluate(candidate)
        return new_score

    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        print([r.name for r in self.dataset.raw_features])

        plain_attributes = CandidateFeature(
            IdentityTransformation(len(self.dataset.raw_features)),
            self.dataset.raw_features)

        self.evaluate_candidates([plain_attributes])
Esempio n. 8
0
class ExploreKitSelection_iterative_search:
    def __init__(
        self,
        dataset_config,
        classifier=LogisticRegression(),
        grid_search_parameters={
            'classifier__penalty': ['l2'],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'classifier__solver': ['lbfgs'],
            'classifier__class_weight': ['balanced'],
            'classifier__max_iter': [10000]
        }):
        self.dataset_config = dataset_config
        self.classifier = classifier
        self.grid_search_parameters = grid_search_parameters

    #generate all possible combinations of features
    def generate(self):

        s = Splitter(train_fraction=[0.6, 10000000], seed=42)
        #s = Splitter(train_fraction=[0.1, 10000000], seed=42)

        self.dataset = Reader(self.dataset_config[0], self.dataset_config[1],
                              s)
        raw_features = self.dataset.read()

        g = Generator(raw_features)
        self.candidates = g.generate_all_candidates()
        print("Number candidates: " + str(len(self.candidates)))

    #rank and select features
    def random_select(self, k: int):
        arr = np.arange(len(self.candidates))
        np.random.shuffle(arr)
        return arr[0:k]

    def generate_target(self):
        current_target = self.dataset.splitted_target['train']
        self.current_target = LabelEncoder().fit_transform(current_target)

    def evaluate(self,
                 candidate,
                 score=make_scorer(roc_auc_score, average='micro'),
                 folds=10):
        parameters = self.grid_search_parameters

        if not isinstance(candidate, CandidateFeature):
            pipeline = Pipeline([('features',
                                  FeatureUnion([(p.get_name(), p.pipeline)
                                                for p in candidate])),
                                 ('classifier', self.classifier)])
        else:
            pipeline = Pipeline([('features',
                                  FeatureUnion([(candidate.get_name(),
                                                 candidate.pipeline)])),
                                 ('classifier', self.classifier)])

        result = {}

        clf = GridSearchCV(pipeline,
                           parameters,
                           cv=self.preprocessed_folds,
                           scoring=score,
                           iid=False,
                           error_score='raise')
        clf.fit(self.dataset.splitted_values['train'], self.current_target)
        result['score'] = clf.best_score_
        result['hyperparameters'] = clf.best_params_

        return result

    def create_starting_features(self):
        Fi: List[RawFeature] = self.dataset.raw_features

        #materialize and numpyfy the features
        starting_feature_matrix = np.zeros(
            (Fi[0].materialize()['train'].shape[0], len(Fi)))
        for f_index in range(len(Fi)):
            starting_feature_matrix[:, f_index] = Fi[f_index].materialize(
            )['train']
        return starting_feature_matrix

    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(
                self.dataset.splitted_values['train'], self.current_target):
            self.preprocessed_folds.append((train, test))

        pool = mp.Pool(processes=int(Config.get("parallelism")))
        results = pool.map(self.evaluate_single_candidate, candidates)
        return results

    '''
    def evaluate_candidates(self, candidates):
        self.preprocessed_folds = []
        for train, test in StratifiedKFold(n_splits=10, random_state=42).split(self.dataset.splitted_values['train'],
                                                                               self.current_target):
            self.preprocessed_folds.append((train, test))

        results = []
        for c in candidates:
            results.append(self.evaluate_single_candidate(c))
        return results

    '''

    def evaluate_single_candidate(self, candidate):
        result = {}
        time_start_gs = time.time()
        try:
            result = self.evaluate(candidate)
            #print("feature: " + str(candidate) + " -> " + str(new_score))
        except Exception as e:
            print(str(candidate) + " -> " + str(e))
            result['score'] = -1.0
            result['hyperparameters'] = {}
            pass
        result['candidate'] = candidate
        result['time'] = time.time() - time_start_gs
        return result

    '''
    def evaluate_single_candidate(self, candidate):
        new_score = -1.0
        new_score = self.evaluate(candidate)
        return new_score
    '''

    #https://stackoverflow.com/questions/10035752/elegant-python-code-for-integer-partitioning
    def partition(self, number):
        answer = set()
        answer.add((number, ))
        for x in range(1, number):
            for y in self.partition(number - x):
                answer.add(tuple(sorted((x, ) + y)))
        return answer

    def get_all_features_below_n_cost(self, cost):
        filtered_candidates = []
        for i in range(len(self.candidates)):
            if (self.candidates[i].get_number_of_transformations() +
                    1) <= cost:
                filtered_candidates.append(self.candidates[i])
        return filtered_candidates

    def get_all_features_equal_n_cost(self, cost):
        filtered_candidates = []
        for i in range(len(self.candidates)):
            if (self.candidates[i].get_number_of_transformations() +
                    1) == cost:
                filtered_candidates.append(self.candidates[i])
        return filtered_candidates

    def get_all_possible_representations_for_step_x(self, x):

        all_representations = set()
        partitions = self.partition(x)

        #get candidates of partitions
        candidates_with_cost_x = {}
        for i in range(x + 1):
            candidates_with_cost_x[i] = self.get_all_features_equal_n_cost(i)

        for p in partitions:
            current_list = itertools.product(
                *[candidates_with_cost_x[pi] for pi in p])
            for c_output in current_list:
                if len(set(c_output)) == len(p):
                    all_representations.add(frozenset(c_output))

        return all_representations

    def filter_failing_features(self):
        working_features: List[CandidateFeature] = []
        for candidate in self.candidates:
            try:
                candidate.fit(self.dataset.splitted_values['train'])
                candidate.transform(self.dataset.splitted_values['train'])
            except:
                continue
            working_features.append(candidate)
        return working_features

    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        working_features = self.filter_failing_features()

        all_f = CandidateFeature(IdentityTransformation(len(working_features)),
                                 working_features)

        selection = CandidateFeature(
            FeatureSelectionTransformation(
                1, 2,
                LogisticRegression(penalty='l2',
                                   solver='lbfgs',
                                   class_weight='balanced',
                                   max_iter=10000)), [all_f])

        results = self.evaluate_candidates([selection])

        new_scores = [r['score'] for r in results]
        best_id = np.argmax(new_scores)

        print(results[best_id])
Esempio n. 9
0
class EvaluationFramework:
    def __init__(
            self,
            dataset_config,
            classifier=LogisticRegression,
            grid_search_parameters={
                'classifier__penalty': ['l2'],
                'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'classifier__solver': ['lbfgs'],
                'classifier__class_weight': ['balanced'],
                'classifier__max_iter': [10000],
                'classifier__multi_class': ['auto']
            },
            transformation_producer=get_transformation_for_feature_space):
        self.dataset_config = dataset_config
        self.classifier = classifier
        self.grid_search_parameters = grid_search_parameters
        self.transformation_producer = transformation_producer

    #generate all possible combinations of features
    def generate(self, seed=42):
        if type(self.reader) == type(None):
            s = None
            if isinstance(self.classifier(), ClassifierMixin):
                s = Splitter(train_fraction=[0.6, 10000000],
                             valid_fraction=0.0,
                             test_fraction=0.4,
                             seed=seed)
            elif isinstance(self.classifier(), RegressorMixin):
                s = RandomSplitter(train_fraction=[0.6, 10000000],
                                   valid_fraction=0.0,
                                   test_fraction=0.4,
                                   seed=seed)
            else:
                pass

            self.dataset = Reader(self.dataset_config[0],
                                  self.dataset_config[1], s)
        else:
            self.dataset = self.reader
        self.raw_features = self.dataset.read()

        print("training:" + str(len(self.dataset.splitted_target['train'])))
        print("test:" + str(len(self.dataset.splitted_target['test'])))

        if Config.get_default('instance.selection', 'False') == 'True':
            self.train_X_all = copy.deepcopy(
                self.dataset.splitted_values['train'])
            self.train_y_all = copy.deepcopy(
                self.dataset.splitted_target['train'])

            #self.dataset.splitted_values['train'], self.dataset.splitted_target['train'] = sample_data_by_cnn(self.dataset.splitted_values['train'], self.dataset.splitted_target['train'])
            print("training:" +
                  str(len(self.dataset.splitted_target['train'])))
        else:
            self.train_X_all = self.dataset.splitted_values['train']
            self.train_y_all = self.dataset.splitted_target['train']

    #rank and select features
    def random_select(self, k: int):
        arr = np.arange(len(self.candidates))
        np.random.shuffle(arr)
        return arr[0:k]

    def generate_target(self):
        current_target = self.dataset.splitted_target['train']

        if isinstance(self.classifier(), ClassifierMixin):
            label_encoder = LabelEncoder()
            label_encoder.fit(current_target)

            current_target = label_encoder.transform(current_target)

            self.test_target = None
            self.train_y_all_target = None
            if Config.get_default('score.test', 'False') == 'True':
                self.test_target = label_encoder.transform(
                    self.dataset.splitted_target['test'])
                self.train_y_all_target = label_encoder.transform(
                    self.train_y_all)

            self.preprocessed_folds = []
            for train, test in StratifiedKFold(
                    n_splits=self.folds, random_state=42).split(
                        self.dataset.splitted_values['train'], current_target):
                self.preprocessed_folds.append((train, test))
        elif isinstance(self.classifier(), RegressorMixin):

            if Config.get_default('score.test', 'False') == 'True':
                self.test_target = self.dataset.splitted_target['test']
                self.train_y_all_target = self.train_y_all

            self.preprocessed_folds = []
            for train, test in KFold(n_splits=self.folds,
                                     random_state=42).split(
                                         self.dataset.splitted_values['train'],
                                         current_target):
                self.preprocessed_folds.append((train, test))
        else:
            pass

        self.target_train_folds = [None] * self.folds
        self.target_test_folds = [None] * self.folds

        for fold in range(len(self.preprocessed_folds)):
            self.target_train_folds[fold] = current_target[
                self.preprocessed_folds[fold][0]]
            self.target_test_folds[fold] = current_target[
                self.preprocessed_folds[fold][1]]

    '''
    def evaluate_candidates(self, candidates: List[CandidateFeature]) -> List[CandidateFeature]:
        pool = mp.Pool(processes=int(Config.get_default("parallelism", mp.cpu_count())))



        my_function = partial(evaluate, classifier=self.classifier,
                              grid_search_parameters=self.grid_search_parameters,
                              preprocessed_folds=self.preprocessed_folds,
                              score=self.score,
                              train_data=self.dataset.splitted_values['train'],
                              current_target=self.current_target,
                              train_X_all=self.train_X_all,
                              train_y_all_target=self.train_y_all_target,
                              test_data=self.dataset.splitted_values['test'],
                              test_target=self.test_target)

        if Config.get_default("show_progess", 'True') == 'True':
            results = []
            for x in tqdm.tqdm(pool.imap_unordered(my_function, candidates), total=len(candidates)):
                results.append(x)
        else:
            results = pool.map(my_function, candidates)


        return results
    '''

    def evaluate_candidates(self, candidates: List[CandidateFeature],
                            my_folds) -> List[CandidateFeature]:
        my_function = partial(
            evaluate,
            classifier=self.classifier,
            grid_search_parameters=self.grid_search_parameters,
            preprocessed_folds=my_folds,
            score=self.score,
            train_data=self.dataset.splitted_values['train'],
            current_target=self.train_y_all_target,
            train_X_all=self.train_X_all,
            train_y_all_target=self.train_y_all_target,
            test_data=self.dataset.splitted_values['test'],
            test_target=self.test_target)

        results = []
        for can in candidates:
            results.append(my_function(can))
        return results

    def evaluate_candidates_detail(self, candidates: List[CandidateFeature],
                                   my_folds,
                                   cv_jobs) -> List[CandidateFeature]:
        my_function = partial(
            evaluate,
            classifier=self.classifier,
            grid_search_parameters=self.grid_search_parameters,
            preprocessed_folds=my_folds,
            score=self.score,
            train_data=self.dataset.splitted_values['train'],
            current_target=self.train_y_all_target,
            train_X_all=self.train_X_all,
            train_y_all_target=self.train_y_all_target,
            test_data=self.dataset.splitted_values['test'],
            test_target=self.test_target,
            cv_jobs=cv_jobs)

        results = []
        for can in candidates:
            results.append(my_function(can))
        return results

    def evaluate_candidates_randomcv(self, candidates: List[CandidateFeature],
                                     my_folds,
                                     cv_jobs) -> List[CandidateFeature]:
        my_function = partial(
            evaluate_randomcv,
            classifier=self.classifier,
            grid_search_parameters=self.grid_search_parameters,
            preprocessed_folds=my_folds,
            score=self.score,
            train_data=self.dataset.splitted_values['train'],
            current_target=self.train_y_all_target,
            train_X_all=self.train_X_all,
            train_y_all_target=self.train_y_all_target,
            test_data=self.dataset.splitted_values['test'],
            test_target=self.test_target,
            cv_jobs=cv_jobs)

        results = []
        for can in candidates:
            results.append(my_function(can))
        return results

    '''
class ExploreKitSelection_iterative_search:
    def __init__(
        self,
        dataset_config,
        classifier=LogisticRegression(),
        grid_search_parameters={
            'classifier__penalty': ['l2'],
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'classifier__solver': ['lbfgs'],
            'classifier__class_weight': ['balanced'],
            'classifier__max_iter': [10000]
        }):
        self.dataset_config = dataset_config
        self.classifier = classifier
        self.grid_search_parameters = grid_search_parameters

    #generate all possible combinations of features
    def generate(self):

        s = Splitter(train_fraction=[0.6, 10000000], seed=42)
        #s = Splitter(train_fraction=[0.1, 10000000], seed=42)

        self.dataset = Reader(self.dataset_config[0], self.dataset_config[1],
                              s)
        raw_features = self.dataset.read()

        g = Generator(raw_features)
        self.candidates = g.generate_all_candidates()
        print("Number candidates: " + str(len(self.candidates)))

    #rank and select features
    def random_select(self, k: int):
        arr = np.arange(len(self.candidates))
        np.random.shuffle(arr)
        return arr[0:k]

    def generate_target(self):
        current_target = self.dataset.splitted_target['train']
        self.current_target = LabelEncoder().fit_transform(current_target)

    def evaluate(self,
                 candidate,
                 score=make_scorer(roc_auc_score, average='micro'),
                 folds=10):

        pipeline = Pipeline([('feature',
                              FeatureUnion([(candidate.get_name(),
                                             candidate.pipeline)])),
                             ('classifier',
                              LogisticRegression(penalty='l2',
                                                 solver='lbfgs',
                                                 class_weight='balanced'))])

        result = {}

        pipeline.fit(self.dataset.splitted_values['train'][self.train],
                     self.current_target[self.train])
        result['probability_estimations_test'] = pipeline.predict_proba(
            self.dataset.splitted_values['train'][self.test])

        return result

    def create_starting_features(self):
        Fi: List[RawFeature] = self.dataset.raw_features

        #materialize and numpyfy the features
        starting_feature_matrix = np.zeros(
            (Fi[0].materialize()['train'].shape[0], len(Fi)))
        for f_index in range(len(Fi)):
            starting_feature_matrix[:, f_index] = Fi[f_index].materialize(
            )['train']
        return starting_feature_matrix

    def evaluate_candidates(self, candidates):
        pool = mp.Pool(processes=int(Config.get("parallelism")))
        results = pool.map(self.evaluate_single_candidate, candidates)
        return results

    def evaluate_single_candidate(self, candidate):
        result = {}
        time_start_gs = time.time()
        try:
            result = self.evaluate(candidate)
            #print("feature: " + str(candidate) + " -> " + str(new_score))
        except Exception as e:
            print(str(candidate) + " -> " + str(e))
            result['score'] = -1.0
            result['hyperparameters'] = {}
            pass
        result['candidate'] = candidate
        result['time'] = time.time() - time_start_gs
        return result

    def filter_failing_features(self):
        working_features: List[CandidateFeature] = []
        for candidate in self.candidates:
            try:
                candidate.fit(self.dataset.splitted_values['train'])
                candidate.transform(self.dataset.splitted_values['train'])
            except:
                continue
            working_features.append(candidate)
        return working_features

    def run(self):
        # generate all candidates
        self.generate()
        #starting_feature_matrix = self.create_starting_features()
        self.generate_target()

        stratifier = StratifiedKFold(n_splits=2, random_state=42)

        self.train, self.test = next(
            stratifier.split(self.dataset.splitted_values['train'],
                             self.current_target))

        results = self.evaluate_candidates(self.candidates)

        return results
Esempio n. 11
0
        print(len(all_representations))

        return all_representations


if __name__ == '__main__':
    from fastsklearnfeature.splitting.Splitter import Splitter
    import time

    s = Splitter(train_fraction=[0.6, 10000000])

    dataset = (Config.get('statlog_heart.csv'),
               int(Config.get('statlog_heart.target')))
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_27_colic_horse.csv", 22)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/phpAmSP4g_cancer.csv", 30)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/phpOJxGL9_indianliver.csv", 10)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_29_credit-a_credit.csv", 15)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_37_diabetes_diabetes.csv", 8)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv", 20)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv", 9)
    #dataset = ("/home/felix/datasets/ExploreKit/csv/phpn1jVwe_mammography.csv", 6)

    r = Reader(dataset[0], dataset[1], s)
    raw_features = r.read()

    g = TreeGenerator(raw_features)

    start_time = time.time()

    g.generate_candidates()