Beispiel #1
0
def f_clf1(hps):
    # Assembing pipeline
    #weights = [hps['acc_w'], hps['fair_w'], hps['rob_w']]
    #weights = [0.0, 1.0, 0.0]
    #rankings = [accuracy_ranking, fairness_ranking, robustness_ranking]

    #weights = [hps['var_w']]
    #rankings = [variance_ranking]
    #weights = [hps['acc_w']]
    #rankings = [accuracy_ranking]
    #weights = [hps['fair_w']]
    #rankings = [fairness_ranking]

    weights = [hps['acc_w'], hps['fair_w'], hps['var_w']]
    rankings = [accuracy_ranking, fairness_ranking, variance_ranking]

    mask = np.zeros(len(hps) - 5, dtype=bool)
    for k, v in hps.items():
        if k.startswith('f_'):
            mask[int(k.split('_')[1])] = v

    clf = LogisticRegression()
    if type(privacy_epsilon) != type(None):
        clf = models.LogisticRegression(epsilon=privacy_epsilon)

    model = Pipeline([('selection',
                       WeightedRankingSelection(scores=rankings,
                                                weights=weights,
                                                k=hps['k'] + 1,
                                                names=np.array(names),
                                                hyperparameter_mask=mask)),
                      ('clf', clf)])

    return model
Beispiel #2
0
def f_clf1(hps):
    # Assembing pipeline
    #weights = [hps['acc_w'], hps['fair_w'], hps['rob_w']]
    #weights = [0.0, 1.0, 0.0]
    #rankings = [accuracy_ranking, fairness_ranking, robustness_ranking]

    #weights = [hps['var_w']]
    #rankings = [variance_ranking]
    #weights = [hps['acc_w']]
    #rankings = [accuracy_ranking]
    #weights = [hps['fair_w']]
    #rankings = [fairness_ranking]

    weights = [hps['acc_w'], hps['fair_w'], hps['var_w']]
    rankings = [accuracy_ranking, fairness_ranking, variance_ranking]
    #weights = [hps['acc_w'], hps['fair_w'], hps['var_w'], hps['rob_w']]
    #rankings = [accuracy_ranking, fairness_ranking, variance_ranking, robustness_ranking]

    clf = LogisticRegression()
    if type(privacy_epsilon) != type(None):
        clf = models.LogisticRegression(epsilon=privacy_epsilon)

    model = Pipeline([('selection',
                       WeightedRankingSelection(scores=rankings,
                                                weights=weights,
                                                k=hps['k'] + 1,
                                                names=np.array(names))),
                      ('clf', clf)])

    return model
Beispiel #3
0
def get_model(c):
    #return ('clf', LogisticRegression(class_weight='balanced', C=c))
    #return ('clf', RandomForestClassifier(class_weight='balanced', n_estimators=c))
    return ('clf',
            models.LogisticRegression(epsilon=0.00001,
                                      class_weight='balanced',
                                      C=c))
Beispiel #4
0
def f_clf1(hps):
    # Assembing pipeline
    weights = [hps['acc_w'], hps['fair_w'], hps['rob_w']]
    #weights = [0.0, 1.0, 0.0]
    rankings = [accuracy_ranking, fairness_ranking, robustness_ranking]

    clf = LogisticRegression()
    if type(privacy_epsilon) != type(None):
        clf = models.LogisticRegression(
            epsilon=privacy_epsilon
        )  #The smaller the value is, the better privacy protection

    model = Pipeline([('selection',
                       WeightedRankingSelection(scores=rankings,
                                                weights=weights,
                                                k=hps['k'] + 1,
                                                names=np.array(names))),
                      ('clf', clf)])

    return model
        min_robustness = 0.0
        if most_uncertain_f['robustness_choice'][0]:
            min_robustness = most_uncertain_f['robustness_specified'][0]
        max_number_features = 1.0
        if most_uncertain_f['k_choice'][0]:
            max_number_features = most_uncertain_f['k_specified'][0]

        max_search_time = most_uncertain_f['search_time_specified'][0]

        # Execute each search strategy with a given time limit (in parallel)
        # maybe run multiple times to smooth stochasticity

        model = LogisticRegression(class_weight='balanced')
        if most_uncertain_f['privacy_choice'][0]:
            model = models.LogisticRegression(
                epsilon=most_uncertain_f['privacy_specified'][0],
                class_weight='balanced')
        mp_global.clf = model

        #define rankings
        rankings = [
            variance, chi2_score_wo, f_anova_wo, fcbf, my_fisher_score,
            mutual_info_classif, my_mcfs
        ]
        #rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking
        #rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking
        #rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking
        rankings.append(partial(model_score,
                                estimator=ReliefF(n_neighbors=10)))  # relieff

        mp_global.min_accuracy = min_accuracy
Beispiel #6
0
    def query(self,
              X_train,
              X_validation,
              X_test,
              y_train,
              y_validation,
              y_test,
              classifier=LogisticRegression(class_weight='balanced'),
              min_accuracy=0.5,
              sensitive_ids=None,
              min_fairness=0.0,
              min_safety=0.0,
              min_privacy=None,
              max_complexity=1.0,
              max_search_time=np.inf,
              feature_names=None):

        if isinstance(max_complexity, int):
            max_complexity = max_complexity / float(X_train.shape[1])

        X_train_val = np.vstack((X_train, X_validation))
        y_train_val = np.append(y_train, y_validation)

        self.feature_names = feature_names

        if type(min_privacy) != type(None):
            classifier = models.LogisticRegression(epsilon=min_privacy,
                                                   class_weight='balanced')

        self.stored_results_file = '/tmp/experiment' + str(
            time.time()) + '.pickle'

        mp_global.X_train = X_train
        mp_global.X_validation = X_validation
        mp_global.X_train_val = X_train_val
        mp_global.X_test = X_test
        mp_global.y_train = y_train
        mp_global.y_validation = y_validation
        mp_global.y_train_val = y_train_val
        mp_global.y_test = y_test
        mp_global.names = feature_names
        mp_global.sensitive_ids = sensitive_ids

        mp_global.min_accuracy = min_accuracy
        mp_global.min_fairness = min_fairness
        mp_global.min_robustness = min_safety
        mp_global.max_number_features = max_complexity
        mp_global.max_search_time = max_search_time
        mp_global.clf = classifier
        mp_global.log_file = self.stored_results_file

        configuration = {}
        configuration['ranking_functions'] = copy.deepcopy(
            self.ranking_functions)
        configuration['run_id'] = 0
        configuration['main_strategy'] = copy.deepcopy(self.selection_function)

        mp_global.configurations = [configuration]

        with ProcessPool(max_workers=1) as pool:
            future = pool.map(my_function,
                              range(len(mp_global.configurations)),
                              timeout=max_search_time)

            iterator = future.result()
            while True:
                try:
                    result = next(iterator)
                except StopIteration:
                    break
                except TimeoutError as error:
                    print("function took longer than %d seconds" %
                          error.args[1])
                except ProcessExpired as error:
                    print("%s. Exit code: %d" % (error, error.exitcode))
                except Exception as error:
                    print("function raised %s" % error)
                    #print(error.traceback)  # Python's traceback of remote process

        return self.get_satisfying_features()
results_heatmap = {}
for min_accuracy in np.arange(l_acc, u_acc, (u_acc - l_acc) / 10.0):
	for max_number_features in np.arange(start_features, 1.0 + (1.0 - start_features) / 10.0, (1.0 - start_features) / 10.0):
		i += 1

		min_robustness = 0.0
		max_search_time = 20 * 60
		privacy = None
		min_fairness = 0.0

		# Execute each search strategy with a given time limit (in parallel)
		# maybe run multiple times to smooth stochasticity

		model = LogisticRegression()
		if type(privacy) != type(None):
			model = models.LogisticRegression(epsilon=privacy)
		mp_global.clf = model

		#define rankings
		rankings = [variance,
					chi2_score_wo,
					fcbf,
					my_fisher_score,
					mutual_info_classif,
					my_mcfs]
		#rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking
		#rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking
		#rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking
		rankings.append(partial(model_score, estimator=ReliefF(n_neighbors=10)))  # relieff

		mp_global.min_accuracy = min_accuracy
Beispiel #8
0
def run_strategy(strategy_method, ranking_id, strategy_id):
    data_infos = pickle.load(
        open(
            Config.get('data_path') + '/openml_data/fitting_datasets.pickle',
            'rb'))

    time_limit = 60 * 20

    meta_classifier = RandomForestRegressor(n_estimators=1000)
    X_train_meta_classifier = []
    y_train_meta_classifier = []

    cv_splitter = StratifiedKFold(5, random_state=42)
    auc_scorer = make_scorer(roc_auc_score,
                             greater_is_better=True,
                             needs_threshold=True)

    acc_value_list = []
    fair_value_list = []
    robust_value_list = []
    success_value_list = []
    runtime_value_list = []
    dataset_did_list = []
    dataset_sensitive_attribute_list = []

    while True:
        X_train, X_test, y_train, y_test, names, sensitive_ids, data_did, sensitive_attribute_id = get_data_openml(
            data_infos)

        #run on tiny sample
        X_train_tiny, _, y_train_tiny, _ = train_test_split(X_train,
                                                            y_train,
                                                            train_size=100,
                                                            random_state=42,
                                                            stratify=y_train)

        fair_train_tiny = make_scorer(
            true_positive_rate_score,
            greater_is_better=True,
            sensitive_data=X_train_tiny[:, sensitive_ids[0]])

        def objective(hps):
            print(hps)

            try:

                cv_k = 1.0
                cv_privacy = hps['privacy']
                model = LogisticRegression()
                if type(cv_privacy) == type(None):
                    cv_privacy = X_train_tiny.shape[0]
                else:
                    model = models.LogisticRegression(epsilon=cv_privacy)

                robust_scorer = make_scorer(robust_score,
                                            greater_is_better=True,
                                            X=X_train_tiny,
                                            y=y_train_tiny,
                                            model=model,
                                            feature_selector=None,
                                            scorer=auc_scorer)

                cv = GridSearchCV(model,
                                  param_grid={'C': [1.0]},
                                  scoring={
                                      'AUC': auc_scorer,
                                      'Fairness': fair_train_tiny,
                                      'Robustness': robust_scorer
                                  },
                                  refit=False,
                                  cv=cv_splitter)
                cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
                cv_acc = cv.cv_results_['mean_test_AUC'][0]
                cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]
                cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

                small_start_time = time.time()

                cv = GridSearchCV(model,
                                  param_grid={'C': [1.0]},
                                  scoring={
                                      'AUC': auc_scorer,
                                      'Fairness': fair_train_tiny,
                                      'Robustness': robust_scorer
                                  },
                                  refit=False,
                                  cv=cv_splitter)
                cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
                cv_acc = cv.cv_results_['mean_test_AUC'][0]
                cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]
                cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

                # construct feature vector
                feature_list = []
                # user-specified constraints
                feature_list.append(hps['accuracy'])
                feature_list.append(hps['fairness'])
                feature_list.append(hps['k'])
                feature_list.append(hps['k'] * X_train.shape[1])
                feature_list.append(hps['robustness'])
                feature_list.append(cv_privacy)
                feature_list.append(hps['search_time'])
                # differences to sample performance
                feature_list.append(cv_acc - hps['accuracy'])
                feature_list.append(cv_fair - hps['fairness'])
                feature_list.append(cv_k - hps['k'])
                feature_list.append((cv_k - hps['k']) * X_train.shape[1])
                feature_list.append(cv_robust - hps['robustness'])
                feature_list.append(time.time() - small_start_time)
                # privacy constraint is always satisfied => difference always zero => constant => unnecessary

                # metadata features
                feature_list.append(X_train.shape[0])  # number rows
                feature_list.append(X_train.shape[1])  # number columns

                features = np.array(feature_list)

                #predict the best model and calculate uncertainty

                loss = 0
                if hasattr(meta_classifier, 'estimators_'):
                    predictions = []
                    for tree in range(len(meta_classifier.estimators_)):
                        predictions.append(
                            meta_classifier.estimators_[tree].predict(
                                [features])[0])

                    stddev = np.std(np.array(predictions), axis=0)
                    print('stddev: ' + str(stddev))

                    loss = (stddev**2) * -1

                return {
                    'loss': loss,
                    'status': STATUS_OK,
                    'features': features
                }
            except:
                return {'loss': np.inf, 'status': STATUS_OK}

        space = {
            'k':
            hp.choice('k_choice', [(1.0), (hp.uniform('k_specified', 0, 1))]),
            'accuracy':
            hp.uniform('accuracy_specified', 0.5, 1),
            'fairness':
            hp.choice('fairness_choice',
                      [(0.0), (hp.uniform('fairness_specified', 0, 1))]),
            'privacy':
            hp.choice('privacy_choice',
                      [(None), (hp.lognormal('privacy_specified', 0, 1))]),
            'robustness':
            hp.choice('robustness_choice',
                      [(0.0), (hp.uniform('robustness_specified', 0, 1))]),
            'search_time':
            hp.uniform('search_time_specified', 10, time_limit),  # in seconds
        }

        trials = Trials()
        runs_per_dataset = 0
        i = 1
        while True:
            fmin(objective,
                 space=space,
                 algo=tpe.suggest,
                 max_evals=i,
                 trials=trials)
            i += 1

            if trials.trials[-1]['result']['loss'] == np.inf:
                break

            #break, once convergence tolerance is reached and generate new dataset
            if trials.trials[-1]['result']['loss'] == 0 or i % 20 == 0:
                best_trial = trials.trials[-1]
                if i % 20 == 0:
                    best_trial = trials.best_trial
                most_uncertain_f = best_trial['misc']['vals']
                #print(most_uncertain_f)

                min_accuracy = most_uncertain_f['accuracy_specified'][0]
                min_fairness = 0.0
                if most_uncertain_f['fairness_choice'][0]:
                    min_fairness = most_uncertain_f['fairness_specified'][0]
                min_robustness = 0.0
                if most_uncertain_f['robustness_choice'][0]:
                    min_robustness = most_uncertain_f['robustness_specified'][
                        0]
                max_number_features = X_train.shape[1]
                if most_uncertain_f['k_choice'][0]:
                    max_number_features = most_uncertain_f['k_specified'][0]

                max_search_time = most_uncertain_f['search_time_specified'][0]

                # Execute each search strategy with a given time limit (in parallel)
                # maybe run multiple times to smooth stochasticity

                model = LogisticRegression()
                if most_uncertain_f['privacy_choice'][0]:
                    model = models.LogisticRegression(
                        epsilon=most_uncertain_f['privacy_specified'][0])

                rankings = [variance, chi2_score_wo]  # simple rankings
                rankings.append(
                    partial(model_score,
                            estimator=ExtraTreesClassifier(
                                n_estimators=1000)))  # accuracy ranking
                rankings.append(
                    partial(robustness_score, model=model,
                            scorer=auc_scorer))  # robustness ranking
                rankings.append(
                    partial(fairness_score,
                            estimator=ExtraTreesClassifier(n_estimators=1000),
                            sensitive_ids=sensitive_ids))  # fairness ranking

                selected_rankings = rankings
                if type(ranking_id) != type(None):
                    selected_rankings = [rankings[ranking_id]]

                result = strategy_method(
                    X_train,
                    X_test,
                    y_train,
                    y_test,
                    names,
                    sensitive_ids,
                    ranking_functions=selected_rankings,
                    clf=model,
                    min_accuracy=min_accuracy,
                    min_fairness=min_fairness,
                    min_robustness=min_robustness,
                    max_number_features=max_number_features,
                    max_search_time=max_search_time,
                    cv_splitter=cv_splitter)

                # append ml data
                X_train_meta_classifier.append(
                    best_trial['result']['features'])
                y_train_meta_classifier.append(result['time'])

                try:
                    meta_classifier.fit(np.array(X_train_meta_classifier),
                                        y_train_meta_classifier)
                except:
                    pass

                #pickle everything and store it
                one_big_object = {}
                one_big_object['features'] = X_train_meta_classifier
                #one_big_object['best_strategy'] = y_train_meta_classifier

                runtime_value_list.append(result['time'])
                acc_value_list.append(result['cv_acc'])
                fair_value_list.append(result['cv_fair'])
                robust_value_list.append(result['cv_robust'])
                success_value_list.append(result['success'])

                dataset_did_list.append(data_did)
                dataset_sensitive_attribute_list.append(sensitive_attribute_id)

                one_big_object['times_value'] = runtime_value_list
                one_big_object['acc_value'] = acc_value_list
                one_big_object['fair_value'] = fair_value_list
                one_big_object['robust_value'] = robust_value_list
                one_big_object['success_value'] = success_value_list
                one_big_object['dataset_id'] = dataset_did_list
                one_big_object[
                    'sensitive_attribute_id'] = dataset_sensitive_attribute_list

                pickle.dump(
                    one_big_object,
                    open(
                        '/tmp/metalearning_data' + str(strategy_id) +
                        '.pickle', 'wb'))

                trials = Trials()
                i = 1
                runs_per_dataset += 1
                break
Beispiel #9
0
    for fname_i in range(len(all_names)):
        if all_names[fname_i].startswith('onehot__x' +
                                         str(cat_sensitive_attribute_id) +
                                         '_'):
            sensitive_ids.append(fname_i)

    print(sensitive_ids)

    le = preprocessing.LabelEncoder()
    le.fit(y_train)
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    cv_splitter = StratifiedKFold(5, random_state=42)

    model = models.LogisticRegression(epsilon=10)

    evolution(X_train,
              X_test,
              y_train,
              y_test,
              names,
              sensitive_ids,
              ranking_functions=[],
              clf=model,
              min_accuracy=1.0,
              min_fairness=0.0,
              min_robustness=0.0,
              max_number_features=1.0,
              cv_splitter=cv_splitter)
Beispiel #10
0
def uncertainty_sampling(training_dataset_ids, all_current_models):
    time_limit = 30 * 60  #60 * 60 * 3

    training_dataset_ids = training_dataset_ids.tolist()

    if '1240' in training_dataset_ids:
        training_dataset_ids.remove('1240')
    if '42132' in training_dataset_ids:
        training_dataset_ids.remove('42132')

    def maximize_uncertainty(hps):
        print(hps)

        X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation(
            dataset_key=hps['data'])
        is_regression = False

        # run on tiny sample
        if X_train.shape[0] > 100:
            if is_regression:
                X_train_tiny, _, y_train_tiny, _ = train_test_split(
                    X_train, y_train, train_size=100, random_state=42)
            else:
                X_train_tiny, _, y_train_tiny, _ = train_test_split(
                    X_train,
                    y_train,
                    train_size=100,
                    random_state=42,
                    stratify=y_train)
        else:
            X_train_tiny = X_train
            y_train_tiny = y_train

        print(X_train.shape)

        if type(sensitive_ids) != type(None):
            fair_train_tiny = make_scorer(
                true_positive_rate_score,
                greater_is_better=True,
                sensitive_data=X_train_tiny[:, sensitive_ids[0]])

        mp_global.X_train = X_train
        mp_global.X_validation = X_validation
        mp_global.X_train_val = X_train_val
        mp_global.X_test = X_test
        mp_global.y_train = y_train
        mp_global.y_validation = y_validation
        mp_global.y_train_val = y_train_val
        mp_global.y_test = y_test
        mp_global.names = names
        mp_global.sensitive_ids = sensitive_ids

        mp_global.cv_splitter = StratifiedKFold(5, random_state=42)
        mp_global.accuracy_scorer = make_scorer(f1_score)

        mp_global.avoid_robustness = False

        cv_k = 1.0
        cv_privacy = hps['privacy']

        model = LogisticRegression(class_weight='balanced')
        if type(cv_privacy) != type(None):
            model = models.LogisticRegression(epsilon=cv_privacy,
                                              class_weight='balanced')

        if type(cv_privacy) == type(None):
            cv_privacy = X_train_tiny.shape[0]

        robust_scorer = make_scorer(robust_score,
                                    greater_is_better=True,
                                    X=X_train_tiny,
                                    y=y_train_tiny,
                                    model=model,
                                    feature_selector=None,
                                    scorer=mp_global.accuracy_scorer)

        small_start_time = time.time()

        scoring = {'AUC': mp_global.accuracy_scorer}
        if not mp_global.avoid_robustness:
            scoring['Robustness'] = robust_scorer
        if type(sensitive_ids) != type(None):
            scoring['Fairness'] = fair_train_tiny

        cv = GridSearchCV(model,
                          param_grid={},
                          scoring=scoring,
                          refit=False,
                          cv=mp_global.cv_splitter)
        cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
        cv_acc = cv.cv_results_['mean_test_AUC'][0]

        cv_fair = 0.0
        if type(sensitive_ids) != type(None):
            cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]

        cv_robust = 0.0
        if not mp_global.avoid_robustness:
            cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

        cv_time = time.time() - small_start_time

        # construct feature vector
        feature_list = []
        # user-specified constraints
        feature_list.append(hps['accuracy'])
        feature_list.append(hps['fairness'])
        feature_list.append(hps['k'])
        feature_list.append(hps['k'] * X_train.shape[1])
        feature_list.append(hps['robustness'])
        feature_list.append(cv_privacy)
        feature_list.append(hps['search_time'])
        # differences to sample performance
        feature_list.append(cv_acc - hps['accuracy'])
        feature_list.append(cv_fair - hps['fairness'])
        feature_list.append(cv_k - hps['k'])
        feature_list.append((cv_k - hps['k']) * X_train.shape[1])
        feature_list.append(cv_robust - hps['robustness'])
        feature_list.append(cv_time)
        # privacy constraint is always satisfied => difference always zero => constant => unnecessary

        # metadata features
        feature_list.append(X_train.shape[0])  # number rows
        feature_list.append(X_train.shape[1])  # number columns

        #models
        feature_list.append(hps['model'] == 'Decision Tree')
        feature_list.append(hps['model'] == 'Gaussian Naive Bayes')
        feature_list.append(hps['model'] == 'Logistic Regression')

        features = np.array(feature_list).reshape(1, -1)

        # predict the best model and calculate uncertainty

        print(features)

        #now predict with models
        aggregated_certainty = 0
        print("uncertainty")
        for model_i in range(len(all_current_models)):
            certainty = np.abs(
                all_current_models[model_i].predict_proba(features)[0, 0] -
                0.5)
            aggregated_certainty += certainty

        print('Certainty: ' + str(aggregated_certainty))

        return {
            'loss': aggregated_certainty,
            'status': STATUS_OK,
            'features': features,
            'search_time': hps['search_time'],
            'constraints': hps
        }

    space = {
        'data':
        hp.choice('data_choice', training_dataset_ids),
        'model':
        hp.choice(
            'model_choice',
            [
                'Logistic Regression',
                'Gaussian Naive Bayes',
                'Decision Tree'  # , 'Random Forest'
            ]),
        'k':
        hp.choice('k_choice', [(1.0), (hp.uniform('k_specified', 0, 1))]),
        'accuracy':
        hp.uniform('accuracy_specified', 0.5, 1),
        'fairness':
        hp.choice('fairness_choice',
                  [(0.0), (hp.uniform('fairness_specified', 0.8, 1))]),
        'privacy':
        hp.choice('privacy_choice',
                  [(None), (hp.lognormal('privacy_specified', 0, 1))]),
        'robustness':
        hp.choice('robustness_choice',
                  [(0.0), (hp.uniform('robustness_specified', 0.8, 1))]),
        'search_time':
        hp.uniform('search_time_specified', 10, time_limit),  # in seconds
    }

    trials = Trials()
    fmin(maximize_uncertainty,
         space=space,
         algo=tpe.suggest,
         max_evals=100,
         trials=trials,
         show_progressbar=True)

    ### now run most uncertain trial

    number_of_runs = 1

    # break, once convergence tolerance is reached and generate new dataset
    last_trial = trials.best_trial
    most_uncertain_f = last_trial['misc']['vals']
    # print(most_uncertain_f)

    ##specifiy data
    run_counter = 0
    current_run_time_id = time.time()
    path = pathlib.Path('/tmp/experiment_uncertainty/run' + str(0))
    path.mkdir(parents=True, exist_ok=True)

    selected_dataset_id = training_dataset_ids[most_uncertain_f['data_choice']
                                               [0]]
    X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation(
        dataset_key=selected_dataset_id)
    is_regression = False

    mp_global.X_train = X_train
    mp_global.X_validation = X_validation
    mp_global.X_train_val = X_train_val
    mp_global.X_test = X_test
    mp_global.y_train = y_train
    mp_global.y_validation = y_validation
    mp_global.y_train_val = y_train_val
    mp_global.y_test = y_test
    mp_global.names = names
    mp_global.sensitive_ids = sensitive_ids

    if is_regression:
        mp_global.cv_splitter = KFold(5, random_state=42)
        mp_global.accuracy_scorer = make_scorer(r2_score)
    else:
        mp_global.cv_splitter = StratifiedKFold(5, random_state=42)
        mp_global.accuracy_scorer = make_scorer(f1_score)
    mp_global.avoid_robustness = False

    min_accuracy = most_uncertain_f['accuracy_specified'][0]
    min_fairness = 0.0
    if most_uncertain_f['fairness_choice'][0]:
        min_fairness = most_uncertain_f['fairness_specified'][0]
    min_robustness = 0.0
    if most_uncertain_f['robustness_choice'][0]:
        min_robustness = most_uncertain_f['robustness_specified'][0]
    max_number_features = 1.0
    if most_uncertain_f['k_choice'][0]:
        max_number_features = most_uncertain_f['k_specified'][0]

    max_search_time = most_uncertain_f['search_time_specified'][0]

    # Execute each search strategy with a given time limit (in parallel)
    # maybe run multiple times to smooth stochasticity

    model = None
    print(most_uncertain_f)
    if most_uncertain_f['model_choice'][0] == 0:
        model = LogisticRegression(class_weight='balanced')
        if most_uncertain_f['privacy_choice'][0]:
            model = models.LogisticRegression(
                epsilon=most_uncertain_f['privacy_specified'][0],
                class_weight='balanced')
    elif most_uncertain_f['model_choice'][0] == 1:
        model = GaussianNB()
        if most_uncertain_f['privacy_choice'][0]:
            model = models.GaussianNB(
                epsilon=most_uncertain_f['privacy_specified'][0])
    elif most_uncertain_f['model_choice'][0] == 2:
        model = DecisionTreeClassifier(class_weight='balanced')
        if most_uncertain_f['privacy_choice'][0]:
            model = PrivateRandomForest(
                n_estimators=1,
                epsilon=most_uncertain_f['privacy_specified'][0])
    elif most_uncertain_f['model_choice'][0] == 3:
        model = RandomForestClassifier(n_estimators=100,
                                       class_weight='balanced')
        if most_uncertain_f['privacy_choice'][0]:
            model = PrivateRandomForest(
                n_estimators=100,
                epsilon=most_uncertain_f['privacy_specified'][0])

    print(model)

    mp_global.clf = model
    # define rankings
    rankings = [
        variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif,
        my_mcfs
    ]
    # rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking
    # rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking
    # rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking
    rankings.append(partial(model_score,
                            estimator=ReliefF(n_neighbors=10)))  # relieff

    mp_global.min_accuracy = min_accuracy
    mp_global.min_fairness = min_fairness
    mp_global.min_robustness = min_robustness
    mp_global.max_number_features = max_number_features
    mp_global.max_search_time = max_search_time

    mp_global.configurations = []
    # add single rankings
    strategy_id = 1
    for r in range(len(rankings)):
        for run in range(number_of_runs):
            configuration = {}
            configuration['ranking_functions'] = copy.deepcopy([rankings[r]])
            configuration['run_id'] = copy.deepcopy(run)
            configuration['main_strategy'] = copy.deepcopy(weighted_ranking)
            configuration['strategy_id'] = copy.deepcopy(strategy_id)
            mp_global.configurations.append(configuration)
        strategy_id += 1

    main_strategies = [
        TPE, simulated_annealing, evolution, exhaustive, forward_selection,
        backward_selection, forward_floating_selection,
        backward_floating_selection, recursive_feature_elimination,
        fullfeatures
    ]

    # run main strategies

    for strategy in main_strategies:
        for run in range(number_of_runs):
            configuration = {}
            configuration['ranking_functions'] = []
            configuration['run_id'] = copy.deepcopy(run)
            configuration['main_strategy'] = copy.deepcopy(strategy)
            configuration['strategy_id'] = copy.deepcopy(strategy_id)
            mp_global.configurations.append(configuration)
        strategy_id += 1

    # 6#17
    with ProcessPool(max_workers=17) as pool:
        future = pool.map(my_function,
                          range(len(mp_global.configurations)),
                          timeout=max_search_time)

        iterator = future.result()
        while True:
            try:
                result = next(iterator)
            except StopIteration:
                break
            except TimeoutError as error:
                print("function took longer than %d seconds" % error.args[1])
            except ProcessExpired as error:
                print("%s. Exit code: %d" % (error, error.exitcode))
            except Exception as error:
                print("function raised %s" % error)
                print(error.traceback)  # Python's traceback of remote process

    #check which strategies were successful

    mappnames = {
        1: 'TPE(Variance)',
        2: 'TPE($\chi^2$)',
        3: 'TPE(FCBF)',
        4: 'TPE(Fisher)',
        5: 'TPE(MIM)',
        6: 'TPE(MCFS)',
        7: 'TPE(ReliefF)',
        8: 'TPE(NR)',
        9: 'SA(NR)',
        10: 'NSGA-II(NR)',
        11: 'ES(NR)',
        12: 'SFS(NR)',
        13: 'SBS(NR)',
        14: 'SFFS(NR)',
        15: 'SBFS(NR)',
        16: 'RFE(LR)',
        17: 'Complete Set'
    }

    def load_pickle(fname):
        data = []
        with open(fname, "rb") as f:
            while True:
                try:
                    data.append(pickle.load(f))
                except EOFError:
                    break
        return data

    def is_successfull_validation_and_test(exp_results):
        return len(exp_results
                   ) > 0 and 'success_test' in exp_results[-1] and exp_results[
                       -1]['success_test'] == True  # also on test satisfied

    def is_successfull_validation(exp_results):
        return len(exp_results) > 0 and 'Validation_Satisfied' in exp_results[
            -1]  # constraints were satisfied on validation set

    run_strategies_success_test = {}
    run_strategies_times = {}
    run_strategies_success_validation = {}

    rfolder = '/tmp/experiment_uncertainty/run' + str(0) + '/'

    validation_satisfied_by_any_strategy = False

    min_time = np.inf
    best_strategy = 0
    for s in range(1, len(mappnames) + 1):
        exp_results = []
        try:
            exp_results = load_pickle(rfolder + 'strategy' + str(s) +
                                      '.pickle')
        except:
            pass
        if is_successfull_validation_and_test(exp_results):
            runtime = exp_results[-1]['final_time']
            if runtime < min_time:
                min_time = runtime
                best_strategy = s

            run_strategies_success_test[s] = True
            run_strategies_times[s] = runtime
        else:
            run_strategies_success_test[s] = False

        run_strategies_success_validation[s] = is_successfull_validation(
            exp_results)
        if run_strategies_success_validation[s]:
            validation_satisfied_by_any_strategy = True

    strategy_success = np.zeros((1, len(mappnames)))
    for c_i in range(len(mappnames)):
        strategy_success[0, c_i] = run_strategies_success_test[c_i + 1]

    return last_trial['result']['features'], strategy_success
Beispiel #11
0
    def objective(hps):
        print(hps)

        cv_k = 1.0
        cv_privacy = hps['privacy']
        model = LogisticRegression()
        if type(cv_privacy) == type(None):
            cv_privacy = X_train_tiny.shape[0]
        else:
            model = models.LogisticRegression(epsilon=cv_privacy)

        robust_scorer = make_scorer(robust_score,
                                    greater_is_better=True,
                                    X=X_train_tiny,
                                    y=y_train_tiny,
                                    model=model,
                                    feature_selector=None,
                                    scorer=auc_scorer)

        cv = GridSearchCV(model,
                          param_grid={'C': [1.0]},
                          scoring={
                              'AUC': auc_scorer,
                              'Fairness': fair_train_tiny,
                              'Robustness': robust_scorer
                          },
                          refit=False,
                          cv=cv_splitter)
        cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
        cv_acc = cv.cv_results_['mean_test_AUC'][0]
        cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]
        cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

        #construct feature vector
        feature_list = []
        #user-specified constraints
        feature_list.append(hps['accuracy'])
        feature_list.append(hps['fairness'])
        feature_list.append(hps['k'])
        feature_list.append(hps['k'] * X_train.shape[1])
        feature_list.append(hps['robustness'])
        feature_list.append(cv_privacy)
        #differences to sample performance
        feature_list.append(cv_acc - hps['accuracy'])
        feature_list.append(cv_fair - hps['fairness'])
        feature_list.append(cv_k - hps['k'])
        feature_list.append((cv_k - hps['k']) * X_train.shape[1])
        feature_list.append(cv_robust - hps['robustness'])
        #privacy constraint is always satisfied => difference always zero => constant => unnecessary

        #metadata features
        feature_list.append(X_train.shape[0])  #number rows
        feature_list.append(X_train.shape[1])  #number columns

        features = np.array(feature_list)

        #predict the best model and calculate uncertainty

        loss = 0
        try:
            proba_predictions = meta_classifier.predict_proba([features])[0]
            proba_predictions = np.sort(proba_predictions)

            print("predictions: " + str(proba_predictions))

            uncertainty = 1 - (proba_predictions[-1] - proba_predictions[-2])
            loss = -1 * uncertainty  # we want to maximize uncertainty
        except:
            pass

        return {'loss': loss, 'status': STATUS_OK, 'features': features}
print(model.coef_) #Print the coefficients for each independent variable.
#But it is not clear which one corresponds to what.
#SO let us print both column values and coefficients.
#.Series is a 1-D labeled array capable of holding any data type.
#Default index would be 0,1,2,3... but let us overwrite them with column names for X (independent variables)
weights = pd.Series(model.coef_[0], index=X.columns.values)

print("Weights for each variables is a follows...")
print(weights)

#+VE VALUE INDICATES THAT THE VARIABLE HAS A POSITIVE IMPACT'''

baseline = model.score(X_test,y_test)

import diffprivlib.models as dp
dp_clf = dp.LogisticRegression()
dp_clf.fit(X_train, y_train)

print("Differentially private test accuracy (epsilon=%.2f): %.2f%%" %
     (dp_clf.epsilon, dp_clf.score(X_test, y_test) * 100))

dp_clf = dp.LogisticRegression(epsilon=float("inf"), data_norm=1e5)
dp_clf.fit(X_train, y_train)

print("Agreement between non-private and differentially private (epsilon=inf) classifiers: %.2f%%" % (dp_clf.score(X_test, model.predict(X_test)) * 100))

accuracy = []
epsilons = np.logspace(-3, 1, 500)

for eps in epsilons:
    dp_clf = dp.LogisticRegression(epsilon=eps, data_norm=100)
Beispiel #13
0
            min_fairness = 0.0
            if most_uncertain_f['fairness_choice'][0]:
                min_fairness = most_uncertain_f['fairness_specified'][0]
            min_robustness = 0.0
            if most_uncertain_f['robustness_choice'][0]:
                min_robustness = most_uncertain_f['robustness_specified'][0]
            max_number_features = X_train.shape[1]
            if most_uncertain_f['k_choice'][0]:
                max_number_features = most_uncertain_f['k_specified'][0]

            # Execute each search strategy with a given time limit (in parallel)
            # maybe run multiple times to smooth stochasticity

            model = LogisticRegression()
            if most_uncertain_f['privacy_choice'][0]:
                model = models.LogisticRegression(
                    epsilon=most_uncertain_f['privacy_specified'][0])
            mp_global.clf = model

            #define rankings
            rankings = [variance, chi2_score_wo]  #simple rankings
            rankings.append(
                partial(model_score,
                        estimator=ExtraTreesClassifier(
                            n_estimators=1000)))  #accuracy ranking
            rankings.append(
                partial(robustness_score, model=model,
                        scorer=auc_scorer))  #robustness ranking
            rankings.append(
                partial(fairness_score,
                        estimator=ExtraTreesClassifier(n_estimators=1000),
                        sensitive_ids=sensitive_ids))  #fairness ranking
        mp_global.y_train = y_train
        mp_global.y_test = y_test
        mp_global.names = []
        mp_global.sensitive_ids = None
        mp_global.cv_splitter = cv_splitter

        min_accuracy = config['accuracy']
        min_fairness = 0.0
        min_robustness = config['robustness']
        max_number_features = config['k']

        max_search_time = time_limit

        model = LogisticRegression()
        if type(config['privacy']) != type(None):
            model = models.LogisticRegression(epsilon=config['privacy'])
        mp_global.clf = model

        #define rankings
        rankings = [
            variance, chi2_score_wo, fcbf, my_fisher_score,
            mutual_info_classif, my_mcfs
        ]
        #rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking
        #rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking
        #rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking
        rankings.append(partial(model_score,
                                estimator=ReliefF(n_neighbors=10)))  # relieff

        mp_global.min_accuracy = min_accuracy
        mp_global.min_fairness = min_fairness
Beispiel #15
0
    def get_estimated_best_strategy(self, X_train, y_train, min_accuracy,
                                    sensitive_ids, min_fairness, min_safety,
                                    privacy, max_complexity, max_search_time,
                                    classifier):

        start_time = time.time()

        selection_strategies = {}
        rankings = {}

        ranking_list = [
            variance, chi2_score_wo, fcbf, my_fisher_score,
            mutual_info_classif, my_mcfs
        ]
        ranking_list.append(
            partial(model_score, estimator=ReliefF(n_neighbors=10)))

        for my_strategy in range(1, 8):
            selection_strategies[my_strategy] = weighted_ranking
            rankings[my_strategy] = [ranking_list[my_strategy - 1]]

        main_strategies = [
            TPE, simulated_annealing, evolution, exhaustive, forward_selection,
            backward_selection, forward_floating_selection,
            backward_floating_selection, recursive_feature_elimination,
            fullfeatures
        ]

        for my_strategy in range(8, 18):
            selection_strategies[my_strategy] = main_strategies[my_strategy -
                                                                8]
            rankings[my_strategy] = None

        if isinstance(max_complexity, int):
            max_complexity = max_complexity / float(X_train.shape[1])

        auc_scorer = make_scorer(roc_auc_score,
                                 greater_is_better=True,
                                 needs_threshold=True)

        cv_splitter = StratifiedKFold(5, random_state=42)

        X_train_tiny, _, y_train_tiny, _ = train_test_split(X_train,
                                                            y_train,
                                                            train_size=100,
                                                            random_state=42,
                                                            stratify=y_train)

        cv_k = 1.0

        model = classifier
        if type(privacy) == type(None):
            privacy = X_train_tiny.shape[0]
        else:
            if isinstance(model, LogisticRegression):
                model = models.LogisticRegression(epsilon=privacy,
                                                  class_weight='balanced')

        robust_scorer = make_scorer(robust_score,
                                    greater_is_better=True,
                                    X=X_train_tiny,
                                    y=y_train_tiny,
                                    model=model,
                                    feature_selector=None,
                                    scorer=auc_scorer)

        small_start_time = time.time()

        scoring_dict = {'AUC': auc_scorer, 'Robustness': robust_scorer}

        if type(sensitive_ids) != type(None):
            fair_train_tiny = make_scorer(
                true_positive_rate_score,
                greater_is_better=True,
                sensitive_data=X_train_tiny[:, sensitive_ids[0]])
            scoring_dict['Fairness'] = fair_train_tiny

        cv = GridSearchCV(model,
                          param_grid={'C': [1.0]},
                          scoring=scoring_dict,
                          refit=False,
                          cv=cv_splitter)
        cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
        cv_acc = cv.cv_results_['mean_test_AUC'][0]

        cv_fair = 0.0
        if type(sensitive_ids) != type(None):
            cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]
        cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

        cv_time = time.time() - small_start_time

        #cases if utility is defined
        min_fairness_new = min_fairness
        if min_fairness_new > 1.0:
            min_fairness_new = 1.0

        min_accuracy_new = min_accuracy
        if min_accuracy_new > 1.0:
            min_accuracy_new = 1.0

        min_safety_new = min_safety
        if min_accuracy_new > 1.0:
            min_safety_new = 1.0

        max_complexity_new = max_complexity
        if max_complexity_new < 0.0:
            max_complexity_new = 0.0

        # construct feature vector
        feature_list = []
        # user-specified constraints
        feature_list.append(min_accuracy_new)
        feature_list.append(min_fairness_new)
        feature_list.append(max_complexity_new)
        feature_list.append(max_complexity_new * X_train.shape[1])
        feature_list.append(min_safety_new)
        feature_list.append(privacy)
        feature_list.append(max_search_time)
        # differences to sample performance
        feature_list.append(cv_acc - min_accuracy_new)
        feature_list.append(cv_fair - min_fairness_new)
        feature_list.append(cv_k - max_complexity_new)
        feature_list.append((cv_k - max_complexity_new) * X_train.shape[1])
        feature_list.append(cv_robust - min_safety_new)
        feature_list.append(cv_time)
        # privacy constraint is always satisfied => difference always zero => constant => unnecessary

        # metadata features
        feature_list.append(X_train.shape[0])  # number rows
        feature_list.append(X_train.shape[1])  # number columns

        feature_list.append(isinstance(model, DecisionTreeClassifier))
        feature_list.append(isinstance(model, GaussianNB))
        feature_list.append(isinstance(model, LogisticRegression))

        self.features = np.array(feature_list).reshape(1, -1)

        self.predicted_probabilities = np.zeros(len(self.mappnames))

        self.best_model = None
        best_score = -1
        for my_strategy in range(len(self.mappnames)):
            self.predicted_probabilities[my_strategy] = self.models[
                my_strategy].predict_proba(self.features)[:, 1]
            if self.predicted_probabilities[my_strategy] > best_score:
                best_score = self.predicted_probabilities[my_strategy]
                self.best_model = self.models[my_strategy]

        best_id = np.argmax(self.predicted_probabilities)

        self.selection_function = selection_strategies[best_id + 1]
        self.ranking_functions = rankings[best_id + 1]

        print("Within " + str(time.time() - start_time) +
              " seconds, the Optimizer chose to run " +
              str(self.mappnames[best_id + 1]))
    def objective(hps):
        print(hps)

        try:
            cv_k = 1.0
            cv_privacy = hps['privacy']
            model = LogisticRegression(class_weight='balanced')
            if type(cv_privacy) == type(None):
                cv_privacy = X_train_tiny.shape[0]
            else:
                model = models.LogisticRegression(epsilon=cv_privacy,
                                                  class_weight='balanced')

            robust_scorer = make_scorer(robust_score,
                                        greater_is_better=True,
                                        X=X_train_tiny,
                                        y=y_train_tiny,
                                        model=model,
                                        feature_selector=None,
                                        scorer=auc_scorer)

            small_start_time = time.time()

            cv = GridSearchCV(model,
                              param_grid={'C': [1.0]},
                              scoring={
                                  'AUC': auc_scorer,
                                  'Fairness': fair_train_tiny,
                                  'Robustness': robust_scorer
                              },
                              refit=False,
                              cv=cv_splitter)
            cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
            cv_acc = cv.cv_results_['mean_test_AUC'][0]
            cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]
            cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

            cv_time = time.time() - small_start_time

            #construct feature vector
            feature_list = []
            #user-specified constraints
            feature_list.append(hps['accuracy'])
            feature_list.append(hps['fairness'])
            feature_list.append(hps['k'])
            feature_list.append(hps['k'] * X_train.shape[1])
            feature_list.append(hps['robustness'])
            feature_list.append(cv_privacy)
            feature_list.append(hps['search_time'])
            #differences to sample performance
            feature_list.append(cv_acc - hps['accuracy'])
            feature_list.append(cv_fair - hps['fairness'])
            feature_list.append(cv_k - hps['k'])
            feature_list.append((cv_k - hps['k']) * X_train.shape[1])
            feature_list.append(cv_robust - hps['robustness'])
            feature_list.append(cv_time)
            #privacy constraint is always satisfied => difference always zero => constant => unnecessary

            #metadata features
            feature_list.append(X_train.shape[0])  #number rows
            feature_list.append(X_train.shape[1])  #number columns

            features = np.array(feature_list)

            #predict the best model and calculate uncertainty

            loss = 0
            return {
                'loss': loss,
                'status': STATUS_OK,
                'features': features,
                'search_time': hps['search_time'],
                'constraints': hps
            }
        except:
            return {'loss': np.inf, 'status': STATUS_OK}
Beispiel #17
0
def objective(hps):
    print(hps)

    cv_k = 1.0
    cv_privacy = hps['privacy']
    model = LogisticRegression()
    if type(cv_privacy) == type(None):
        cv_privacy = X_train_tiny.shape[0]
    else:
        model = models.LogisticRegression(epsilon=cv_privacy)

    robust_scorer = make_scorer(robust_score,
                                greater_is_better=True,
                                X=X_train_tiny,
                                y=y_train_tiny,
                                model=model,
                                feature_selector=None,
                                scorer=auc_scorer)

    cv = GridSearchCV(model,
                      param_grid={'C': [1.0]},
                      scoring={
                          'AUC': auc_scorer,
                          'Fairness': fair_train_tiny,
                          'Robustness': robust_scorer
                      },
                      refit=False,
                      cv=cv_splitter)
    cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
    cv_acc = cv.cv_results_['mean_test_AUC'][0]
    cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]
    cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

    #construct feature vector
    feature_list = []
    #user-specified constraints
    feature_list.append(hps['accuracy'])
    feature_list.append(hps['fairness'])
    feature_list.append(hps['k'])
    feature_list.append(hps['robustness'])
    feature_list.append(cv_privacy)
    #differences to sample performance
    feature_list.append(cv_acc - hps['accuracy'])
    feature_list.append(cv_fair - hps['fairness'])
    feature_list.append(cv_k - hps['k'])
    feature_list.append(cv_robust - hps['robustness'])
    #privacy constraint is always satisfied => difference always zero => constant => unnecessary

    #metadata features
    #feature_list.append(X_train.shape[0])#number rows
    #feature_list.append(X_train.shape[1])#number columns

    features = np.array(feature_list)

    #predict the best model and calculate uncertainty

    loss = 0
    if hasattr(meta_classifier, 'estimators_'):
        predictions = []
        for tree in range(len(meta_classifier.estimators_)):
            predictions.append(meta_classifier.estimators_[tree].predict(
                [features])[0])

        stddev = np.std(np.array(predictions), axis=0)
        print("hello2")
        print(stddev.shape)

        loss = np.sum(stddev**2) * -1

    return {'loss': loss, 'status': STATUS_OK, 'features': features}
Beispiel #18
0
        mp_global.X_train_val = X_train_val
        mp_global.y_validation = y_val
        mp_global.y_train_val = y_train_val

        min_accuracy = config['accuracy']
        min_fairness = 0.0
        min_robustness = config['robustness']
        max_number_features = config['k']

        max_search_time = time_limit

        model = None
        if config['model'] == 'Logistic Regression':
            model = LogisticRegression(class_weight='balanced')
            if type(config['privacy']) != type(None):
                model = models.LogisticRegression(epsilon=config['privacy'],
                                                  class_weight='balanced')
        elif config['model'] == 'Gaussian Naive Bayes':
            model = GaussianNB()
            if type(config['privacy']) != type(None):
                model = models.GaussianNB(epsilon=config['privacy'])
        elif config['model'] == 'Decision Tree':
            model = DecisionTreeClassifier(class_weight='balanced')
            if type(config['privacy']) != type(None):
                model = PrivateRandomForest(n_estimators=1,
                                            epsilon=config['privacy'])

        mp_global.clf = model
        # define rankings
        rankings = [
            variance, chi2_score_wo, fcbf, my_fisher_score,
            mutual_info_classif, my_mcfs
for min_accuracy in [0.5, 0.53, 0.56, 0.59, 0.62, 0.65, 0.68]:
    for privacy in [10.0, 7.0, 3.0, 1.0, 0.7, 0.3, 0.1, 0.07, 0.03]:

        success_per_strategy = np.zeros(18)
        time_per_strategy = np.zeros(18)
        for nruns_global in range(5):

            min_robustness = 0.0
            max_search_time = 20 * 60
            min_fairness = 0.0
            max_number_features = 1.0

            # Execute each search strategy with a given time limit (in parallel)
            # maybe run multiple times to smooth stochasticity

            model = models.LogisticRegression(epsilon=privacy,
                                              class_weight='balanced')
            mp_global.clf = model
            mp_global.model_hyperparameters = {
                'C': [0.001, 0.01, 0.1, 1.0, 10, 100, 1000]
            }
            mp_global.model_hyperparameters['epsilon'] = [privacy]

            #define rankings
            rankings = [
                variance, chi2_score_wo, fcbf, my_fisher_score,
                mutual_info_classif, my_mcfs
            ]
            #rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking
            #rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking
            #rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking
            rankings.append(
Beispiel #20
0
    def maximize_uncertainty(hps):
        print(hps)

        X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation(
            dataset_key=hps['data'])
        is_regression = False

        # run on tiny sample
        if X_train.shape[0] > 100:
            if is_regression:
                X_train_tiny, _, y_train_tiny, _ = train_test_split(
                    X_train, y_train, train_size=100, random_state=42)
            else:
                X_train_tiny, _, y_train_tiny, _ = train_test_split(
                    X_train,
                    y_train,
                    train_size=100,
                    random_state=42,
                    stratify=y_train)
        else:
            X_train_tiny = X_train
            y_train_tiny = y_train

        print(X_train.shape)

        if type(sensitive_ids) != type(None):
            fair_train_tiny = make_scorer(
                true_positive_rate_score,
                greater_is_better=True,
                sensitive_data=X_train_tiny[:, sensitive_ids[0]])

        mp_global.X_train = X_train
        mp_global.X_validation = X_validation
        mp_global.X_train_val = X_train_val
        mp_global.X_test = X_test
        mp_global.y_train = y_train
        mp_global.y_validation = y_validation
        mp_global.y_train_val = y_train_val
        mp_global.y_test = y_test
        mp_global.names = names
        mp_global.sensitive_ids = sensitive_ids

        mp_global.cv_splitter = StratifiedKFold(5, random_state=42)
        mp_global.accuracy_scorer = make_scorer(f1_score)

        mp_global.avoid_robustness = False

        cv_k = 1.0
        cv_privacy = hps['privacy']

        model = LogisticRegression(class_weight='balanced')
        if type(cv_privacy) != type(None):
            model = models.LogisticRegression(epsilon=cv_privacy,
                                              class_weight='balanced')

        if type(cv_privacy) == type(None):
            cv_privacy = X_train_tiny.shape[0]

        robust_scorer = make_scorer(robust_score,
                                    greater_is_better=True,
                                    X=X_train_tiny,
                                    y=y_train_tiny,
                                    model=model,
                                    feature_selector=None,
                                    scorer=mp_global.accuracy_scorer)

        small_start_time = time.time()

        scoring = {'AUC': mp_global.accuracy_scorer}
        if not mp_global.avoid_robustness:
            scoring['Robustness'] = robust_scorer
        if type(sensitive_ids) != type(None):
            scoring['Fairness'] = fair_train_tiny

        cv = GridSearchCV(model,
                          param_grid={},
                          scoring=scoring,
                          refit=False,
                          cv=mp_global.cv_splitter)
        cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
        cv_acc = cv.cv_results_['mean_test_AUC'][0]

        cv_fair = 0.0
        if type(sensitive_ids) != type(None):
            cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]

        cv_robust = 0.0
        if not mp_global.avoid_robustness:
            cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

        cv_time = time.time() - small_start_time

        # construct feature vector
        feature_list = []
        # user-specified constraints
        feature_list.append(hps['accuracy'])
        feature_list.append(hps['fairness'])
        feature_list.append(hps['k'])
        feature_list.append(hps['k'] * X_train.shape[1])
        feature_list.append(hps['robustness'])
        feature_list.append(cv_privacy)
        feature_list.append(hps['search_time'])
        # differences to sample performance
        feature_list.append(cv_acc - hps['accuracy'])
        feature_list.append(cv_fair - hps['fairness'])
        feature_list.append(cv_k - hps['k'])
        feature_list.append((cv_k - hps['k']) * X_train.shape[1])
        feature_list.append(cv_robust - hps['robustness'])
        feature_list.append(cv_time)
        # privacy constraint is always satisfied => difference always zero => constant => unnecessary

        # metadata features
        feature_list.append(X_train.shape[0])  # number rows
        feature_list.append(X_train.shape[1])  # number columns

        #models
        feature_list.append(hps['model'] == 'Decision Tree')
        feature_list.append(hps['model'] == 'Gaussian Naive Bayes')
        feature_list.append(hps['model'] == 'Logistic Regression')

        features = np.array(feature_list).reshape(1, -1)

        # predict the best model and calculate uncertainty

        print(features)

        #now predict with models
        aggregated_certainty = 0
        print("uncertainty")
        for model_i in range(len(all_current_models)):
            certainty = np.abs(
                all_current_models[model_i].predict_proba(features)[0, 0] -
                0.5)
            aggregated_certainty += certainty

        print('Certainty: ' + str(aggregated_certainty))

        return {
            'loss': aggregated_certainty,
            'status': STATUS_OK,
            'features': features,
            'search_time': hps['search_time'],
            'constraints': hps
        }
Beispiel #21
0
    for fname_i in range(len(all_names)):
        if all_names[fname_i].startswith('onehot__x' +
                                         str(cat_sensitive_attribute_id) +
                                         '_'):
            sensitive_ids.append(fname_i)

    print(sensitive_ids)

    le = preprocessing.LabelEncoder()
    le.fit(y_train)
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    cv_splitter = StratifiedKFold(5, random_state=42)

    model = models.LogisticRegression()

    start_time = time.time()

    evolution(X_train,
              X_test,
              y_train,
              y_test,
              names,
              sensitive_ids,
              ranking_functions=[chi2_score_wo],
              clf=model,
              min_accuracy=0.85,
              min_fairness=0.86,
              min_robustness=0.80,
              max_number_features=0.2,