コード例 #1
0
def optimize_accuracy_under_constraints(
        trial, metafeature_values_hold):  #todo: transfer use features directly
    try:
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        search_time = trial.suggest_int('global_search_time_constraint',
                                        10,
                                        search_time_frozen,
                                        log=False)
        evaluation_time = trial.suggest_int(
            'global_evaluation_time_constraint', 10, search_time, log=False)
        memory_limit = trial.suggest_uniform('global_memory_constraint', 0.001,
                                             4)
        cv = trial.suggest_int('global_cv', 2, 20, log=False)
        number_of_cvs = trial.suggest_int('global_number_cv', 1, 10, log=False)

        my_list_constraints_values = [
            search_time, evaluation_time, memory_limit, cv, number_of_cvs
        ]
        features = space2features(space, my_list_constraints_values,
                                  metafeature_values_hold)
        trial.set_user_attr('features', features)

        return predict_range(model, features)
    except Exception as e:
        print(str(e) + 'except dataset _ accuracy: ' + '\n\n')
        return 0.0
コード例 #2
0
def optimize_uncertainty(trial):
    try:
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        search_time = trial.suggest_int('global_search_time_constraint',
                                        10,
                                        total_search_time,
                                        log=False)
        evaluation_time = trial.suggest_int(
            'global_evaluation_time_constraint', 10, search_time, log=False)
        memory_limit = trial.suggest_uniform('global_memory_constraint', 1.5,
                                             4)
        cv = trial.suggest_int('global_cv', 2, 20, log=False)
        number_of_cvs = trial.suggest_int('global_number_cv', 1, 10, log=False)

        dataset_id = trial.suggest_categorical('dataset_id',
                                               my_openml_datasets)

        my_random_seed = int(time.time())

        X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(
            dataset_id, randomstate=my_random_seed)

        trial.set_user_attr('data_random_seed', my_random_seed)

        #add metafeatures of data

        my_list_constraints_values = [
            search_time, evaluation_time, memory_limit, cv, number_of_cvs
        ]

        metafeature_values = data2features(X_train, y_train,
                                           categorical_indicator)
        features = space2features(space, my_list_constraints_values,
                                  metafeature_values)

        trial.set_user_attr('features', features)

        predictions = []
        for tree in range(model.n_estimators):
            predictions.append(predict_range(model.estimators_[tree],
                                             features))

        stddev_pred = np.std(np.matrix(predictions).transpose(), axis=1)

        return stddev_pred[0]
    except Exception as e:
        print(
            str(e) + 'except dataset _ uncertainty: ' + str(dataset_id) +
            '\n\n')
        return 0.0
コード例 #3
0
def optimize_uncertainty(trial, dataset_id):
    dataset_id = str(dataset_id)
    try:
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        search_time, evaluation_time, memory_limit, privacy_limit, training_time_limit, inference_time_limit, pipeline_size_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, _ = generate_parameters(trial, total_search_time, my_openml_datasets)

        my_random_seed = int(time.time())

        X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id,
                                                                                            randomstate=my_random_seed)

        trial.set_user_attr('data_random_seed', my_random_seed)
        trial.set_user_attr('dataset_id', dataset_id)

        #add metafeatures of data
        my_list_constraints_values = [search_time,
                                      evaluation_time,
                                      memory_limit,
                                      cv,
                                      number_of_cvs,
                                      ifNull(privacy_limit, constant_value=1000),
                                      ifNull(hold_out_fraction),
                                      sample_fraction,
                                      training_time_limit,
                                      inference_time_limit,
                                      pipeline_size_limit]

        metafeature_values = data2features(X_train, y_train, categorical_indicator)
        features = space2features(space, my_list_constraints_values, metafeature_values)
        features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names)

        trial.set_user_attr('features', features)

        model = mp_glob.ml_model
        trial.set_user_attr('predicted_target', model.predict(features))

        predictions = []
        for tree in range(model.n_estimators):
            predictions.append(predict_range(model.estimators_[tree], features))

        stddev_pred = np.std(np.matrix(predictions).transpose(), axis=1)

        return stddev_pred[0]
    except Exception as e:
        print(str(e) + 'except dataset _ uncertainty: ' + str(dataset_id) + '\n\n')
        return -np.inf
コード例 #4
0
ファイル: generate_ranges.py プロジェクト: BigDaMa/DFS
def optimize_uncertainty(trial):
    try:
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        search_time, evaluation_time, memory_limit, privacy_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(
            trial)

        my_random_seed = int(time.time())
        trial.set_user_attr('data_random_seed', my_random_seed)

        return 0
    except Exception as e:
        print(str(e) + 'except dataset _ uncertainty: ' + str(dataset_id) + '\n\n')
        return 0.0
コード例 #5
0
                    search_time=search_time_frozen,
                    memory_limit=memory_budget,
                    pipeline_size_limit=pipeline_size)

                from fastsklearnfeature.declarative_automl.optuna_package.myautoml.utils_model import show_progress
                #show_progress(search, X_test_hold, y_test_hold, my_scorer)

                print("test result: " + str(result))
                current_dynamic.append(result)
            except:
                current_dynamic.append(0.0)

            print('dynamic: ' + str(current_dynamic))
            print('static: ' + str(current_static))

            gen_new = SpaceGenerator()
            space_new = gen_new.generate_params()
            for pre, _, node in RenderTree(space_new.parameter_tree):
                if node.status == True:
                    print("%s%s" % (pre, node.name))

            try:
                search = MyAutoML(n_jobs=1,
                                  time_search_budget=search_time_frozen,
                                  space=space_new,
                                  evaluation_budget=int(0.1 *
                                                        search_time_frozen),
                                  main_memory_budget_gb=memory_budget,
                                  pipeline_size_limit=pipeline_size,
                                  hold_out_fraction=0.33)
コード例 #6
0
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None):
    space = None
    search_time = None
    if not 'space' in trial.user_attrs:
        # which hyperparameters to use
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        search_time, evaluation_time, memory_limit, privacy_limit, training_time_limit, inference_time_limit, pipeline_size_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(trial, total_search_time, my_openml_datasets)

    else:
        space = trial.user_attrs['space']

        print(trial.params)

        #make this a hyperparameter
        search_time = trial.params['global_search_time_constraint']

        evaluation_time = search_time
        if 'global_evaluation_time_constraint' in trial.params:
            evaluation_time = trial.params['global_evaluation_time_constraint']

        memory_limit = 10
        if 'global_memory_constraint' in trial.params:
            memory_limit = trial.params['global_memory_constraint']

        privacy_limit = None
        if 'privacy_constraint' in trial.params:
            privacy_limit = trial.params['privacy_constraint']

        training_time_limit = search_time
        if 'training_time_constraint' in trial.params:
            training_time_limit = trial.params['training_time_constraint']

        inference_time_limit = 60
        if 'inference_time_constraint' in trial.params:
            inference_time_limit = trial.params['inference_time_constraint']

        pipeline_size_limit = 350000000
        if 'pipeline_size_constraint' in trial.params:
            pipeline_size_limit = trial.params['pipeline_size_constraint']

        cv = 1
        number_of_cvs = 1
        hold_out_fraction = None
        if 'global_cv' in trial.params:
            cv = trial.params['global_cv']
            if 'global_number_cv' in trial.params:
                number_of_cvs = trial.params['global_number_cv']
        else:
            hold_out_fraction = trial.params['hold_out_fraction']

        sample_fraction = 1.0
        if 'sample_fraction' in trial.params:
            sample_fraction = trial.params['sample_fraction']

        if 'dataset_id' in trial.params:
            dataset_id = trial.params['dataset_id']
        else:
            dataset_id = trial.user_attrs['dataset_id']

    for pre, _, node in RenderTree(space.parameter_tree):
        if node.status == True:
            print("%s%s" % (pre, node.name))

    if type(X_train) == type(None):

        my_random_seed = int(time.time())
        if 'data_random_seed' in trial.user_attrs:
            my_random_seed = trial.user_attrs['data_random_seed']

        X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed)

        if not isinstance(trial, FrozenTrial):
            my_list_constraints_values = [search_time,
                                          evaluation_time,
                                          memory_limit, cv,
                                          number_of_cvs,
                                          ifNull(privacy_limit, constant_value=1000),
                                          ifNull(hold_out_fraction),
                                          sample_fraction,
                                          training_time_limit,
                                          inference_time_limit,
                                          pipeline_size_limit]

            metafeature_values = data2features(X_train, y_train, categorical_indicator)
            features = space2features(space, my_list_constraints_values, metafeature_values)
            features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names)
            trial.set_user_attr('features', features)


    dynamic_params = []
    for random_i in range(5): #5
        search = MyAutoML(cv=cv,
                          number_of_cvs=number_of_cvs,
                          n_jobs=1,
                          evaluation_budget=evaluation_time,
                          time_search_budget=search_time,
                          space=space,
                          main_memory_budget_gb=memory_limit,
                          differential_privacy_epsilon=privacy_limit,
                          hold_out_fraction=hold_out_fraction,
                          sample_fraction=sample_fraction,
                          training_time_limit=training_time_limit,
                          inference_time_limit=inference_time_limit,
                          pipeline_size_limit=pipeline_size_limit)

        test_score = 0.0
        try:
            search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer)

            best_pipeline = search.get_best_pipeline()
            if type(best_pipeline) != type(None):
                test_score = my_scorer(search.get_best_pipeline(), X_test, y_test)
        except:
            pass
        dynamic_params.append(test_score)

    count_success = 0
    for i_run in range(len(dynamic_params)):
        if dynamic_params[i_run] > 0.0:
            count_success += 1
    success_rate = float(count_success) / float(len(dynamic_params))

    return success_rate, search
コード例 #7
0
ファイル: budget_experiment.py プロジェクト: BigDaMa/DFS
from fastsklearnfeature.declarative_automl.optuna_package.myautoml.Space_GenerationTree import SpaceGenerator

auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

#dataset = openml.datasets.get_dataset(40536)
dataset = openml.datasets.get_dataset(31)
#dataset = openml.datasets.get_dataset(1590)

X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format='array', target=dataset.default_target_attribute)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, random_state=1, stratify=y, train_size=0.6)

gen = SpaceGenerator()
space = gen.generate_params()

print('number hyperparameters: ' + str(len(space.name2node)))

from anytree import Node, RenderTree

for pre, _, node in RenderTree(space.parameter_tree):
    print("%s%s: %s" % (pre, node.name, node.status))

my_study = optuna.create_study(direction='maximize')

validation_scores = []
test_scores = []

#add Caruana ensemble with replacement # save pipelines to disk
コード例 #8
0
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None):
    space = None
    search_time = None
    if not 'space' in trial.user_attrs:
        # which hyperparameters to use
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        # which constraints to use
        search_time = trial.suggest_int('global_search_time_constraint', 10, total_search_time, log=False)

        # how much time for each evaluation
        evaluation_time = trial.suggest_int('global_evaluation_time_constraint', 10, search_time, log=False)

        # how much memory is allowed
        memory_limit = trial.suggest_uniform('global_memory_constraint', 1.5, 4)

        # how many cvs should be used
        cv = trial.suggest_int('global_cv', 2, 20, log=False) #todo: calculate minimum number of splits based on y

        number_of_cvs = trial.suggest_int('global_number_cv', 1, 10, log=False)

        dataset_id = trial.suggest_categorical('dataset_id', my_openml_datasets)

    else:
        space = trial.user_attrs['space']

        print(trial.params)

        #make this a hyperparameter
        search_time = trial.params['global_search_time_constraint']
        evaluation_time = trial.params['global_evaluation_time_constraint']
        memory_limit = trial.params['global_memory_constraint']
        cv = trial.params['global_cv']
        number_of_cvs = trial.params['global_number_cv']

        if 'dataset_id' in trial.params:
            dataset_id = trial.params['dataset_id'] #get same random seed
        else:
            dataset_id = 31


    for pre, _, node in RenderTree(space.parameter_tree):
        print("%s%s: %s" % (pre, node.name, node.status))

    # which dataset to use
    #todo: add more datasets


    if type(X_train) == type(None):

        X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=int(time.time()))

        if not isinstance(trial, FrozenTrial):
            my_list_constraints_values = [search_time, evaluation_time, memory_limit, cv, number_of_cvs]

            metafeature_values = data2features(X_train, y_train, categorical_indicator)
            features = space2features(space, my_list_constraints_values, metafeature_values)
            trial.set_user_attr('features', features)

    search = MyAutoML(cv=cv,
                      number_of_cvs=number_of_cvs,
                      n_jobs=1,
                      evaluation_budget=evaluation_time,
                      time_search_budget=search_time,
                      space=space,
                      main_memory_budget_gb=memory_limit)
    search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=auc)

    best_pipeline = search.get_best_pipeline()

    test_score = 0.0
    if type(best_pipeline) != type(None):
        test_score = auc(search.get_best_pipeline(), X_test, y_test)


    return test_score
コード例 #9
0
X_train_hold, X_test_hold, y_train_hold, y_test_hold, categorical_indicator_hold, attribute_names_hold = get_data(test_holdout_dataset_id, randomstate=42)


metafeature_values_hold = data2features(X_train_hold, y_train_hold, categorical_indicator_hold)

auc=make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)



total_search_time = 60*60#10 * 60

my_openml_datasets = [3, 4, 13, 15, 24, 25, 29, 31, 37, 38, 40, 43, 44, 49, 50, 51, 52, 53, 55, 56, 59, 151, 152, 153, 161, 162, 164, 172, 179, 310, 311, 312, 316, 333, 334, 335, 336, 337, 346, 444, 446, 448, 450, 451, 459, 461, 463, 464, 465, 466, 467, 470, 472, 476, 479, 481, 682, 683, 747, 803, 981, 993, 1037, 1038, 1039, 1040, 1042, 1045, 1046, 1048, 1049, 1050, 1053, 1054, 1055, 1056, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1071, 1073, 1075, 1085, 1101, 1104, 1107, 1111, 1112, 1114, 1116, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1169, 1216, 1235, 1236, 1237, 1238, 1240, 1412, 1441, 1442, 1443, 1444, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1455, 1458, 1460, 1461, 1462, 1463, 1464, 1467, 1471, 1473, 1479, 1480, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1494, 1495, 1496, 1498, 1502, 1504, 1506, 1507, 1510, 1511, 1547, 1561, 1562, 1563, 1564, 1597, 4134, 4135, 4154, 4329, 4534, 23499, 40536, 40645, 40646, 40647, 40648, 40649, 40650, 40660, 40665, 40666, 40669, 40680, 40681, 40690, 40693, 40701, 40705, 40706, 40710, 40713, 40714, 40900, 40910, 40922, 40999, 41005, 41007, 41138, 41142, 41144, 41145, 41146, 41147, 41150, 41156, 41158, 41159, 41160, 41161, 41162, 41228, 41430, 41521, 41538, 41976, 42172, 42477]
my_openml_datasets.remove(test_holdout_dataset_id)


mgen = SpaceGenerator()
mspace = mgen.generate_params()

my_list = list(mspace.name2node.keys())
my_list.sort()

my_list_constraints = ['global_search_time_constraint', 'global_evaluation_time_constraint', 'global_memory_constraint', 'global_cv', 'global_number_cv']


def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None):
    space = None
    search_time = None
    if not 'space' in trial.user_attrs:
        # which hyperparameters to use
        gen = SpaceGenerator()
        space = gen.generate_params()
コード例 #10
0
ファイル: utils_model.py プロジェクト: BigDaMa/DFS
def optimize_accuracy_under_constraints(trial,
                                        metafeature_values_hold,
                                        search_time,
                                        model,
                                        memory_limit=10,
                                        privacy_limit=None,
                                        evaluation_time=None,
                                        hold_out_fraction=None):
    try:
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        if type(evaluation_time) == type(None):
            evaluation_time = search_time
            if trial.suggest_categorical('use_evaluation_time_constraint',
                                         [True, False]):
                evaluation_time = trial.suggest_int(
                    'global_evaluation_time_constraint',
                    10,
                    search_time,
                    log=False)
        else:
            trial.set_user_attr('evaluation_time', evaluation_time)

        # how many cvs should be used
        cv = 1
        number_of_cvs = 1
        if type(hold_out_fraction) == type(None):
            hold_out_fraction = None
            if trial.suggest_categorical('use_hold_out', [True, False]):
                hold_out_fraction = trial.suggest_uniform(
                    'hold_out_fraction', 0, 1)
            else:
                cv = trial.suggest_int(
                    'global_cv', 2, 20, log=False
                )  # todo: calculate minimum number of splits based on y
                number_of_cvs = 1
                if trial.suggest_categorical('use_multiple_cvs',
                                             [True, False]):
                    number_of_cvs = trial.suggest_int('global_number_cv',
                                                      2,
                                                      10,
                                                      log=False)
        else:
            trial.set_user_attr('hold_out_fraction', hold_out_fraction)

        sample_fraction = 1.0
        #if trial.suggest_categorical('use_sampling', [True, False]):
        #    sample_fraction = trial.suggest_uniform('sample_fraction', 0, 1)

        my_list_constraints_values = [
            search_time, evaluation_time, memory_limit, cv, number_of_cvs,
            ifNull(privacy_limit, constant_value=1000),
            ifNull(hold_out_fraction), sample_fraction
        ]

        features = space2features(space, my_list_constraints_values,
                                  metafeature_values_hold)
        feature_names, _ = get_feature_names()
        features = FeatureTransformations().fit(features).transform(
            features, feature_names=feature_names)
        trial.set_user_attr('features', features)

        return predict_range(model, features)
    except Exception as e:
        print(str(e) + 'except dataset _ accuracy: ' + '\n\n')
        return 0.0
コード例 #11
0
ファイル: generate_ranges.py プロジェクト: BigDaMa/DFS
def run_AutoML(trial, X_train=None, X_test=None, y_train=None, y_test=None, categorical_indicator=None):
    space = None
    search_time = None
    if not 'space' in trial.user_attrs:
        # which hyperparameters to use
        gen = SpaceGenerator()
        space = gen.generate_params()
        space.sample_parameters(trial)

        trial.set_user_attr('space', copy.deepcopy(space))

        search_time, evaluation_time, memory_limit, privacy_limit, cv, number_of_cvs, hold_out_fraction, sample_fraction, dataset_id = generate_parameters(trial)

    else:
        space = trial.user_attrs['space']

        print(trial.params)

        #make this a hyperparameter
        search_time = total_search_time
        evaluation_time = search_time
        memory_limit = 4
        privacy_limit = None

        cv = 1
        number_of_cvs = 1
        hold_out_fraction = None
        if 'global_cv' in trial.params:
            cv = trial.params['global_cv']
            if 'global_number_cv' in trial.params:
                number_of_cvs = trial.params['global_number_cv']
        else:
            hold_out_fraction = trial.params['hold_out_fraction']

        sample_fraction = 1.0
        if 'sample_fraction' in trial.params:
            sample_fraction = trial.params['sample_fraction']

        if 'dataset_id' in trial.params:
            dataset_id = trial.params['dataset_id'] #get same random seed
        else:
            dataset_id = 31

    for pre, _, node in RenderTree(space.parameter_tree):
        if node.status == True:
            print("%s%s" % (pre, node.name))

    if type(X_train) == type(None):

        my_random_seed = int(time.time())
        if 'data_random_seed' in trial.user_attrs:
            my_random_seed = trial.user_attrs['data_random_seed']

        X_train, X_test, y_train, y_test, categorical_indicator, attribute_names = get_data(dataset_id, randomstate=my_random_seed)

        if not isinstance(trial, FrozenTrial):
            my_list_constraints_values = [search_time,
                                          evaluation_time,
                                          memory_limit, cv,
                                          number_of_cvs,
                                          ifNull(privacy_limit, constant_value=1000),
                                          ifNull(hold_out_fraction),
                                          sample_fraction]

            metafeature_values = data2features(X_train, y_train, categorical_indicator)
            features = space2features(space, my_list_constraints_values, metafeature_values)
            features = FeatureTransformations().fit(features).transform(features, feature_names=feature_names)
            trial.set_user_attr('features', features)

    search = MyAutoML(cv=cv,
                      number_of_cvs=number_of_cvs,
                      n_jobs=1,
                      evaluation_budget=evaluation_time,
                      time_search_budget=search_time,
                      space=space,
                      main_memory_budget_gb=memory_limit,
                      differential_privacy_epsilon=privacy_limit,
                      hold_out_fraction=hold_out_fraction,
                      sample_fraction=sample_fraction)
    search.fit(X_train, y_train, categorical_indicator=categorical_indicator, scorer=my_scorer)

    best_pipeline = search.get_best_pipeline()

    test_score = 0.0
    if type(best_pipeline) != type(None):
        test_score = my_scorer(search.get_best_pipeline(), X_test, y_test)


    return test_score, search