Beispiel #1
0
def drop_weak_columns(ctx, feature_selector=None):
    import sklearn.feature_selection as fs

    if feature_selector is None and ctx.is_regression:
        feature_selector = fs.SelectFwe(fs.f_regression)

    if feature_selector is None and ctx.is_classification:
        feature_selector = fs.SelectFwe(fs.f_classif)

    X, y = ctx.training_data()
    X = X._float_array()

    feature_selector.fit(X, y)
    weak_cols = np.where(np.invert(feature_selector.get_support()))[0]

    # For now, ignore the selector if it wants to drop every column.
    if 0 < len(weak_cols) < len(ctx.matrix.columns):
        _drop_weak_columns(ctx, weak_cols.tolist())
Beispiel #2
0
def get_fs_model(model, method, train, target=None, cv=None):
    """Connects given model with specified feature selection method and trains
    the final structure.
    """
    if method == "RFE":
        model = fs_scikit.RFE(model, 2, step=5)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    if method == "RFECV":
        model = fs_scikit.RFECV(model, 3, cv=cv)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    elif method == "linearSVC":
        sel = SelectFromModel(LinearSVC(penalty='l1', dual=False))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "fromModel":
        fm = fs_scikit.SelectFromModel(model)
        if target is not None:
            fm.fit(train, target)
        else:
            fm.fit(train)
        model = Pipeline([('feature_selection', fm), ('data_mining', model)])

    # elif method == "Anova":
    # ANOVA SVM-C
    # anova_filter = fs_scikit.SelectKBest(f_regression, k=5)
    # model = Pipeline([
    #     ('feature_selection', anova_filter),
    #     ('data_mining', model)
    # ])
    elif method == "VarianceThreshold":
        sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8)))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectPercentile":
        sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFpr":
        sel = fs_scikit.SelectFpr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFdr":
        sel = fs_scikit.SelectFdr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFwe":
        sel = fs_scikit.SelectFwe(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "ch2":
        sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    else:
        print("Feature selection method was not found: " + method)
        sys.exit(1)
    return model
def selection_process(b_data, labels):

    # scatter_variables(all_data, 'MONTH')

    # n_data = normalize(b_data)
    # The idea is to compare possible outcomes combination.

    # this works:
    # X_train, X_test, y_train, y_test = ms.train_test_split(b_data, labels, test_size=0.33, random_state=RANDOM_SEED, stratify=labels)
    # clf = Pipeline([('RandomForestClassifier', RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state=RANDOM_SEED))])
    # results = benchmark(clf, 'RandomForestClassifier', X_train, y_train, X_test, y_test)

    # for i in all_data.columns.values:
    #    print(all_data[i].min(), '-', all_data[i].max())
    # clf = svm_classify(X_train, y_train)
    # y_pred = clf.predict(X_test)
    # print(classification_report(y_test, y_pred))

    # this doesnt work:
    # attrib = attributes_selection(b_data, labels, k=50, invariant=True, function=f_classif) # mutual_info_classif
    # red_data = b_data[attrib[0]]
    # n_data = normalize(red_data)
    # X_train, X_test, y_train, y_test = ms.train_test_split(n_data, labels, test_size=0.33, random_state=RANDOM_SEED)
    #results = benchmark(clf, X_train, y_train, X_test, y_test)
    # sel_clf = svm_classify(X_train, y_train)
    # sel_y_pred = sel_clf.predict(X_test)
    # print(classification_report(y_test, sel_y_pred))

    for name, slt in (
        ('LinearSVCselection', SelectFromModel(LinearSVC(penalty="l1"))),
        ('SelectKBest', SelectKBest(mutual_info_classif, k=50)),
        ('SelectFwe', fs.SelectFwe(mutual_info_classif, alpha=0.05)),
    ):
        print('&' * 80)
        print(name)
        pip = Pipeline([slt])
        results = []
        for name, clf in (("RidgeClassifier",
                           RidgeClassifier(tol=1e-2, solver="lsqr")),
                          ("Perceptron", Perceptron(n_iter=50)),
                          ("PassiveAggressive",
                           PassiveAggressiveClassifier(n_iter=50)),
                          ("kNN", KNeighborsClassifier(n_neighbors=10)),
                          ("RandomForest",
                           RandomForestClassifier(n_estimators=100))):
            print('=' * 80)
            print(name)
            pip.classes_.append(clf)
            results.append(
                benchmark(pip, name, X_train, y_train, X_test, y_test))
Beispiel #4
0
def select_best():
    df = pd.merge(
        acw.gen_long_data(tpt)
            .normalize(columns="metric")
            .add_net_meta(tpt.net_hierarchy(HierarchyName.RESTRICTED_PERIPHERY_CORE))
            .groupby(["task", "subject", "region", "net_meta"]).mean().reset_index()
            .rename(columns={"metric": "acw"}),
        acz.gen_long_data(tpt)
            .normalize(columns="metric")
            .add_net_meta(tpt.net_hierarchy(HierarchyName.RESTRICTED_PERIPHERY_CORE))
            .groupby(["task", "subject", "region", "net_meta"]).mean().reset_index()
            .rename(columns={"metric": "acz"}),
        on=["task", "subject", "region", "net_meta"], sort=False).and_filter(NOTnet_meta="M")

    X = df.iloc[:, -2:].values
    y = df.net_meta.map({"C": 0, "P": 1}).values

    functions = [fs.mutual_info_classif, fs.f_classif, fs.chi2]
    for func in functions:
        for method in [fs.SelectKBest(func, k=1), fs.SelectPercentile(func), fs.SelectFdr(func), fs.SelectFpr(func),
                       fs.SelectFwe(func)]:
            method.fit(X, y)
            print(f'{str(method).split("(")[0]} {func.__name__}: {np.argmax(method.scores_) + 1}')
Beispiel #5
0
def get_feature(n):
    x_train, y_train = train_data(n)
    # x_test, y_test = te4t_data()
    x_train=np.array(x_train)
    y_train=np.array(y_train)
    # x_test=np.array(x_test)
    # y_test=np.array(y_test)

    kf=StratifiedKFold(n_splits=5,shuffle=True)
    p=0
    # y_train=np.ravel(y_train)
    # res=sm.Logit(y_train,x_train).fit(method='bfgs')
    # print(res.summary())
    a=[]
    for train_index, test_index in kf.split(x_train,y_train):
        X_train1, X_test1 = x_train[train_index], x_train[test_index]
        y_train1, y_test1 = y_train[train_index], y_train[test_index]
        select_feature=fs.SelectFwe()
        select_feature.fit(X_train1, y_train1)
        print(select_feature.get_support(True))
        #  get new X arrary
        X_train1 = select_feature.transform(X_train1)
        X_test1 = select_feature.transform(X_test1)


        # y_train=np.ravel(y_train)

        clf = LogisticRegression()
        clf.fit(X_train1, y_train1)
        y_pred = clf.predict(X_test1)
        a.append(np.mean(y_pred == y_test1))
        p = p + np.mean(y_pred == y_test1)

    print(a)
    print("precision:")
    print(p/5.000000000000)
    return select_feature
Beispiel #6
0
        raise

_feature_selectors = []
_feature_selectors.append((feature_selection.SelectKBest(k=1),
                           pd_feature_selection.SelectKBest(k=1), True))
_feature_selectors.append(
    (feature_selection.SelectKBest(k=1),
     pickle.loads(pickle.dumps(pd_feature_selection.SelectKBest(k=1))), True))
_feature_selectors.append((feature_selection.SelectKBest(k=2),
                           pd_feature_selection.SelectKBest(k=2), True))
_feature_selectors.append((feature_selection.SelectPercentile(),
                           pd_feature_selection.SelectPercentile(), True))
_feature_selectors.append(
    (feature_selection.SelectFdr(), pd_feature_selection.SelectFdr(), True))
_feature_selectors.append(
    (feature_selection.SelectFwe(), pd_feature_selection.SelectFwe(), True))
# Tmp Ami
if False:
    _feature_selectors.append(
        (feature_selection.RFE(linear_model.LogisticRegression()),
         pd_feature_selection.RFE(pd_linear_model.LogisticRegression()), True))

_keras_estimators = []
if _level > 0:
    _keras_estimators.append(
        (KerasClassifier(_build_classifier_nn, verbose=0),
         PdKerasClassifier(_build_classifier_nn,
                           _load_iris()[0]['class'].unique(),
                           verbose=0), False))
    _keras_estimators.append((KerasRegressor(_build_regressor_nn, verbose=0),
                              PdKerasRegressor(_build_regressor_nn,
Beispiel #7
0
#
# MAIN
#
synonyms_filepath = io_utils.get_synonyms_filepath()

UNIVARIATE = {
    "uv_kbest_def":
    feature_selection.SelectKBest(f_classif, k=10),
    "uv_kbest_chi2_def":
    feature_selection.SelectKBest(chi2, k=10),
    "uv_percentile_def":
    feature_selection.SelectPercentile(f_classif, percentile=10),
    "uv_fpr_def":
    feature_selection.SelectFpr(f_classif),
    "uv_fwe_def":
    feature_selection.SelectFwe(f_classif)
}

print "Preparing Train Collection"
X_train, y_train = create_train_data(io_utils.get_train_vectors_list())
print "Preparing Test Collection"
X_test, test_collections = create_test_data(io_utils.get_train_vectors_list())

# Univariate
for univariate_model_name in UNIVARIATE:
    model = UNIVARIATE[univariate_model_name]
    model.fit(X_train, y_train)
    X_train_new = model.transform(X_train)
    X_test_new = model.transform(X_test)
    for method_name in CLASSIFIERS:
        name = "{}_{}".format(method_name, univariate_model_name)
Beispiel #8
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
def get_search_params(params_builder):
    search_params = {}
    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
    safe_eval_es = SafeEval(load_estimators=True)

    for p in params_builder['param_set']:
        search_p = p['search_param_selector']['search_p']
        if search_p.strip() == '':
            continue
        param_type = p['search_param_selector']['selected_param_type']

        lst = search_p.split(':')
        assert (
            len(lst) == 2
        ), "Error, make sure there is one and only one colon in search parameter input."
        literal = lst[1].strip()
        param_name = lst[0].strip()
        if param_name:
            if param_name.lower() == 'n_jobs':
                sys.exit("Parameter `%s` is invalid for search." % param_name)
            elif not param_name.endswith('-'):
                ev = safe_eval(literal)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name] = ev
            else:
                # only for estimator eval, add `-` to the end of param
                #TODO maybe add regular express check
                ev = safe_eval_es(literal)
                for obj in ev:
                    if 'n_jobs' in obj.get_params():
                        obj.set_params(n_jobs=N_JOBS)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name[:-1]] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name[:-1]] = ev
        elif param_type != 'final_estimator_p':
            #TODO regular express check ?
            ev = safe_eval_es(literal)
            preprocessors = [
                preprocessing.StandardScaler(),
                preprocessing.Binarizer(),
                preprocessing.Imputer(),
                preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(),
                feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(),
                feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS),
                skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0)
            ]
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported preprocessor type: %r" % (obj))
            search_params['preprocessing_' + param_type[5:6]] = newlist
        else:
            sys.exit("Parameter name of the final estimator can't be skipped!")

    return search_params
Beispiel #10
0
def fit(actions, dataset):

    X_train, X_test, y_train, y_test = dataset
    seq = {}
    #fit_transformer
    if actions[1].item() == 0:
        log1p_fit_transformer = preprocessing.FunctionTransformer()
        seq[1] = log1p_fit_transformer
    else:
        quantile_fit_transformer = preprocessing.QuantileTransformer(
            random_state=0)
        seq[1] = quantile_fit_transformer
    #scaler
    if actions[3].item() == 0:
        standard_scaler = preprocessing.StandardScaler()
        seq[2] = standard_scaler
    elif actions[3].item() == 1:
        robust_scaler = preprocessing.RobustScaler()
        seq[2] = robust_scaler
    else:
        min_max_scaler = preprocessing.MinMaxScaler()
        seq[2] = min_max_scaler
    #constructers
    if actions[5].item() == 0:
        seq[3] = preprocessing.PolynomialFeatures(interaction_only=True)
    else:
        seq[3] = FeatureAgglomeration(5)
    #selecter
    if actions[7].item() == 0:
        selecter = feature_selection.SelectFwe()
        seq[4] = selecter
    elif actions[7].item() == 1:
        selecter = feature_selection.SelectPercentile()
        seq[4] = selecter
    elif actions[7].item() == 2:
        selecter = feature_selection.RFE(
            sklearn.ensemble.ExtraTreesClassifier())
        seq[4] = selecter
    else:
        selecter = feature_selection.SelectFromModel(
            sklearn.ensemble.ExtraTreesClassifier(), "median")
        seq[4] = selecter
    #models
    if actions[-1].item() == 0:
        model = sklearn.naive_bayes.GaussianNB()
        seq[5] = model
    elif actions[-1].item() == 1:
        model = sklearn.ensemble.RandomForestClassifier()
        seq[5] = model
    elif actions[-1].item() == 2:
        model = sklearn.naive_bayes.BernoulliNB()
        seq[5] = model
    elif actions[-1].item() == 3:
        model = linear_model.LogisticRegression()
        seq[5] = model
    elif actions[-1].item() == 4:
        model = sklearn.tree.DecisionTreeClassifier()
        seq[5] = model
    else:
        model = sklearn.ensemble.ExtraTreesClassifier()
        seq[5] = model

    #connectivity
    transformed = {}
    #For Node 1
    transformed[1] = seq[1].fit_transform(X_train)
    #For Node 2
    if actions[2].item() == 0:
        transformed[2] = seq[2].fit_transform(X_train)
    elif actions[2].item() == 1:
        transformed[2] = seq[2].fit_transform(transformed[1])
    #For Node 3
    if actions[4].item() == 0:
        transformed[3] = seq[3].fit_transform(X_train)
    elif actions[4].item() == 1:
        transformed[3] = seq[3].fit_transform(transformed[1])
    elif actions[4].item() == 2:
        transformed[3] = seq[3].fit_transform(transformed[2])
    #For Node 4
    if actions[6].item() == 0:
        transformed[4] = seq[4].fit_transform(X_train, y_train)
    elif actions[6].item() == 1:
        transformed[4] = seq[4].fit_transform(transformed[1], y_train)
    elif actions[6].item() == 2:
        transformed[4] = seq[4].fit_transform(transformed[2], y_train)
    elif actions[6].item() == 3:
        transformed[4] = seq[4].fit_transform(transformed[3], y_train)

    #leaf nodes
    leaf_nodes = set(range(5)) - {i.item() for i in actions[0:-1:2]}
    # print(leaf_nodes)
    merge_data = np.concatenate([transformed[i] for i in leaf_nodes], axis=1)
    last_selecter = feature_selection.SelectFromModel(
        sklearn.ensemble.ExtraTreesClassifier(), "median")
    merge_data = last_selecter.fit_transform(merge_data, y_train)
    clf = seq[5].fit(merge_data, y_train)

    #test data
    test_transformed = {}
    #For Node 1
    test_transformed[1] = seq[1].transform(X_test)
    #For Node 2
    if actions[2].item() == 0:
        test_transformed[2] = seq[2].transform(X_test)
    elif actions[2].item() == 1:
        test_transformed[2] = seq[2].transform(test_transformed[1])
    #For Node 3
    if actions[4].item() == 0:
        test_transformed[3] = seq[3].transform(X_test)
    elif actions[4].item() == 1:
        test_transformed[3] = seq[3].transform(test_transformed[1])
    elif actions[4].item() == 2:
        test_transformed[3] = seq[3].transform(test_transformed[2])
    #For Node 4
    if actions[6].item() == 0:
        test_transformed[4] = seq[4].transform(X_test)
    elif actions[6].item() == 1:
        test_transformed[4] = seq[4].transform(test_transformed[1])
    elif actions[6].item() == 2:
        test_transformed[4] = seq[4].transform(test_transformed[2])
    elif actions[6].item() == 3:
        test_transformed[4] = seq[4].transform(test_transformed[3])

    # print([test_transformed[i].shape for i in leaf_nodes])

    merge_data = np.concatenate([test_transformed[i] for i in leaf_nodes],
                                axis=1)
    merge_data = last_selecter.transform(merge_data)
    pred_test = clf.predict(merge_data)

    # reward = metrics.accuracy_score(y_test, pred_test)
    reward = balanced_accuracy_score(y_test, pred_test)
    # print(clf)
    # print(reward)
    # print('\nPrediction accuracy for the normal test dataset with log tranformer')
    # print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))

    return reward