コード例 #1
0
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
コード例 #2
0
def votingEnsembleTest2ndLayer_Test(top_ensembles_dict, test_country_data):
    hit_count = 0
    for BC in top_ensembles_dict.keys():
        classifiers = [
            _vclf
            for _vclf in [sub_list[1] for sub_list in top_ensembles_dict[BC]]
        ]
        _weights = np.asarray([1] * len(classifiers))
        vclf_layer2 = EnsembleVoteClassifier(clfs=classifiers,
                                             weights=_weights,
                                             refit=False)
        Y = test_country_data[BC]["Y"]
        X = test_country_data[BC]["X"]
        vclf_layer2.fit(X, Y)
        y_estimate = vclf_layer2.predict(X)
        print(
            "Mentality Cycle {} 2nd Layer Voting Classifier Ensemble has accuracy: {}"
            .format(BC, np.mean(Y == y_estimate)))
        hit_count = hit_count + np.sum(
            Y == y_estimate
        )  ##calc overall performance of top 3 classifiers for each region

    total_obvs = test_country_data[1]["Y"].shape[0] + test_country_data[2][
        "Y"].shape[0] + test_country_data[3]["Y"].shape[0]
    overall_hit_rate = hit_count / total_obvs
    print("Aggregated accuracy of 2nd Layer Voting Classifiers is: {}".format(
        overall_hit_rate))
コード例 #3
0
def test_no_weight_support_with_no_weight():
    logi = LogisticRegression()
    rf = RandomForestClassifier()
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard')
    eclf.fit(X, y)
コード例 #4
0
def test_no_weight_support_with_no_weight():
    logi = LogisticRegression(solver='liblinear', multi_class='ovr')
    rf = RandomForestClassifier(n_estimators=10)
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard')
    eclf.fit(X, y)
コード例 #5
0
def emsembal_train(feature, label):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier
    label = transport_labels(label)
    X_train, X_test, Y_train, Y_test = train_test_split(feature,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=1000)
    clf1 = SVC(C=10, kernel='sigmoid', probability=True)
    clf2 = RandomForestClassifier(random_state=0)
    clf3 = LogisticRegression(random_state=0)
    clf4 = xgb.XGBClassifier(max_depth=8,
                             learning_rate=0.07,
                             n_estimators=35,
                             silent=True,
                             objective="binary:logistic",
                             booster='gbtree',
                             gamma=0,
                             min_child_weight=6,
                             subsample=0.8,
                             colsample_bytree=0.7,
                             reg_alpha=0.1,
                             seed=1000)
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf3, clf4], voting='soft')

    eclf.fit(X_train, Y_train)
    y_pred = eclf.predict(X_test)
    print('eclf accs=%f' %
          (sum(1 for i in range(len(y_pred)) if y_pred[i] == Y_test[i]) /
           float(len(y_pred))))
コード例 #6
0
def tri_train(domain,X_train,y_train,X_test,y_test,X_un,theta=0.5,dis=False):
    models = list()
    accs = list()
    for i in range(3):   
        X_split,y_split = bootstrap_sample(X_train,y_train)
        acc,clf_func = get_acc_clf(domain,X_split,y_split,X_test,y_test)
        models.append(clf_func)
        accs.append(acc)

    for (j,k) in itertools.combinations(models,2):
        # i_features = list()
        unlabelled_features = np.array(X_un)
        total = len(X_train)+len(X_un)
        t = 0
        count = 0
        X_i = X_train
        y_i = y_train
        # find current classifier
        clf_i = [x for x in models if x!=j and x!=k][0]
        index_i = models.index(clf_i)
        print "***classifier %d***"%index_i
        while count < total and len(unlabelled_features)!=0:
            t += 1            
            X_tgt,y_tgt = get_features(unlabelled_features,j,k,clf_i,models,theta=theta,dis=dis)
            if len(X_tgt)==0 and t>1:
                print "no new features added"
                break
            
            X_i = concatenate(X_i,X_tgt)
            y_i = concatenate(y_i,y_tgt)
            count = len(X_i)
            print "%d %d %d"%(t,count,total)
            # clf_i.fit(X_i,y_i)
            # update classifier
            acc,clf_i = get_acc_clf(domain,X_i,y_i,X_test,y_test)
            if accs[index_i]<acc:
                accs[index_i] = acc
                # best_clf = clf_i
                print "*NEW BEST! best acc:", acc
                models[index_i] = clf_i
            else:
                print "no improvement..skip.."
                break
            if count == total:
                print "reach end.."
                break
            # update the unlabelled features for speed-up
            print np.array(X_tgt).shape
            X_tgt = [list(x) for x in X_tgt]
            unlabelled_features =[x for x in unlabelled_features if list(x) not in X_tgt]
            print np.array(unlabelled_features).shape
    # majority vote classifiers
    eclf = EnsembleVoteClassifier(clfs=models,weights=[1,1,1],refit=False)
    eclf.fit(X_test,y_test) # this line is not doing work
    # tmp_name = domain.upper()[0] if "large" not in domain else "large/"+domain.upper()[6]
    pred = eclf.predict(X_test)
    acc = accuracy_score(y_test,pred) if "large" not in domain else f1_score(y_test,pred,average='macro')

    print "acc:%s theta:%s"%(acc,theta),"seprate accs:",accs
    return acc,eclf
    def make_model(self):
        #---------------------------------------------------------------------------------------------
        #                       TREE BASED ALGORITHMS
        #---------------------------------------------------------------------------------------------

        #--Chossing random_state parameter
        #------Basically, a sub-optimal greedy algorithm is repeated a number of times using----------
        #------random selections of features and samples (a similar technique used in random----------
        #------ forests).The 'random_state' parameter allows controlling these random choices---------

        #--n_estimators = no of decision trees to be created in forest

        model_rf = RandomForestClassifier(n_estimators=145,
                                          random_state=10,
                                          n_jobs=-1)
        model_rf.fit(train_feats2, target)

        model_gb = GradientBoostingClassifier(n_estimators=145,
                                              random_state=11,
                                              n_jobs=-1)
        model_gb.fit(train_feats2, target)

        model_ab = AdaBoostClassifier(n_estimators=145,
                                      random_state=12,
                                      n_jobs=-1)
        model_ab.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               LOGISTIC REGRESSION
        #--------------------------------------------------------------------------------------------

        model_lr = LogisticRegression(random_state=1)
        model_lr.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               NAIVE BAYES
        #--------------------------------------------------------------------------------------------

        model_nb = MultinomialNB()
        model_nb.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               VOTING ENSEMBLE OF ALL MODELS
        #--------------------------------------------------------------------------------------------

        clf = [model_rf, model_lr, model_gb, model_ab, model_nb]
        eclf = EnsembleVoteClassifier(
            clfs=clf, weights=[1, 2, 1, 1, 1],
            refit=False)  #weights can be decided by stacking!!
        eclf.fit(train_feats2, target)
        print("model created")
        preds = eclf.predict(test_feats2)
        sub3 = pd.DataFrame({'User_ID': test_df.User_ID, 'Is_Response': preds})
        sub3['Is_Response'] = sub3['Is_Response'].map(
            lambda x: functions.to_labels(self, x))
        sub3 = sub3[['User_ID', 'Is_Response']]
        sub3.to_csv('D:\\New folder\\f2c2f440-8-dataset_he\\SUB_TEST.csv',
                    index=False)
        print("prediction saved")
        return eclf
コード例 #8
0
def majority_vote(target):
    X_test = load_obj("%s/X_test"%target)
    y_test = load_obj("%s/y_test"%target)

    domains = []
    if "mlp" in target:
        domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"]
    else:
        if "large" not in target:
            domains = ["books","dvd","electronics","kitchen"]
            if target not in domains:
                return
        else:
            domains =["large/baby","large/cell_phone","large/imdb","large/yelp2014"]

    models = []
    for source in domains:
        if target == source:
            continue
        else:
            print source
            clf_func = load_obj("%s/self_clf"%source)
            models.append(clf_func)


    eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1],
    eclf.fit(X_test,y_test) # this line is not doing work
    tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6]
    tmp_name = target.upper()[0] if "mlp" not in target else "mlp/"+target.upper()[4]
    save_obj(eclf, '%s_eclf'%(tmp_name))
    pred = eclf.predict(X_test)
    acc = accuracy_score(y_test,pred) if "large" not in target else f1_score(y_test,pred,average='macro')
    print 'self-train',acc
    pass
コード例 #9
0
 def __init__(self, X, y, x_test, model_lists):
     self.model = EnsembleVoteClassifier(clfs=model_lists,
                                         weights=[1, 1, 1],
                                         refit=False,
                                         voting='soft')
     self.X = X
     self.y = y
     self.X_test = x_test
コード例 #10
0
def train_knn_model(assts, n_macroepochs=100, n_epochs=10):
    TUNE = False
    #we start by fitting pca across the whole population (random sample)
    sgen = xy_generator(assts, batch_size=5000)
    pca = PCA(n_components=48)
    for _,X,y,_,_,_,_ in sgen:
        print("fitting PCA...")
        X = numpy.array(X, dtype=numpy.int8)
        y = numpy.array(y).ravel()
        pca.fit_transform(X)
        # if TUNE:
        #     tuned_parameters = [{'n_neighbors': [1, 20, 50, 100],
        #                          'weights': ['distance', 'uniform'],
        #                          'algorithm': ['ball_tree', 'kd_tree', 'brute']
        #                          }]
        #     scores = ['f1_macro', 'f1_micro', 'accuracy']
        #     # scores = ['accuracy']
        #     performances = []
        #     print("Tuning")
        #     for score in scores:
        #         print("# Tuning hyper-parameters for %s" % score)
        #         clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5, scoring=score, verbose=0, n_jobs=7)
        #         clf.fit(X, y)
        #         print("Best parameters set found on development set:")
        #         print(clf.best_estimator_)
        #         print("Grid scores on development set:")
        #         for params, mean_score, scores in clf.grid_scores_:
        #             print("%0.3f (+/-%0.03f) for %r"
        #                   % (mean_score, scores.std() / 2, params))
        #
        # break #half-loop just to get one sample from sgen
    exit()
    del sgen
    print("fitted")

    gc.collect()

    xygen = xy_generator(assts, batch_size=5000)  # make generator object
    clfs = []
    i = 0
    for S,X, y, yc, yt, ylv, yv in xygen:
        X = numpy.array(X, dtype=numpy.int8)
        y = numpy.array(y)
        X = pca.transform(X)
        voter = SVC()
        voter.fit(X,y)
        clfs.append(voter)
        i += 1
    model = EnsembleVoteClassifier(clfs=clfs, refit=False)

    X_for_classes = []
    y_for_classes = []
    for classlabel in all_page_ids:
        X_for_classes.append(numpy.zeros(256))
        y_for_classes.append(classlabel)

    model.fit(X_for_classes,y_for_classes)
    return model, pca, None, None #, sscaler, levscaler, volscaler
コード例 #11
0
def test_no_weight_support():
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])
    logi = LogisticRegression(solver='liblinear', multi_class='ovr')
    rf = RandomForestClassifier(n_estimators=10)
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard')
    eclf.fit(X, y, sample_weight=w)
コード例 #12
0
ファイル: MulticriteriaEnsemble.py プロジェクト: flokos/MuCE
 def _fit_wmv(self):
     #Merge the base learners and the produced models(extra)
     models = self.bootstrap_models.values()
     #Define the Weighted Majority Voting model
     self.wmv_model = EnsembleVoteClassifier(clfs=models,
                                             weights=self.weights,
                                             voting=self.voting,
                                             refit=False)
     #Fit the WMV model
     self.wmv_model.fit(self.dataset.X_train, self.dataset.y_train)
コード例 #13
0
    def ensemble(self, folds_limit=42):
        answers = []

        pass
        # clf1 = ExtraTreesClassifier(max_features=0.4, min_samples_leaf=1, min_samples_split=4,
        #                             n_estimators=1000, n_jobs=self.cpu)
        # clf2 = ExtraTreesClassifier(criterion="gini", max_features=0.4, min_samples_split=6, n_estimators=1000,
        #                             n_jobs=self.cpu)
        # clf3 = ExtraTreesClassifier(max_features=0.55, min_samples_leaf=1, min_samples_split=4, n_estimators=1000,
        #                             n_jobs=self.cpu)
        # clf4 = ExtraTreesClassifier(max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=1000,
        #                             n_jobs=self.cpu)

        pass
        # default 0.6742 on seed=42 for full set (search_best_3)
        clf1 = ExtraTreesClassifier(max_features=0.4537270875668709,
                                    criterion='entropy',
                                    min_samples_leaf=1,
                                    min_samples_split=2,
                                    n_estimators=3138,
                                    n_jobs=self.cpu)

        pass
        # clf1 = RandomForestClassifier(max_features=0.34808889858456293, criterion='entropy',
        #                               min_samples_split=2, n_estimators=4401, n_jobs=self.cpu)

        pass
        # default
        # clf1 = ExtraTreesClassifier(max_features=0.4, min_samples_leaf=1, min_samples_split=2, n_estimators=1000,
        #                             n_jobs=self.cpu)

        self.pipeline = EnsembleVoteClassifier(clfs=[clf1],
                                               weights=[1],
                                               voting='soft')

        for iteration in range(folds_limit):
            np.random.seed(42 + iteration)

            x_train, y_train, x_test, y_test = self.get_fold(
                self.default_columns)
            self.pipeline.fit(x_train, y_train)
            preds = self.pipeline.predict(x_test)

            # print(confusion_matrix(y_test, preds))
            matrix_ = confusion_matrix(y_test, preds)
            correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][
                2] + matrix_[3][3] + matrix_[4][4]
            print('   Correct answers count: ', correct_answers,
                  ' [it: %s]' % iteration)
            answers.append(int(correct_answers))
            if iteration % 5 == 0 and iteration > 0:
                print('Params: mean: %s std: %s best: %s' %
                      (np.mean(answers), np.std(answers), max(answers)))
        print('Params: mean: %s std: %s best: %s' %
              (np.mean(answers), np.std(answers), max(answers)))
コード例 #14
0
def test_sample_weight():
    # with no weight
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard')
    prob1 = eclf.fit(X, y).predict_proba(X)

    # with weight = 1
    w = np.ones(len(y))
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard')
    prob2 = eclf.fit(X, y, sample_weight=w).predict_proba(X)

    # with random weight
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard')
    prob3 = eclf.fit(X, y, sample_weight=w).predict_proba(X)

    diff12 = np.max(np.abs(prob1 - prob2))
    diff23 = np.max(np.abs(prob2 - prob3))
    assert diff12 < 1e-3, "max diff is %.4f" % diff12
    assert diff23 > 1e-3, "max diff is %.4f" % diff23
コード例 #15
0
def test_1model_probas():
    clf = LogisticRegression(multi_class='multinomial',
                             solver='newton-cg', random_state=123)
    ens_clf_1 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=None)
    ens_clf_2 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=[1.])

    pred_e1 = ens_clf_1.fit(X, y).predict_proba(X)
    pred_e2 = ens_clf_2.fit(X, y).predict_proba(X)
    pred_e3 = clf.fit(X, y).predict_proba(X)

    np.testing.assert_almost_equal(pred_e1, pred_e2, decimal=8)
    np.testing.assert_almost_equal(pred_e1, pred_e3, decimal=8)
コード例 #16
0
def test_get_params():
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=1, n_estimators=10)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3])

    got = sorted(list({s.split('__')[0] for s in eclf.get_params().keys()}))
    expect = [
        'clfs', 'gaussiannb', 'kneighborsclassifier', 'randomforestclassifier',
        'refit', 'verbose', 'voting', 'weights'
    ]
    assert got == expect, got
コード例 #17
0
def majority_vote_mlp(target):
    X_test = load_obj("%s/X_test"%target)
    y_test = load_obj("%s/y_test"%target)

    # domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"]
    data_name = ["books", "dvd", "electronics", "kitchen"]
    X_joint = load_obj("%s/X_joint"%target)
    y_joint = load_obj("%s/y_joint"%target)
    temp_un = load_obj("%s/X_un"%target)
    meta_sources = []
    for i in range(len(data_name)):
        if 'mlp/'+data_name[i] != target:
            meta_sources.append(data_name[i])
    # print meta_sources
    models = []
    for j in range(len(meta_sources)):
        temp_X = X_joint[j]
        temp_y = y_joint[j]
        thetas = [0.5,0.6,0.7,0.8,0.9]
        best_acc = 0.0
        best_clf =""
        best_theta = 0.0
        resFile = open("../work/params/%s_theta_self-%s.csv"%(target,meta_sources[j].upper()[0]),"w")
        resFile.write("theta, acc\n")
        for theta in thetas:
            print "##############################"
            print "start with theta=%s"%theta
            print "##############################"
            acc,clf_func = self_train(target,temp_X,temp_y,X_test,y_test,temp_un,theta=theta)
            
            if best_acc<acc:
                best_acc = acc
                best_clf = clf_func
                best_theta = theta

            resFile.write("%f, %f\n"%(theta,acc))
            resFile.flush()
        resFile.close()
        print "##############################"
        print "best_theta:",best_theta,"best_acc:",best_acc
        models.append(best_clf)

    eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1],
    eclf.fit(X_test,y_test) # this line is not doing work
    # tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6]
    # tmp_name = 'mlp/'+target.upper()[4]
    save_obj(eclf, "%s/self_clf"%target)
    pred = eclf.predict(X_test)
    # print pred
    acc = accuracy_score(y_test,pred)
    print 'self-train',acc
    pass
コード例 #18
0
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
コード例 #19
0
def votingEnsembleTest(all_country_data_with_algos, test_country_data_US):
    print(
        " \n For each training set country for each sub dataset (split by Mentality Cycle): the top n trained algorithms form a Voting Classifiers. This Voting Classifiers is then tested on its corresponding US sub data set. An aggregate scocre for each trainging set country is calculated through an Aggregation of its 3 Voting Classifiers' performances"
    )
    _all_country_data_with_trained_algos = copy.deepcopy(
        all_country_data_with_algos)

    for country in _all_country_data_with_trained_algos.keys():
        country_level_total_hits = 0
        for BC in _all_country_data_with_trained_algos[country].keys():
            classifiers = copy.deepcopy(
                _all_country_data_with_trained_algos[country][BC].get(
                    'trained algos'))

            clf_weights = np.asarray([1, 1, 1], dtype=int)

            Y = test_country_data_US[BC].get("Y")
            X = test_country_data_US[BC].get("X")

            vclf = EnsembleVoteClassifier(clfs=classifiers,
                                          weights=clf_weights,
                                          refit=False,
                                          voting='hard')  # voting='soft'

            vclf.fit(X, Y)
            y_estimate = vclf.predict(np.array(X))
            print(
                "Voting Classifier trained on {} Mentality Cycle {} has accuracy: {}"
                .format(country, BC, np.mean(Y == pd.Series(y_estimate))))

            ##saving Country-BC split accuracy and instance of Voting Classifier score to all_country... dictionary
            _all_country_data_with_trained_algos[country][BC][
                'accuracy'] = np.mean(Y == y_estimate)
            _all_country_data_with_trained_algos[country][BC][
                'votingclassifier'] = vclf
            country_level_total_hits = country_level_total_hits + np.sum(
                Y == y_estimate)

        record_count = test_country_data_US[1]["Y"].shape[
            0] + test_country_data_US[2]["Y"].shape[0] + test_country_data_US[
                3]["Y"].shape[0]
        _all_country_data_with_trained_algos[country]['accuracy'] = (
            country_level_total_hits / record_count)
        print("Aggregated Classifier trained on {} has accuracy: {} \n".format(
            country,
            _all_country_data_with_trained_algos[country]['accuracy']))

    return _all_country_data_with_trained_algos
コード例 #20
0
def test_get_params():
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3])

    got = sorted(list({s.split('__')[0] for s in eclf.get_params().keys()}))
    expect = ['clfs',
              'gaussiannb',
              'kneighborsclassifier',
              'randomforestclassifier',
              'refit',
              'verbose',
              'voting',
              'weights']
    assert got == expect, got
コード例 #21
0
def test4():
    # Example 2 - Grid Search
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from mlxtend.classifier import EnsembleVoteClassifier

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft')

    params = {
        'logisticregression__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200],
    }

    grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
    grid.fit(iris.data, iris.target)

    cv_keys = ('mean_test_score', 'std_test_score', 'params')

    for r, _ in enumerate(grid.cv_results_['mean_test_score']):
        print(
            "%0.3f +/- %0.2f %r" %
            (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] /
             2.0, grid.cv_results_[cv_keys[2]][r]))
コード例 #22
0
def test5():
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from mlxtend.classifier import EnsembleVoteClassifier

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf1, clf2], voting='soft')

    # If the EnsembleClassifier is initialized with multiple similar estimator objects, the estimator names are modified with consecutive integer indices, for example:
    params = {
        'logisticregression-1__C': [1.0, 100.0],
        'logisticregression-2__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200],
    }

    grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
    grid = grid.fit(iris.data, iris.target)

    cv_keys = ('mean_test_score', 'std_test_score', 'params')

    for r, _ in enumerate(grid.cv_results_['mean_test_score']):
        print(
            "%0.3f +/- %0.2f %r" %
            (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] /
             2.0, grid.cv_results_[cv_keys[2]][r]))
コード例 #23
0
def test_fit_base_estimators_false():
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()

    clf1.fit(X, y)
    clf2.fit(X, y)
    clf3.fit(X, y)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  fit_base_estimators=False)

    eclf.fit(X, y)
    assert round(eclf.score(X, y), 2) == 0.97
コード例 #24
0
def ensembleClassifier(X_train, X_test, y_train, y_test, X_1_df, Y_1_df):
    print('-----------------------------')
    print('Ensemble Vote Classifier was Called. Wait...')

    clf1 = LogisticRegression(C=5.0,
                              class_weight='balanced',
                              max_iter=10000,
                              random_state=1)  # C = 5.0
    clf2 = SVC(kernel='linear', C=1.0, random_state=1)  # linear SVM C = 1.0
    clf3 = KNeighborsClassifier(n_neighbors=1)  # optimum_k = 1
    clf4 = DecisionTreeClassifier(max_depth=23, criterion='gini')  #

    labels = [
        'Logistic Regression', 'Support Vector Machine', 'K Nearest Neighbor',
        'Decision Tree', 'Ensemble'
    ]

    start = time.time()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4],
                                  weights=[1, 1, 1, 1])

    for clf, label in zip([clf1, clf2, clf3, clf4, eclf], labels):
        clf.fit(X_1_df, Y_1_df)

        scores = cross_val_score(clf,
                                 X_1_df,
                                 Y_1_df.values.ravel(),
                                 cv=20,
                                 scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))

    end = time.time()
    print("Running time %.3f" % (end - start))
    return
コード例 #25
0
def test_use_clones():
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], use_clones=True).fit(X, y)

    assert_raises(
        exceptions.NotFittedError,
        "This RandomForestClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this estimator.", clf2.predict, X)

    EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], use_clones=False).fit(X, y)

    clf2.predict(X)
コード例 #26
0
def test_EnsembleVoteClassifier_gridsearch():

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft')

    params = {
        'logisticregression__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200]
    }

    grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
    grid.fit(iris.data, iris.target)

    if Version(sklearn_version) < '0.18':
        mean_scores = []
        for params, mean_score, scores in grid.grid_scores_:
            mean_scores.append(round(mean_score, 2))
    else:
        mean_scores = [
            round(s, 2) for s in grid.cv_results_['mean_test_score']
        ]

    assert mean_scores == [0.95, 0.96, 0.96, 0.95]
コード例 #27
0
def test_EnsembleVoteClassifier_gridsearch():

    clf1 = LogisticRegression(solver='liblinear',
                              multi_class='ovr',
                              random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft')

    params = {
        'logisticregression__C': [1.0, 100.0],
        'randomforestclassifier__n_estimators': [20, 200]
    }

    if Version(sklearn_version) < '0.24.1':
        grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, iid=False)
    else:
        grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)

    X, y = iris_data()
    grid.fit(X, y)

    mean_scores = [round(s, 2) for s in grid.cv_results_['mean_test_score']]

    assert mean_scores == [0.95, 0.96, 0.96, 0.95]
コード例 #28
0
class VotingModel:
    def __init__(self, X, y, x_test, model_lists):
        self.model = EnsembleVoteClassifier(clfs=model_lists,
                                            weights=[1, 1, 1],
                                            refit=False,
                                            voting='soft')
        self.X = X
        self.y = y
        self.X_test = x_test

    def train(self):
        self.model.fit(self.X, self.y)

    def predict(self):
        return self.model.predict(self.X_test)

    def predict_proba(self):
        return self.model.predict_proba(self.X_test)
コード例 #29
0
def test_clone():

    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)
    clone(eclf)
コード例 #30
0
def test_clone():

    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)
    clone(eclf)
コード例 #31
0
 def fit(self, X, y):
     """
     Train all the models in the ensemble.
     :param X: Features values of trainset
     :param y: Target values of trainset
     :return: ---
     """
     # self._commit_models(X, y)
     if self.parallel:
         pool = multiprocessing.Pool(processes=None)
         f = partial(self._fit_single_model, X, y)
         self.models = pool.map(f, self.models)
         pool.close()
         pool.join()
     else:
         for model in self.models:
             self._fit_single_model(X, y, model)
     self.votingClassifier = EnsembleVoteClassifier(clfs=self._get_models(), voting=self.voting, refit=False)
     self.votingClassifier.fit(X, y)
コード例 #32
0
    def fit(self, X, values):
        #hard prediction
        for train_index, validation_index in KFold(n_splits=self.n_folds).split(X):
            train_set = X[train_index]
            train_values = values[train_index]

            validation_set = X[validation_index]
            validation_values = values[validation_index]

            fold_model = clone(self.template_model)
            fold_model.fit(train_set, train_values) #retrains a brand new model for the fold

            fold_regressor = KNeighborsRegressor(weights=self.weights, n_neighbors=self.n_neighbors)
            fold_regressor.fit(validation_set, fold_model.predict(validation_set) == validation_values)
            self.fold_regressions.append(fold_regressor)
            self.fold_models.append(fold_model)

        self.bagger = EnsembleVoteClassifier(self.fold_models, voting="soft", refit=False)
        self.bagger.fit(X, values) #trivial fit
コード例 #33
0
def test_EnsembleVoteClassifier():

    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard')

    scores = cross_val_score(eclf, X, y, cv=5, scoring='accuracy')
    scores_mean = (round(scores.mean(), 2))
    assert (scores_mean == 0.94)
コード例 #34
0
ファイル: vote.py プロジェクト: clover9gu/simplemining
# Majority voting with classifiers trained on different feature subsets
#

from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector

sfs1 = SequentialFeatureSelector(clf1,
                                 k_features=4,
                                 floating=False,
                                 scoring='accuracy',
                                 print_progress=False,
                                 cv=0)
clf1_pipe = Pipeline([('sfs', sfs1),
                      ('logreg', clf1)])

eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3], voting='soft')

params = {'pipeline__sfs__k_features': [1, 2, 3],
          #'pipeline__logreg__C': [1,0, 100.0],
          'randomforestclassifier__n_estimators': [20, 200]}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid.fit(iris.data, iris.target)

for  params, mean_score, scores in grid.grid_scores_:
  print("%0.3f (+/-%0.03f) for %r"
        % (mean_score, scores.std()/ 2, params))

print grid.best_params_

eclf = eclf.set_params(**grid.best_params_)
print eclf.fit(X, y).predict(X[[1, 51, 149]])