def test_no_weight_support_with_no_weight():
    logi = LogisticRegression()
    rf = RandomForestClassifier()
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard')
    eclf.fit(X, y)
Example #2
0
def test_sample_weight():
    # with no weight
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard')
    prob1 = eclf.fit(X, y).predict_proba(X)

    # with weight = 1
    w = np.ones(len(y))
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard')
    prob2 = eclf.fit(X, y, sample_weight=w).predict_proba(X)

    # with random weight
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard')
    prob3 = eclf.fit(X, y, sample_weight=w).predict_proba(X)

    diff12 = np.max(np.abs(prob1 - prob2))
    diff23 = np.max(np.abs(prob2 - prob3))
    assert diff12 < 1e-3, "max diff is %.4f" % diff12
    assert diff23 > 1e-3, "max diff is %.4f" % diff23
def emsembal_train(feature, label):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier
    label = transport_labels(label)
    X_train, X_test, Y_train, Y_test = train_test_split(feature,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=1000)
    clf1 = SVC(C=10, kernel='sigmoid', probability=True)
    clf2 = RandomForestClassifier(random_state=0)
    clf3 = LogisticRegression(random_state=0)
    clf4 = xgb.XGBClassifier(max_depth=8,
                             learning_rate=0.07,
                             n_estimators=35,
                             silent=True,
                             objective="binary:logistic",
                             booster='gbtree',
                             gamma=0,
                             min_child_weight=6,
                             subsample=0.8,
                             colsample_bytree=0.7,
                             reg_alpha=0.1,
                             seed=1000)
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf3, clf4], voting='soft')

    eclf.fit(X_train, Y_train)
    y_pred = eclf.predict(X_test)
    print('eclf accs=%f' %
          (sum(1 for i in range(len(y_pred)) if y_pred[i] == Y_test[i]) /
           float(len(y_pred))))
Example #4
0
def test_no_weight_support_with_no_weight():
    logi = LogisticRegression(solver='liblinear', multi_class='ovr')
    rf = RandomForestClassifier(n_estimators=10)
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard')
    eclf.fit(X, y)
Example #5
0
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
    def make_model(self):
        #---------------------------------------------------------------------------------------------
        #                       TREE BASED ALGORITHMS
        #---------------------------------------------------------------------------------------------

        #--Chossing random_state parameter
        #------Basically, a sub-optimal greedy algorithm is repeated a number of times using----------
        #------random selections of features and samples (a similar technique used in random----------
        #------ forests).The 'random_state' parameter allows controlling these random choices---------

        #--n_estimators = no of decision trees to be created in forest

        model_rf = RandomForestClassifier(n_estimators=145,
                                          random_state=10,
                                          n_jobs=-1)
        model_rf.fit(train_feats2, target)

        model_gb = GradientBoostingClassifier(n_estimators=145,
                                              random_state=11,
                                              n_jobs=-1)
        model_gb.fit(train_feats2, target)

        model_ab = AdaBoostClassifier(n_estimators=145,
                                      random_state=12,
                                      n_jobs=-1)
        model_ab.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               LOGISTIC REGRESSION
        #--------------------------------------------------------------------------------------------

        model_lr = LogisticRegression(random_state=1)
        model_lr.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               NAIVE BAYES
        #--------------------------------------------------------------------------------------------

        model_nb = MultinomialNB()
        model_nb.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               VOTING ENSEMBLE OF ALL MODELS
        #--------------------------------------------------------------------------------------------

        clf = [model_rf, model_lr, model_gb, model_ab, model_nb]
        eclf = EnsembleVoteClassifier(
            clfs=clf, weights=[1, 2, 1, 1, 1],
            refit=False)  #weights can be decided by stacking!!
        eclf.fit(train_feats2, target)
        print("model created")
        preds = eclf.predict(test_feats2)
        sub3 = pd.DataFrame({'User_ID': test_df.User_ID, 'Is_Response': preds})
        sub3['Is_Response'] = sub3['Is_Response'].map(
            lambda x: functions.to_labels(self, x))
        sub3 = sub3[['User_ID', 'Is_Response']]
        sub3.to_csv('D:\\New folder\\f2c2f440-8-dataset_he\\SUB_TEST.csv',
                    index=False)
        print("prediction saved")
        return eclf
Example #7
0
def votingEnsembleTest2ndLayer_Test(top_ensembles_dict, test_country_data):
    hit_count = 0
    for BC in top_ensembles_dict.keys():
        classifiers = [
            _vclf
            for _vclf in [sub_list[1] for sub_list in top_ensembles_dict[BC]]
        ]
        _weights = np.asarray([1] * len(classifiers))
        vclf_layer2 = EnsembleVoteClassifier(clfs=classifiers,
                                             weights=_weights,
                                             refit=False)
        Y = test_country_data[BC]["Y"]
        X = test_country_data[BC]["X"]
        vclf_layer2.fit(X, Y)
        y_estimate = vclf_layer2.predict(X)
        print(
            "Mentality Cycle {} 2nd Layer Voting Classifier Ensemble has accuracy: {}"
            .format(BC, np.mean(Y == y_estimate)))
        hit_count = hit_count + np.sum(
            Y == y_estimate
        )  ##calc overall performance of top 3 classifiers for each region

    total_obvs = test_country_data[1]["Y"].shape[0] + test_country_data[2][
        "Y"].shape[0] + test_country_data[3]["Y"].shape[0]
    overall_hit_rate = hit_count / total_obvs
    print("Aggregated accuracy of 2nd Layer Voting Classifiers is: {}".format(
        overall_hit_rate))
def test_string_labels_refit_false():
    np.random.seed(123)
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier()
    clf3 = GaussianNB()

    y_str = y.copy()
    y_str = y_str.astype(str)
    y_str[:50] = 'a'
    y_str[50:100] = 'b'
    y_str[100:150] = 'c'

    clf1.fit(X, y_str)
    clf2.fit(X, y_str)
    clf3.fit(X, y_str)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='soft',
                                  refit=False)

    eclf.fit(X, y_str)
    assert round(eclf.score(X, y_str), 2) == 0.97
def tri_train(domain,X_train,y_train,X_test,y_test,X_un,theta=0.5,dis=False):
    models = list()
    accs = list()
    for i in range(3):   
        X_split,y_split = bootstrap_sample(X_train,y_train)
        acc,clf_func = get_acc_clf(domain,X_split,y_split,X_test,y_test)
        models.append(clf_func)
        accs.append(acc)

    for (j,k) in itertools.combinations(models,2):
        # i_features = list()
        unlabelled_features = np.array(X_un)
        total = len(X_train)+len(X_un)
        t = 0
        count = 0
        X_i = X_train
        y_i = y_train
        # find current classifier
        clf_i = [x for x in models if x!=j and x!=k][0]
        index_i = models.index(clf_i)
        print "***classifier %d***"%index_i
        while count < total and len(unlabelled_features)!=0:
            t += 1            
            X_tgt,y_tgt = get_features(unlabelled_features,j,k,clf_i,models,theta=theta,dis=dis)
            if len(X_tgt)==0 and t>1:
                print "no new features added"
                break
            
            X_i = concatenate(X_i,X_tgt)
            y_i = concatenate(y_i,y_tgt)
            count = len(X_i)
            print "%d %d %d"%(t,count,total)
            # clf_i.fit(X_i,y_i)
            # update classifier
            acc,clf_i = get_acc_clf(domain,X_i,y_i,X_test,y_test)
            if accs[index_i]<acc:
                accs[index_i] = acc
                # best_clf = clf_i
                print "*NEW BEST! best acc:", acc
                models[index_i] = clf_i
            else:
                print "no improvement..skip.."
                break
            if count == total:
                print "reach end.."
                break
            # update the unlabelled features for speed-up
            print np.array(X_tgt).shape
            X_tgt = [list(x) for x in X_tgt]
            unlabelled_features =[x for x in unlabelled_features if list(x) not in X_tgt]
            print np.array(unlabelled_features).shape
    # majority vote classifiers
    eclf = EnsembleVoteClassifier(clfs=models,weights=[1,1,1],refit=False)
    eclf.fit(X_test,y_test) # this line is not doing work
    # tmp_name = domain.upper()[0] if "large" not in domain else "large/"+domain.upper()[6]
    pred = eclf.predict(X_test)
    acc = accuracy_score(y_test,pred) if "large" not in domain else f1_score(y_test,pred,average='macro')

    print "acc:%s theta:%s"%(acc,theta),"seprate accs:",accs
    return acc,eclf
def majority_vote(target):
    X_test = load_obj("%s/X_test"%target)
    y_test = load_obj("%s/y_test"%target)

    domains = []
    if "mlp" in target:
        domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"]
    else:
        if "large" not in target:
            domains = ["books","dvd","electronics","kitchen"]
            if target not in domains:
                return
        else:
            domains =["large/baby","large/cell_phone","large/imdb","large/yelp2014"]

    models = []
    for source in domains:
        if target == source:
            continue
        else:
            print source
            clf_func = load_obj("%s/self_clf"%source)
            models.append(clf_func)


    eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1],
    eclf.fit(X_test,y_test) # this line is not doing work
    tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6]
    tmp_name = target.upper()[0] if "mlp" not in target else "mlp/"+target.upper()[4]
    save_obj(eclf, '%s_eclf'%(tmp_name))
    pred = eclf.predict(X_test)
    acc = accuracy_score(y_test,pred) if "large" not in target else f1_score(y_test,pred,average='macro')
    print 'self-train',acc
    pass
Example #11
0
def train_knn_model(assts, n_macroepochs=100, n_epochs=10):
    TUNE = False
    #we start by fitting pca across the whole population (random sample)
    sgen = xy_generator(assts, batch_size=5000)
    pca = PCA(n_components=48)
    for _,X,y,_,_,_,_ in sgen:
        print("fitting PCA...")
        X = numpy.array(X, dtype=numpy.int8)
        y = numpy.array(y).ravel()
        pca.fit_transform(X)
        # if TUNE:
        #     tuned_parameters = [{'n_neighbors': [1, 20, 50, 100],
        #                          'weights': ['distance', 'uniform'],
        #                          'algorithm': ['ball_tree', 'kd_tree', 'brute']
        #                          }]
        #     scores = ['f1_macro', 'f1_micro', 'accuracy']
        #     # scores = ['accuracy']
        #     performances = []
        #     print("Tuning")
        #     for score in scores:
        #         print("# Tuning hyper-parameters for %s" % score)
        #         clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5, scoring=score, verbose=0, n_jobs=7)
        #         clf.fit(X, y)
        #         print("Best parameters set found on development set:")
        #         print(clf.best_estimator_)
        #         print("Grid scores on development set:")
        #         for params, mean_score, scores in clf.grid_scores_:
        #             print("%0.3f (+/-%0.03f) for %r"
        #                   % (mean_score, scores.std() / 2, params))
        #
        # break #half-loop just to get one sample from sgen
    exit()
    del sgen
    print("fitted")

    gc.collect()

    xygen = xy_generator(assts, batch_size=5000)  # make generator object
    clfs = []
    i = 0
    for S,X, y, yc, yt, ylv, yv in xygen:
        X = numpy.array(X, dtype=numpy.int8)
        y = numpy.array(y)
        X = pca.transform(X)
        voter = SVC()
        voter.fit(X,y)
        clfs.append(voter)
        i += 1
    model = EnsembleVoteClassifier(clfs=clfs, refit=False)

    X_for_classes = []
    y_for_classes = []
    for classlabel in all_page_ids:
        X_for_classes.append(numpy.zeros(256))
        y_for_classes.append(classlabel)

    model.fit(X_for_classes,y_for_classes)
    return model, pca, None, None #, sscaler, levscaler, volscaler
Example #12
0
def test_no_weight_support():
    random.seed(87)
    w = np.array([random.random() for _ in range(len(y))])
    logi = LogisticRegression(solver='liblinear', multi_class='ovr')
    rf = RandomForestClassifier(n_estimators=10)
    gnb = GaussianNB()
    knn = KNeighborsClassifier()
    eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard')
    eclf.fit(X, y, sample_weight=w)
def test_1model_probas():
    clf = LogisticRegression(multi_class='multinomial',
                             solver='newton-cg', random_state=123)
    ens_clf_1 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=None)
    ens_clf_2 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=[1.])

    pred_e1 = ens_clf_1.fit(X, y).predict_proba(X)
    pred_e2 = ens_clf_2.fit(X, y).predict_proba(X)
    pred_e3 = clf.fit(X, y).predict_proba(X)

    np.testing.assert_almost_equal(pred_e1, pred_e2, decimal=8)
    np.testing.assert_almost_equal(pred_e1, pred_e3, decimal=8)
def majority_vote_mlp(target):
    X_test = load_obj("%s/X_test"%target)
    y_test = load_obj("%s/y_test"%target)

    # domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"]
    data_name = ["books", "dvd", "electronics", "kitchen"]
    X_joint = load_obj("%s/X_joint"%target)
    y_joint = load_obj("%s/y_joint"%target)
    temp_un = load_obj("%s/X_un"%target)
    meta_sources = []
    for i in range(len(data_name)):
        if 'mlp/'+data_name[i] != target:
            meta_sources.append(data_name[i])
    # print meta_sources
    models = []
    for j in range(len(meta_sources)):
        temp_X = X_joint[j]
        temp_y = y_joint[j]
        thetas = [0.5,0.6,0.7,0.8,0.9]
        best_acc = 0.0
        best_clf =""
        best_theta = 0.0
        resFile = open("../work/params/%s_theta_self-%s.csv"%(target,meta_sources[j].upper()[0]),"w")
        resFile.write("theta, acc\n")
        for theta in thetas:
            print "##############################"
            print "start with theta=%s"%theta
            print "##############################"
            acc,clf_func = self_train(target,temp_X,temp_y,X_test,y_test,temp_un,theta=theta)
            
            if best_acc<acc:
                best_acc = acc
                best_clf = clf_func
                best_theta = theta

            resFile.write("%f, %f\n"%(theta,acc))
            resFile.flush()
        resFile.close()
        print "##############################"
        print "best_theta:",best_theta,"best_acc:",best_acc
        models.append(best_clf)

    eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1],
    eclf.fit(X_test,y_test) # this line is not doing work
    # tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6]
    # tmp_name = 'mlp/'+target.upper()[4]
    save_obj(eclf, "%s/self_clf"%target)
    pred = eclf.predict(X_test)
    # print pred
    acc = accuracy_score(y_test,pred)
    print 'self-train',acc
    pass
Example #15
0
def test_1model_labels():
    clf = LogisticRegression(multi_class='multinomial',
                             solver='newton-cg',
                             random_state=123)
    ens_clf_1 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=None)
    ens_clf_2 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=[1.])

    pred_e1 = ens_clf_1.fit(X, y).predict(X)
    pred_e2 = ens_clf_2.fit(X, y).predict(X)
    pred_e3 = clf.fit(X, y).predict(X)

    np.testing.assert_equal(pred_e1, pred_e2)
    np.testing.assert_equal(pred_e1, pred_e3)
Example #16
0
def votingEnsembleTest(all_country_data_with_algos, test_country_data_US):
    print(
        " \n For each training set country for each sub dataset (split by Mentality Cycle): the top n trained algorithms form a Voting Classifiers. This Voting Classifiers is then tested on its corresponding US sub data set. An aggregate scocre for each trainging set country is calculated through an Aggregation of its 3 Voting Classifiers' performances"
    )
    _all_country_data_with_trained_algos = copy.deepcopy(
        all_country_data_with_algos)

    for country in _all_country_data_with_trained_algos.keys():
        country_level_total_hits = 0
        for BC in _all_country_data_with_trained_algos[country].keys():
            classifiers = copy.deepcopy(
                _all_country_data_with_trained_algos[country][BC].get(
                    'trained algos'))

            clf_weights = np.asarray([1, 1, 1], dtype=int)

            Y = test_country_data_US[BC].get("Y")
            X = test_country_data_US[BC].get("X")

            vclf = EnsembleVoteClassifier(clfs=classifiers,
                                          weights=clf_weights,
                                          refit=False,
                                          voting='hard')  # voting='soft'

            vclf.fit(X, Y)
            y_estimate = vclf.predict(np.array(X))
            print(
                "Voting Classifier trained on {} Mentality Cycle {} has accuracy: {}"
                .format(country, BC, np.mean(Y == pd.Series(y_estimate))))

            ##saving Country-BC split accuracy and instance of Voting Classifier score to all_country... dictionary
            _all_country_data_with_trained_algos[country][BC][
                'accuracy'] = np.mean(Y == y_estimate)
            _all_country_data_with_trained_algos[country][BC][
                'votingclassifier'] = vclf
            country_level_total_hits = country_level_total_hits + np.sum(
                Y == y_estimate)

        record_count = test_country_data_US[1]["Y"].shape[
            0] + test_country_data_US[2]["Y"].shape[0] + test_country_data_US[
                3]["Y"].shape[0]
        _all_country_data_with_trained_algos[country]['accuracy'] = (
            country_level_total_hits / record_count)
        print("Aggregated Classifier trained on {} has accuracy: {} \n".format(
            country,
            _all_country_data_with_trained_algos[country]['accuracy']))

    return _all_country_data_with_trained_algos
Example #17
0
def test_fit_base_estimators_false():
    np.random.seed(123)
    clf1 = LogisticRegression(solver='liblinear', multi_class='ovr')
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()

    clf1.fit(X, y)
    clf2.fit(X, y)
    clf3.fit(X, y)

    eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                  voting='hard',
                                  fit_base_estimators=False)

    eclf.fit(X, y)
    assert round(eclf.score(X, y), 2) == 0.97
Example #18
0
class VotingModel:
    def __init__(self, X, y, x_test, model_lists):
        self.model = EnsembleVoteClassifier(clfs=model_lists,
                                            weights=[1, 1, 1],
                                            refit=False,
                                            voting='soft')
        self.X = X
        self.y = y
        self.X_test = x_test

    def train(self):
        self.model.fit(self.X, self.y)

    def predict(self):
        return self.model.predict(self.X_test)

    def predict_proba(self):
        return self.model.predict_proba(self.X_test)
Example #19
0
def meta_ensemble():
    #ensemble learning (mlxtend)
    eclf1 = EnsembleVoteClassifier(clfs=[model1, model2, model3],
                                   weights=weight_base,
                                   voting='soft',
                                   refit=True)
    eclf1.fit(train_x_dtm, train_y)
    print 'ensemble1 fitted.'

    eclf2 = EnsembleVoteClassifier(clfs=[model1, model2, model3],
                                   weights=weight_base,
                                   voting='soft',
                                   refit=True)
    eclf2.fit(X_resampled, y_resampled)
    print 'ensemble2 fitted.'

    eclf3 = EnsembleVoteClassifier(clfs=[eclf1, eclf2],
                                   weights=weight_meta,
                                   voting='soft',
                                   refit=False)
    apply_model(eclf3)
def meta_ensemble_model():
    # ensemble learning (mlxtend)
    ensemble1 = EnsembleVoteClassifier(clfs=[mnb, lr, rf],
                                       weights=weight_base,
                                       voting='soft',
                                       refit=True)
    ensemble2 = EnsembleVoteClassifier(clfs=[mnb, lr, rf],
                                       weights=weight_base,
                                       voting='soft',
                                       refit=True)
    meta_ensemble = EnsembleVoteClassifier(clfs=[ensemble1, ensemble2],
                                           weights=weight_meta,
                                           voting='soft',
                                           refit=False)

    ensemble1.fit(train_x_dtm, train_y)
    print('ensemble1 fitted.')
    ensemble2.fit(x_resampled, y_resampled)
    print('ensemble2 fitted.')

    return meta_ensemble
Example #21
0
class ModelTrustRegression:
    def __init__(self, model, n_neighbors=20, weights='uniform', n_folds=5):
        self.template_model = model
        self.n_neighbors = n_neighbors
        self.weights=weights
        self.n_folds = n_folds
        self.fold_regressions=[]
        self.fold_models=[]
        self.bagger = None

    def fit(self, X, values):
        #hard prediction
        for train_index, validation_index in KFold(n_splits=self.n_folds).split(X):
            train_set = X[train_index]
            train_values = values[train_index]

            validation_set = X[validation_index]
            validation_values = values[validation_index]

            fold_model = clone(self.template_model)
            fold_model.fit(train_set, train_values) #retrains a brand new model for the fold

            fold_regressor = KNeighborsRegressor(weights=self.weights, n_neighbors=self.n_neighbors)
            fold_regressor.fit(validation_set, fold_model.predict(validation_set) == validation_values)
            self.fold_regressions.append(fold_regressor)
            self.fold_models.append(fold_model)

        self.bagger = EnsembleVoteClassifier(self.fold_models, voting="soft", refit=False)
        self.bagger.fit(X, values) #trivial fit

    def predict(self, X):
        return np.mean([fm.predict(X) for fm in self.fold_regressions], axis=0)

    def predict_proba(self, X):
        return self.bagger.predict_proba(X)

    def get_bagger(self):
        return self.bagger
Example #22
0
def test6():
    import numpy as np
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from mlxtend.classifier import EnsembleVoteClassifier
    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    y = np.array([1, 1, 1, 2, 2, 2])
    eclf1 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                   voting='hard',
                                   verbose=1)
    eclf1 = eclf1.fit(X, y)
    print(eclf1.predict(X))
    eclf2 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft')
    eclf2 = eclf2.fit(X, y)
    print(eclf2.predict(X))
    eclf3 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                   voting='soft',
                                   weights=[2, 1, 1])
    eclf3 = eclf3.fit(X, y)
    print(eclf3.predict(X))
Example #23
0
def meta_ensemble():
    #ensemble learning (mlxtend)
    eclf1 = EnsembleVoteClassifier(clfs=[model1, model2, model3],
                                   weights=weight_base,
                                   voting='soft',
                                   refit=True)
    eclf1.fit(train_x_dtm, train_y)
    print 'ensemble1 fitted.'

    eclf2 = EnsembleVoteClassifier(clfs=[model1, model2, model3],
                                   weights=weight_base,
                                   voting='soft',
                                   refit=True)
    eclf2.fit(scipy.sparse.load_npz('train_x_dtm_us_smote.npz'), [
        int(str(line).replace('\n', ''))
        for line in open('train_y_us_smote', 'r')
    ])
    print 'ensemble2 fitted.'

    eclf3 = EnsembleVoteClassifier(clfs=[eclf1, eclf2],
                                   weights=weight_meta,
                                   voting='soft',
                                   refit=False)
    apply_model(eclf3)
Example #24
0
#

from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import SequentialFeatureSelector

sfs1 = SequentialFeatureSelector(clf1,
                                 k_features=4,
                                 floating=False,
                                 scoring='accuracy',
                                 print_progress=False,
                                 cv=0)
clf1_pipe = Pipeline([('sfs', sfs1),
                      ('logreg', clf1)])

eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3], voting='soft')

params = {'pipeline__sfs__k_features': [1, 2, 3],
          #'pipeline__logreg__C': [1,0, 100.0],
          'randomforestclassifier__n_estimators': [20, 200]}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid.fit(iris.data, iris.target)

for  params, mean_score, scores in grid.grid_scores_:
  print("%0.3f (+/-%0.03f) for %r"
        % (mean_score, scores.std()/ 2, params))

print grid.best_params_

eclf = eclf.set_params(**grid.best_params_)
print eclf.fit(X, y).predict(X[[1, 51, 149]])
        logging.info(f'Training {classifier_name}...')

        clf.fit(X_train, y_train)

        score = balanced_accuracy_score(y_test, clf.predict(X_test))

        logging.info(f'{classifier_name} BAC = {score:.4f}')

        probabilities = clf.predict_proba(X_test)
        np.save(PROBABILITIES_PATH / f'{classifier_name}.cv.{args.fold}.npy',
                probabilities)

        results.append([classifier_name, score])

    ensemble = EnsembleVoteClassifier(list(classifiers.values()),
                                      voting='soft',
                                      fit_base_estimators=False)
    ensemble.fit(X_train, y_train)

    score = balanced_accuracy_score(y_test, ensemble.predict(X_test))

    logging.info(f'Ensemble BAC = {score:.4f}')

    results.append(['Ensemble', score])

    with open(MODELS_PATH / f'ensemble.cv.{args.fold}.pickle', 'wb') as f:
        pickle.dump(ensemble, f)

    df = pd.DataFrame(results, columns=['Classifier', 'BAC'])
    df.to_csv(RESULTS_PATH / f'{args.fold}.csv', index=False)
    pd.concat([x_valid_0, x_valid_1, x_valid_2, x_valid_3, x_valid_4], axis=0))
y_valid = pd.DataFrame()
y_valid['target'] = x_valid['target']
x_valid.drop('target', axis=1, inplace=True)
x_train_0 = pd.DataFrame(X[X['target'] == 0][:90])
x_train_1 = pd.DataFrame((X[X['target'] == 1][:900]))
x_train_2 = pd.DataFrame(X[X['target'] == 2][:1300])
x_train_3 = pd.DataFrame(X[X['target'] == 3][:420])
x_train_4 = pd.DataFrame(X[X['target'] == 4][:90])
x_train = pd.DataFrame(
    pd.concat([x_train_0, x_train_1, x_train_2, x_train_3, x_train_4], axis=0))
y_train = pd.DataFrame()
y_train['target'] = x_train['target']
x_train.drop('target', axis=1, inplace=True)

eclf.fit(x_train[best_columns], y_train['target'])
preds = eclf.predict(x_valid[best_columns])
print('Confusion matrix:\n')
print(confusion_matrix(y_valid['target'].values, preds))
matrix_ = confusion_matrix(y_valid['target'].values, preds)
correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][2] + matrix_[3][
    3] + matrix_[4][4]
print('Correct answers count: ', correct_answers)

# --- answer module ---
eclf.fit(X[best_columns], Y['target'])
score_dataset = pd.read_csv('original_data/x_test.csv',
                            delimiter=';',
                            names=names)
y_pred = eclf.predict(score_dataset[best_columns])
pd.Series(y_pred).to_csv('data/answer.csv', index=False)
x_train, x_test, y_train, y_test = train_test_split(dataset_api, dataset_label, test_size=0.1)

vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)

train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)

clf1 = LogisticRegression(random_state=0)
clf2 = RandomForestClassifier(random_state=0)
clf3 = SVC(random_state=0, probability=True)
clf4 = MultinomialNB(alpha=.01)
clf5 = xgb.XGBClassifier()
eclif = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4, clf5],
                              weights=[2, 4, 2, 4, 7], voting='soft')
eclif.fit(train_vectors, y_train)

pred = eclif.predict(test_vectors)

f_1 = sklearn.metrics.f1_score(y_test, pred, average='weighted')
print "f_1 is " + str(f_1)

with open(f_1_f, "w") as f:
    f.write("f_1 is " + str(f_1))

c = make_pipeline(vectorizer, eclif)

nb_success = 0
nb_fail = 0

result_list = []
clf_DT = DecisionTreeClassifier()
#clf_MNB= MNB()
eclf = EnsembleVoteClassifier(clfs=[clf_RF, clf_ET, clf_svc, clf_DT],
                              weights=[1, 1, 1, 1])

labels = [
    'Random Forest', 'Extra Trees', 'Support Vector', 'Decision Tree',
    'Ensemble Vote'
]
for clf, label in zip([clf_RF, clf_ET, clf_svc, clf_DT, eclf], labels):

    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))

eclf.fit(X_train, y_train)
confidence = eclf.score(X_test, y_test)
print(confidence)

example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1]])
example_measures = example_measures.reshape(len(example_measures), -1)
prediction = eclf.predict(example_measures)
print(prediction)

col_dict = dict(list(enumerate(df.columns)))
col_dict

X = np.array(df.drop(['class'], 1), dtype=np.float64)
y = np.array(df['class'], dtype=np.int64)
plot_decision_regions(
    X=X,
#cl4=XGBClassifier()
clf4 = GradientBoostingClassifier()

print('10-fold cross validation:\n')

#np.random.seed(123)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4],
                              weights=[1, 1, 1, 1],
                              voting='soft')
#from sklearn.model_selection import ShuffleSplit
#for clf, label in zip([clf1, clf2, clf3], ['Logistic Regression', 'Random Forest', 'SVM']
#for clf, label in zip([clf1, clf3, cl4,eclf], ['Logistic Regression','RandomForest','SVM','Xgboost','Voting Ensemble']):

#    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
#    print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean()*100, scores.std(), label))
eclf.fit(X_train, Y_train)
y_pred = eclf.predict(X_test)
print(accuracy_score(Y_test, y_pred) * 100)
X = np.concatenate((X_train, X_test), 0)
Y = np.concatenate((Y_train, Y_test), 0)

#    cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
#    scores=cross_val_score(clf, X, y, cv=cv)
#    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
#    accuracies=cross_val_score(estimator=clf,X=X,y=Y,cv=10)
#    print(accuracies.mean()*100,accuracies.std()*100)

#    print("Accuracy: %0.4f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
Mamun_confusion_matrix = confusion_matrix(Y_test,
                                          y_pred,
                                          labels=[1, 2, 3, 4, 5, 6, 12, 13])
Example #30
0
                            n_estimators=100)
# current best
clf6 = ExtraTreesClassifier(max_features=0.45,
                            min_samples_leaf=1,
                            min_samples_split=5,
                            n_estimators=100)

eclf = EnsembleVoteClassifier(clfs=[clf3, clf4, clf5, clf6],
                              weights=[1, 1, 1, 1],
                              voting='soft')

labels = ['Trees_3', 'Trees_4', 'Trees_5', 'Trees_6', 'Ensemble']

for clf, label in zip([clf3, clf4, clf5, clf6, eclf], labels):

    scores = model_selection.cross_val_score(clf,
                                             X[best_columns],
                                             Y['target'],
                                             cv=4,
                                             scoring='neg_log_loss')
    print("Log Loss: %0.3f (+/- %0.3f) [%s]" %
          (scores.mean(), scores.std(), label))

# --- answer module ---
eclf.fit(X[best_columns], Y['target'])
score_dataset = pd.read_csv('original_data/x_test.csv',
                            delimiter=';',
                            names=names)
y_pred = eclf.predict(score_dataset[best_columns])
pd.Series(y_pred).to_csv('data/answer.csv', index=False)
Example #31
0
    'nthread': 4,
    'silent': 1,
    'subsample': 0.6,
    'reg_lambda': 0.89,
    'gamma': 0.1,
    'min_child_weight': 49.8,
    'colsample_bytree': 0.8,
    'n_estimators': 2790,
}
clf_2 = xgb.XGBClassifier(**clf_2_params)

clf_3_params = {
    'learning_rate': 0.0065,
    'max_depth': 5,
    'nthread': 4,
    'silent': 1,
    'subsample': 0.621,
    'reg_lambda': 0.726,
    'gamma': 0.053,
    'min_child_weight': 30.8,
    'colsample_bytree': 0.905,
    'n_estimators': 958,
}
clf_3 = xgb.XGBClassifier(**clf_3_params)

pipeline = EnsembleVoteClassifier(clfs=[clf_0, clf_1, clf_2, clf_3], weights=[1, 1, 1, 1], voting='soft')
pipeline.fit(train, Y)

y_pred = pipeline.predict_proba(test[test.columns])
pd.Series(y_pred[:, 1]).to_csv('answer.csv', index=False)
Example #32
0
clf5_pipe,clf5_avg_f1 = set_pipe(clf5, avg_feats, 'c45_')
list_of_cv_acc.append(clf5_avg_f1)


# In[50]:


clf6_pipe,clf6_avg_f1 = set_pipe(clf6, mi_feats, 'knn_')
list_of_cv_acc.append(clf6_avg_f1)


# In[51]:


enclf = EnsembleVoteClassifier((clf1_pipe,clf2_pipe,clf3_pipe,clf4_pipe,clf5_pipe, clf6_pipe), refit = False)
enclf.fit(X_train, y_train)
y_pred = enclf.predict(X_test)
con_mat = confusion_matrix(y_test, y_pred)
    
#print("Cross Val acc score:         ", (model_selection.cross_val_score(enclf, X_train, y_train, cv = 5,)).mean())
#print("Cross Val f1  score:         ", (model_selection.cross_val_score(enclf, X_train, y_train, cv = 5, scoring = 'f1')).mean())
print()
print("Overall Acc score:           ", accuracy_score(y_test, y_pred))
print("Recall score (Tru Pos Rate): ", recall_score(y_test, y_pred))
print("Precision score:             ", precision_score(y_test, y_pred))
print("Neg Predictive Val:          ", con_mat[0][0] / (con_mat[0][1] + con_mat[0][0]))
print("Tru Neg Rate(Specifi):       ", con_mat[0][0] / (con_mat[1][0] + con_mat[0][0]))
print("F1 score:                    ", f1_score(y_test, y_pred))
print("Auc score:                   ", roc_auc_score(y_test, y_pred))
print(con_mat)
print()
#print(X_train_counts.toarray()[0])

tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

feature_names = count_vect.get_feature_names()
ch2 = SelectKBest(chi2, k=1500)
X_train = ch2.fit_transform(X_train_tfidf, newsgroups_train.target)

selected_feature_names = [
    feature_names[i] for i in ch2.get_support(indices=True)
]

#clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.3,max_depth=3, random_state=0)
clf1 = MultinomialNB(alpha=0.1)
#clf2 = svm.LinearSVC(max_iter = 2000,probability=True,random_state=0)
clf2 = SVC(kernel='linear', probability=True)
#clf3 = SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")
clf = EnsembleVoteClassifier(clfs=[clf1, clf2], weights=[2, 1], voting='soft')

clf.fit(X_train, newsgroups_train.target)

#pred_t = clf.predict(X_train)
#print(metrics.precision_score(newsgroups_train.target, pred_t, average='macro'))

vectors_test2 = count_vect.transform(newsgroups_test.data)
vectors_test = tfidf_transformer.transform(vectors_test2)
X_test = ch2.transform(vectors_test)
pred = clf.predict(X_test)
print(metrics.precision_score(newsgroups_test.target, pred, average='macro'))