def make_model(self):
        #---------------------------------------------------------------------------------------------
        #                       TREE BASED ALGORITHMS
        #---------------------------------------------------------------------------------------------

        #--Chossing random_state parameter
        #------Basically, a sub-optimal greedy algorithm is repeated a number of times using----------
        #------random selections of features and samples (a similar technique used in random----------
        #------ forests).The 'random_state' parameter allows controlling these random choices---------

        #--n_estimators = no of decision trees to be created in forest

        model_rf = RandomForestClassifier(n_estimators=145,
                                          random_state=10,
                                          n_jobs=-1)
        model_rf.fit(train_feats2, target)

        model_gb = GradientBoostingClassifier(n_estimators=145,
                                              random_state=11,
                                              n_jobs=-1)
        model_gb.fit(train_feats2, target)

        model_ab = AdaBoostClassifier(n_estimators=145,
                                      random_state=12,
                                      n_jobs=-1)
        model_ab.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               LOGISTIC REGRESSION
        #--------------------------------------------------------------------------------------------

        model_lr = LogisticRegression(random_state=1)
        model_lr.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               NAIVE BAYES
        #--------------------------------------------------------------------------------------------

        model_nb = MultinomialNB()
        model_nb.fit(train_feats2, target)

        #--------------------------------------------------------------------------------------------
        #               VOTING ENSEMBLE OF ALL MODELS
        #--------------------------------------------------------------------------------------------

        clf = [model_rf, model_lr, model_gb, model_ab, model_nb]
        eclf = EnsembleVoteClassifier(
            clfs=clf, weights=[1, 2, 1, 1, 1],
            refit=False)  #weights can be decided by stacking!!
        eclf.fit(train_feats2, target)
        print("model created")
        preds = eclf.predict(test_feats2)
        sub3 = pd.DataFrame({'User_ID': test_df.User_ID, 'Is_Response': preds})
        sub3['Is_Response'] = sub3['Is_Response'].map(
            lambda x: functions.to_labels(self, x))
        sub3 = sub3[['User_ID', 'Is_Response']]
        sub3.to_csv('D:\\New folder\\f2c2f440-8-dataset_he\\SUB_TEST.csv',
                    index=False)
        print("prediction saved")
        return eclf
def tri_train(domain,X_train,y_train,X_test,y_test,X_un,theta=0.5,dis=False):
    models = list()
    accs = list()
    for i in range(3):   
        X_split,y_split = bootstrap_sample(X_train,y_train)
        acc,clf_func = get_acc_clf(domain,X_split,y_split,X_test,y_test)
        models.append(clf_func)
        accs.append(acc)

    for (j,k) in itertools.combinations(models,2):
        # i_features = list()
        unlabelled_features = np.array(X_un)
        total = len(X_train)+len(X_un)
        t = 0
        count = 0
        X_i = X_train
        y_i = y_train
        # find current classifier
        clf_i = [x for x in models if x!=j and x!=k][0]
        index_i = models.index(clf_i)
        print "***classifier %d***"%index_i
        while count < total and len(unlabelled_features)!=0:
            t += 1            
            X_tgt,y_tgt = get_features(unlabelled_features,j,k,clf_i,models,theta=theta,dis=dis)
            if len(X_tgt)==0 and t>1:
                print "no new features added"
                break
            
            X_i = concatenate(X_i,X_tgt)
            y_i = concatenate(y_i,y_tgt)
            count = len(X_i)
            print "%d %d %d"%(t,count,total)
            # clf_i.fit(X_i,y_i)
            # update classifier
            acc,clf_i = get_acc_clf(domain,X_i,y_i,X_test,y_test)
            if accs[index_i]<acc:
                accs[index_i] = acc
                # best_clf = clf_i
                print "*NEW BEST! best acc:", acc
                models[index_i] = clf_i
            else:
                print "no improvement..skip.."
                break
            if count == total:
                print "reach end.."
                break
            # update the unlabelled features for speed-up
            print np.array(X_tgt).shape
            X_tgt = [list(x) for x in X_tgt]
            unlabelled_features =[x for x in unlabelled_features if list(x) not in X_tgt]
            print np.array(unlabelled_features).shape
    # majority vote classifiers
    eclf = EnsembleVoteClassifier(clfs=models,weights=[1,1,1],refit=False)
    eclf.fit(X_test,y_test) # this line is not doing work
    # tmp_name = domain.upper()[0] if "large" not in domain else "large/"+domain.upper()[6]
    pred = eclf.predict(X_test)
    acc = accuracy_score(y_test,pred) if "large" not in domain else f1_score(y_test,pred,average='macro')

    print "acc:%s theta:%s"%(acc,theta),"seprate accs:",accs
    return acc,eclf
Esempio n. 3
0
def votingEnsembleTest2ndLayer_Test(top_ensembles_dict, test_country_data):
    hit_count = 0
    for BC in top_ensembles_dict.keys():
        classifiers = [
            _vclf
            for _vclf in [sub_list[1] for sub_list in top_ensembles_dict[BC]]
        ]
        _weights = np.asarray([1] * len(classifiers))
        vclf_layer2 = EnsembleVoteClassifier(clfs=classifiers,
                                             weights=_weights,
                                             refit=False)
        Y = test_country_data[BC]["Y"]
        X = test_country_data[BC]["X"]
        vclf_layer2.fit(X, Y)
        y_estimate = vclf_layer2.predict(X)
        print(
            "Mentality Cycle {} 2nd Layer Voting Classifier Ensemble has accuracy: {}"
            .format(BC, np.mean(Y == y_estimate)))
        hit_count = hit_count + np.sum(
            Y == y_estimate
        )  ##calc overall performance of top 3 classifiers for each region

    total_obvs = test_country_data[1]["Y"].shape[0] + test_country_data[2][
        "Y"].shape[0] + test_country_data[3]["Y"].shape[0]
    overall_hit_rate = hit_count / total_obvs
    print("Aggregated accuracy of 2nd Layer Voting Classifiers is: {}".format(
        overall_hit_rate))
Esempio n. 4
0
def emsembal_train(feature, label):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier
    label = transport_labels(label)
    X_train, X_test, Y_train, Y_test = train_test_split(feature,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=1000)
    clf1 = SVC(C=10, kernel='sigmoid', probability=True)
    clf2 = RandomForestClassifier(random_state=0)
    clf3 = LogisticRegression(random_state=0)
    clf4 = xgb.XGBClassifier(max_depth=8,
                             learning_rate=0.07,
                             n_estimators=35,
                             silent=True,
                             objective="binary:logistic",
                             booster='gbtree',
                             gamma=0,
                             min_child_weight=6,
                             subsample=0.8,
                             colsample_bytree=0.7,
                             reg_alpha=0.1,
                             seed=1000)
    eclf = EnsembleVoteClassifier(clfs=[clf1, clf3, clf4], voting='soft')

    eclf.fit(X_train, Y_train)
    y_pred = eclf.predict(X_test)
    print('eclf accs=%f' %
          (sum(1 for i in range(len(y_pred)) if y_pred[i] == Y_test[i]) /
           float(len(y_pred))))
def majority_vote(target):
    X_test = load_obj("%s/X_test"%target)
    y_test = load_obj("%s/y_test"%target)

    domains = []
    if "mlp" in target:
        domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"]
    else:
        if "large" not in target:
            domains = ["books","dvd","electronics","kitchen"]
            if target not in domains:
                return
        else:
            domains =["large/baby","large/cell_phone","large/imdb","large/yelp2014"]

    models = []
    for source in domains:
        if target == source:
            continue
        else:
            print source
            clf_func = load_obj("%s/self_clf"%source)
            models.append(clf_func)


    eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1],
    eclf.fit(X_test,y_test) # this line is not doing work
    tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6]
    tmp_name = target.upper()[0] if "mlp" not in target else "mlp/"+target.upper()[4]
    save_obj(eclf, '%s_eclf'%(tmp_name))
    pred = eclf.predict(X_test)
    acc = accuracy_score(y_test,pred) if "large" not in target else f1_score(y_test,pred,average='macro')
    print 'self-train',acc
    pass
def majority_vote_mlp(target):
    X_test = load_obj("%s/X_test"%target)
    y_test = load_obj("%s/y_test"%target)

    # domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"]
    data_name = ["books", "dvd", "electronics", "kitchen"]
    X_joint = load_obj("%s/X_joint"%target)
    y_joint = load_obj("%s/y_joint"%target)
    temp_un = load_obj("%s/X_un"%target)
    meta_sources = []
    for i in range(len(data_name)):
        if 'mlp/'+data_name[i] != target:
            meta_sources.append(data_name[i])
    # print meta_sources
    models = []
    for j in range(len(meta_sources)):
        temp_X = X_joint[j]
        temp_y = y_joint[j]
        thetas = [0.5,0.6,0.7,0.8,0.9]
        best_acc = 0.0
        best_clf =""
        best_theta = 0.0
        resFile = open("../work/params/%s_theta_self-%s.csv"%(target,meta_sources[j].upper()[0]),"w")
        resFile.write("theta, acc\n")
        for theta in thetas:
            print "##############################"
            print "start with theta=%s"%theta
            print "##############################"
            acc,clf_func = self_train(target,temp_X,temp_y,X_test,y_test,temp_un,theta=theta)
            
            if best_acc<acc:
                best_acc = acc
                best_clf = clf_func
                best_theta = theta

            resFile.write("%f, %f\n"%(theta,acc))
            resFile.flush()
        resFile.close()
        print "##############################"
        print "best_theta:",best_theta,"best_acc:",best_acc
        models.append(best_clf)

    eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1],
    eclf.fit(X_test,y_test) # this line is not doing work
    # tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6]
    # tmp_name = 'mlp/'+target.upper()[4]
    save_obj(eclf, "%s/self_clf"%target)
    pred = eclf.predict(X_test)
    # print pred
    acc = accuracy_score(y_test,pred)
    print 'self-train',acc
    pass
Esempio n. 7
0
def test6():
    import numpy as np
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from mlxtend.classifier import EnsembleVoteClassifier
    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    y = np.array([1, 1, 1, 2, 2, 2])
    eclf1 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                   voting='hard',
                                   verbose=1)
    eclf1 = eclf1.fit(X, y)
    print(eclf1.predict(X))
    eclf2 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft')
    eclf2 = eclf2.fit(X, y)
    print(eclf2.predict(X))
    eclf3 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                                   voting='soft',
                                   weights=[2, 1, 1])
    eclf3 = eclf3.fit(X, y)
    print(eclf3.predict(X))
Esempio n. 8
0
def votingEnsembleTest(all_country_data_with_algos, test_country_data_US):
    print(
        " \n For each training set country for each sub dataset (split by Mentality Cycle): the top n trained algorithms form a Voting Classifiers. This Voting Classifiers is then tested on its corresponding US sub data set. An aggregate scocre for each trainging set country is calculated through an Aggregation of its 3 Voting Classifiers' performances"
    )
    _all_country_data_with_trained_algos = copy.deepcopy(
        all_country_data_with_algos)

    for country in _all_country_data_with_trained_algos.keys():
        country_level_total_hits = 0
        for BC in _all_country_data_with_trained_algos[country].keys():
            classifiers = copy.deepcopy(
                _all_country_data_with_trained_algos[country][BC].get(
                    'trained algos'))

            clf_weights = np.asarray([1, 1, 1], dtype=int)

            Y = test_country_data_US[BC].get("Y")
            X = test_country_data_US[BC].get("X")

            vclf = EnsembleVoteClassifier(clfs=classifiers,
                                          weights=clf_weights,
                                          refit=False,
                                          voting='hard')  # voting='soft'

            vclf.fit(X, Y)
            y_estimate = vclf.predict(np.array(X))
            print(
                "Voting Classifier trained on {} Mentality Cycle {} has accuracy: {}"
                .format(country, BC, np.mean(Y == pd.Series(y_estimate))))

            ##saving Country-BC split accuracy and instance of Voting Classifier score to all_country... dictionary
            _all_country_data_with_trained_algos[country][BC][
                'accuracy'] = np.mean(Y == y_estimate)
            _all_country_data_with_trained_algos[country][BC][
                'votingclassifier'] = vclf
            country_level_total_hits = country_level_total_hits + np.sum(
                Y == y_estimate)

        record_count = test_country_data_US[1]["Y"].shape[
            0] + test_country_data_US[2]["Y"].shape[0] + test_country_data_US[
                3]["Y"].shape[0]
        _all_country_data_with_trained_algos[country]['accuracy'] = (
            country_level_total_hits / record_count)
        print("Aggregated Classifier trained on {} has accuracy: {} \n".format(
            country,
            _all_country_data_with_trained_algos[country]['accuracy']))

    return _all_country_data_with_trained_algos
Esempio n. 9
0
class VotingModel:
    def __init__(self, X, y, x_test, model_lists):
        self.model = EnsembleVoteClassifier(clfs=model_lists,
                                            weights=[1, 1, 1],
                                            refit=False,
                                            voting='soft')
        self.X = X
        self.y = y
        self.X_test = x_test

    def train(self):
        self.model.fit(self.X, self.y)

    def predict(self):
        return self.model.predict(self.X_test)

    def predict_proba(self):
        return self.model.predict_proba(self.X_test)
#print(X_train_counts.toarray()[0])

tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

feature_names = count_vect.get_feature_names()
ch2 = SelectKBest(chi2, k=1500)
X_train = ch2.fit_transform(X_train_tfidf, newsgroups_train.target)

selected_feature_names = [
    feature_names[i] for i in ch2.get_support(indices=True)
]

#clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.3,max_depth=3, random_state=0)
clf1 = MultinomialNB(alpha=0.1)
#clf2 = svm.LinearSVC(max_iter = 2000,probability=True,random_state=0)
clf2 = SVC(kernel='linear', probability=True)
#clf3 = SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")
clf = EnsembleVoteClassifier(clfs=[clf1, clf2], weights=[2, 1], voting='soft')

clf.fit(X_train, newsgroups_train.target)

#pred_t = clf.predict(X_train)
#print(metrics.precision_score(newsgroups_train.target, pred_t, average='macro'))

vectors_test2 = count_vect.transform(newsgroups_test.data)
vectors_test = tfidf_transformer.transform(vectors_test2)
X_test = ch2.transform(vectors_test)
pred = clf.predict(X_test)
print(metrics.precision_score(newsgroups_test.target, pred, average='macro'))
        logging.info(f'Training {classifier_name}...')

        clf.fit(X_train, y_train)

        score = balanced_accuracy_score(y_test, clf.predict(X_test))

        logging.info(f'{classifier_name} BAC = {score:.4f}')

        probabilities = clf.predict_proba(X_test)
        np.save(PROBABILITIES_PATH / f'{classifier_name}.cv.{args.fold}.npy',
                probabilities)

        results.append([classifier_name, score])

    ensemble = EnsembleVoteClassifier(list(classifiers.values()),
                                      voting='soft',
                                      fit_base_estimators=False)
    ensemble.fit(X_train, y_train)

    score = balanced_accuracy_score(y_test, ensemble.predict(X_test))

    logging.info(f'Ensemble BAC = {score:.4f}')

    results.append(['Ensemble', score])

    with open(MODELS_PATH / f'ensemble.cv.{args.fold}.pickle', 'wb') as f:
        pickle.dump(ensemble, f)

    df = pd.DataFrame(results, columns=['Classifier', 'BAC'])
    df.to_csv(RESULTS_PATH / f'{args.fold}.csv', index=False)
y_valid = pd.DataFrame()
y_valid['target'] = x_valid['target']
x_valid.drop('target', axis=1, inplace=True)
x_train_0 = pd.DataFrame(X[X['target'] == 0][:90])
x_train_1 = pd.DataFrame((X[X['target'] == 1][:900]))
x_train_2 = pd.DataFrame(X[X['target'] == 2][:1300])
x_train_3 = pd.DataFrame(X[X['target'] == 3][:420])
x_train_4 = pd.DataFrame(X[X['target'] == 4][:90])
x_train = pd.DataFrame(
    pd.concat([x_train_0, x_train_1, x_train_2, x_train_3, x_train_4], axis=0))
y_train = pd.DataFrame()
y_train['target'] = x_train['target']
x_train.drop('target', axis=1, inplace=True)

eclf.fit(x_train[best_columns], y_train['target'])
preds = eclf.predict(x_valid[best_columns])
print('Confusion matrix:\n')
print(confusion_matrix(y_valid['target'].values, preds))
matrix_ = confusion_matrix(y_valid['target'].values, preds)
correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][2] + matrix_[3][
    3] + matrix_[4][4]
print('Correct answers count: ', correct_answers)

# --- answer module ---
eclf.fit(X[best_columns], Y['target'])
score_dataset = pd.read_csv('original_data/x_test.csv',
                            delimiter=';',
                            names=names)
y_pred = eclf.predict(score_dataset[best_columns])
pd.Series(y_pred).to_csv('data/answer.csv', index=False)
clf4 = GradientBoostingClassifier()

print('10-fold cross validation:\n')

#np.random.seed(123)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4],
                              weights=[1, 1, 1, 1],
                              voting='soft')
#from sklearn.model_selection import ShuffleSplit
#for clf, label in zip([clf1, clf2, clf3], ['Logistic Regression', 'Random Forest', 'SVM']
#for clf, label in zip([clf1, clf3, cl4,eclf], ['Logistic Regression','RandomForest','SVM','Xgboost','Voting Ensemble']):

#    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
#    print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean()*100, scores.std(), label))
eclf.fit(X_train, Y_train)
y_pred = eclf.predict(X_test)
print(accuracy_score(Y_test, y_pred) * 100)
X = np.concatenate((X_train, X_test), 0)
Y = np.concatenate((Y_train, Y_test), 0)

#    cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
#    scores=cross_val_score(clf, X, y, cv=cv)
#    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
#    accuracies=cross_val_score(estimator=clf,X=X,y=Y,cv=10)
#    print(accuracies.mean()*100,accuracies.std()*100)

#    print("Accuracy: %0.4f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
Mamun_confusion_matrix = confusion_matrix(Y_test,
                                          y_pred,
                                          labels=[1, 2, 3, 4, 5, 6, 12, 13])
confusion_matrix = Mamun_confusion_matrix
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False)

train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)

clf1 = LogisticRegression(random_state=0)
clf2 = RandomForestClassifier(random_state=0)
clf3 = SVC(random_state=0, probability=True)
clf4 = MultinomialNB(alpha=.01)
clf5 = xgb.XGBClassifier()
eclif = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4, clf5],
                              weights=[2, 4, 2, 4, 7], voting='soft')
eclif.fit(train_vectors, y_train)

pred = eclif.predict(test_vectors)

f_1 = sklearn.metrics.f1_score(y_test, pred, average='weighted')
print "f_1 is " + str(f_1)

with open(f_1_f, "w") as f:
    f.write("f_1 is " + str(f_1))

c = make_pipeline(vectorizer, eclif)

nb_success = 0
nb_fail = 0

result_list = []
result_label = []
result_accepted_list_ml = []
trained = util.load_pickle(name='fs_1', path='..\\pickles\\feature_sets\\')
print('trained', size(trained))
test = util.load_pickle(name='fs_test_1', path='..\\pickles\\test_features\\')
print('test', size(test))

test_data = test['data_set']
featureset = 'fs_words_bigrams_pos'

X_train, y_train = trained[featureset], trained['labels']
X_test, y_test = test[featureset], test['labels']
feat_size = X_train.shape[1]
x = load_from_file()
svm = x['svm']
xgb = x['xgb']
knn = x['knn']
nb = x['nb']
dt = x['dt']
rf = x['rf']
nn = x['nn']
mc = x['mc']
estimators = [svm.clf, xgb.clf,  nb.clf, dt.clf, rf.clf]#, mc.clf], #, nn.clf]
#y_pred = predict_from_multiple_estimator(estimators, X_test)

from mlxtend.classifier import EnsembleVoteClassifier

combined =EnsembleVoteClassifier(clfs=estimators, voting='hard', refit=False)
combined.fit(X_train, y_train)
y_pred = combined.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
Esempio n. 16
0
                            n_estimators=100)
# current best
clf6 = ExtraTreesClassifier(max_features=0.45,
                            min_samples_leaf=1,
                            min_samples_split=5,
                            n_estimators=100)

eclf = EnsembleVoteClassifier(clfs=[clf3, clf4, clf5, clf6],
                              weights=[1, 1, 1, 1],
                              voting='soft')

labels = ['Trees_3', 'Trees_4', 'Trees_5', 'Trees_6', 'Ensemble']

for clf, label in zip([clf3, clf4, clf5, clf6, eclf], labels):

    scores = model_selection.cross_val_score(clf,
                                             X[best_columns],
                                             Y['target'],
                                             cv=4,
                                             scoring='neg_log_loss')
    print("Log Loss: %0.3f (+/- %0.3f) [%s]" %
          (scores.mean(), scores.std(), label))

# --- answer module ---
eclf.fit(X[best_columns], Y['target'])
score_dataset = pd.read_csv('original_data/x_test.csv',
                            delimiter=';',
                            names=names)
y_pred = eclf.predict(score_dataset[best_columns])
pd.Series(y_pred).to_csv('data/answer.csv', index=False)
Esempio n. 17
0
list_of_cv_acc.append(clf5_avg_f1)


# In[50]:


clf6_pipe,clf6_avg_f1 = set_pipe(clf6, mi_feats, 'knn_')
list_of_cv_acc.append(clf6_avg_f1)


# In[51]:


enclf = EnsembleVoteClassifier((clf1_pipe,clf2_pipe,clf3_pipe,clf4_pipe,clf5_pipe, clf6_pipe), refit = False)
enclf.fit(X_train, y_train)
y_pred = enclf.predict(X_test)
con_mat = confusion_matrix(y_test, y_pred)
    
#print("Cross Val acc score:         ", (model_selection.cross_val_score(enclf, X_train, y_train, cv = 5,)).mean())
#print("Cross Val f1  score:         ", (model_selection.cross_val_score(enclf, X_train, y_train, cv = 5, scoring = 'f1')).mean())
print()
print("Overall Acc score:           ", accuracy_score(y_test, y_pred))
print("Recall score (Tru Pos Rate): ", recall_score(y_test, y_pred))
print("Precision score:             ", precision_score(y_test, y_pred))
print("Neg Predictive Val:          ", con_mat[0][0] / (con_mat[0][1] + con_mat[0][0]))
print("Tru Neg Rate(Specifi):       ", con_mat[0][0] / (con_mat[1][0] + con_mat[0][0]))
print("F1 score:                    ", f1_score(y_test, y_pred))
print("Auc score:                   ", roc_auc_score(y_test, y_pred))
print(con_mat)
print()
(pd.DataFrame(y_pred)).to_csv('maj_vote' + 'y_pred_avg_filt.csv')
y_valid = pd.DataFrame()
y_valid['target'] = x_valid['target']
x_valid.drop('target', axis=1, inplace=True)
x_train_0 = pd.DataFrame(X[X['target'] == 0][:90])
x_train_1 = pd.DataFrame((X[X['target'] == 1][:900]))
x_train_2 = pd.DataFrame(X[X['target'] == 2][:1300])
x_train_3 = pd.DataFrame(X[X['target'] == 3][:420])
x_train_4 = pd.DataFrame(X[X['target'] == 4][:90])
x_train = pd.DataFrame(
    pd.concat([x_train_0, x_train_1, x_train_2, x_train_3, x_train_4], axis=0))
y_train = pd.DataFrame()
y_train['target'] = x_train['target']
x_train.drop('target', axis=1, inplace=True)

eclf.fit(x_train[best_columns], y_train['target'])
preds = eclf.predict(x_valid[best_columns])
print('Confusion matrix:\n')
print(confusion_matrix(y_valid['target'].values, preds))
matrix_ = confusion_matrix(y_valid['target'].values, preds)
print(type(matrix_))
correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][2] + matrix_[3][
    3] + matrix_[4][4]
print('Correct answers count: ', correct_answers)

for iteration in range(10):
    best_score = 1
    best_feature = ''
    for feature in good_features_drop_no_corr:
        my_columns = best_columns[:]
        my_columns.append(feature)
        if feature in best_columns:
plot_conf(svc)

results_acc['svc'] = accscorsv
results_f1['svc']  = f1scorsv
#########################Boosting#################################

log = LogisticRegression(solver='lbfgs', class_weight='balanced')
ada = AdaBoostClassifier(n_estimators=5, base_estimator=log)
grad_boost = GradientBoostingClassifier(n_estimators=100)
xgb = XGBClassifier(max_depth=8, learning_rate=0.001, use_label_encoder=False)

ensemble = EnsembleVoteClassifier(clfs = [ada, grad_boost, xgb], voting='hard')

ensemble.fit(X_train, y_train)

y_preden = ensemble.predict(X_test)
f1scoren = metrics.f1_score(y_test, y_preden)
accscoren = ensemble.score(X_test, y_test)
results_acc['ensemble'] = accscoren
results_f1['ensemble']  = f1scoren

print(classification_report(y_test, y_pred))
plot_conf(ensemble)
###############################################################################

naive = GaussianNB(var_smoothing=2e-9)
naive.fit(X_train, y_train)

y_pred  = naive.predict(X_test)
f1scornb = metrics.f1_score(y_test, y_pred)
accscornb = naive.score(X_test, y_test)
Esempio n. 20
0
class MulticriteriaEnsemble(object):
    def __init__(self,
                 models=OrderedDict({}),
                 dataset=None,
                 pickle_path=None,
                 crit_metrics=None,
                 global_metric=None,
                 delta=None,
                 epsilon=None,
                 a=None,
                 bootstrap_models=OrderedDict({}),
                 n_splits=5,
                 voting='soft',
                 jenks=True,
                 jenks_limit=2,
                 refit=False):
        self.models = models
        self.bootstrap_models = bootstrap_models
        self.dataset = dataset
        self.crit_metrics = crit_metrics
        self.global_metric = global_metric
        self.delta = delta
        self.best_delta = None
        self.epsilon = epsilon
        self.a = a
        self.voting = voting
        self.n_splits = n_splits
        self.refit = refit
        self.pickle_path = self.dataset.path + 'base_learners/'
        self.multicriteria_table = None
        self.meta_table = None
        self.utastar_model = None
        self.wmv_model = None
        self.natural_breaks = None
        self.weights = []
        self.global_utilities = []
        self.kfold_indices = []
        self.test_kfold_indices = []
        self.global_metrics = []
        self.is_fit = {
            'wmv': False,
            'clfs': not self.refit,
            'utastar': False,
        }
        self.jenks = jenks
        self.jenks_limit = jenks_limit
        if not self.models and refit == True:
            raise Exception('Base learners are not provided.')
        elif self.models and refit == False:
            raise Exception(
                'Models parameter should not be set to anything while refit=False'
            )
        if self.dataset == None:
            raise Exception('Dataset is not provided.')
        if self.crit_metrics == None:
            raise Exception('Performance estimators are not provided.')
        if self.global_metric == None:
            raise Exception('Global Performance estimator is not provided.')
        if self.delta == None or self.a == None or self.epsilon == None:
            raise Exception(
                'One or more utastar model parameters is/are not provided.')

    def _pso_cost(self, x):
        self.delta = x[0]
        self.epsilon = x[1]
        if self.is_fit['wmv']:
            self.fit(mtable=False)
        else:
            self.fit()
        return 1 - self.score()

    def pso(self, bounds, num_particles, w, c1, c2, maxiter, threshold):
        psopt(self._pso_cost, bounds, num_particles, w, c1, c2, maxiter,
              threshold)

    def _save_model(self, model, file_name):
        print "Saving Model!"
        if os.path.isfile(self.pickle_path + file_name):
            if not os.path.exists(self.pickle_path + 'Archive/'):
                os.makedirs(self.pickle_path + 'Archive/')
            archived_file_name = self.pickle_path + 'Archive/' + file_name.replace(
                '.pkl', '_') + datetime.datetime.today().strftime(
                    "%m-%d-%Y-%H%M%S") + '.pkl'
            shutil.move(self.pickle_path + file_name, archived_file_name)
            joblib.dump(model, self.pickle_path + file_name)
            print "Model Saved!!!"
        else:
            print "Model Saved!!!"
            joblib.dump(model, self.pickle_path + file_name)

    #Reinitialize crucial variables
    def _reset(self):
        self.global_utilities = []
        self.weights = []
        self.kfold_indices = []
        if self.refit == True:
            self.bootstrap_models = OrderedDict({})
            print 'Multicriteria Table Deleted!!!'
            self.multicriteria_table = None
            self.meta_table = None

    #Split dataset to k stratified folds and save the indices
    def _skfold(self, n_splits):
        skf = StratifiedKFold(n_splits=n_splits,
                              shuffle=True,
                              random_state=12345)
        for train_index, test_index in skf.split(self.dataset.X_train,
                                                 self.dataset.y_train):
            self.kfold_indices.append(train_index.tolist())
            self.test_kfold_indices.append(test_index.tolist())

    #Fit the base learners
    def _fit_clfs(self):
        #If the path that the models will be saved does not exist create it
        if not os.path.exists(self.pickle_path):
            os.makedirs(self.pickle_path)
        #For every fold
        for k_idx, k in enumerate(self.kfold_indices):
            #Make a copy of the base learners
            temp_models = OrderedDict(
                zip(self.models.keys(), clone(self.models.values())))
            #For every model in the base learners create a separate model , train it on the current fold and save it
            for model in temp_models.keys():
                model_name = '%s_%s_FOLD%i' % (model.replace(
                    '_' + self.dataset.name, ''), self.dataset.name, k_idx)
                temp_models[model].fit(self.dataset.X_train.iloc[k],
                                       self.dataset.y_train.iloc[k])
                file_name = model_name + '.pkl'
                self._save_model(temp_models[model], file_name)
                self.bootstrap_models[model_name] = temp_models[model]

        #Rename the base learners to include the dataset name,fit the models and save them
        if not self.dataset.name in model:
            self.models = self._rename_models(self.models)
        for model in self.models.keys():
            self.models[model].fit(self.dataset.X_train, self.dataset.y_train)
            self._save_model(self.models[model], model + '.pkl')

    #Fit the utastar model
    def _fit_utastar(self):
        #Define the Utastar model
        self.utastar_model = Utastar(self.multicriteria_table, self.meta_table,
                                     self.delta, self.epsilon)
        #Fit the Utastar model
        self.utastar_model.solve()

    def _get_global_utilities(self):
        metrics = self._get_metrics(self.bootstrap_models, on='test')
        self._utastar_predict(metrics)

    #Fit the Weighted Majority Voting model
    def _fit_wmv(self):
        #Merge the base learners and the produced models(extra)
        models = self.bootstrap_models.values()
        #Define the Weighted Majority Voting model
        self.wmv_model = EnsembleVoteClassifier(clfs=models,
                                                weights=self.weights,
                                                voting=self.voting,
                                                refit=False)
        #Fit the WMV model
        self.wmv_model.fit(self.dataset.X_train, self.dataset.y_train)

    #Fit the Multicriteria Ensemble Model
    def fit(self, mtable=True):
        #Reinitilize crucial variables
        self._reset()
        #Get Stratified K-Fold indices
        self._skfold(self.n_splits)
        #if refit is needed,fit the models
        if self.refit:
            self._fit_clfs()
            self.is_fit['clfs'] = True
        else:
            #Check if
            try:
                for base_learner in os.walk(self.pickle_path).next()[2]:
                    if 'FOLD' in base_learner:
                        self.bootstrap_models[base_learner.replace(
                            '.pkl', '')] = joblib.load(self.pickle_path +
                                                       '%s' % base_learner)
                    else:
                        self.models[base_learner.replace(
                            '.pkl', '')] = joblib.load(self.pickle_path +
                                                       '%s' % base_learner)
                dummy_var = self.bootstrap_models.keys()[1]
            except:
                raise AttributeError(
                    'Refit is set to False but no models are given.')
        if mtable == False and self.multicriteria_table is None:
            raise Exception(
                'Multicriteria table not found.Please run fit(mtable=True) at least once.'
            )
        elif mtable == True:
            print 'Multicriteria table formed!!!'
            self._get_meta_table()
            self._get_multicriteria_table()
        self._fit_utastar()
        self._get_global_utilities()
        self._get_clfs_weights()
        self._fit_wmv()
        self.is_fit['wmv'] = True

    def predict(self, X):
        return self.wmv_model.predict(X)

    def predict_proba(self, X):
        return self.wmv_model.predict_proba(X)

    def _get_clfs_weights(self):
        gu = self.global_utilities
        if self.jenks == True:
            self.natural_breaks = jenkspy.jenks_breaks(gu, nb_class=5)
            gu = [
                i if i >= self.natural_breaks[-self.jenks_limit] else 0
                for i in gu
            ]
        gu_sum = sum(gu)
        for value in gu:
            self.weights.append(value / gu_sum)

    def add_clfs(self, clfs, refit=False):
        clfs = self._rename_models(clfs)
        if set(self.models.keys()).isdisjoint(clfs.keys()):
            if not refit:
                metrics = self._get_metrics(clfs)
                self.models.update(clfs)
            else:
                temp_models = {}
                for clf in clfs.keys():
                    temp_models[clf] = clone(clfs[clf])
                    temp_models[clf].fit(self.dataset.X_train,
                                         self.dataset.y_train)
                metrics = self._get_metrics(temp_models)
                self.models.update(temp_models)
            self._utastar_predict(metrics)
            self.weights = []
            self._get_clfs_weights()
            self._fit_wmv()
        else:
            raise Exception('One or more models are already in the ensemble.')

    def score(self):
        return self._get_global_metrics({'wmv': self.wmv_model}, on='test')[0]

    def _utastar_predict(self, metrics):
        for clf_metrics in metrics:
            pred_partial_util = []
            for crit in self.utastar_model.criteria:
                X = self.utastar_model.intervals[crit]
                y = self.utastar_model.marginal_post[crit]
                pred_partial_util.append(
                    np.interp(
                        clf_metrics[
                            self.utastar_model.criteria.tolist().index(crit) +
                            1], X, y))
            pred_global_util = np.array(pred_partial_util).dot(
                np.array(clf_metrics[1:]))
            self.global_utilities.append(pred_global_util)

    def _rename_models(self, models):
        for model in models.keys():
            model_name = '%s_%s' % (model, self.dataset.name)
            models[model_name] = models.pop(model)
        return models

    def plot_partial_utilities(self):
        numofcriteria = len(self.utastar_model.criteria)
        n = numofcriteria

        if n % 2 == 0:
            fig1, axs = plt.subplots(n / 2, 2, figsize=(18, 18))
        else:
            fig1, axs = plt.subplots(n / 2 + 1, 2, figsize=(18, 18))
        for i in range(n):
            y = self.utastar_model.marginal_post[
                self.utastar_model.criteria[i]]
            x = self.utastar_model.intervals[self.utastar_model.criteria[i]]
            if i % 2 == 0:
                if self.utastar_model.get_type(
                        self.utastar_model.criteria[i]) == 1:
                    axs[i / 2, 0].plot(x, y, '--ok')
                    axs[i / 2, 0].set_title(self.utastar_model.criteria[i])
                    axs[i / 2, 0].set_xticks(x)
                    axs[i / 2, 0].set_xlim(x[0], x[-1])
                    axs[i / 2,
                        0].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1)))
                    axs[i / 2, 0].yaxis.grid(False)
                    if self.utastar_model.get_monotonicity(
                            self.utastar_model.criteria[i]) == 1:
                        axs[i / 2, 0].set_xlim(x[-1], x[0])
                else:
                    axs[i / 2, 0].plot(x, y, '-ok')
                    axs[i / 2, 0].set_title(self.utastar_model.criteria[i])
                    axs[i / 2, 0].set_xticks(x)
                    axs[i / 2, 0].set_xlim(x[0], x[-1])
                    axs[i / 2,
                        0].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1)))
                    axs[i / 2, 0].yaxis.grid(False)
                    if self.utastar_model.get_monotonicity(
                            self.utastar_model.criteria[i]) == 1:
                        axs[i / 2, 0].set_xlim(x[-1], x[0])
            else:
                if self.utastar_model.get_type(
                        self.utastar_model.criteria[i]) == 1:
                    axs[i / 2, 1].plot(x, y, '--ok')
                    axs[i / 2, 1].set_title(self.utastar_model.criteria[i])
                    axs[i / 2, 1].set_xticks(x)
                    axs[i / 2, 1].set_xlim(x[0], x[-1])
                    axs[i / 2,
                        1].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1)))
                    axs[i / 2, 1].yaxis.grid(False)
                    if self.utastar_model.get_monotonicity(
                            self.utastar_model.criteria[i]) == 1:
                        axs[i / 2, 1].set_xlim(x[-1], x[0])

                else:
                    axs[i / 2, 1].plot(x, y, '-ok')
                    axs[i / 2, 1].set_title(self.utastar_model.criteria[i])
                    axs[i / 2, 1].set_xticks(x)
                    axs[i / 2, 1].set_xlim(x[0], x[-1])
                    axs[i / 2,
                        1].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1)))
                    axs[i / 2, 1].yaxis.grid(False)
                    if self.utastar_model.get_monotonicity(
                            self.utastar_model.criteria[i]) == 1:
                        axs[i / 2, 1].set_xlim(x[-1], x[0])
        if n % 2 != 0:
            for l in axs[i / 2 - 1, 1].get_xaxis().get_majorticklabels():
                l.set_visible(True)
            fig1.delaxes(axs[i / 2, 1])
        #plt.subplots_adjust(wspace = 0.3,hspace = 0.3)
        plt.tight_layout()
        plt.show()

    def plot_global_utilities(self):
        fig4 = plt.figure(4)
        ax = fig4.gca()
        ax.barh(range(len(self.utastar_model.global_utilities_post))[::-1],
                self.utastar_model.global_utilities_post.values(),
                align='center',
                color='grey',
                alpha=0.8)
        plt.yticks(
            range(len(self.utastar_model.global_utilities_post))[::-1],
            self.utastar_model.global_utilities_post.keys())
        ax.plot(self.utastar_model.global_utilities_post.values(),
                range(len(self.utastar_model.global_utilities_post))[::-1],
                linestyle='--',
                color='black',
                alpha=0.8)
        plt.xlim(0, 1)
        plt.title('Ranking')
        plt.tight_layout()
        plt.show()

    def plot_global_utilities_pred(self):
        fig4 = plt.figure(4)
        ax = fig4.gca()
        ax.barh(range(len(self.global_utilities))[::-1],
                self.global_utilities,
                align='center',
                color='grey',
                alpha=0.8)
        plt.yticks(
            range(len(self.global_utilities))[::-1],
            self.bootstrap_models.keys())
        ax.plot(self.global_utilities,
                range(len(self.global_utilities))[::-1],
                linestyle='--',
                color='black',
                alpha=0.8)
        plt.xlim(0, 1)
        plt.title('Ranking')
        plt.tight_layout()
        plt.show()

    def plot_criteria_weights(self):
        variables = self.utastar_model.model_weights_post.keys()
        data = self.utastar_model.model_weights_post.values()
        ranges = [
            (0.00001,
             0.00001 + max(self.utastar_model.model_weights_post.values()))
        ] * len(self.utastar_model.criteria)
        fig1 = plt.figure(figsize=(10, 10))
        radar = ComplexRadar(fig1, variables, ranges, 7)
        radar.plot(data)
        #dradar.fill(data, alpha=0.2, color='grey')
        plt.show()

    def plot_model_weights(self, title):
        sns.set(style="whitegrid")
        f, ax = plt.subplots(figsize=(10, 4))
        variables = dict(
            sorted(zip(self.bootstrap_models.keys(), self.weights)))
        sns.set_color_codes("pastel")
        f = sns.barplot(x=variables.keys(), y=variables.values(),
                        color="b").set_title(title)
        ax.set_xticklabels(ax.get_xticklabels(),
                           rotation=45,
                           fontdict={
                               'verticalalignment': 'baseline',
                               'horizontalalignment': 'right'
                           })
        ax.set(xlim=(-1, 30), ylabel="Weight", xlabel="Models")
        sns.despine(left=True, bottom=True)

    def _get_meta_table(self):
        columns = [
            'Cri/atributes', 'Monotonicity', 'Type', 'Worst', 'Best', 'a'
        ]
        meta_table = []
        for metric in self.crit_metrics.keys():
            monotonicity = 1
            if self.crit_metrics[metric][0]._sign == -1:
                monotonicity = 0
                self.crit_metrics[metric][0]._sign = 1
            mt_metric = [
                metric, monotonicity, 0, self.crit_metrics[metric][1],
                self.crit_metrics[metric][2], self.a
            ]
            meta_table.append(mt_metric)
        self.meta_table = pd.DataFrame(meta_table, columns=columns)

    def _get_multicriteria_table(self):
        criteria = self.crit_metrics.keys()
        columns = ['Alt/Cri ']
        columns.extend(criteria)
        #metrics_orig = self._get_metrics(self.models)
        metrics_bootstrap = self._get_metrics(self.bootstrap_models,
                                              on='validation')
        metrics = metrics_bootstrap
        multicriteria_table = pd.DataFrame(metrics, columns=columns)
        ranking = self._get_init_ranking()
        ranking = pd.DataFrame(ranking, columns=['Ranking'])
        self.multicriteria_table = multicriteria_table.join(ranking).copy(
            deep=True)

    def _get_dataset(self, model, on='test'):
        if on == 'test':
            X, y = self.dataset.X_test.copy(), self.dataset.y_test.copy()
        elif on == 'validation':
            X, y = self.dataset.X_train.copy(), self.dataset.y_train.copy()
            if 'FOLD' in model:
                fold_idx = int(re.search(r'(?<=FOLD)[0-9]', model).group(0))
                indices = self.test_kfold_indices[fold_idx]
                X, y = X.iloc[indices], y.iloc[indices]
        elif on == 'train':
            X, y = self.dataset.X_train, self.dataset.y_train
            if 'FOLD' in model:
                fold_idx = int(re.search(r'(?<=FOLD_)[0-9]', model).group(0))
                indices = self.kfold_indices[fold_idx]
                X, y = X.iloc[indices], y.iloc[indices]
        else:
            raise Exception('Unexpected input for argument on.')
        return X, y

    def _get_global_metrics(self, models, on='test'):
        global_metrics = []
        for model in models.keys():
            X, y = self._get_dataset(model, on=on)
            global_metrics.append(self.global_metric(models[model], X, y))
        return global_metrics

    def _get_init_ranking(self):
        #gm_orig = self._get_global_metrics(self.models,on='validation')
        gm_bootstrap = self._get_global_metrics(self.bootstrap_models,
                                                on='validation')
        #gm = gm_orig + gm_bootstrap
        self.global_metrics = gm_bootstrap
        if self.global_metric._sign == 1:
            ranking = len(self.global_metrics) - scipy.stats.rankdata(
                self.global_metrics, method='max')
        else:
            ranking = scipy.stats.rankdata(gm, method='max')
        return ranking

    def _get_metrics(self, models, on='test'):
        metrics = []
        for model in models.keys():
            model_metrics = [model]
            X, y = self._get_dataset(model, on=on)
            for metric in self.crit_metrics.keys():
                mes = self.crit_metrics[metric][0](models[model], X, y)
                #Takes care of the negativde values on the multicriteria table and replaces them with 0
                if mes > 0:
                    model_metrics.append(mes)
                else:
                    model_metrics.append(0)
            metrics.append(model_metrics)
        return metrics
    lgbm_calibrator, knn_calibrator, rf_calibrator, ada_calibrator,
    extra_calibrator
],
                                   weights=weights,
                                   refit=False,
                                   voting='soft',
                                   verbose=1)

print('fitting')
#eclf_hard.fit(train_xm, train_y)
eclf_soft.fit(train_xm, train_y)
print('predicting')
#eclf_hard_pred = eclf_hard.predict(val_xm)
#eclf_hard_pred_pr = eclf_hard.predict_proba(val_xm)

eclf_soft_pred = eclf_soft.predict(val_xm)
eclf_soft_pred_pr = eclf_soft.predict_proba(val_xm)

#evaluating majority voting
voter_pred = eclf_soft_pred
voter_pred_pr = eclf_soft_pred_pr

acc_voter = accuracy_score(val_y, voter_pred)
roc_voter = roc_auc_score(val_y, voter_pred_pr[:, 1])
f1_voter = f1_score(val_y, voter_pred)
precision_voter = precision_score(val_y, voter_pred)
recall_voter = recall_score(val_y, voter_pred)
log_loss_voter = log_loss(val_y, voter_pred_pr)
print('accuracy voter: ', acc_voter)
print('roc voter: ', roc_voter)
print('f1 voter: ', f1_voter)
Esempio n. 22
0
          RandomForestClassifier(n_estimators=500,
                                 max_features='auto',
                                 min_samples_split=20,
                                 min_samples_leaf=5),
          GradientBoostingClassifier(
              n_estimators=250, max_features=5, min_samples_leaf=5),
          ExtraTreesClassifier(n_estimators=75, min_samples_leaf=5)))

# classifier = StackingClassifier(
#     classifiers=(
#         xgb.XGBClassifier(n_estimators=150, max_depth=4, subsample=0.7, colsample_bytree=0.4),
#         RandomForestClassifier(n_estimators=500, max_features='auto', min_samples_split=20, min_samples_leaf=5),
#         GradientBoostingClassifier(n_estimators=250, max_features=5, min_samples_leaf=5),
#         ExtraTreesClassifier(n_estimators=75, min_samples_leaf=5)
#     ),
#     meta_classifier=LogisticRegression()
# )

# scores = cross_val_score(voting_classifier, getX(data), data['Survived'], cv=5, scoring='accuracy')
# print np.mean(scores)

classifier.fit(getX(data), data['Survived'])

data = pd.read_csv('titanic-test.csv')
data = preprocess(data)

data['Survived'] = classifier.predict(getX(data))

data.to_csv('titanic-submission.csv',
            index=False,
            columns=['PassengerId', 'Survived'])
    clf.fit(X, y)

# In[31]:

from mlxtend.classifier import EnsembleVoteClassifier
import copy

eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],
                              weights=[1, 1, 1],
                              refit=False)

labels = ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble']

eclf.fit(X, y)

print('accuracy:', np.mean(y == eclf.predict(X)))

# ## Example 6 - Ensembles of Classifiers that Operate on Different Feature Subsets

# In[32]:

from sklearn.datasets import load_iris
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

iris = load_iris()
X = iris.data
y = iris.target
clf1 = bestLogReg_model
clf2 = bestSVC_model
clf3 = bestnn_model
clf4 = best_rf
clf5 = bestgrad_model

from mlxtend.classifier import EnsembleVoteClassifier
import copy
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4, clf5],
                              weights=[2, 2, 2, 2, 1],
                              refit=False)
eclf.fit(X_test_scaled, y_test)

print('Ensemble Model accuracy:',
      np.mean(y_test == eclf.predict(X_test_scaled)) * 100, "%")

# # Models - Visualizations

# In[28]:

import itertools
clfs = [clf1, clf2, clf3, clf4, clf5, eclf]
labels_eclf = [
    "Logistic Regression", 'SVC', "Neural Network", 'Random Forest',
    'GradientBoosting', "Ensemble Model"
]
pca = PCA(n_components=2)
x_train2 = pca.fit_transform(X_test_scaled)

gs = gridspec.GridSpec(2, 2)
Esempio n. 25
0
class ExtendedBaggingClassifier:
    def __init__(self, voting="hard", verbose=False, parallel=True, target_name='target'):
        self.models = []
        self.temporary_models = []
        self.voting = voting
        self.predictions = []
        self.votingClassifier = None
        self.verbose = verbose
        self.parallel = parallel
        self.target_name = target_name

    def _get_models(self):
        base_models = []
        for model in self.models:
            base_models.append(model.model)
        return base_models

    def add_models(self, model, params):
        """
        Create all the possible combinations of the model with given parameters.
        Usage example:
            params = {
                'C': np.logspace(0, 4, num=10),
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }

            custom_bagging = CustomBaggingClassifier(verbose=True, parallel=True)
            custom_bagging.add_models(LogisticRegression, params)

        :param model: The name of the model (passed without calling the constructor) that is intended to be used
        :param params: key-value pairs of hyperparameters that will be used to generate all the possible models
        :return: the number of models of the ensemble
        """
        if self.votingClassifier is not None:
            self.votingClassifier = None
        keys = list(params)
        for values in itertools.product(*map(params.get, keys)):
            model_instance = model(**dict(zip(keys, values)))
            self.temporary_models.append((str(model_instance), model_instance))
        return len(self.temporary_models)

    def add_model(self, model):
        """
        Add a model to the ensemble
        :param model: instance of the model
        :return: ---
        See also :add_models.
        """
        if self.votingClassifier is not None:
            self.votingClassifier = None
        self.temporary_models.append((str(model), model))
        return len(self.temporary_models)

    def _commit_single_model(self, n_samples, temp_model):
        """
        train_set, oob_set = self._generate_bootstrap_sample(Xy)
        return BaseModel(temp_model[0], temp_model[1], train_set, oob_set, self.target_name)
        """
        sampled_idx, unsampled_idx = self._generate_indexes(len(self.temporary_models), n_samples)
        return BaseModelIdx(temp_model[0], temp_model[1], sampled_idx, unsampled_idx, self.target_name)

    def _commit_models(self, X, y):
        """
        Create indexes sets for train and oob validation sets.
        """
        if X.shape[0] != y.shape[0]:
            raise ValueError('It seems that target values (y) are not the same as feature values (X)')

        if self.parallel:
            pool = multiprocessing.Pool(processes=None)
            f = partial(self._commit_single_model, X.shape[0])
            self.models = pool.map(f, self.temporary_models)
            pool.close()
            pool.join()
        else:
            for temp_model in self.temporary_models:
                self.models.append(self._commit_single_model(X.shape[0], temp_model))

    def _fit_single_model(self, X, y, single_model):
        return single_model.fit(X, y)

    def fit(self, X, y):
        """
        Train all the models in the ensemble.
        :param X: Features values of trainset
        :param y: Target values of trainset
        :return: ---
        """
        # self._commit_models(X, y)
        if self.parallel:
            pool = multiprocessing.Pool(processes=None)
            f = partial(self._fit_single_model, X, y)
            self.models = pool.map(f, self.models)
            pool.close()
            pool.join()
        else:
            for model in self.models:
                self._fit_single_model(X, y, model)
        self.votingClassifier = EnsembleVoteClassifier(clfs=self._get_models(), voting=self.voting, refit=False)
        self.votingClassifier.fit(X, y)

    def _predict_single_model(self, X, model):
        return model.name, model.predict(X)

    def predict_each_model(self, X):
        """
        Perform a prediction for each model in the ensemble. NOTE! fit(X,y) is required before.
        :param X: Features dataframe to be used for predictions
        :return: List of predictions with model name associated
        """
        if len(self.models) == 0:
            raise ValueError('Probably fit(X,y) method was not called before. Call it!')
        predictions = []
        if self.parallel:
            pool = multiprocessing.Pool(processes=None)
            f = partial(self._predict_single_model, X)
            predictions = pool.map(f, self.models)
            pool.close()
            pool.join()
        else:
            for model in self.models:
                predictions.append(self._predict_single_model(model, X))
        return predictions

    def score(self, X, y):
        """
        Get the score given X as features values and y as target values. Useful for validation/testing purposes.
        :param X: Features dataframe of trainset
        :param y: Target dataframe of trainset
        :return: score
        """
        return self.votingClassifier.score(X, y)

    def predict(self, X):
        """
        Perform a prediction considering the models as an ensemble. NOTE! train_models() must be called before getting the
        predictions
        :param X: input values to be used for predictions
        :return: list of predictions with model name associated
        """
        return self.votingClassifier.predict(X)

    def _get_single_oob(self, X, y, model):
        return model.name, model.score(X, y)

    def models_oob_score(self, X, y):
        '''
        Computes the OOB score for each model in the ensemble
        :return: list of OOB scores, one for each model in the ensemble
        '''

        oob_scores = []
        if self.parallel:
            pool = multiprocessing.Pool(processes=None)
            f = partial(self._get_single_oob, X, y)
            oob_scores = pool.map(f, self.models)
            pool.close()
            pool.join()
        else:
            for model in self.models:
                oob_scores.append((self._get_single_oob(X, y, model)))
        return oob_scores

    def _ret_accuracy(self, array):
        return array[1]

    def best_model(self, X, y):
        '''
        Find the best model comparing performances over OOB set
        :return: the model with the best OOB score
        '''
        performances = self.models_oob_score(X, y)
        performances.sort(key=self._ret_accuracy, reverse=False)
        return performances.pop()

    '''def _generate_bootstrap_sample(self, X):
        df_boot = X.sample(n=X.shape[0], replace=True, random_state=randint(0, 10000))
        oob = pd.concat([df_boot, X]).drop_duplicates(keep=False)
        if self.verbose is True:
            print("OOB set size: %.2f" % float(oob.shape[0] / df_boot.shape[0] * 100), "%")
            print("OOB set abs.:   %i" % oob.shape[0])
        return df_boot, oob'''

    def _generate_indexes(self, num_models, n_samples):
        rand_state = randint(0, num_models)
        sampled_idxs   = self._generate_sample_indices(rand_state, n_samples)
        unsampled_idxs = self._generate_unsampled_indices(rand_state, n_samples)
        return sampled_idxs, unsampled_idxs

    def _generate_unsampled_indices(self, random_state, n_samples):
        sample_indices = self._generate_sample_indices(random_state, n_samples)
        sample_counts = np.bincount(sample_indices, minlength=n_samples)
        unsampled_mask = sample_counts == 0
        indices_range = np.arange(n_samples)
        unsampled_indices = indices_range[unsampled_mask]
        return unsampled_indices

    def _generate_sample_indices(self, random_state, n_samples):
        random_instance = self._check_random_state(random_state)
        sample_indices = random_instance.randint(0, n_samples, n_samples)

        return sample_indices

    def _check_random_state(self, seed):
        if isinstance(seed, numbers.Integral):
            return np.random.RandomState(seed)
        if isinstance(seed, np.random.RandomState):
            return seed
        raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
                         ' instance' % seed)
    'Random Forest', 'Extra Trees', 'Support Vector', 'Decision Tree',
    'Ensemble Vote'
]
for clf, label in zip([clf_RF, clf_ET, clf_svc, clf_DT, eclf], labels):

    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))

eclf.fit(X_train, y_train)
confidence = eclf.score(X_test, y_test)
print(confidence)

example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1]])
example_measures = example_measures.reshape(len(example_measures), -1)
prediction = eclf.predict(example_measures)
print(prediction)

col_dict = dict(list(enumerate(df.columns)))
col_dict

X = np.array(df.drop(['class'], 1), dtype=np.float64)
y = np.array(df['class'], dtype=np.int64)
plot_decision_regions(
    X=X,
    y=y,
    clf=clf,
    filler_feature_values=col_dict,
    filler_feature_ranges=col_dict,
)
Esempio n. 27
0
class GetThatEnsemble:
    def __init__(self, cpu):
        self.names = ['f_' + str(i) for i in range(223)]
        self.X = pd.read_csv('original_data/x_train.csv',
                             delimiter=';',
                             names=self.names)
        self.Y = pd.read_csv('original_data/y_train.csv',
                             names=['target'],
                             delimiter=';')

        # feature generator block:
        self.X['mul_0'] = self.X['f_138'] * self.X['f_96']
        self.X['mul_1'] = self.X['f_138'] * self.X['f_156']
        self.X['mul_2'] = self.X['f_11'] * self.X['f_200']
        self.X['mul_3'] = self.X['f_96'] * self.X['f_83']
        self.X['mul_4'] = self.X['f_200'] * self.X['f_83']
        self.X['mul_5'] = self.X['f_200'] * self.X['f_156']
        self.X['mul_6'] = self.X['f_76'] * self.X['f_156']
        self.X['mul_7'] = self.X['f_76'] * self.X['f_131']
        self.X['mul_8'] = self.X['f_76'] * self.X['f_182']
        self.X['mul_9'] = self.X['f_41'] * self.X['f_182']
        self.X['mul_10'] = self.X['f_11'] * self.X['f_200']

        #
        # self.X['mul'] = self.X['f_84'] * self.X['f_182']

        self.default_columns = [
            'f_138',
            'f_11',
            'f_96',
            'f_200',
            'f_76',
            'f_41',
            'f_83',
            'f_156',
            'f_131',
            'f_84',
            'f_182',
            'mul_0',
            'mul_1',
            'mul_2',
            'mul_3',
            'mul_4',
            'mul_5',
            'mul_6',
            'mul_7',
            'mul_8',
            'mul_9',
            'mul_10',
        ]

        self.kf = None
        self.cpu = cpu
        self.pipeline = None

    def get_fold(self, columns, fold_amount=5):
        self.kf = StratifiedKFold(n_splits=fold_amount, shuffle=True)
        self.kf.get_n_splits(self.X[columns], self.Y['target'])

        for train_index, test_index in self.kf.split(self.X[columns],
                                                     self.Y['target']):
            x_train, x_test = self.X.as_matrix(
                columns)[train_index], self.X.as_matrix(columns)[test_index]
            y_train, y_test = self.Y.as_matrix(
            )[train_index], self.Y.as_matrix()[test_index]

            return x_train, y_train, x_test, y_test

    def ensemble(self, folds_limit=42):
        answers = []

        pass
        # clf1 = ExtraTreesClassifier(max_features=0.4, min_samples_leaf=1, min_samples_split=4,
        #                             n_estimators=1000, n_jobs=self.cpu)
        # clf2 = ExtraTreesClassifier(criterion="gini", max_features=0.4, min_samples_split=6, n_estimators=1000,
        #                             n_jobs=self.cpu)
        # clf3 = ExtraTreesClassifier(max_features=0.55, min_samples_leaf=1, min_samples_split=4, n_estimators=1000,
        #                             n_jobs=self.cpu)
        # clf4 = ExtraTreesClassifier(max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=1000,
        #                             n_jobs=self.cpu)

        pass
        # default 0.6742 on seed=42 for full set (search_best_3)
        clf1 = ExtraTreesClassifier(max_features=0.4537270875668709,
                                    criterion='entropy',
                                    min_samples_leaf=1,
                                    min_samples_split=2,
                                    n_estimators=3138,
                                    n_jobs=self.cpu)

        pass
        # clf1 = RandomForestClassifier(max_features=0.34808889858456293, criterion='entropy',
        #                               min_samples_split=2, n_estimators=4401, n_jobs=self.cpu)

        pass
        # default
        # clf1 = ExtraTreesClassifier(max_features=0.4, min_samples_leaf=1, min_samples_split=2, n_estimators=1000,
        #                             n_jobs=self.cpu)

        self.pipeline = EnsembleVoteClassifier(clfs=[clf1],
                                               weights=[1],
                                               voting='soft')

        for iteration in range(folds_limit):
            np.random.seed(42 + iteration)

            x_train, y_train, x_test, y_test = self.get_fold(
                self.default_columns)
            self.pipeline.fit(x_train, y_train)
            preds = self.pipeline.predict(x_test)

            # print(confusion_matrix(y_test, preds))
            matrix_ = confusion_matrix(y_test, preds)
            correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][
                2] + matrix_[3][3] + matrix_[4][4]
            print('   Correct answers count: ', correct_answers,
                  ' [it: %s]' % iteration)
            answers.append(int(correct_answers))
            if iteration % 5 == 0 and iteration > 0:
                print('Params: mean: %s std: %s best: %s' %
                      (np.mean(answers), np.std(answers), max(answers)))
        print('Params: mean: %s std: %s best: %s' %
              (np.mean(answers), np.std(answers), max(answers)))

    def answers(self, iter_limit=10):
        self.pipeline.fit(self.X[self.default_columns], self.Y['target'])
        score_dataset = pd.read_csv('original_data/x_test.csv',
                                    delimiter=';',
                                    names=self.names)
        # feature generator block:
        score_dataset['mul_0'] = score_dataset['f_138'] * score_dataset['f_96']
        score_dataset[
            'mul_1'] = score_dataset['f_138'] * score_dataset['f_156']
        score_dataset['mul_2'] = score_dataset['f_11'] * score_dataset['f_200']
        score_dataset['mul_3'] = score_dataset['f_96'] * score_dataset['f_83']
        score_dataset['mul_4'] = score_dataset['f_200'] * score_dataset['f_83']
        score_dataset[
            'mul_5'] = score_dataset['f_200'] * score_dataset['f_156']
        score_dataset['mul_6'] = score_dataset['f_76'] * score_dataset['f_156']
        score_dataset['mul_7'] = score_dataset['f_76'] * score_dataset['f_131']
        score_dataset['mul_8'] = score_dataset['f_76'] * score_dataset['f_182']
        score_dataset['mul_9'] = score_dataset['f_41'] * score_dataset['f_182']
        score_dataset[
            'mul_10'] = score_dataset['f_11'] * score_dataset['f_200']

        predicts = pd.DataFrame()
        for iteration in range(iter_limit):
            if iteration > 0 and iteration % 5 == 0:
                print('[Predict: %s]' % iteration)
            np.random.seed(42 + iteration)
            y_pred = self.pipeline.predict(score_dataset[self.default_columns])
            predicts[iteration] = y_pred

        vote_answer = []
        for pos in range(len(predicts[0])):
            row_dict = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0}
            for column in predicts.columns:
                row_dict[str(predicts[column].iloc[pos])] += 1
            best_answer_count = 0
            for key in row_dict.keys():
                if row_dict[key] > best_answer_count:
                    best_answer_count = row_dict[key]
                    best_answer = int(key)
            vote_answer.append(best_answer)

        predicts['votes'] = vote_answer
        predicts['diff'] = predicts['votes'] - predicts[0]
        print(predicts[predicts['diff'] != 0])
        print(predicts[predicts['diff'] != 0].shape)

        pd.Series(predicts['votes']).to_csv('data/answer.csv', index=False)