def make_model(self): #--------------------------------------------------------------------------------------------- # TREE BASED ALGORITHMS #--------------------------------------------------------------------------------------------- #--Chossing random_state parameter #------Basically, a sub-optimal greedy algorithm is repeated a number of times using---------- #------random selections of features and samples (a similar technique used in random---------- #------ forests).The 'random_state' parameter allows controlling these random choices--------- #--n_estimators = no of decision trees to be created in forest model_rf = RandomForestClassifier(n_estimators=145, random_state=10, n_jobs=-1) model_rf.fit(train_feats2, target) model_gb = GradientBoostingClassifier(n_estimators=145, random_state=11, n_jobs=-1) model_gb.fit(train_feats2, target) model_ab = AdaBoostClassifier(n_estimators=145, random_state=12, n_jobs=-1) model_ab.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # LOGISTIC REGRESSION #-------------------------------------------------------------------------------------------- model_lr = LogisticRegression(random_state=1) model_lr.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # NAIVE BAYES #-------------------------------------------------------------------------------------------- model_nb = MultinomialNB() model_nb.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # VOTING ENSEMBLE OF ALL MODELS #-------------------------------------------------------------------------------------------- clf = [model_rf, model_lr, model_gb, model_ab, model_nb] eclf = EnsembleVoteClassifier( clfs=clf, weights=[1, 2, 1, 1, 1], refit=False) #weights can be decided by stacking!! eclf.fit(train_feats2, target) print("model created") preds = eclf.predict(test_feats2) sub3 = pd.DataFrame({'User_ID': test_df.User_ID, 'Is_Response': preds}) sub3['Is_Response'] = sub3['Is_Response'].map( lambda x: functions.to_labels(self, x)) sub3 = sub3[['User_ID', 'Is_Response']] sub3.to_csv('D:\\New folder\\f2c2f440-8-dataset_he\\SUB_TEST.csv', index=False) print("prediction saved") return eclf
def tri_train(domain,X_train,y_train,X_test,y_test,X_un,theta=0.5,dis=False): models = list() accs = list() for i in range(3): X_split,y_split = bootstrap_sample(X_train,y_train) acc,clf_func = get_acc_clf(domain,X_split,y_split,X_test,y_test) models.append(clf_func) accs.append(acc) for (j,k) in itertools.combinations(models,2): # i_features = list() unlabelled_features = np.array(X_un) total = len(X_train)+len(X_un) t = 0 count = 0 X_i = X_train y_i = y_train # find current classifier clf_i = [x for x in models if x!=j and x!=k][0] index_i = models.index(clf_i) print "***classifier %d***"%index_i while count < total and len(unlabelled_features)!=0: t += 1 X_tgt,y_tgt = get_features(unlabelled_features,j,k,clf_i,models,theta=theta,dis=dis) if len(X_tgt)==0 and t>1: print "no new features added" break X_i = concatenate(X_i,X_tgt) y_i = concatenate(y_i,y_tgt) count = len(X_i) print "%d %d %d"%(t,count,total) # clf_i.fit(X_i,y_i) # update classifier acc,clf_i = get_acc_clf(domain,X_i,y_i,X_test,y_test) if accs[index_i]<acc: accs[index_i] = acc # best_clf = clf_i print "*NEW BEST! best acc:", acc models[index_i] = clf_i else: print "no improvement..skip.." break if count == total: print "reach end.." break # update the unlabelled features for speed-up print np.array(X_tgt).shape X_tgt = [list(x) for x in X_tgt] unlabelled_features =[x for x in unlabelled_features if list(x) not in X_tgt] print np.array(unlabelled_features).shape # majority vote classifiers eclf = EnsembleVoteClassifier(clfs=models,weights=[1,1,1],refit=False) eclf.fit(X_test,y_test) # this line is not doing work # tmp_name = domain.upper()[0] if "large" not in domain else "large/"+domain.upper()[6] pred = eclf.predict(X_test) acc = accuracy_score(y_test,pred) if "large" not in domain else f1_score(y_test,pred,average='macro') print "acc:%s theta:%s"%(acc,theta),"seprate accs:",accs return acc,eclf
def votingEnsembleTest2ndLayer_Test(top_ensembles_dict, test_country_data): hit_count = 0 for BC in top_ensembles_dict.keys(): classifiers = [ _vclf for _vclf in [sub_list[1] for sub_list in top_ensembles_dict[BC]] ] _weights = np.asarray([1] * len(classifiers)) vclf_layer2 = EnsembleVoteClassifier(clfs=classifiers, weights=_weights, refit=False) Y = test_country_data[BC]["Y"] X = test_country_data[BC]["X"] vclf_layer2.fit(X, Y) y_estimate = vclf_layer2.predict(X) print( "Mentality Cycle {} 2nd Layer Voting Classifier Ensemble has accuracy: {}" .format(BC, np.mean(Y == y_estimate))) hit_count = hit_count + np.sum( Y == y_estimate ) ##calc overall performance of top 3 classifiers for each region total_obvs = test_country_data[1]["Y"].shape[0] + test_country_data[2][ "Y"].shape[0] + test_country_data[3]["Y"].shape[0] overall_hit_rate = hit_count / total_obvs print("Aggregated accuracy of 2nd Layer Voting Classifiers is: {}".format( overall_hit_rate))
def emsembal_train(feature, label): from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier label = transport_labels(label) X_train, X_test, Y_train, Y_test = train_test_split(feature, label, test_size=0.2, random_state=1000) clf1 = SVC(C=10, kernel='sigmoid', probability=True) clf2 = RandomForestClassifier(random_state=0) clf3 = LogisticRegression(random_state=0) clf4 = xgb.XGBClassifier(max_depth=8, learning_rate=0.07, n_estimators=35, silent=True, objective="binary:logistic", booster='gbtree', gamma=0, min_child_weight=6, subsample=0.8, colsample_bytree=0.7, reg_alpha=0.1, seed=1000) eclf = EnsembleVoteClassifier(clfs=[clf1, clf3, clf4], voting='soft') eclf.fit(X_train, Y_train) y_pred = eclf.predict(X_test) print('eclf accs=%f' % (sum(1 for i in range(len(y_pred)) if y_pred[i] == Y_test[i]) / float(len(y_pred))))
def majority_vote(target): X_test = load_obj("%s/X_test"%target) y_test = load_obj("%s/y_test"%target) domains = [] if "mlp" in target: domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"] else: if "large" not in target: domains = ["books","dvd","electronics","kitchen"] if target not in domains: return else: domains =["large/baby","large/cell_phone","large/imdb","large/yelp2014"] models = [] for source in domains: if target == source: continue else: print source clf_func = load_obj("%s/self_clf"%source) models.append(clf_func) eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1], eclf.fit(X_test,y_test) # this line is not doing work tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6] tmp_name = target.upper()[0] if "mlp" not in target else "mlp/"+target.upper()[4] save_obj(eclf, '%s_eclf'%(tmp_name)) pred = eclf.predict(X_test) acc = accuracy_score(y_test,pred) if "large" not in target else f1_score(y_test,pred,average='macro') print 'self-train',acc pass
def majority_vote_mlp(target): X_test = load_obj("%s/X_test"%target) y_test = load_obj("%s/y_test"%target) # domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"] data_name = ["books", "dvd", "electronics", "kitchen"] X_joint = load_obj("%s/X_joint"%target) y_joint = load_obj("%s/y_joint"%target) temp_un = load_obj("%s/X_un"%target) meta_sources = [] for i in range(len(data_name)): if 'mlp/'+data_name[i] != target: meta_sources.append(data_name[i]) # print meta_sources models = [] for j in range(len(meta_sources)): temp_X = X_joint[j] temp_y = y_joint[j] thetas = [0.5,0.6,0.7,0.8,0.9] best_acc = 0.0 best_clf ="" best_theta = 0.0 resFile = open("../work/params/%s_theta_self-%s.csv"%(target,meta_sources[j].upper()[0]),"w") resFile.write("theta, acc\n") for theta in thetas: print "##############################" print "start with theta=%s"%theta print "##############################" acc,clf_func = self_train(target,temp_X,temp_y,X_test,y_test,temp_un,theta=theta) if best_acc<acc: best_acc = acc best_clf = clf_func best_theta = theta resFile.write("%f, %f\n"%(theta,acc)) resFile.flush() resFile.close() print "##############################" print "best_theta:",best_theta,"best_acc:",best_acc models.append(best_clf) eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1], eclf.fit(X_test,y_test) # this line is not doing work # tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6] # tmp_name = 'mlp/'+target.upper()[4] save_obj(eclf, "%s/self_clf"%target) pred = eclf.predict(X_test) # print pred acc = accuracy_score(y_test,pred) print 'self-train',acc pass
def test6(): import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import EnsembleVoteClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) eclf1 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', verbose=1) eclf1 = eclf1.fit(X, y) print(eclf1.predict(X)) eclf2 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft') eclf2 = eclf2.fit(X, y) print(eclf2.predict(X)) eclf3 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', weights=[2, 1, 1]) eclf3 = eclf3.fit(X, y) print(eclf3.predict(X))
def votingEnsembleTest(all_country_data_with_algos, test_country_data_US): print( " \n For each training set country for each sub dataset (split by Mentality Cycle): the top n trained algorithms form a Voting Classifiers. This Voting Classifiers is then tested on its corresponding US sub data set. An aggregate scocre for each trainging set country is calculated through an Aggregation of its 3 Voting Classifiers' performances" ) _all_country_data_with_trained_algos = copy.deepcopy( all_country_data_with_algos) for country in _all_country_data_with_trained_algos.keys(): country_level_total_hits = 0 for BC in _all_country_data_with_trained_algos[country].keys(): classifiers = copy.deepcopy( _all_country_data_with_trained_algos[country][BC].get( 'trained algos')) clf_weights = np.asarray([1, 1, 1], dtype=int) Y = test_country_data_US[BC].get("Y") X = test_country_data_US[BC].get("X") vclf = EnsembleVoteClassifier(clfs=classifiers, weights=clf_weights, refit=False, voting='hard') # voting='soft' vclf.fit(X, Y) y_estimate = vclf.predict(np.array(X)) print( "Voting Classifier trained on {} Mentality Cycle {} has accuracy: {}" .format(country, BC, np.mean(Y == pd.Series(y_estimate)))) ##saving Country-BC split accuracy and instance of Voting Classifier score to all_country... dictionary _all_country_data_with_trained_algos[country][BC][ 'accuracy'] = np.mean(Y == y_estimate) _all_country_data_with_trained_algos[country][BC][ 'votingclassifier'] = vclf country_level_total_hits = country_level_total_hits + np.sum( Y == y_estimate) record_count = test_country_data_US[1]["Y"].shape[ 0] + test_country_data_US[2]["Y"].shape[0] + test_country_data_US[ 3]["Y"].shape[0] _all_country_data_with_trained_algos[country]['accuracy'] = ( country_level_total_hits / record_count) print("Aggregated Classifier trained on {} has accuracy: {} \n".format( country, _all_country_data_with_trained_algos[country]['accuracy'])) return _all_country_data_with_trained_algos
class VotingModel: def __init__(self, X, y, x_test, model_lists): self.model = EnsembleVoteClassifier(clfs=model_lists, weights=[1, 1, 1], refit=False, voting='soft') self.X = X self.y = y self.X_test = x_test def train(self): self.model.fit(self.X, self.y) def predict(self): return self.model.predict(self.X_test) def predict_proba(self): return self.model.predict_proba(self.X_test)
#print(X_train_counts.toarray()[0]) tfidf_transformer = TfidfTransformer(use_idf=True) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k=1500) X_train = ch2.fit_transform(X_train_tfidf, newsgroups_train.target) selected_feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] #clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.3,max_depth=3, random_state=0) clf1 = MultinomialNB(alpha=0.1) #clf2 = svm.LinearSVC(max_iter = 2000,probability=True,random_state=0) clf2 = SVC(kernel='linear', probability=True) #clf3 = SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet") clf = EnsembleVoteClassifier(clfs=[clf1, clf2], weights=[2, 1], voting='soft') clf.fit(X_train, newsgroups_train.target) #pred_t = clf.predict(X_train) #print(metrics.precision_score(newsgroups_train.target, pred_t, average='macro')) vectors_test2 = count_vect.transform(newsgroups_test.data) vectors_test = tfidf_transformer.transform(vectors_test2) X_test = ch2.transform(vectors_test) pred = clf.predict(X_test) print(metrics.precision_score(newsgroups_test.target, pred, average='macro'))
logging.info(f'Training {classifier_name}...') clf.fit(X_train, y_train) score = balanced_accuracy_score(y_test, clf.predict(X_test)) logging.info(f'{classifier_name} BAC = {score:.4f}') probabilities = clf.predict_proba(X_test) np.save(PROBABILITIES_PATH / f'{classifier_name}.cv.{args.fold}.npy', probabilities) results.append([classifier_name, score]) ensemble = EnsembleVoteClassifier(list(classifiers.values()), voting='soft', fit_base_estimators=False) ensemble.fit(X_train, y_train) score = balanced_accuracy_score(y_test, ensemble.predict(X_test)) logging.info(f'Ensemble BAC = {score:.4f}') results.append(['Ensemble', score]) with open(MODELS_PATH / f'ensemble.cv.{args.fold}.pickle', 'wb') as f: pickle.dump(ensemble, f) df = pd.DataFrame(results, columns=['Classifier', 'BAC']) df.to_csv(RESULTS_PATH / f'{args.fold}.csv', index=False)
y_valid = pd.DataFrame() y_valid['target'] = x_valid['target'] x_valid.drop('target', axis=1, inplace=True) x_train_0 = pd.DataFrame(X[X['target'] == 0][:90]) x_train_1 = pd.DataFrame((X[X['target'] == 1][:900])) x_train_2 = pd.DataFrame(X[X['target'] == 2][:1300]) x_train_3 = pd.DataFrame(X[X['target'] == 3][:420]) x_train_4 = pd.DataFrame(X[X['target'] == 4][:90]) x_train = pd.DataFrame( pd.concat([x_train_0, x_train_1, x_train_2, x_train_3, x_train_4], axis=0)) y_train = pd.DataFrame() y_train['target'] = x_train['target'] x_train.drop('target', axis=1, inplace=True) eclf.fit(x_train[best_columns], y_train['target']) preds = eclf.predict(x_valid[best_columns]) print('Confusion matrix:\n') print(confusion_matrix(y_valid['target'].values, preds)) matrix_ = confusion_matrix(y_valid['target'].values, preds) correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][2] + matrix_[3][ 3] + matrix_[4][4] print('Correct answers count: ', correct_answers) # --- answer module --- eclf.fit(X[best_columns], Y['target']) score_dataset = pd.read_csv('original_data/x_test.csv', delimiter=';', names=names) y_pred = eclf.predict(score_dataset[best_columns]) pd.Series(y_pred).to_csv('data/answer.csv', index=False)
clf4 = GradientBoostingClassifier() print('10-fold cross validation:\n') #np.random.seed(123) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[1, 1, 1, 1], voting='soft') #from sklearn.model_selection import ShuffleSplit #for clf, label in zip([clf1, clf2, clf3], ['Logistic Regression', 'Random Forest', 'SVM'] #for clf, label in zip([clf1, clf3, cl4,eclf], ['Logistic Regression','RandomForest','SVM','Xgboost','Voting Ensemble']): # scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') # print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean()*100, scores.std(), label)) eclf.fit(X_train, Y_train) y_pred = eclf.predict(X_test) print(accuracy_score(Y_test, y_pred) * 100) X = np.concatenate((X_train, X_test), 0) Y = np.concatenate((Y_train, Y_test), 0) # cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0) # scores=cross_val_score(clf, X, y, cv=cv) # print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) # accuracies=cross_val_score(estimator=clf,X=X,y=Y,cv=10) # print(accuracies.mean()*100,accuracies.std()*100) # print("Accuracy: %0.4f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) Mamun_confusion_matrix = confusion_matrix(Y_test, y_pred, labels=[1, 2, 3, 4, 5, 6, 12, 13]) confusion_matrix = Mamun_confusion_matrix
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False) train_vectors = vectorizer.fit_transform(x_train) test_vectors = vectorizer.transform(x_test) clf1 = LogisticRegression(random_state=0) clf2 = RandomForestClassifier(random_state=0) clf3 = SVC(random_state=0, probability=True) clf4 = MultinomialNB(alpha=.01) clf5 = xgb.XGBClassifier() eclif = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4, clf5], weights=[2, 4, 2, 4, 7], voting='soft') eclif.fit(train_vectors, y_train) pred = eclif.predict(test_vectors) f_1 = sklearn.metrics.f1_score(y_test, pred, average='weighted') print "f_1 is " + str(f_1) with open(f_1_f, "w") as f: f.write("f_1 is " + str(f_1)) c = make_pipeline(vectorizer, eclif) nb_success = 0 nb_fail = 0 result_list = [] result_label = [] result_accepted_list_ml = []
trained = util.load_pickle(name='fs_1', path='..\\pickles\\feature_sets\\') print('trained', size(trained)) test = util.load_pickle(name='fs_test_1', path='..\\pickles\\test_features\\') print('test', size(test)) test_data = test['data_set'] featureset = 'fs_words_bigrams_pos' X_train, y_train = trained[featureset], trained['labels'] X_test, y_test = test[featureset], test['labels'] feat_size = X_train.shape[1] x = load_from_file() svm = x['svm'] xgb = x['xgb'] knn = x['knn'] nb = x['nb'] dt = x['dt'] rf = x['rf'] nn = x['nn'] mc = x['mc'] estimators = [svm.clf, xgb.clf, nb.clf, dt.clf, rf.clf]#, mc.clf], #, nn.clf] #y_pred = predict_from_multiple_estimator(estimators, X_test) from mlxtend.classifier import EnsembleVoteClassifier combined =EnsembleVoteClassifier(clfs=estimators, voting='hard', refit=False) combined.fit(X_train, y_train) y_pred = combined.predict(X_test) from sklearn.metrics import accuracy_score print(accuracy_score(y_test, y_pred))
n_estimators=100) # current best clf6 = ExtraTreesClassifier(max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=100) eclf = EnsembleVoteClassifier(clfs=[clf3, clf4, clf5, clf6], weights=[1, 1, 1, 1], voting='soft') labels = ['Trees_3', 'Trees_4', 'Trees_5', 'Trees_6', 'Ensemble'] for clf, label in zip([clf3, clf4, clf5, clf6, eclf], labels): scores = model_selection.cross_val_score(clf, X[best_columns], Y['target'], cv=4, scoring='neg_log_loss') print("Log Loss: %0.3f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label)) # --- answer module --- eclf.fit(X[best_columns], Y['target']) score_dataset = pd.read_csv('original_data/x_test.csv', delimiter=';', names=names) y_pred = eclf.predict(score_dataset[best_columns]) pd.Series(y_pred).to_csv('data/answer.csv', index=False)
list_of_cv_acc.append(clf5_avg_f1) # In[50]: clf6_pipe,clf6_avg_f1 = set_pipe(clf6, mi_feats, 'knn_') list_of_cv_acc.append(clf6_avg_f1) # In[51]: enclf = EnsembleVoteClassifier((clf1_pipe,clf2_pipe,clf3_pipe,clf4_pipe,clf5_pipe, clf6_pipe), refit = False) enclf.fit(X_train, y_train) y_pred = enclf.predict(X_test) con_mat = confusion_matrix(y_test, y_pred) #print("Cross Val acc score: ", (model_selection.cross_val_score(enclf, X_train, y_train, cv = 5,)).mean()) #print("Cross Val f1 score: ", (model_selection.cross_val_score(enclf, X_train, y_train, cv = 5, scoring = 'f1')).mean()) print() print("Overall Acc score: ", accuracy_score(y_test, y_pred)) print("Recall score (Tru Pos Rate): ", recall_score(y_test, y_pred)) print("Precision score: ", precision_score(y_test, y_pred)) print("Neg Predictive Val: ", con_mat[0][0] / (con_mat[0][1] + con_mat[0][0])) print("Tru Neg Rate(Specifi): ", con_mat[0][0] / (con_mat[1][0] + con_mat[0][0])) print("F1 score: ", f1_score(y_test, y_pred)) print("Auc score: ", roc_auc_score(y_test, y_pred)) print(con_mat) print() (pd.DataFrame(y_pred)).to_csv('maj_vote' + 'y_pred_avg_filt.csv')
y_valid = pd.DataFrame() y_valid['target'] = x_valid['target'] x_valid.drop('target', axis=1, inplace=True) x_train_0 = pd.DataFrame(X[X['target'] == 0][:90]) x_train_1 = pd.DataFrame((X[X['target'] == 1][:900])) x_train_2 = pd.DataFrame(X[X['target'] == 2][:1300]) x_train_3 = pd.DataFrame(X[X['target'] == 3][:420]) x_train_4 = pd.DataFrame(X[X['target'] == 4][:90]) x_train = pd.DataFrame( pd.concat([x_train_0, x_train_1, x_train_2, x_train_3, x_train_4], axis=0)) y_train = pd.DataFrame() y_train['target'] = x_train['target'] x_train.drop('target', axis=1, inplace=True) eclf.fit(x_train[best_columns], y_train['target']) preds = eclf.predict(x_valid[best_columns]) print('Confusion matrix:\n') print(confusion_matrix(y_valid['target'].values, preds)) matrix_ = confusion_matrix(y_valid['target'].values, preds) print(type(matrix_)) correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][2] + matrix_[3][ 3] + matrix_[4][4] print('Correct answers count: ', correct_answers) for iteration in range(10): best_score = 1 best_feature = '' for feature in good_features_drop_no_corr: my_columns = best_columns[:] my_columns.append(feature) if feature in best_columns:
plot_conf(svc) results_acc['svc'] = accscorsv results_f1['svc'] = f1scorsv #########################Boosting################################# log = LogisticRegression(solver='lbfgs', class_weight='balanced') ada = AdaBoostClassifier(n_estimators=5, base_estimator=log) grad_boost = GradientBoostingClassifier(n_estimators=100) xgb = XGBClassifier(max_depth=8, learning_rate=0.001, use_label_encoder=False) ensemble = EnsembleVoteClassifier(clfs = [ada, grad_boost, xgb], voting='hard') ensemble.fit(X_train, y_train) y_preden = ensemble.predict(X_test) f1scoren = metrics.f1_score(y_test, y_preden) accscoren = ensemble.score(X_test, y_test) results_acc['ensemble'] = accscoren results_f1['ensemble'] = f1scoren print(classification_report(y_test, y_pred)) plot_conf(ensemble) ############################################################################### naive = GaussianNB(var_smoothing=2e-9) naive.fit(X_train, y_train) y_pred = naive.predict(X_test) f1scornb = metrics.f1_score(y_test, y_pred) accscornb = naive.score(X_test, y_test)
class MulticriteriaEnsemble(object): def __init__(self, models=OrderedDict({}), dataset=None, pickle_path=None, crit_metrics=None, global_metric=None, delta=None, epsilon=None, a=None, bootstrap_models=OrderedDict({}), n_splits=5, voting='soft', jenks=True, jenks_limit=2, refit=False): self.models = models self.bootstrap_models = bootstrap_models self.dataset = dataset self.crit_metrics = crit_metrics self.global_metric = global_metric self.delta = delta self.best_delta = None self.epsilon = epsilon self.a = a self.voting = voting self.n_splits = n_splits self.refit = refit self.pickle_path = self.dataset.path + 'base_learners/' self.multicriteria_table = None self.meta_table = None self.utastar_model = None self.wmv_model = None self.natural_breaks = None self.weights = [] self.global_utilities = [] self.kfold_indices = [] self.test_kfold_indices = [] self.global_metrics = [] self.is_fit = { 'wmv': False, 'clfs': not self.refit, 'utastar': False, } self.jenks = jenks self.jenks_limit = jenks_limit if not self.models and refit == True: raise Exception('Base learners are not provided.') elif self.models and refit == False: raise Exception( 'Models parameter should not be set to anything while refit=False' ) if self.dataset == None: raise Exception('Dataset is not provided.') if self.crit_metrics == None: raise Exception('Performance estimators are not provided.') if self.global_metric == None: raise Exception('Global Performance estimator is not provided.') if self.delta == None or self.a == None or self.epsilon == None: raise Exception( 'One or more utastar model parameters is/are not provided.') def _pso_cost(self, x): self.delta = x[0] self.epsilon = x[1] if self.is_fit['wmv']: self.fit(mtable=False) else: self.fit() return 1 - self.score() def pso(self, bounds, num_particles, w, c1, c2, maxiter, threshold): psopt(self._pso_cost, bounds, num_particles, w, c1, c2, maxiter, threshold) def _save_model(self, model, file_name): print "Saving Model!" if os.path.isfile(self.pickle_path + file_name): if not os.path.exists(self.pickle_path + 'Archive/'): os.makedirs(self.pickle_path + 'Archive/') archived_file_name = self.pickle_path + 'Archive/' + file_name.replace( '.pkl', '_') + datetime.datetime.today().strftime( "%m-%d-%Y-%H%M%S") + '.pkl' shutil.move(self.pickle_path + file_name, archived_file_name) joblib.dump(model, self.pickle_path + file_name) print "Model Saved!!!" else: print "Model Saved!!!" joblib.dump(model, self.pickle_path + file_name) #Reinitialize crucial variables def _reset(self): self.global_utilities = [] self.weights = [] self.kfold_indices = [] if self.refit == True: self.bootstrap_models = OrderedDict({}) print 'Multicriteria Table Deleted!!!' self.multicriteria_table = None self.meta_table = None #Split dataset to k stratified folds and save the indices def _skfold(self, n_splits): skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=12345) for train_index, test_index in skf.split(self.dataset.X_train, self.dataset.y_train): self.kfold_indices.append(train_index.tolist()) self.test_kfold_indices.append(test_index.tolist()) #Fit the base learners def _fit_clfs(self): #If the path that the models will be saved does not exist create it if not os.path.exists(self.pickle_path): os.makedirs(self.pickle_path) #For every fold for k_idx, k in enumerate(self.kfold_indices): #Make a copy of the base learners temp_models = OrderedDict( zip(self.models.keys(), clone(self.models.values()))) #For every model in the base learners create a separate model , train it on the current fold and save it for model in temp_models.keys(): model_name = '%s_%s_FOLD%i' % (model.replace( '_' + self.dataset.name, ''), self.dataset.name, k_idx) temp_models[model].fit(self.dataset.X_train.iloc[k], self.dataset.y_train.iloc[k]) file_name = model_name + '.pkl' self._save_model(temp_models[model], file_name) self.bootstrap_models[model_name] = temp_models[model] #Rename the base learners to include the dataset name,fit the models and save them if not self.dataset.name in model: self.models = self._rename_models(self.models) for model in self.models.keys(): self.models[model].fit(self.dataset.X_train, self.dataset.y_train) self._save_model(self.models[model], model + '.pkl') #Fit the utastar model def _fit_utastar(self): #Define the Utastar model self.utastar_model = Utastar(self.multicriteria_table, self.meta_table, self.delta, self.epsilon) #Fit the Utastar model self.utastar_model.solve() def _get_global_utilities(self): metrics = self._get_metrics(self.bootstrap_models, on='test') self._utastar_predict(metrics) #Fit the Weighted Majority Voting model def _fit_wmv(self): #Merge the base learners and the produced models(extra) models = self.bootstrap_models.values() #Define the Weighted Majority Voting model self.wmv_model = EnsembleVoteClassifier(clfs=models, weights=self.weights, voting=self.voting, refit=False) #Fit the WMV model self.wmv_model.fit(self.dataset.X_train, self.dataset.y_train) #Fit the Multicriteria Ensemble Model def fit(self, mtable=True): #Reinitilize crucial variables self._reset() #Get Stratified K-Fold indices self._skfold(self.n_splits) #if refit is needed,fit the models if self.refit: self._fit_clfs() self.is_fit['clfs'] = True else: #Check if try: for base_learner in os.walk(self.pickle_path).next()[2]: if 'FOLD' in base_learner: self.bootstrap_models[base_learner.replace( '.pkl', '')] = joblib.load(self.pickle_path + '%s' % base_learner) else: self.models[base_learner.replace( '.pkl', '')] = joblib.load(self.pickle_path + '%s' % base_learner) dummy_var = self.bootstrap_models.keys()[1] except: raise AttributeError( 'Refit is set to False but no models are given.') if mtable == False and self.multicriteria_table is None: raise Exception( 'Multicriteria table not found.Please run fit(mtable=True) at least once.' ) elif mtable == True: print 'Multicriteria table formed!!!' self._get_meta_table() self._get_multicriteria_table() self._fit_utastar() self._get_global_utilities() self._get_clfs_weights() self._fit_wmv() self.is_fit['wmv'] = True def predict(self, X): return self.wmv_model.predict(X) def predict_proba(self, X): return self.wmv_model.predict_proba(X) def _get_clfs_weights(self): gu = self.global_utilities if self.jenks == True: self.natural_breaks = jenkspy.jenks_breaks(gu, nb_class=5) gu = [ i if i >= self.natural_breaks[-self.jenks_limit] else 0 for i in gu ] gu_sum = sum(gu) for value in gu: self.weights.append(value / gu_sum) def add_clfs(self, clfs, refit=False): clfs = self._rename_models(clfs) if set(self.models.keys()).isdisjoint(clfs.keys()): if not refit: metrics = self._get_metrics(clfs) self.models.update(clfs) else: temp_models = {} for clf in clfs.keys(): temp_models[clf] = clone(clfs[clf]) temp_models[clf].fit(self.dataset.X_train, self.dataset.y_train) metrics = self._get_metrics(temp_models) self.models.update(temp_models) self._utastar_predict(metrics) self.weights = [] self._get_clfs_weights() self._fit_wmv() else: raise Exception('One or more models are already in the ensemble.') def score(self): return self._get_global_metrics({'wmv': self.wmv_model}, on='test')[0] def _utastar_predict(self, metrics): for clf_metrics in metrics: pred_partial_util = [] for crit in self.utastar_model.criteria: X = self.utastar_model.intervals[crit] y = self.utastar_model.marginal_post[crit] pred_partial_util.append( np.interp( clf_metrics[ self.utastar_model.criteria.tolist().index(crit) + 1], X, y)) pred_global_util = np.array(pred_partial_util).dot( np.array(clf_metrics[1:])) self.global_utilities.append(pred_global_util) def _rename_models(self, models): for model in models.keys(): model_name = '%s_%s' % (model, self.dataset.name) models[model_name] = models.pop(model) return models def plot_partial_utilities(self): numofcriteria = len(self.utastar_model.criteria) n = numofcriteria if n % 2 == 0: fig1, axs = plt.subplots(n / 2, 2, figsize=(18, 18)) else: fig1, axs = plt.subplots(n / 2 + 1, 2, figsize=(18, 18)) for i in range(n): y = self.utastar_model.marginal_post[ self.utastar_model.criteria[i]] x = self.utastar_model.intervals[self.utastar_model.criteria[i]] if i % 2 == 0: if self.utastar_model.get_type( self.utastar_model.criteria[i]) == 1: axs[i / 2, 0].plot(x, y, '--ok') axs[i / 2, 0].set_title(self.utastar_model.criteria[i]) axs[i / 2, 0].set_xticks(x) axs[i / 2, 0].set_xlim(x[0], x[-1]) axs[i / 2, 0].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1))) axs[i / 2, 0].yaxis.grid(False) if self.utastar_model.get_monotonicity( self.utastar_model.criteria[i]) == 1: axs[i / 2, 0].set_xlim(x[-1], x[0]) else: axs[i / 2, 0].plot(x, y, '-ok') axs[i / 2, 0].set_title(self.utastar_model.criteria[i]) axs[i / 2, 0].set_xticks(x) axs[i / 2, 0].set_xlim(x[0], x[-1]) axs[i / 2, 0].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1))) axs[i / 2, 0].yaxis.grid(False) if self.utastar_model.get_monotonicity( self.utastar_model.criteria[i]) == 1: axs[i / 2, 0].set_xlim(x[-1], x[0]) else: if self.utastar_model.get_type( self.utastar_model.criteria[i]) == 1: axs[i / 2, 1].plot(x, y, '--ok') axs[i / 2, 1].set_title(self.utastar_model.criteria[i]) axs[i / 2, 1].set_xticks(x) axs[i / 2, 1].set_xlim(x[0], x[-1]) axs[i / 2, 1].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1))) axs[i / 2, 1].yaxis.grid(False) if self.utastar_model.get_monotonicity( self.utastar_model.criteria[i]) == 1: axs[i / 2, 1].set_xlim(x[-1], x[0]) else: axs[i / 2, 1].plot(x, y, '-ok') axs[i / 2, 1].set_title(self.utastar_model.criteria[i]) axs[i / 2, 1].set_xticks(x) axs[i / 2, 1].set_xlim(x[0], x[-1]) axs[i / 2, 1].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1))) axs[i / 2, 1].yaxis.grid(False) if self.utastar_model.get_monotonicity( self.utastar_model.criteria[i]) == 1: axs[i / 2, 1].set_xlim(x[-1], x[0]) if n % 2 != 0: for l in axs[i / 2 - 1, 1].get_xaxis().get_majorticklabels(): l.set_visible(True) fig1.delaxes(axs[i / 2, 1]) #plt.subplots_adjust(wspace = 0.3,hspace = 0.3) plt.tight_layout() plt.show() def plot_global_utilities(self): fig4 = plt.figure(4) ax = fig4.gca() ax.barh(range(len(self.utastar_model.global_utilities_post))[::-1], self.utastar_model.global_utilities_post.values(), align='center', color='grey', alpha=0.8) plt.yticks( range(len(self.utastar_model.global_utilities_post))[::-1], self.utastar_model.global_utilities_post.keys()) ax.plot(self.utastar_model.global_utilities_post.values(), range(len(self.utastar_model.global_utilities_post))[::-1], linestyle='--', color='black', alpha=0.8) plt.xlim(0, 1) plt.title('Ranking') plt.tight_layout() plt.show() def plot_global_utilities_pred(self): fig4 = plt.figure(4) ax = fig4.gca() ax.barh(range(len(self.global_utilities))[::-1], self.global_utilities, align='center', color='grey', alpha=0.8) plt.yticks( range(len(self.global_utilities))[::-1], self.bootstrap_models.keys()) ax.plot(self.global_utilities, range(len(self.global_utilities))[::-1], linestyle='--', color='black', alpha=0.8) plt.xlim(0, 1) plt.title('Ranking') plt.tight_layout() plt.show() def plot_criteria_weights(self): variables = self.utastar_model.model_weights_post.keys() data = self.utastar_model.model_weights_post.values() ranges = [ (0.00001, 0.00001 + max(self.utastar_model.model_weights_post.values())) ] * len(self.utastar_model.criteria) fig1 = plt.figure(figsize=(10, 10)) radar = ComplexRadar(fig1, variables, ranges, 7) radar.plot(data) #dradar.fill(data, alpha=0.2, color='grey') plt.show() def plot_model_weights(self, title): sns.set(style="whitegrid") f, ax = plt.subplots(figsize=(10, 4)) variables = dict( sorted(zip(self.bootstrap_models.keys(), self.weights))) sns.set_color_codes("pastel") f = sns.barplot(x=variables.keys(), y=variables.values(), color="b").set_title(title) ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontdict={ 'verticalalignment': 'baseline', 'horizontalalignment': 'right' }) ax.set(xlim=(-1, 30), ylabel="Weight", xlabel="Models") sns.despine(left=True, bottom=True) def _get_meta_table(self): columns = [ 'Cri/atributes', 'Monotonicity', 'Type', 'Worst', 'Best', 'a' ] meta_table = [] for metric in self.crit_metrics.keys(): monotonicity = 1 if self.crit_metrics[metric][0]._sign == -1: monotonicity = 0 self.crit_metrics[metric][0]._sign = 1 mt_metric = [ metric, monotonicity, 0, self.crit_metrics[metric][1], self.crit_metrics[metric][2], self.a ] meta_table.append(mt_metric) self.meta_table = pd.DataFrame(meta_table, columns=columns) def _get_multicriteria_table(self): criteria = self.crit_metrics.keys() columns = ['Alt/Cri '] columns.extend(criteria) #metrics_orig = self._get_metrics(self.models) metrics_bootstrap = self._get_metrics(self.bootstrap_models, on='validation') metrics = metrics_bootstrap multicriteria_table = pd.DataFrame(metrics, columns=columns) ranking = self._get_init_ranking() ranking = pd.DataFrame(ranking, columns=['Ranking']) self.multicriteria_table = multicriteria_table.join(ranking).copy( deep=True) def _get_dataset(self, model, on='test'): if on == 'test': X, y = self.dataset.X_test.copy(), self.dataset.y_test.copy() elif on == 'validation': X, y = self.dataset.X_train.copy(), self.dataset.y_train.copy() if 'FOLD' in model: fold_idx = int(re.search(r'(?<=FOLD)[0-9]', model).group(0)) indices = self.test_kfold_indices[fold_idx] X, y = X.iloc[indices], y.iloc[indices] elif on == 'train': X, y = self.dataset.X_train, self.dataset.y_train if 'FOLD' in model: fold_idx = int(re.search(r'(?<=FOLD_)[0-9]', model).group(0)) indices = self.kfold_indices[fold_idx] X, y = X.iloc[indices], y.iloc[indices] else: raise Exception('Unexpected input for argument on.') return X, y def _get_global_metrics(self, models, on='test'): global_metrics = [] for model in models.keys(): X, y = self._get_dataset(model, on=on) global_metrics.append(self.global_metric(models[model], X, y)) return global_metrics def _get_init_ranking(self): #gm_orig = self._get_global_metrics(self.models,on='validation') gm_bootstrap = self._get_global_metrics(self.bootstrap_models, on='validation') #gm = gm_orig + gm_bootstrap self.global_metrics = gm_bootstrap if self.global_metric._sign == 1: ranking = len(self.global_metrics) - scipy.stats.rankdata( self.global_metrics, method='max') else: ranking = scipy.stats.rankdata(gm, method='max') return ranking def _get_metrics(self, models, on='test'): metrics = [] for model in models.keys(): model_metrics = [model] X, y = self._get_dataset(model, on=on) for metric in self.crit_metrics.keys(): mes = self.crit_metrics[metric][0](models[model], X, y) #Takes care of the negativde values on the multicriteria table and replaces them with 0 if mes > 0: model_metrics.append(mes) else: model_metrics.append(0) metrics.append(model_metrics) return metrics
lgbm_calibrator, knn_calibrator, rf_calibrator, ada_calibrator, extra_calibrator ], weights=weights, refit=False, voting='soft', verbose=1) print('fitting') #eclf_hard.fit(train_xm, train_y) eclf_soft.fit(train_xm, train_y) print('predicting') #eclf_hard_pred = eclf_hard.predict(val_xm) #eclf_hard_pred_pr = eclf_hard.predict_proba(val_xm) eclf_soft_pred = eclf_soft.predict(val_xm) eclf_soft_pred_pr = eclf_soft.predict_proba(val_xm) #evaluating majority voting voter_pred = eclf_soft_pred voter_pred_pr = eclf_soft_pred_pr acc_voter = accuracy_score(val_y, voter_pred) roc_voter = roc_auc_score(val_y, voter_pred_pr[:, 1]) f1_voter = f1_score(val_y, voter_pred) precision_voter = precision_score(val_y, voter_pred) recall_voter = recall_score(val_y, voter_pred) log_loss_voter = log_loss(val_y, voter_pred_pr) print('accuracy voter: ', acc_voter) print('roc voter: ', roc_voter) print('f1 voter: ', f1_voter)
RandomForestClassifier(n_estimators=500, max_features='auto', min_samples_split=20, min_samples_leaf=5), GradientBoostingClassifier( n_estimators=250, max_features=5, min_samples_leaf=5), ExtraTreesClassifier(n_estimators=75, min_samples_leaf=5))) # classifier = StackingClassifier( # classifiers=( # xgb.XGBClassifier(n_estimators=150, max_depth=4, subsample=0.7, colsample_bytree=0.4), # RandomForestClassifier(n_estimators=500, max_features='auto', min_samples_split=20, min_samples_leaf=5), # GradientBoostingClassifier(n_estimators=250, max_features=5, min_samples_leaf=5), # ExtraTreesClassifier(n_estimators=75, min_samples_leaf=5) # ), # meta_classifier=LogisticRegression() # ) # scores = cross_val_score(voting_classifier, getX(data), data['Survived'], cv=5, scoring='accuracy') # print np.mean(scores) classifier.fit(getX(data), data['Survived']) data = pd.read_csv('titanic-test.csv') data = preprocess(data) data['Survived'] = classifier.predict(getX(data)) data.to_csv('titanic-submission.csv', index=False, columns=['PassengerId', 'Survived'])
clf.fit(X, y) # In[31]: from mlxtend.classifier import EnsembleVoteClassifier import copy eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[1, 1, 1], refit=False) labels = ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble'] eclf.fit(X, y) print('accuracy:', np.mean(y == eclf.predict(X))) # ## Example 6 - Ensembles of Classifiers that Operate on Different Feature Subsets # In[32]: from sklearn.datasets import load_iris from mlxtend.classifier import EnsembleVoteClassifier from mlxtend.feature_selection import ColumnSelector from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression iris = load_iris() X = iris.data y = iris.target
clf1 = bestLogReg_model clf2 = bestSVC_model clf3 = bestnn_model clf4 = best_rf clf5 = bestgrad_model from mlxtend.classifier import EnsembleVoteClassifier import copy eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4, clf5], weights=[2, 2, 2, 2, 1], refit=False) eclf.fit(X_test_scaled, y_test) print('Ensemble Model accuracy:', np.mean(y_test == eclf.predict(X_test_scaled)) * 100, "%") # # Models - Visualizations # In[28]: import itertools clfs = [clf1, clf2, clf3, clf4, clf5, eclf] labels_eclf = [ "Logistic Regression", 'SVC', "Neural Network", 'Random Forest', 'GradientBoosting', "Ensemble Model" ] pca = PCA(n_components=2) x_train2 = pca.fit_transform(X_test_scaled) gs = gridspec.GridSpec(2, 2)
class ExtendedBaggingClassifier: def __init__(self, voting="hard", verbose=False, parallel=True, target_name='target'): self.models = [] self.temporary_models = [] self.voting = voting self.predictions = [] self.votingClassifier = None self.verbose = verbose self.parallel = parallel self.target_name = target_name def _get_models(self): base_models = [] for model in self.models: base_models.append(model.model) return base_models def add_models(self, model, params): """ Create all the possible combinations of the model with given parameters. Usage example: params = { 'C': np.logspace(0, 4, num=10), 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga'] } custom_bagging = CustomBaggingClassifier(verbose=True, parallel=True) custom_bagging.add_models(LogisticRegression, params) :param model: The name of the model (passed without calling the constructor) that is intended to be used :param params: key-value pairs of hyperparameters that will be used to generate all the possible models :return: the number of models of the ensemble """ if self.votingClassifier is not None: self.votingClassifier = None keys = list(params) for values in itertools.product(*map(params.get, keys)): model_instance = model(**dict(zip(keys, values))) self.temporary_models.append((str(model_instance), model_instance)) return len(self.temporary_models) def add_model(self, model): """ Add a model to the ensemble :param model: instance of the model :return: --- See also :add_models. """ if self.votingClassifier is not None: self.votingClassifier = None self.temporary_models.append((str(model), model)) return len(self.temporary_models) def _commit_single_model(self, n_samples, temp_model): """ train_set, oob_set = self._generate_bootstrap_sample(Xy) return BaseModel(temp_model[0], temp_model[1], train_set, oob_set, self.target_name) """ sampled_idx, unsampled_idx = self._generate_indexes(len(self.temporary_models), n_samples) return BaseModelIdx(temp_model[0], temp_model[1], sampled_idx, unsampled_idx, self.target_name) def _commit_models(self, X, y): """ Create indexes sets for train and oob validation sets. """ if X.shape[0] != y.shape[0]: raise ValueError('It seems that target values (y) are not the same as feature values (X)') if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._commit_single_model, X.shape[0]) self.models = pool.map(f, self.temporary_models) pool.close() pool.join() else: for temp_model in self.temporary_models: self.models.append(self._commit_single_model(X.shape[0], temp_model)) def _fit_single_model(self, X, y, single_model): return single_model.fit(X, y) def fit(self, X, y): """ Train all the models in the ensemble. :param X: Features values of trainset :param y: Target values of trainset :return: --- """ # self._commit_models(X, y) if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._fit_single_model, X, y) self.models = pool.map(f, self.models) pool.close() pool.join() else: for model in self.models: self._fit_single_model(X, y, model) self.votingClassifier = EnsembleVoteClassifier(clfs=self._get_models(), voting=self.voting, refit=False) self.votingClassifier.fit(X, y) def _predict_single_model(self, X, model): return model.name, model.predict(X) def predict_each_model(self, X): """ Perform a prediction for each model in the ensemble. NOTE! fit(X,y) is required before. :param X: Features dataframe to be used for predictions :return: List of predictions with model name associated """ if len(self.models) == 0: raise ValueError('Probably fit(X,y) method was not called before. Call it!') predictions = [] if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._predict_single_model, X) predictions = pool.map(f, self.models) pool.close() pool.join() else: for model in self.models: predictions.append(self._predict_single_model(model, X)) return predictions def score(self, X, y): """ Get the score given X as features values and y as target values. Useful for validation/testing purposes. :param X: Features dataframe of trainset :param y: Target dataframe of trainset :return: score """ return self.votingClassifier.score(X, y) def predict(self, X): """ Perform a prediction considering the models as an ensemble. NOTE! train_models() must be called before getting the predictions :param X: input values to be used for predictions :return: list of predictions with model name associated """ return self.votingClassifier.predict(X) def _get_single_oob(self, X, y, model): return model.name, model.score(X, y) def models_oob_score(self, X, y): ''' Computes the OOB score for each model in the ensemble :return: list of OOB scores, one for each model in the ensemble ''' oob_scores = [] if self.parallel: pool = multiprocessing.Pool(processes=None) f = partial(self._get_single_oob, X, y) oob_scores = pool.map(f, self.models) pool.close() pool.join() else: for model in self.models: oob_scores.append((self._get_single_oob(X, y, model))) return oob_scores def _ret_accuracy(self, array): return array[1] def best_model(self, X, y): ''' Find the best model comparing performances over OOB set :return: the model with the best OOB score ''' performances = self.models_oob_score(X, y) performances.sort(key=self._ret_accuracy, reverse=False) return performances.pop() '''def _generate_bootstrap_sample(self, X): df_boot = X.sample(n=X.shape[0], replace=True, random_state=randint(0, 10000)) oob = pd.concat([df_boot, X]).drop_duplicates(keep=False) if self.verbose is True: print("OOB set size: %.2f" % float(oob.shape[0] / df_boot.shape[0] * 100), "%") print("OOB set abs.: %i" % oob.shape[0]) return df_boot, oob''' def _generate_indexes(self, num_models, n_samples): rand_state = randint(0, num_models) sampled_idxs = self._generate_sample_indices(rand_state, n_samples) unsampled_idxs = self._generate_unsampled_indices(rand_state, n_samples) return sampled_idxs, unsampled_idxs def _generate_unsampled_indices(self, random_state, n_samples): sample_indices = self._generate_sample_indices(random_state, n_samples) sample_counts = np.bincount(sample_indices, minlength=n_samples) unsampled_mask = sample_counts == 0 indices_range = np.arange(n_samples) unsampled_indices = indices_range[unsampled_mask] return unsampled_indices def _generate_sample_indices(self, random_state, n_samples): random_instance = self._check_random_state(random_state) sample_indices = random_instance.randint(0, n_samples, n_samples) return sample_indices def _check_random_state(self, seed): if isinstance(seed, numbers.Integral): return np.random.RandomState(seed) if isinstance(seed, np.random.RandomState): return seed raise ValueError('%r cannot be used to seed a numpy.random.RandomState' ' instance' % seed)
'Random Forest', 'Extra Trees', 'Support Vector', 'Decision Tree', 'Ensemble Vote' ] for clf, label in zip([clf_RF, clf_ET, clf_svc, clf_DT, eclf], labels): scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) eclf.fit(X_train, y_train) confidence = eclf.score(X_test, y_test) print(confidence) example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1]]) example_measures = example_measures.reshape(len(example_measures), -1) prediction = eclf.predict(example_measures) print(prediction) col_dict = dict(list(enumerate(df.columns))) col_dict X = np.array(df.drop(['class'], 1), dtype=np.float64) y = np.array(df['class'], dtype=np.int64) plot_decision_regions( X=X, y=y, clf=clf, filler_feature_values=col_dict, filler_feature_ranges=col_dict, )
class GetThatEnsemble: def __init__(self, cpu): self.names = ['f_' + str(i) for i in range(223)] self.X = pd.read_csv('original_data/x_train.csv', delimiter=';', names=self.names) self.Y = pd.read_csv('original_data/y_train.csv', names=['target'], delimiter=';') # feature generator block: self.X['mul_0'] = self.X['f_138'] * self.X['f_96'] self.X['mul_1'] = self.X['f_138'] * self.X['f_156'] self.X['mul_2'] = self.X['f_11'] * self.X['f_200'] self.X['mul_3'] = self.X['f_96'] * self.X['f_83'] self.X['mul_4'] = self.X['f_200'] * self.X['f_83'] self.X['mul_5'] = self.X['f_200'] * self.X['f_156'] self.X['mul_6'] = self.X['f_76'] * self.X['f_156'] self.X['mul_7'] = self.X['f_76'] * self.X['f_131'] self.X['mul_8'] = self.X['f_76'] * self.X['f_182'] self.X['mul_9'] = self.X['f_41'] * self.X['f_182'] self.X['mul_10'] = self.X['f_11'] * self.X['f_200'] # # self.X['mul'] = self.X['f_84'] * self.X['f_182'] self.default_columns = [ 'f_138', 'f_11', 'f_96', 'f_200', 'f_76', 'f_41', 'f_83', 'f_156', 'f_131', 'f_84', 'f_182', 'mul_0', 'mul_1', 'mul_2', 'mul_3', 'mul_4', 'mul_5', 'mul_6', 'mul_7', 'mul_8', 'mul_9', 'mul_10', ] self.kf = None self.cpu = cpu self.pipeline = None def get_fold(self, columns, fold_amount=5): self.kf = StratifiedKFold(n_splits=fold_amount, shuffle=True) self.kf.get_n_splits(self.X[columns], self.Y['target']) for train_index, test_index in self.kf.split(self.X[columns], self.Y['target']): x_train, x_test = self.X.as_matrix( columns)[train_index], self.X.as_matrix(columns)[test_index] y_train, y_test = self.Y.as_matrix( )[train_index], self.Y.as_matrix()[test_index] return x_train, y_train, x_test, y_test def ensemble(self, folds_limit=42): answers = [] pass # clf1 = ExtraTreesClassifier(max_features=0.4, min_samples_leaf=1, min_samples_split=4, # n_estimators=1000, n_jobs=self.cpu) # clf2 = ExtraTreesClassifier(criterion="gini", max_features=0.4, min_samples_split=6, n_estimators=1000, # n_jobs=self.cpu) # clf3 = ExtraTreesClassifier(max_features=0.55, min_samples_leaf=1, min_samples_split=4, n_estimators=1000, # n_jobs=self.cpu) # clf4 = ExtraTreesClassifier(max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=1000, # n_jobs=self.cpu) pass # default 0.6742 on seed=42 for full set (search_best_3) clf1 = ExtraTreesClassifier(max_features=0.4537270875668709, criterion='entropy', min_samples_leaf=1, min_samples_split=2, n_estimators=3138, n_jobs=self.cpu) pass # clf1 = RandomForestClassifier(max_features=0.34808889858456293, criterion='entropy', # min_samples_split=2, n_estimators=4401, n_jobs=self.cpu) pass # default # clf1 = ExtraTreesClassifier(max_features=0.4, min_samples_leaf=1, min_samples_split=2, n_estimators=1000, # n_jobs=self.cpu) self.pipeline = EnsembleVoteClassifier(clfs=[clf1], weights=[1], voting='soft') for iteration in range(folds_limit): np.random.seed(42 + iteration) x_train, y_train, x_test, y_test = self.get_fold( self.default_columns) self.pipeline.fit(x_train, y_train) preds = self.pipeline.predict(x_test) # print(confusion_matrix(y_test, preds)) matrix_ = confusion_matrix(y_test, preds) correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][ 2] + matrix_[3][3] + matrix_[4][4] print(' Correct answers count: ', correct_answers, ' [it: %s]' % iteration) answers.append(int(correct_answers)) if iteration % 5 == 0 and iteration > 0: print('Params: mean: %s std: %s best: %s' % (np.mean(answers), np.std(answers), max(answers))) print('Params: mean: %s std: %s best: %s' % (np.mean(answers), np.std(answers), max(answers))) def answers(self, iter_limit=10): self.pipeline.fit(self.X[self.default_columns], self.Y['target']) score_dataset = pd.read_csv('original_data/x_test.csv', delimiter=';', names=self.names) # feature generator block: score_dataset['mul_0'] = score_dataset['f_138'] * score_dataset['f_96'] score_dataset[ 'mul_1'] = score_dataset['f_138'] * score_dataset['f_156'] score_dataset['mul_2'] = score_dataset['f_11'] * score_dataset['f_200'] score_dataset['mul_3'] = score_dataset['f_96'] * score_dataset['f_83'] score_dataset['mul_4'] = score_dataset['f_200'] * score_dataset['f_83'] score_dataset[ 'mul_5'] = score_dataset['f_200'] * score_dataset['f_156'] score_dataset['mul_6'] = score_dataset['f_76'] * score_dataset['f_156'] score_dataset['mul_7'] = score_dataset['f_76'] * score_dataset['f_131'] score_dataset['mul_8'] = score_dataset['f_76'] * score_dataset['f_182'] score_dataset['mul_9'] = score_dataset['f_41'] * score_dataset['f_182'] score_dataset[ 'mul_10'] = score_dataset['f_11'] * score_dataset['f_200'] predicts = pd.DataFrame() for iteration in range(iter_limit): if iteration > 0 and iteration % 5 == 0: print('[Predict: %s]' % iteration) np.random.seed(42 + iteration) y_pred = self.pipeline.predict(score_dataset[self.default_columns]) predicts[iteration] = y_pred vote_answer = [] for pos in range(len(predicts[0])): row_dict = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0} for column in predicts.columns: row_dict[str(predicts[column].iloc[pos])] += 1 best_answer_count = 0 for key in row_dict.keys(): if row_dict[key] > best_answer_count: best_answer_count = row_dict[key] best_answer = int(key) vote_answer.append(best_answer) predicts['votes'] = vote_answer predicts['diff'] = predicts['votes'] - predicts[0] print(predicts[predicts['diff'] != 0]) print(predicts[predicts['diff'] != 0].shape) pd.Series(predicts['votes']).to_csv('data/answer.csv', index=False)