def test_no_weight_support_with_no_weight(): logi = LogisticRegression() rf = RandomForestClassifier() gnb = GaussianNB() knn = KNeighborsClassifier() eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard') eclf.fit(X, y)
def test_sample_weight(): # with no weight np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard') prob1 = eclf.fit(X, y).predict_proba(X) # with weight = 1 w = np.ones(len(y)) np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard') prob2 = eclf.fit(X, y, sample_weight=w).predict_proba(X) # with random weight random.seed(87) w = np.array([random.random() for _ in range(len(y))]) np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard') prob3 = eclf.fit(X, y, sample_weight=w).predict_proba(X) diff12 = np.max(np.abs(prob1 - prob2)) diff23 = np.max(np.abs(prob2 - prob3)) assert diff12 < 1e-3, "max diff is %.4f" % diff12 assert diff23 > 1e-3, "max diff is %.4f" % diff23
def emsembal_train(feature, label): from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier label = transport_labels(label) X_train, X_test, Y_train, Y_test = train_test_split(feature, label, test_size=0.2, random_state=1000) clf1 = SVC(C=10, kernel='sigmoid', probability=True) clf2 = RandomForestClassifier(random_state=0) clf3 = LogisticRegression(random_state=0) clf4 = xgb.XGBClassifier(max_depth=8, learning_rate=0.07, n_estimators=35, silent=True, objective="binary:logistic", booster='gbtree', gamma=0, min_child_weight=6, subsample=0.8, colsample_bytree=0.7, reg_alpha=0.1, seed=1000) eclf = EnsembleVoteClassifier(clfs=[clf1, clf3, clf4], voting='soft') eclf.fit(X_train, Y_train) y_pred = eclf.predict(X_test) print('eclf accs=%f' % (sum(1 for i in range(len(y_pred)) if y_pred[i] == Y_test[i]) / float(len(y_pred))))
def test_no_weight_support_with_no_weight(): logi = LogisticRegression(solver='liblinear', multi_class='ovr') rf = RandomForestClassifier(n_estimators=10) gnb = GaussianNB() knn = KNeighborsClassifier() eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard') eclf.fit(X, y)
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def make_model(self): #--------------------------------------------------------------------------------------------- # TREE BASED ALGORITHMS #--------------------------------------------------------------------------------------------- #--Chossing random_state parameter #------Basically, a sub-optimal greedy algorithm is repeated a number of times using---------- #------random selections of features and samples (a similar technique used in random---------- #------ forests).The 'random_state' parameter allows controlling these random choices--------- #--n_estimators = no of decision trees to be created in forest model_rf = RandomForestClassifier(n_estimators=145, random_state=10, n_jobs=-1) model_rf.fit(train_feats2, target) model_gb = GradientBoostingClassifier(n_estimators=145, random_state=11, n_jobs=-1) model_gb.fit(train_feats2, target) model_ab = AdaBoostClassifier(n_estimators=145, random_state=12, n_jobs=-1) model_ab.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # LOGISTIC REGRESSION #-------------------------------------------------------------------------------------------- model_lr = LogisticRegression(random_state=1) model_lr.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # NAIVE BAYES #-------------------------------------------------------------------------------------------- model_nb = MultinomialNB() model_nb.fit(train_feats2, target) #-------------------------------------------------------------------------------------------- # VOTING ENSEMBLE OF ALL MODELS #-------------------------------------------------------------------------------------------- clf = [model_rf, model_lr, model_gb, model_ab, model_nb] eclf = EnsembleVoteClassifier( clfs=clf, weights=[1, 2, 1, 1, 1], refit=False) #weights can be decided by stacking!! eclf.fit(train_feats2, target) print("model created") preds = eclf.predict(test_feats2) sub3 = pd.DataFrame({'User_ID': test_df.User_ID, 'Is_Response': preds}) sub3['Is_Response'] = sub3['Is_Response'].map( lambda x: functions.to_labels(self, x)) sub3 = sub3[['User_ID', 'Is_Response']] sub3.to_csv('D:\\New folder\\f2c2f440-8-dataset_he\\SUB_TEST.csv', index=False) print("prediction saved") return eclf
def votingEnsembleTest2ndLayer_Test(top_ensembles_dict, test_country_data): hit_count = 0 for BC in top_ensembles_dict.keys(): classifiers = [ _vclf for _vclf in [sub_list[1] for sub_list in top_ensembles_dict[BC]] ] _weights = np.asarray([1] * len(classifiers)) vclf_layer2 = EnsembleVoteClassifier(clfs=classifiers, weights=_weights, refit=False) Y = test_country_data[BC]["Y"] X = test_country_data[BC]["X"] vclf_layer2.fit(X, Y) y_estimate = vclf_layer2.predict(X) print( "Mentality Cycle {} 2nd Layer Voting Classifier Ensemble has accuracy: {}" .format(BC, np.mean(Y == y_estimate))) hit_count = hit_count + np.sum( Y == y_estimate ) ##calc overall performance of top 3 classifiers for each region total_obvs = test_country_data[1]["Y"].shape[0] + test_country_data[2][ "Y"].shape[0] + test_country_data[3]["Y"].shape[0] overall_hit_rate = hit_count / total_obvs print("Aggregated accuracy of 2nd Layer Voting Classifiers is: {}".format( overall_hit_rate))
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def tri_train(domain,X_train,y_train,X_test,y_test,X_un,theta=0.5,dis=False): models = list() accs = list() for i in range(3): X_split,y_split = bootstrap_sample(X_train,y_train) acc,clf_func = get_acc_clf(domain,X_split,y_split,X_test,y_test) models.append(clf_func) accs.append(acc) for (j,k) in itertools.combinations(models,2): # i_features = list() unlabelled_features = np.array(X_un) total = len(X_train)+len(X_un) t = 0 count = 0 X_i = X_train y_i = y_train # find current classifier clf_i = [x for x in models if x!=j and x!=k][0] index_i = models.index(clf_i) print "***classifier %d***"%index_i while count < total and len(unlabelled_features)!=0: t += 1 X_tgt,y_tgt = get_features(unlabelled_features,j,k,clf_i,models,theta=theta,dis=dis) if len(X_tgt)==0 and t>1: print "no new features added" break X_i = concatenate(X_i,X_tgt) y_i = concatenate(y_i,y_tgt) count = len(X_i) print "%d %d %d"%(t,count,total) # clf_i.fit(X_i,y_i) # update classifier acc,clf_i = get_acc_clf(domain,X_i,y_i,X_test,y_test) if accs[index_i]<acc: accs[index_i] = acc # best_clf = clf_i print "*NEW BEST! best acc:", acc models[index_i] = clf_i else: print "no improvement..skip.." break if count == total: print "reach end.." break # update the unlabelled features for speed-up print np.array(X_tgt).shape X_tgt = [list(x) for x in X_tgt] unlabelled_features =[x for x in unlabelled_features if list(x) not in X_tgt] print np.array(unlabelled_features).shape # majority vote classifiers eclf = EnsembleVoteClassifier(clfs=models,weights=[1,1,1],refit=False) eclf.fit(X_test,y_test) # this line is not doing work # tmp_name = domain.upper()[0] if "large" not in domain else "large/"+domain.upper()[6] pred = eclf.predict(X_test) acc = accuracy_score(y_test,pred) if "large" not in domain else f1_score(y_test,pred,average='macro') print "acc:%s theta:%s"%(acc,theta),"seprate accs:",accs return acc,eclf
def majority_vote(target): X_test = load_obj("%s/X_test"%target) y_test = load_obj("%s/y_test"%target) domains = [] if "mlp" in target: domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"] else: if "large" not in target: domains = ["books","dvd","electronics","kitchen"] if target not in domains: return else: domains =["large/baby","large/cell_phone","large/imdb","large/yelp2014"] models = [] for source in domains: if target == source: continue else: print source clf_func = load_obj("%s/self_clf"%source) models.append(clf_func) eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1], eclf.fit(X_test,y_test) # this line is not doing work tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6] tmp_name = target.upper()[0] if "mlp" not in target else "mlp/"+target.upper()[4] save_obj(eclf, '%s_eclf'%(tmp_name)) pred = eclf.predict(X_test) acc = accuracy_score(y_test,pred) if "large" not in target else f1_score(y_test,pred,average='macro') print 'self-train',acc pass
def train_knn_model(assts, n_macroepochs=100, n_epochs=10): TUNE = False #we start by fitting pca across the whole population (random sample) sgen = xy_generator(assts, batch_size=5000) pca = PCA(n_components=48) for _,X,y,_,_,_,_ in sgen: print("fitting PCA...") X = numpy.array(X, dtype=numpy.int8) y = numpy.array(y).ravel() pca.fit_transform(X) # if TUNE: # tuned_parameters = [{'n_neighbors': [1, 20, 50, 100], # 'weights': ['distance', 'uniform'], # 'algorithm': ['ball_tree', 'kd_tree', 'brute'] # }] # scores = ['f1_macro', 'f1_micro', 'accuracy'] # # scores = ['accuracy'] # performances = [] # print("Tuning") # for score in scores: # print("# Tuning hyper-parameters for %s" % score) # clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5, scoring=score, verbose=0, n_jobs=7) # clf.fit(X, y) # print("Best parameters set found on development set:") # print(clf.best_estimator_) # print("Grid scores on development set:") # for params, mean_score, scores in clf.grid_scores_: # print("%0.3f (+/-%0.03f) for %r" # % (mean_score, scores.std() / 2, params)) # # break #half-loop just to get one sample from sgen exit() del sgen print("fitted") gc.collect() xygen = xy_generator(assts, batch_size=5000) # make generator object clfs = [] i = 0 for S,X, y, yc, yt, ylv, yv in xygen: X = numpy.array(X, dtype=numpy.int8) y = numpy.array(y) X = pca.transform(X) voter = SVC() voter.fit(X,y) clfs.append(voter) i += 1 model = EnsembleVoteClassifier(clfs=clfs, refit=False) X_for_classes = [] y_for_classes = [] for classlabel in all_page_ids: X_for_classes.append(numpy.zeros(256)) y_for_classes.append(classlabel) model.fit(X_for_classes,y_for_classes) return model, pca, None, None #, sscaler, levscaler, volscaler
def test_no_weight_support(): random.seed(87) w = np.array([random.random() for _ in range(len(y))]) logi = LogisticRegression(solver='liblinear', multi_class='ovr') rf = RandomForestClassifier(n_estimators=10) gnb = GaussianNB() knn = KNeighborsClassifier() eclf = EnsembleVoteClassifier(clfs=[logi, rf, gnb, knn], voting='hard') eclf.fit(X, y, sample_weight=w)
def test_1model_probas(): clf = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=123) ens_clf_1 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=None) ens_clf_2 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=[1.]) pred_e1 = ens_clf_1.fit(X, y).predict_proba(X) pred_e2 = ens_clf_2.fit(X, y).predict_proba(X) pred_e3 = clf.fit(X, y).predict_proba(X) np.testing.assert_almost_equal(pred_e1, pred_e2, decimal=8) np.testing.assert_almost_equal(pred_e1, pred_e3, decimal=8)
def majority_vote_mlp(target): X_test = load_obj("%s/X_test"%target) y_test = load_obj("%s/y_test"%target) # domains = ["mlp/books","mlp/dvd","mlp/electronics","mlp/kitchen"] data_name = ["books", "dvd", "electronics", "kitchen"] X_joint = load_obj("%s/X_joint"%target) y_joint = load_obj("%s/y_joint"%target) temp_un = load_obj("%s/X_un"%target) meta_sources = [] for i in range(len(data_name)): if 'mlp/'+data_name[i] != target: meta_sources.append(data_name[i]) # print meta_sources models = [] for j in range(len(meta_sources)): temp_X = X_joint[j] temp_y = y_joint[j] thetas = [0.5,0.6,0.7,0.8,0.9] best_acc = 0.0 best_clf ="" best_theta = 0.0 resFile = open("../work/params/%s_theta_self-%s.csv"%(target,meta_sources[j].upper()[0]),"w") resFile.write("theta, acc\n") for theta in thetas: print "##############################" print "start with theta=%s"%theta print "##############################" acc,clf_func = self_train(target,temp_X,temp_y,X_test,y_test,temp_un,theta=theta) if best_acc<acc: best_acc = acc best_clf = clf_func best_theta = theta resFile.write("%f, %f\n"%(theta,acc)) resFile.flush() resFile.close() print "##############################" print "best_theta:",best_theta,"best_acc:",best_acc models.append(best_clf) eclf = EnsembleVoteClassifier(clfs=models,refit=False)#weights=[1,1,1], eclf.fit(X_test,y_test) # this line is not doing work # tmp_name = target.upper()[0] if "large" not in target else "large/"+target.upper()[6] # tmp_name = 'mlp/'+target.upper()[4] save_obj(eclf, "%s/self_clf"%target) pred = eclf.predict(X_test) # print pred acc = accuracy_score(y_test,pred) print 'self-train',acc pass
def test_1model_labels(): clf = LogisticRegression(multi_class='multinomial', solver='newton-cg', random_state=123) ens_clf_1 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=None) ens_clf_2 = EnsembleVoteClassifier(clfs=[clf], voting='soft', weights=[1.]) pred_e1 = ens_clf_1.fit(X, y).predict(X) pred_e2 = ens_clf_2.fit(X, y).predict(X) pred_e3 = clf.fit(X, y).predict(X) np.testing.assert_equal(pred_e1, pred_e2) np.testing.assert_equal(pred_e1, pred_e3)
def votingEnsembleTest(all_country_data_with_algos, test_country_data_US): print( " \n For each training set country for each sub dataset (split by Mentality Cycle): the top n trained algorithms form a Voting Classifiers. This Voting Classifiers is then tested on its corresponding US sub data set. An aggregate scocre for each trainging set country is calculated through an Aggregation of its 3 Voting Classifiers' performances" ) _all_country_data_with_trained_algos = copy.deepcopy( all_country_data_with_algos) for country in _all_country_data_with_trained_algos.keys(): country_level_total_hits = 0 for BC in _all_country_data_with_trained_algos[country].keys(): classifiers = copy.deepcopy( _all_country_data_with_trained_algos[country][BC].get( 'trained algos')) clf_weights = np.asarray([1, 1, 1], dtype=int) Y = test_country_data_US[BC].get("Y") X = test_country_data_US[BC].get("X") vclf = EnsembleVoteClassifier(clfs=classifiers, weights=clf_weights, refit=False, voting='hard') # voting='soft' vclf.fit(X, Y) y_estimate = vclf.predict(np.array(X)) print( "Voting Classifier trained on {} Mentality Cycle {} has accuracy: {}" .format(country, BC, np.mean(Y == pd.Series(y_estimate)))) ##saving Country-BC split accuracy and instance of Voting Classifier score to all_country... dictionary _all_country_data_with_trained_algos[country][BC][ 'accuracy'] = np.mean(Y == y_estimate) _all_country_data_with_trained_algos[country][BC][ 'votingclassifier'] = vclf country_level_total_hits = country_level_total_hits + np.sum( Y == y_estimate) record_count = test_country_data_US[1]["Y"].shape[ 0] + test_country_data_US[2]["Y"].shape[0] + test_country_data_US[ 3]["Y"].shape[0] _all_country_data_with_trained_algos[country]['accuracy'] = ( country_level_total_hits / record_count) print("Aggregated Classifier trained on {} has accuracy: {} \n".format( country, _all_country_data_with_trained_algos[country]['accuracy'])) return _all_country_data_with_trained_algos
def test_fit_base_estimators_false(): np.random.seed(123) clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') clf2 = RandomForestClassifier(n_estimators=10) clf3 = GaussianNB() clf1.fit(X, y) clf2.fit(X, y) clf3.fit(X, y) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', fit_base_estimators=False) eclf.fit(X, y) assert round(eclf.score(X, y), 2) == 0.97
class VotingModel: def __init__(self, X, y, x_test, model_lists): self.model = EnsembleVoteClassifier(clfs=model_lists, weights=[1, 1, 1], refit=False, voting='soft') self.X = X self.y = y self.X_test = x_test def train(self): self.model.fit(self.X, self.y) def predict(self): return self.model.predict(self.X_test) def predict_proba(self): return self.model.predict_proba(self.X_test)
def meta_ensemble(): #ensemble learning (mlxtend) eclf1 = EnsembleVoteClassifier(clfs=[model1, model2, model3], weights=weight_base, voting='soft', refit=True) eclf1.fit(train_x_dtm, train_y) print 'ensemble1 fitted.' eclf2 = EnsembleVoteClassifier(clfs=[model1, model2, model3], weights=weight_base, voting='soft', refit=True) eclf2.fit(X_resampled, y_resampled) print 'ensemble2 fitted.' eclf3 = EnsembleVoteClassifier(clfs=[eclf1, eclf2], weights=weight_meta, voting='soft', refit=False) apply_model(eclf3)
def meta_ensemble_model(): # ensemble learning (mlxtend) ensemble1 = EnsembleVoteClassifier(clfs=[mnb, lr, rf], weights=weight_base, voting='soft', refit=True) ensemble2 = EnsembleVoteClassifier(clfs=[mnb, lr, rf], weights=weight_base, voting='soft', refit=True) meta_ensemble = EnsembleVoteClassifier(clfs=[ensemble1, ensemble2], weights=weight_meta, voting='soft', refit=False) ensemble1.fit(train_x_dtm, train_y) print('ensemble1 fitted.') ensemble2.fit(x_resampled, y_resampled) print('ensemble2 fitted.') return meta_ensemble
class ModelTrustRegression: def __init__(self, model, n_neighbors=20, weights='uniform', n_folds=5): self.template_model = model self.n_neighbors = n_neighbors self.weights=weights self.n_folds = n_folds self.fold_regressions=[] self.fold_models=[] self.bagger = None def fit(self, X, values): #hard prediction for train_index, validation_index in KFold(n_splits=self.n_folds).split(X): train_set = X[train_index] train_values = values[train_index] validation_set = X[validation_index] validation_values = values[validation_index] fold_model = clone(self.template_model) fold_model.fit(train_set, train_values) #retrains a brand new model for the fold fold_regressor = KNeighborsRegressor(weights=self.weights, n_neighbors=self.n_neighbors) fold_regressor.fit(validation_set, fold_model.predict(validation_set) == validation_values) self.fold_regressions.append(fold_regressor) self.fold_models.append(fold_model) self.bagger = EnsembleVoteClassifier(self.fold_models, voting="soft", refit=False) self.bagger.fit(X, values) #trivial fit def predict(self, X): return np.mean([fm.predict(X) for fm in self.fold_regressions], axis=0) def predict_proba(self, X): return self.bagger.predict_proba(X) def get_bagger(self): return self.bagger
def test6(): import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import EnsembleVoteClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) eclf1 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', verbose=1) eclf1 = eclf1.fit(X, y) print(eclf1.predict(X)) eclf2 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft') eclf2 = eclf2.fit(X, y) print(eclf2.predict(X)) eclf3 = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', weights=[2, 1, 1]) eclf3 = eclf3.fit(X, y) print(eclf3.predict(X))
def meta_ensemble(): #ensemble learning (mlxtend) eclf1 = EnsembleVoteClassifier(clfs=[model1, model2, model3], weights=weight_base, voting='soft', refit=True) eclf1.fit(train_x_dtm, train_y) print 'ensemble1 fitted.' eclf2 = EnsembleVoteClassifier(clfs=[model1, model2, model3], weights=weight_base, voting='soft', refit=True) eclf2.fit(scipy.sparse.load_npz('train_x_dtm_us_smote.npz'), [ int(str(line).replace('\n', '')) for line in open('train_y_us_smote', 'r') ]) print 'ensemble2 fitted.' eclf3 = EnsembleVoteClassifier(clfs=[eclf1, eclf2], weights=weight_meta, voting='soft', refit=False) apply_model(eclf3)
# from sklearn.pipeline import Pipeline from mlxtend.feature_selection import SequentialFeatureSelector sfs1 = SequentialFeatureSelector(clf1, k_features=4, floating=False, scoring='accuracy', print_progress=False, cv=0) clf1_pipe = Pipeline([('sfs', sfs1), ('logreg', clf1)]) eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3], voting='soft') params = {'pipeline__sfs__k_features': [1, 2, 3], #'pipeline__logreg__C': [1,0, 100.0], 'randomforestclassifier__n_estimators': [20, 200]} grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) grid.fit(iris.data, iris.target) for params, mean_score, scores in grid.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std()/ 2, params)) print grid.best_params_ eclf = eclf.set_params(**grid.best_params_) print eclf.fit(X, y).predict(X[[1, 51, 149]])
logging.info(f'Training {classifier_name}...') clf.fit(X_train, y_train) score = balanced_accuracy_score(y_test, clf.predict(X_test)) logging.info(f'{classifier_name} BAC = {score:.4f}') probabilities = clf.predict_proba(X_test) np.save(PROBABILITIES_PATH / f'{classifier_name}.cv.{args.fold}.npy', probabilities) results.append([classifier_name, score]) ensemble = EnsembleVoteClassifier(list(classifiers.values()), voting='soft', fit_base_estimators=False) ensemble.fit(X_train, y_train) score = balanced_accuracy_score(y_test, ensemble.predict(X_test)) logging.info(f'Ensemble BAC = {score:.4f}') results.append(['Ensemble', score]) with open(MODELS_PATH / f'ensemble.cv.{args.fold}.pickle', 'wb') as f: pickle.dump(ensemble, f) df = pd.DataFrame(results, columns=['Classifier', 'BAC']) df.to_csv(RESULTS_PATH / f'{args.fold}.csv', index=False)
pd.concat([x_valid_0, x_valid_1, x_valid_2, x_valid_3, x_valid_4], axis=0)) y_valid = pd.DataFrame() y_valid['target'] = x_valid['target'] x_valid.drop('target', axis=1, inplace=True) x_train_0 = pd.DataFrame(X[X['target'] == 0][:90]) x_train_1 = pd.DataFrame((X[X['target'] == 1][:900])) x_train_2 = pd.DataFrame(X[X['target'] == 2][:1300]) x_train_3 = pd.DataFrame(X[X['target'] == 3][:420]) x_train_4 = pd.DataFrame(X[X['target'] == 4][:90]) x_train = pd.DataFrame( pd.concat([x_train_0, x_train_1, x_train_2, x_train_3, x_train_4], axis=0)) y_train = pd.DataFrame() y_train['target'] = x_train['target'] x_train.drop('target', axis=1, inplace=True) eclf.fit(x_train[best_columns], y_train['target']) preds = eclf.predict(x_valid[best_columns]) print('Confusion matrix:\n') print(confusion_matrix(y_valid['target'].values, preds)) matrix_ = confusion_matrix(y_valid['target'].values, preds) correct_answers = matrix_[0][0] + matrix_[1][1] + matrix_[2][2] + matrix_[3][ 3] + matrix_[4][4] print('Correct answers count: ', correct_answers) # --- answer module --- eclf.fit(X[best_columns], Y['target']) score_dataset = pd.read_csv('original_data/x_test.csv', delimiter=';', names=names) y_pred = eclf.predict(score_dataset[best_columns]) pd.Series(y_pred).to_csv('data/answer.csv', index=False)
x_train, x_test, y_train, y_test = train_test_split(dataset_api, dataset_label, test_size=0.1) vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(lowercase=False) train_vectors = vectorizer.fit_transform(x_train) test_vectors = vectorizer.transform(x_test) clf1 = LogisticRegression(random_state=0) clf2 = RandomForestClassifier(random_state=0) clf3 = SVC(random_state=0, probability=True) clf4 = MultinomialNB(alpha=.01) clf5 = xgb.XGBClassifier() eclif = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4, clf5], weights=[2, 4, 2, 4, 7], voting='soft') eclif.fit(train_vectors, y_train) pred = eclif.predict(test_vectors) f_1 = sklearn.metrics.f1_score(y_test, pred, average='weighted') print "f_1 is " + str(f_1) with open(f_1_f, "w") as f: f.write("f_1 is " + str(f_1)) c = make_pipeline(vectorizer, eclif) nb_success = 0 nb_fail = 0 result_list = []
clf_DT = DecisionTreeClassifier() #clf_MNB= MNB() eclf = EnsembleVoteClassifier(clfs=[clf_RF, clf_ET, clf_svc, clf_DT], weights=[1, 1, 1, 1]) labels = [ 'Random Forest', 'Extra Trees', 'Support Vector', 'Decision Tree', 'Ensemble Vote' ] for clf, label in zip([clf_RF, clf_ET, clf_svc, clf_DT, eclf], labels): scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) eclf.fit(X_train, y_train) confidence = eclf.score(X_test, y_test) print(confidence) example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1]]) example_measures = example_measures.reshape(len(example_measures), -1) prediction = eclf.predict(example_measures) print(prediction) col_dict = dict(list(enumerate(df.columns))) col_dict X = np.array(df.drop(['class'], 1), dtype=np.float64) y = np.array(df['class'], dtype=np.int64) plot_decision_regions( X=X,
#cl4=XGBClassifier() clf4 = GradientBoostingClassifier() print('10-fold cross validation:\n') #np.random.seed(123) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4], weights=[1, 1, 1, 1], voting='soft') #from sklearn.model_selection import ShuffleSplit #for clf, label in zip([clf1, clf2, clf3], ['Logistic Regression', 'Random Forest', 'SVM'] #for clf, label in zip([clf1, clf3, cl4,eclf], ['Logistic Regression','RandomForest','SVM','Xgboost','Voting Ensemble']): # scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy') # print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean()*100, scores.std(), label)) eclf.fit(X_train, Y_train) y_pred = eclf.predict(X_test) print(accuracy_score(Y_test, y_pred) * 100) X = np.concatenate((X_train, X_test), 0) Y = np.concatenate((Y_train, Y_test), 0) # cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0) # scores=cross_val_score(clf, X, y, cv=cv) # print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) # accuracies=cross_val_score(estimator=clf,X=X,y=Y,cv=10) # print(accuracies.mean()*100,accuracies.std()*100) # print("Accuracy: %0.4f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) Mamun_confusion_matrix = confusion_matrix(Y_test, y_pred, labels=[1, 2, 3, 4, 5, 6, 12, 13])
n_estimators=100) # current best clf6 = ExtraTreesClassifier(max_features=0.45, min_samples_leaf=1, min_samples_split=5, n_estimators=100) eclf = EnsembleVoteClassifier(clfs=[clf3, clf4, clf5, clf6], weights=[1, 1, 1, 1], voting='soft') labels = ['Trees_3', 'Trees_4', 'Trees_5', 'Trees_6', 'Ensemble'] for clf, label in zip([clf3, clf4, clf5, clf6, eclf], labels): scores = model_selection.cross_val_score(clf, X[best_columns], Y['target'], cv=4, scoring='neg_log_loss') print("Log Loss: %0.3f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label)) # --- answer module --- eclf.fit(X[best_columns], Y['target']) score_dataset = pd.read_csv('original_data/x_test.csv', delimiter=';', names=names) y_pred = eclf.predict(score_dataset[best_columns]) pd.Series(y_pred).to_csv('data/answer.csv', index=False)
'nthread': 4, 'silent': 1, 'subsample': 0.6, 'reg_lambda': 0.89, 'gamma': 0.1, 'min_child_weight': 49.8, 'colsample_bytree': 0.8, 'n_estimators': 2790, } clf_2 = xgb.XGBClassifier(**clf_2_params) clf_3_params = { 'learning_rate': 0.0065, 'max_depth': 5, 'nthread': 4, 'silent': 1, 'subsample': 0.621, 'reg_lambda': 0.726, 'gamma': 0.053, 'min_child_weight': 30.8, 'colsample_bytree': 0.905, 'n_estimators': 958, } clf_3 = xgb.XGBClassifier(**clf_3_params) pipeline = EnsembleVoteClassifier(clfs=[clf_0, clf_1, clf_2, clf_3], weights=[1, 1, 1, 1], voting='soft') pipeline.fit(train, Y) y_pred = pipeline.predict_proba(test[test.columns]) pd.Series(y_pred[:, 1]).to_csv('answer.csv', index=False)
clf5_pipe,clf5_avg_f1 = set_pipe(clf5, avg_feats, 'c45_') list_of_cv_acc.append(clf5_avg_f1) # In[50]: clf6_pipe,clf6_avg_f1 = set_pipe(clf6, mi_feats, 'knn_') list_of_cv_acc.append(clf6_avg_f1) # In[51]: enclf = EnsembleVoteClassifier((clf1_pipe,clf2_pipe,clf3_pipe,clf4_pipe,clf5_pipe, clf6_pipe), refit = False) enclf.fit(X_train, y_train) y_pred = enclf.predict(X_test) con_mat = confusion_matrix(y_test, y_pred) #print("Cross Val acc score: ", (model_selection.cross_val_score(enclf, X_train, y_train, cv = 5,)).mean()) #print("Cross Val f1 score: ", (model_selection.cross_val_score(enclf, X_train, y_train, cv = 5, scoring = 'f1')).mean()) print() print("Overall Acc score: ", accuracy_score(y_test, y_pred)) print("Recall score (Tru Pos Rate): ", recall_score(y_test, y_pred)) print("Precision score: ", precision_score(y_test, y_pred)) print("Neg Predictive Val: ", con_mat[0][0] / (con_mat[0][1] + con_mat[0][0])) print("Tru Neg Rate(Specifi): ", con_mat[0][0] / (con_mat[1][0] + con_mat[0][0])) print("F1 score: ", f1_score(y_test, y_pred)) print("Auc score: ", roc_auc_score(y_test, y_pred)) print(con_mat) print()
#print(X_train_counts.toarray()[0]) tfidf_transformer = TfidfTransformer(use_idf=True) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k=1500) X_train = ch2.fit_transform(X_train_tfidf, newsgroups_train.target) selected_feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] #clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.3,max_depth=3, random_state=0) clf1 = MultinomialNB(alpha=0.1) #clf2 = svm.LinearSVC(max_iter = 2000,probability=True,random_state=0) clf2 = SVC(kernel='linear', probability=True) #clf3 = SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet") clf = EnsembleVoteClassifier(clfs=[clf1, clf2], weights=[2, 1], voting='soft') clf.fit(X_train, newsgroups_train.target) #pred_t = clf.predict(X_train) #print(metrics.precision_score(newsgroups_train.target, pred_t, average='macro')) vectors_test2 = count_vect.transform(newsgroups_test.data) vectors_test = tfidf_transformer.transform(vectors_test2) X_test = ch2.transform(vectors_test) pred = clf.predict(X_test) print(metrics.precision_score(newsgroups_test.target, pred, average='macro'))