def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones((len(y),))) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y),)) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator \'knn\' does not support sample weights.') assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
def test_predict_on_toy_problem(): """Manually check predicted class labels for toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]) y = np.array([1, 1, 1, 2, 2, 2]) assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[1, 1, 1]) assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 1]) assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
def voting_fit(X, y, RESULT_TEST_PATH,RESULT_PATH): ada_best = fit_adaboost(X, y) extratree_best = fit_extratree(X, y) rf_best = fit_rf(X, y) gbdt_best = fit_xgboost(X, y) svc_best = fit_svc(X, y) lr_best = fit_lr(X, y) votingC = VotingClassifier(estimators=[('rfc', rf_best), ('extc', extratree_best),('lr',lr_best), ('adac', ada_best), ('gbc', gbdt_best)], voting='soft', n_jobs=4) votingC.fit(X, y) test_df = pd.read_csv(RESULT_TEST_PATH) test = np.array(test_df) #test_Survived = pd.Series(votingC.predict(test), name="Survived") result = votingC.predict(test) test_df.insert(test_df.columns.size, 'Survived', result) test_df = test_df[['PassengerId', 'Survived']] test_df['PassengerId'] = test_df['PassengerId'].apply(np.int64) test_df.to_csv(RESULT_PATH, index=False) print("finish!")
def main(directory, tools_directory, non_tools_dir): global path path = sys.path[0] start = time.time() if directory is None or not os.path.isdir(directory): print "Please input directory containing pdf publications to classify" sys.exit(1) x_train, y_train = fetch_from_file() x_test, test_files = get_test_set(directory) # Just for testing, update machine learning part later x_train, x_test = normalize_scale(x_train, x_test) classifier = VotingClassifier( [("first", classifier_list[0]), ("second", classifier_list[1]), ("second", classifier_list[2])] ) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) if os.path.isdir(tools_directory): shutil.rmtree(tools_directory) os.makedirs(tools_directory) if os.path.isdir(non_tools_dir): shutil.rmtree(non_tools_dir) os.makedirs(non_tools_dir) for num, pub in zip(y_pred, test_files): if num: shutil.copy2(directory + pub, tools_directory + pub) else: shutil.copy2(directory + pub, non_tools_dir + pub) print "Classification: Seconds taken: " + str(time.time() - start)
def process_cell(self, df_cell_train, df_cell_test, window): place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', ], axis=1).values.astype(int) X_test = df_cell_test.values.astype(int) # Applying the classifier clf1 = KNeighborsClassifier(n_neighbors=50, weights='distance', metric='manhattan') clf2 = RandomForestClassifier(n_estimators=50, n_jobs=-1) eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft') eclf.fit(X, y) y_pred = eclf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert_true('lr' in eclf1.named_estimators) assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1]) assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr']) eclf1.fit(X, y) assert_true('lr' in eclf1.named_estimators_) assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0]) assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_false(hasattr(eclf2, 'nb')) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params()) assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params()) eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0) assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5) assert_equal(eclf1.get_params()["lr__C"], eclf1.get_params()["lr"].get_params()['C'])
def test_estimator_weights_format(): # Test estimator weights inputs as list and array clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) eclf1 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft") eclf2 = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft") eclf1.fit(X, y) eclf2.fit(X, y) assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
def process_one_cell(df_train, df_test, x_min, x_max, y_min, y_max): x_border_augment = 0.025 y_border_augment = 0.0125 #Working on df_train df_cell_train = df_train[(df_train['x'] >= x_min-x_border_augment) & (df_train['x'] < x_max+x_border_augment) & (df_train['y'] >= y_min-y_border_augment) & (df_train['y'] < y_max+y_border_augment)] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] #Working on df_test # to be delete: df_cell_test = df_test.loc[df_test.grid_cell == grid_id] df_cell_test = df_test[(df_test['x'] >= x_min) & (df_test['x'] < x_max) & (df_test['y'] >= y_min) & (df_test['y'] < y_max)] row_ids = df_cell_test.index if(len(df_cell_train) == 0 or len(df_cell_test) == 0): return None, None #Feature engineering on x and y df_cell_train.loc[:,'x'] *= fw[0] df_cell_train.loc[:,'y'] *= fw[1] df_cell_test.loc[:,'x'] *= fw[0] df_cell_test.loc[:,'y'] *= fw[1] #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values.astype(float) if 'place_id' in df_cell_test.columns: cols = df_cell_test.columns cols = cols.drop('place_id') X_test = df_cell_test[cols].values.astype(float) else: X_test = df_cell_test.values.astype(float) #Applying the classifier # clf = KNeighborsClassifier(n_neighbors=26, weights='distance', # metric='manhattan') clf1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=26, weights='distance', metric='manhattan'), n_jobs=-1, n_estimators=50) clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard') eclf.fit(X, y) y_pred = eclf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
def classify(): train_X,Y = load_svmlight_file('data/train_last') test_X,test_Y = load_svmlight_file('data/test_last') train_X = train_X.toarray() test_X = test_X.toarray() Y = [int(y) for y in Y] # print 'Y:',len(Y) rows = pd.read_csv('data/log_test2.csv',index_col=0).sort_index().index.unique() train_n = train_X.shape[0] m = train_X.shape[1] test_n = test_X.shape[0] print train_n,m,#test_n # 先用训练集训练出所有的分类器 print 'train classify...' clf1 = LinearDiscriminantAnalysis() clf2 = GaussianNB() clf3 = LogisticRegression() clf4 = RandomForestClassifier() clf5 = KNeighborsClassifier(n_neighbors=12) clf6 = AdaBoostClassifier() # x_train,x_test,y_train,y_test = train_test_split(train_X,Y,test_size=0.2) # 对训练集进行划分 # print x_train.shape # print x_test.shape # clf.fit(train_X,Y) clf = VotingClassifier(estimators=[('la',clf1),('nb',clf2),('lr',clf3),('rf',clf4),('nn',clf5),('ac',clf6)], voting='soft', weights=[1.5,1,1,1,1,1]) # clf1.fit(x_train,y_train) # clf2.fit(x_train,y_train) # clf3.fit(x_train,y_train) # clf4.fit(x_train,y_train) clf.fit(train_X,Y) print 'end train classify' print 'start classify....' # print metrics.classification_report(Y,predict_Y) # clf2.fit(train_X,Y) # print 'clf2 fited...' # clf3.fit(train_X,Y) # print 'clf3 fited...' # clf4.fit(train_X,Y) # print 'clf4 fited...' # clf1.fit(train_X,Y) # print 'clf1 fited...' # 第一个分类结果 predict_Y = clf.predict(train_X) # predict_Y = clf.predict(train_X) print 'classify result:' print metrics.classification_report(Y,predict_Y) predict_Y = clf.predict(test_X) # print predict_Y,len(predict_Y) print 'end classify...' # predict_Y = clf.predict(X[cnt_train:]) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric # predict_Y = clf.predict(test_X) # 训练注释这一行,输出测试集打开这一行,注释之后的print metric DataFrame(predict_Y,index=rows).to_csv('data/info_test2.csv', header=False)
def test_multilabel(): """Check if error is raised for multilabel classification.""" X, y = make_multilabel_classification(n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123) clf = OneVsRestClassifier(SVC(kernel="linear")) eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard") try: eclf.fit(X, y) except NotImplementedError: return
def test_predict_for_hard_voting(): # Test voting classifier with non-integer (float) prediction clf1 = FaultySVC(random_state=123) clf2 = GaussianNB() clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('fsvc', clf1), ('gnb', clf2), ('svc', clf3)], weights=[1, 2, 3], voting='hard') eclf1.fit(X, y) eclf1.predict(X)
def train(self): for bin_id in sorted(self.xy_bins): file_name = xybins_file_name_str.format(bin_id) print 'Training model: {} of {}'.format(bin_id, max(self.xy_bins)) df = self.df wdf = df[df.xy_bin == bin_id] X = wdf[self.features] y = wdf.place_id model = VotingClassifier(self.models) model.fit(X, y) joblib.dump(model, file_name, compress=3, )
def test_sample_weight_kwargs(): """Check that VotingClassifier passes sample_weight as kwargs""" class MockClassifier(BaseEstimator, ClassifierMixin): """Mock Classifier to check that sample_weight is received as kwargs""" def fit(self, X, y, *args, **sample_weight): assert_true('sample_weight' in sample_weight) clf = MockClassifier() eclf = VotingClassifier(estimators=[('mock', clf)], voting='soft') # Should not raise an error. eclf.fit(X, y, sample_weight=np.ones((len(y),)))
def main(path,filename): batchsT = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5'] batchsAux = ['histogramaByN','histogramaColor','patronesCirculaesByN_2_5','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5'] #for batch in batchsAux: #print batch batchs = batchsAux #batchs.remove(batch) X = [] y = [] load_batch(y,path,'clases',filename) y = [j for i in y for j in i] for batch in batchs: load_batch(X,path,batch,filename) #X,y = load_images('/tmp/train/') est = [RandomForest(),Boosting()] for i in xrange(0,15): est.append(Gradient(i)) for i in xrange(0,4): est.append(SVM(i)) #scores = cross_validation.cross_val_score(clf, X, y, cv=5) #print scores clf = VotingClassifier(estimators=est) clf.fit(X,y) pickle.dump( clf, open( "clf_grande.p", "wb" ) ) return X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, y, test_size=0.2,random_state=777) #print clf.sub_score(X_test,Y_test) print 'start' conf_matrix = metrics.confusion_matrix(Y_test,clf.predict(X_test)) print 'confution matrix' print conf_matrix return for name,estim in est: print name #estim.fit(X_train,Y_train) #print estim.score(X_test,Y_test) print cross_validation.cross_val_score(estim, X, y, cv=5,n_jobs=-1) print 'voter' print cross_validation.cross_val_score(clf, X, y, cv=5,n_jobs=-1) return #clf.fit(X_train,Y_train) print clf.score(X_test,Y_test) return
def train_classifier(algorithm, features, train): print('Train classifier ({})...'.format(algorithm)) estimators = [] if 'rf' in algorithm: estimators.append(('rf', RandomForestClassifier(n_estimators=100))) if 'lr' in algorithm: estimators.append(('lr', LogisticRegression())) if 'mb' in algorithm: estimators.append(('mb', MultinomialNB())) # Training classifier = VotingClassifier(estimators=estimators, voting='soft') classifier.fit(features, train['sentiment']) return classifier
def voting_class(X,training_target,Y): from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft') eclf.fit(X[:,0:6],training_target) proba = eclf.predict_proba(Y[:,0:6]) eclf.predict()
def test_predict_proba_on_toy_problem(): """Calculate predicted probabilities on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) clf1_res = np.array([[0.59790391, 0.40209609], [0.57622162, 0.42377838], [0.50728456, 0.49271544], [0.40241774, 0.59758226]]) clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]]) clf3_res = np.array([[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0., 1.], [0., 1.]]) t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4 t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4 t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4 t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4 eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2, 1, 1]) eclf_res = eclf.fit(X, y).predict_proba(X) assert_almost_equal(t00, eclf_res[0][0], decimal=1) assert_almost_equal(t11, eclf_res[1][1], decimal=1) assert_almost_equal(t21, eclf_res[2][1], decimal=1) assert_almost_equal(t31, eclf_res[3][1], decimal=1) try: eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') eclf.fit(X, y).predict_proba(X) except AttributeError: pass else: raise AssertionError('AttributeError for voting == "hard"' ' and with predict_proba not raised')
def run_voting(training_set, train_set_labels, validation_set, validation_set_labels): from sklearn.ensemble import VotingClassifier standard_train_inputs = standard_data(training_set) standard_valid_inputs = standard_data(validation_set) kknn_class = KNeighborsClassifier(weights='uniform', n_neighbors=5) logistic_regression_solver = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.01, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='newton-cg', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=2) svm_class = svm.SVC(decision_function_shape='ovo', tol=0.001) eclf1 = VotingClassifier(estimators=[('knn', kknn_class), ('lr', logistic_regression_solver), ('svm', svm_class)], voting='hard') eclf1.fit(standard_train_inputs,train_set_labels.ravel()) accuracy = eclf1.score(standard_valid_inputs,validation_set_labels.ravel()) print accuracy
def acc_VotingClassifier(): kf = KFold(900, n_folds=10,shuffle=True) acc = 0.0 temp = 1 conf_mat = [[0 for i in range(10)] for j in range(10)] clf1 = GaussianNB() clf2 = RandomForestClassifier(n_estimators=20,max_features=None,class_weight="balanced_subsample") clf3 = SVC(kernel='rbf', probability=False) clf4 = LogisticRegression() eclf = VotingClassifier(estimators=[('gnb', clf1), ('rf', clf2), ('lr', clf4)], voting='hard', weights=[1,3,3]) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] eclf = eclf.fit(X_train, y_train) y_predict = eclf.predict(X_test) acc_loop = getAccuracy(y_predict,y_test) conf_mat = buildConfusionMatrix(conf_mat,y_predict,y_test) print("*** Accuracy*** for "+str(temp)+"th time: "+str(acc_loop)) acc += acc_loop temp +=1 # Checking if the data set is transformed into MFCC(13) or FFT(1000) or KPCA features(else) if (X.shape[1]==13): print 'In 13 features if' valid_mfcc = eclf.predict(validation_set_mfcc) elif (X.shape[1]==1000): print 'In 1000 features elif' valid_fft = eclf.predict(validation_set_fft) elif (X.shape[1]==100): print 'In KPCA features else' valid_kpca = eclf.predict(validation_set_kpca) acc = (acc/10.0) printConfusionMatrix(conf_mat) return acc, getAccuracyFromConfusion(conf_mat),valid_mfcc, valid_fft, valid_kpca
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())] ) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('Accuracy', confidence) predictions = clf.predict(X_test) print('Predicted spread:', Counter(predictions)) return confidence
def test_tie_situation(): """Check voting classifier selects smaller class label in tie situation.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard") assert_equal(clf1.fit(X, y).predict(X)[73], 2) assert_equal(clf2.fit(X, y).predict(X)[73], 1) assert_equal(eclf.fit(X, y).predict(X)[73], 1)
def test_tie_situation(): """Check voting classifier selects smaller class label in tie situation.""" clf1 = LogisticRegression(random_state=123, solver='liblinear') clf2 = RandomForestClassifier(random_state=123) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard') assert_equal(clf1.fit(X, y).predict(X)[73], 2) assert_equal(clf2.fit(X, y).predict(X)[73], 1) assert_equal(eclf.fit(X, y).predict(X)[73], 1)
def test_sample_weight(): """Tests sample_weight parameter of VotingClassifier""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = SVC(probability=True, random_state=123) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y, sample_weight=np.ones((len(y),))) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) sample_weight = np.random.RandomState(123).uniform(size=(len(y),)) eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft') eclf3.fit(X, y, sample_weight) clf1.fit(X, y, sample_weight) assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) # check that an error is raised and indicative if sample_weight is not # supported. clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') msg = ('Underlying estimator KNeighborsClassifier does not support ' 'sample weights.') with pytest.raises(ValueError, match=msg): eclf3.fit(X, y, sample_weight) # check that _parallel_fit_estimator will raise the right error # it should raise the original error if this is not linked to sample_weight class ClassifierErrorFit(BaseEstimator, ClassifierMixin): def fit(self, X, y, sample_weight): raise TypeError('Error unrelated to sample_weight.') clf = ClassifierErrorFit() with pytest.raises(TypeError, match='Error unrelated to sample_weight'): clf.fit(X, y, sample_weight=sample_weight)
class VtClassifier(Model): ''' Voting Classfier ''' def __init__(self, *args): Model.__init__(self) self.modelIndex = ['GNB', 'SVClassifier', 'LRModel', 'ABClassifier', 'GBClassifier'] self.models = [] self.estimators = [] for arg in args: index = self.modelIndex.index(arg) if index == 0: self.models.append(Model()) self.estimators.append((arg, Model().model)) elif index == 1: self.models.append(SVClassifier()) self.estimators.append((arg, SVClassifier().model)) elif index == 2: self.models.append(LRModel()) self.estimators.append((arg, LRModel().model)) elif index == 3: self.models.append(ABClassifier()) self.estimators.append((arg, ABClassifier().model)) elif index == 4: self.models.append(GBClassifier()) self.estimators.append((arg, GBClassifier().model)) self.model = VotingClassifier(estimators=self.estimators, voting='hard') def train(self, data, target): for model in self.models: model.train(data, target) self.model.fit(data, target) def predict(self, test): return self.model.predict_proba(test)
def buildVoting( features, label, params, verbose=False ): ''' git description + __buildVoting__( features, label, params, verbose=False ) : + _does_ : Fits a voting classifier aggregating a RF a SVM and a KNN classifiers, on ("features", "label") data + _returns_ : Fitted model (as _?_, has to be a _sklearn_ classifier though) + _called by_ : __buildModel__ + _calls_ : __sklearn.ensemble.RandomForestClassifier__, __sklearn.svm.SVC__, __sklearn.neighbors.KNeighborsClassifier__, __sklearn.ensemble.VotingClassifier__ + _arguments_ : | type | name | description | | --- | --- | --- | | _list_ | features | List of train features to fit the model | | _list_ of _int_ | label | List of associated labels | | _list_ | params | List of model parameters [n_estimators, n_neighors, kernel] | | _boolean_ | verbose | Controls console outputs | ''' from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier clf1 = RandomForestClassifier( n_estimators = params[0] ) clf2 = KNeighborsClassifier( n_neighbors = params[1] ) clf3 = SVC( kernel = params[2], probability = True ) t= time() if verbose: print( "Training a voting classifier from following models : \n" + \ " - Random Forest (" + str(params[0]) +" estimators) - weight = 2 \n" + \ " - " + str(params[1]) + "-Nearest Neighbors - weight = 1 \n" + \ " - SVM ('" + str(params[2]) + "' kernel) - weight = 2 \n\n" + \ "Please wait...\n" ) agg_clf = VotingClassifier( estimators=[ ('rf', clf1), ('knn', clf2), ('svm', clf3) ], voting='soft', weights=[2,1,2] ) agg_clf.fit( features, label ) if verbose: print( "Completed in " + str( time()-t ) + " seconds.\n" ) return agg_clf
def train_assembling_average(categories, comments, badwords): from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from sklearn.ensemble import VotingClassifier text_clf = Pipeline([('vect', TfidfVectorizer(lowercase=True, ngram_range=(1, 3), analyzer="word", min_df=3)), ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) custom = CustomTransformer(badwords) clf = Pipeline([('vect', custom), ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) final_classifier = VotingClassifier(estimators=[('text', text_clf), ('custom', clf)], voting='soft', weights=[3,1]) final_classifier = final_classifier.fit(comments, categories) return final_classifier
def combine_voting_NB_classifier(X_train, X_test, y_train, y_test,X_train_meta, X_test_meta, y_train_meta, y_test_meta): from sklearn.naive_bayes import BernoulliNB from sklearn.neighbors import NearestCentroid from sklearn.ensemble import VotingClassifier clf_1 = BernoulliNB(alpha = 0.10000000000000001).fit(X_train_meta, y_train_meta) from sklearn.svm import SVC clf_2 = SVC(C=100, gamma=0.1).fit(X_train_meta, y_train_meta) clf_3 = NearestCentroid().fit(X_train_meta, y_train_meta) eclf = VotingClassifier(estimators=[('nb1', clf_1),('nb2', clf_3)], voting='hard') eclf = eclf.fit(X_train_meta, y_train_meta) y_voting_predicted = eclf.predict(X_test_meta) np.savetxt('oto_wyniki.csv',y_voting_predicted, delimiter=',') print "\n Here is the classification report for Voting classifier:" print metrics.classification_report(y_test_meta, y_voting_predicted)
def all_classifer(X_train,y_train,X_test,y_test): rf=RandomForestClassifier(n_estimators=100,class_weight ='balanced') score1=scores(y_test,rf.fit(X_train,y_train).predict(X_test),rf.predict_proba(X_test)[:,1],'RT') gbc = GradientBoostingClassifier(n_estimators=50,learning_rate=0.05).fit(X_train,y_train) score2=scores(y_test,gbc.fit(X_train,y_train).predict(X_test),gbc.predict_proba(X_test)[:,1],'gbc') ets=ExtraTreesClassifier(n_estimators=100,max_depth=None,min_samples_split=1,random_state=0) score3=scores(y_test,ets.fit(X_train,y_train).predict(X_test),ets.predict_proba(X_test)[:,1],'ets') # lgr = LogisticRegression() # score4=scores(y_test,lgr.fit(X_train,y_train).predict(X_test),'lgr') ab = AdaBoostClassifier(algorithm='SAMME.R',n_estimators=50,learning_rate=0.7) score5=scores(y_test,ab.fit(X_train,y_train).predict(X_test),ab.predict_proba(X_test)[:,1],'abboost') # print roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]) # bagging=BaggingClassifier() # score8=scores(y_test,bagging.fit(X_train,y_train).predict(X_test),'bagging') # dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1,random_state=0) # score6=scores(y_test,dt.fit(X_train,y_train).predict(X_test),'dt') eclf = VotingClassifier(estimators=[ ('rf', rf), ('gd',gbc),('ETs',ets),('ab',ab)], voting='soft',weights =[score1[0],score2[0],score3[0],score5[0]]) score7=scores(y_test,eclf.fit(X_train,y_train).predict(X_test),eclf.predict_proba(X_test)[:,1],'voting') print eclf return [score1,score2,score3,score5,score7]
def mutipleClf(label_clfset,data,features,votingType='soft',weight=[],testData=None,testFeatures=None): flag=False if weight==[]: flag=True; print "======================================\n" print ("Start at: "+time.strftime("%H:%M:%S")+"\n") if votingType=='soft': for label_clf in label_clfset: #use ten fold socore,set the cv to 10 scores = cross_validation.cross_val_score(label_clf[1], data, features, cv=10) if flag: weight.append(scores.mean()) eclf = VotingClassifier(estimators=label_clfset, voting=votingType, weights=weight) else: eclf = VotingClassifier(estimators=label_clfset, voting=votingType) result=eclf.fit(data,features) accuracy=0.0 if testData!=None: testResult=eclf.predict(testData) accuracy=getAccuracy(testResult,testFeatures) print ("End at: "+time.strftime("%H:%M:%S")+"\n") print "======================================\n" return result,accuracy
# Calculate accuracy accuracy = accuracy_score(y_test, y_pred) # Evaluate clf's accuracy on the test set print('{:s} : {:.3f}'.format(clf_name, accuracy)) # --------------------------------------------------------- # Import VotingClassifier from sklearn.ensemble from sklearn.ensemble import VotingClassifier # Instantiate a VotingClassifier vc vc = VotingClassifier(estimators=classifiers) # Fit vc to the training set vc.fit(X_train, y_train) # Evaluate the test set predictions y_pred = vc.predict(X_test) # Calculate accuracy score accuracy = accuracy_score(y_test, y_pred) print('Voting Classifier: {:.3f}'.format(accuracy)) # --------------------------------------------------------- # Import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier # Import BaggingClassifier from sklearn.ensemble import BaggingClassifier
Y, test_size=0.20, random_state=44) # group / ensemble of models estimator = [] estimator.append(('LR', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=200))) estimator.append(('SVC', SVC(gamma='auto', probability=True))) estimator.append(('DTC', DecisionTreeClassifier())) # Voting Classifier with hard voting vot_hard = VotingClassifier(estimators=estimator, voting='hard') vot_hard.fit(X_train, y_train) y_pred = vot_hard.predict(X_test) # using accuracy_score metric to predict accuracy score = accuracy_score(y_test, y_pred) print("DONALD TRUP % d" % score) # Voting Classifier with soft voting vot_soft = VotingClassifier(estimators=estimator, voting='soft') vot_soft.fit(X_train, y_train) y_pred = vot_soft.predict(X_test) # using accuracy_score score = accuracy_score(y_test, y_pred) print("JOE BIDEN % d" % score)
plt.plot(fpr_sv, tpr_sv, color='darkorange', lw=lw, label="ROC Curve (area = %0.2f)" % roc_auc_sv) plt.plot(fpr_sv_adv, tpr_sv_adv, color='green', lw=lw, label="ROC Curve adv. (area = %0.2f)" % roc_auc_sv_adv) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("ROC SVM (class=Normal)") plt.legend(loc="lower right") plt.savefig('ROC_SVM.png') print() print() print("=============================== Voting CLassifier ==============================") vot = VotingClassifier(estimators=[('dt', dt), ('rf', rf), ('sv', sv)], voting='hard') vot.fit(X_train_scaled, y_train_l) y_pred = vot.predict(X_test_scaled) # Calculate FPR for normal class only fpr_vot, tpr_vot, _ = roc_curve(y_test_l, y_pred, pos_label=1, drop_intermediate=False) roc_auc_vot = auc(fpr_vot, tpr_vot) print("Accuracy score: {}".format(accuracy_score(y_test_l, y_pred))) print("F1 Score: {}".format(f1_score(y_test_l, y_pred, average='micro'))) print("AUC score: {}".format(roc_auc_vot)) # Predict using adversarial test samples y_pred_adv = vot.predict(X_adv) fpr_vot_adv, tpr_vot_adv, _ = roc_curve(y_test_l, y_pred_adv, pos_label=1, drop_intermediate=False) roc_auc_vot_adv = auc(fpr_vot_adv, tpr_vot_adv) print("Accuracy score adversarial: {}".format(accuracy_score(y_test_l, y_pred_adv)))
clf2 = xgb.XGBClassifier() # Xgboost eclf = VotingClassifier( estimators=[('lr', clf1), ('xgb', clf2)], voting='soft') # ensemble of Logistic Regression and Xgboost # Cross Validation # for clf, label in zip([clf1, clf2, eclf], ['Logistic Regression', 'Xgboost', 'Ensemble']): # scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='log_loss') # print("Log loss: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) # Read test data test_ids = list() with open('test.csv', 'r') as f: next(f) for line in f: test_ids.append(line[:-2]) eclf.fit(X_train, y_train) y_pred = eclf.predict_proba(X_test) # Write predictions to a file with open('sample_submission.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',') lst = eclf.classes_.tolist() lst.insert(0, "Article") writer.writerow(lst) for i, test_id in enumerate(test_ids): lst = y_pred[i, :].tolist() lst.insert(0, test_id) writer.writerow(lst)
print("\t\t* {0}: {1}".format(par, dict_clf[clf]['best_par'][par])) # ## 3.c. Test set predictions # Our three classifiers have equivalent accuracy over the evaluation set. We can then let them vote using `VotingClassifier`. # In[47]: from sklearn.ensemble import VotingClassifier estimators = [('RF', dict_clf['RF']['best_clf']), ('GB', dict_clf['GB']['best_clf']), ('ADB', dict_clf['ADB']['best_clf'])] # Instanciate the VotingClassifier using the soft voting voter = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1) voter.fit(X_train, y_train) pred = voter.predict(X_test).astype(int) # In[48]: # Calculate the known survival rate in the training set known = train.Survived.values nb_survived = 0 for i in known: if i == 1: nb_survived += 1 print("Number of survivors in training set: {0} over {1} " "({2:.2%})".format(nb_survived, len(known), nb_survived / len(known))) # Calculate the predicted survival rate in the test set
VT_NonScaled_cross_val_scores = cross_val_score(VT_classifier_nonscaled, X_train, y_train, cv=10, scoring='accuracy') print( "The 10 fold cross validation score based on Voting Classifier(Non-Scaled) is: %0.3f(+/-%0.3f)" % (VT_NonScaled_cross_val_scores.mean(), VT_NonScaled_cross_val_scores.std() * 2)) # In[31]: if VT_NonScaled_cross_val_scores.mean() > 0.97: print("The Voting Classifier (Non Scaled) is overfitting in this case.") else: VT_classifier_nonscaled.fit(X_train, y_train) VT_NonScaled_predicted = VT_classifier_nonscaled.predict(X_test) VT_NonScaled_prob_default = np.sum(VT_NonScaled_predicted) / len( VT_NonScaled_predicted) print( "The Default Probability based on Voting Classifier(Non Scaled) is :", '%.3f' % VT_NonScaled_prob_default) VT_NonScaled_accuracy = VT_classifier_nonscaled.score(X_test, y_test) print("The accuracy of Voting Classifier(Non Scaled) on test set is : ", '%.3f' % VT_NonScaled_accuracy) # In[32]: #output the result into the existing evaluation dataframe to compare with other models new_evaluation = pd.DataFrame({ 'Model': ["Voting Classifier_NonScaled"],
# stack base predicts for training meta model #stacked_predictions = np.column_stack((rf_fit.predict(x_train),et_fit.predict(x_train),ada_fit.predict(x_train),gb_fit.predict(x_train),svc_fit.predict(x_train))) polymetamnalicac # train meta model from sklearn.linear_model import LinearRegression #meta_model = LinearRegression() #meta_model.fit(stacked_predictions, t_train) from sklearn import preprocessing satsuki = pd.read_csv('haruten.csv', index_col=0) mm = preprocessing.MinMaxScaler() # インスタンスの作成 satsuki_seiki = mm.fit_transform(satsuki) arima = pd.read_csv('arima.csv', index_col=0) from sklearn.ensemble import VotingClassifier estimators = [ ('svc', SVC()), ('rf', RandomForestClassifier()), ('et', ExtraTreesClassifier()), ('ada', AdaBoostClassifier()), ('gb', GradientBoostingClassifier()), ] sum = 0 buy = 0 voting = VotingClassifier(estimators) voting.fit(x, t) print(voting.predict(satsuki_seiki))
# hard voting # moon data set X, y = make_moons(n_samples=500, noise=0.30, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # build models log_clf = LogisticRegression(random_state=42) rnd_clf = RandomForestClassifier(random_state=42) svm_clf = SVC(random_state=42) voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='hard') # hard voting print(voting_clf.fit(X_train, y_train)) ''' VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,penalty='l2', random_state=42, solver='liblinear', tol=0.0001,verbose=0, warm_start=False)), ('rf', RandomFor...f', max_iter=-1, probability=False, random_state=42, shrinking=True,tol=0.001, verbose=False))], flatten_transform=None, n_jobs=1, voting='hard', weights=None) ''' # RandomForests classifier details info needed later # show each classifier's accuarcy score for clf in (log_clf, rnd_clf, svm_clf, voting_clf): clf.fit(X_train, y_train) y_pred = clf.predict(X_test)
clf = RandomizedSearchCV(model, param_distributions=param, cv=5, verbose=0, n_jobs=-1, n_iter=200) print("Training model: {}".format(model.__class__.__name__)) lst_best_models.append((model_name, clf.fit(X_train_fit, y_train))) else: lst_best_models.append((model_name, model.fit(X_train_fit, y_train))) # Ensemble of models from sklearn.ensemble import VotingClassifier from sklearn.cross_validation import cross_val_score eclf = VotingClassifier(estimators=lst_best_models, voting='soft') eclf.fit(X_train_fit, y_train) scores = cross_val_score(eclf, X_train_fit, y_train, cv=5, scoring='accuracy') print(scores) ''' clf = RandomizedSearchCV(model, param_distributions=params, cv=5, verbose=1, n_jobs=-1, n_iter=100) logging.info("Training...") best_model = clf.fit(X_train_fit, y_train) # Print best results (rank by high test score and low std) train_result = pd.DataFrame(clf.cv_results_) train_result.sort_values(by=['mean_test_score', 'std_test_score'], ascending=[False, True], inplace=True) print(train_result[['mean_test_score', 'std_test_score']].head()) for param in params: print('Parameter: {}, best value={}'.format(param, best_model.best_estimator_.get_params()[param]))
summary['Median'] = summary.median(1) summary.sort_values('Median', ascending=False) # In[ ]: clf_vote = VotingClassifier(estimators=[ ('knn', clf_knn), ('svm', clf_svm), ('extra', clf_ext), ('xgb', clf_xgb), ('percep', clf_pctr), ('logistics', clf_log), ], weights=[2, 2, 3, 3, 1, 2], voting='hard') clf_vote.fit(X, y) scores = cross_val_score(clf_vote, X, y, cv=5, scoring='accuracy') print('Voting: Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std())) # In[ ]: train = X ntrain = train.shape[0] ntest = test.shape[0] SEED = 0 NFOLDS = 5 kf = KFold(n_splits=NFOLDS, random_state=SEED)
gnb.fit(train_x, train_y) gnb_pred_y = gnb.predict(val_x) print('GaussianNB Accuracy:', metrics.accuracy_score(val_y, gnb_pred_y)) bern = BernoulliNB() bern.fit(train_x, train_y) bern_pred_y = bern.predict(val_x) print('BernoulliNB Accuracy:', metrics.accuracy_score(val_y, bern_pred_y)) multi = MultinomialNB() multi.fit(train_x, train_y) multi_pred_y = multi.predict(val_x) print('MultinomialNB Accuracy:', metrics.accuracy_score(val_y, multi_pred_y)) log = LogisticRegression() log.fit(train_x, train_y) log_pred_y = log.predict(val_x) print('Logistic Regression Accuracy:', metrics.accuracy_score(val_y, log_pred_y)) sgd = SGDClassifier() sgd.fit(train_x, train_y) sgd_pred_y = sgd.predict(val_x) print('SGDClassifier Accuracy:', metrics.accuracy_score(val_y, sgd_pred_y)) vote = VotingClassifier([('gnb', gnb), ('bern', bern), ('multi', multi), ('lr', log), ('sgd', sgd)]) vote.fit(train_x, train_y) vote_pred_y = vote.predict(val_x) print('Vote Accuracy:', metrics.accuracy_score(val_y, vote_pred_y))
min_samples_split=2, n_estimators='warn', random_state=42, verbose=0, warm_start=False) # Ensemble: Bagging bagging = BaggingClassifier(rf, n_estimators=500, max_samples=1.0, random_state=42) # Ensemble: Weighted Voting - not included in the report eclf3 = VotingClassifier(estimators=[ ('CART', c), ('ANN', ann), ('BAG', bagging)], voting='soft', weights=[14,3,3], flatten_transform=True) # Training models H = model.fit(x_train, y_train2, validation_data=(x_test, y_test2), epochs=120, batch_size=100) c.fit(x_train, y_train) ann.fit(x_train, y_train) eclf3 = eclf3.fit(x_train, y_train) bagging.fit(x_train, y_train) rf.fit(x_train, y_train) # Plot CART rules feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal'] target_names = ['0', '1', '2', '3', '4'] dot_data = tree.export_graphviz(c, out_file='tree.dot', feature_names=feature_names, class_names=target_names, filled=True, rounded=True, special_characters=True) Source.from_file('tree.dot') graph = graphviz.Source(dot_data)
## Boosting ## skf = StratifiedKFold(target, n_folds = 5) for train, test in skf: clf = BaggingClassifier(base_estimator = RandomForestClassifier(class_weight = "balanced_subsample"), n_estimators = 250, bootstrap = True, bootstrap_features = True, n_jobs = -1) clf.fit(matrix[train], target[train]) yPred = clf.predict_proba(matrix[test])[:,1] print roc_auc_score(target[test], yPred), "BaggingRandomForest" ## Voting ## skf = StratifiedKFold(target, n_folds = 5) for train, test in skf: clf1 = RandomForestClassifier(class_weight = "balanced_subsample", n_jobs = -1) clf2 = svm.SVC(kernel = "linear", class_weight = "balanced", probability = True, C = 10) vclf = VotingClassifier(estimators = [('rf',clf1),('svc', clf2)], voting = "soft") vclf.fit(matrix[train], target[train]) yPred = vclf.predict(matrix[test]) print roc_auc_score(target[test], yPred) ## Extra Trees ## skf = StratifiedKFold(target, n_folds = 5) for train, test in skf: clf = ExtraTreesClassifier(n_estimators = 100, class_weight = "balanced_subsample", n_jobs = -1, bootstrap = True) clf.fit(matrix[train], target[train]) yPred = clf.predict_proba(matrix[test])[:,1] print roc_auc_score(target[test], yPred), "ERTrees" ## Gradient Boost ## skf = StratifiedKFold(target, n_folds = 5) for train, test in skf: clf = GradientBoostingClassifier(n_estimators = 250, max_features = "auto", init = RandomForestClassifier(class_weight = "balanced_subsample", n_jobs = -1))
models = pd.DataFrame({'Model': ["Support Vector Machine", "KNN", "Logistic Regression", "Decision Tree", "Perceptron"], 'Score': [score_svm, score_knn, score_lgr, score_dtree, score_pctr]}) models.sort_values(by='Score', axis=0, ascending=False) # Ensemble methods classifier_vote = VotingClassifier(estimators=[ ('knn', classifier_knn), ('svm', classifier_svm), ("logistic", classifier_lgr), ("decisiontree", classifier_dtree), ("perceptron", classifier_pctr)], weights=[2, 3, 2, 3, 1], voting='hard' ) classifier_vote.fit(X, y) score_votes = cross_val_score(classifier_vote, X, y, cv=5, scoring='accuracy') print("Voting: Accuracy: %0.2f (+/- %0.2f)" % (score_votes.mean(), score_votes.std())) train = X test = test_set n_train = train.shape[0] n_test = test.shape[0] SEED = 0 NFOLDS = 5 kf = KFold(n_splits=NFOLDS, random_state=SEED) def get_oof(clf, x_train, y_train, x_test): oof_train = np.zeros((n_train,)) oof_test = np.zeros((n_test,))
x_train, x_valid, x_test, y_train, y_valid, y_test = generate.get_data() x_train = [x.reshape(1, -1)[0] for x in x_train] x_valid = [x.reshape(1, -1)[0] for x in x_valid] x_test = [x.reshape(1, -1)[0] for x in x_test] pca = PCA(svd_solver='randomized', n_components=n_component) x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) NB = pickle.load(open('model_NB.pkl', 'rb')) KNN = KNeighborsClassifier(n_neighbors=3, weights='distance').fit(x_train, y_train) Dtree = pickle.load(open('model_Dtree.pkl', 'rb')) model = VotingClassifier(estimators=[('NB', NB), ('KNN', KNN), ('Dtree', Dtree)], voting='hard', weights=[0.32, 0.31, 0.37]) model.fit(x_train, y_train) y_pred = np.array(model.predict(x_test)) print(y_test) print(y_pred) # Pickle dictionary using protocol 0. pickle.dump(model, open('model_Vote.pkl', 'wb')) print('Confusion matrix: ', confusion_matrix(y_test, y_pred)) print('Accuracy score: ', accuracy_score(y_test, y_pred)) print('Precision score: ', precision_score(y_test, y_pred, average='weighted')) print('Recall score: ', recall_score(y_test, y_pred, average='weighted')) print('F1 score: ', f1_score(y_test, y_pred, average='weighted'))
# Creating the VotingClassifier using soft voting as the sub classifiers are # well trained to the data due to gridsearchcv. softVoteC_drop = VotingClassifier(estimators=[ ('rfc', drop_rf), ('dt', drop_dt), ('ada', drop_ada), ('bag', drop_bag), ('grad', drop_grad), ('xgb', drop_xgb), ('et', drop_et) ], voting='soft', n_jobs=-1) # Repeat for other Data set softVoteC_per = VotingClassifier(estimators=[('rfc', per_rf), ('dt', per_dt), ('ada', per_ada), ('bag', per_bag), ('grad', per_grad), ('xgb', per_xgb), ('et', per_et)], voting='soft', n_jobs=-1) # Fitting the VoteClassifiers softVoteC_drop = softVoteC_drop.fit(drop_f_train, drop_l_train) softVoteC_per = softVoteC_per.fit(per_f_train, per_l_train) # Dumping the voteClassifiers to pickle files for saving and downloading with open('Soft_voteC_drop.pkl', 'wb') as pf: pickle.dump(softVoteC_drop, pf) with open('Soft_voteC_per.pkl', 'wb') as pf: pickle.dump(softVoteC_per, pf) print('Done!!!!')
shuffle=True) mlp = SVC(C=1.0, kernel='linear', probability=True, gamma=0.1, tol=0.001) mlp.fit(xTrain, yTrain) score = str(mlp.score(xTest, yTest)) print('SVC Score ' + score) clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(xTrain, yTrain) score = str(clf.score(xTest, yTest)) print('AdaBoostClassifier Score ' + score) rclf = DecisionTreeClassifier(max_depth=20, random_state=0) rclf.fit(xTrain, yTrain) score = str(rclf.score(xTest, yTest)) print('DecisionTreeClassifier Score ' + score) eclf2 = VotingClassifier(estimators=[('svc', mlp), ('adaboost', clf), ('rf', rclf)], voting='soft') eclf2.fit(xTrain, yTrain) score = str(eclf2.score(xTest, yTest)) print('VotingClassifier Score ' + score) import pickle # save the model to disk filename = 'eclf2.sav' pickle.dump(eclf2, open(filename, 'wb'))
ensemble_results = pd.concat([ test_Survived_RFC, test_Survived_ExtC, test_Survived_AdaC, test_Survived_GBC, test_Survived_SVMC ], axis=1) g = sns.heatmap(ensemble_results.corr(), annot=True) ######################################################################################### votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best), ('svc', SVMC_best), ('adac', ada_best), ('gbc', GBC_best)], voting='soft', n_jobs=1) votingC = votingC.fit(all_X, all_y) ######################################################################################### test_Survived = pd.Series(votingC.predict(test[predictors]), name="Survived") submit("ensemble_python_voting.csv", votingC) ################################## Logistic Regression ################################## logreg = LogisticRegression(random_state=0) logreg.fit(all_X, all_y) rfe = RFE(logreg, 3) rfe = rfe.fit(all_X, all_y) # summarize the selection of the attributes print(rfe.support_) print(rfe.ranking_)
svm_predictions = svm_model_linear.predict(X_test) # model accuracy for X_test accuracy = svm_model_linear.score(X_test, y_test) # creating a confusion matrix cm = confusion_matrix(y_test, svm_predictions) print(cm) clf = RandomForestClassifier(n_estimators=5, max_depth=5, random_state=0) clf.fit(X_train, y_train) #print(clf.predict([['2018', '04', '01']])) print(clf.score(X_test, y_test, sample_weight=None)) clf2 = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0, max_depth=5, random_state=0).fit(X_train, y_train) print(clf2.score(X_test, y_test)) #print(clf2.predict([['2018', '04', '01']])) eclf1 = VotingClassifier(estimators=[('lsvm', svm_model_linear), ('rf', clf), ('gbc', clf2)], voting='hard') eclf1 = eclf1.fit(X_train, y_train) print(eclf1.score(X_test, y_test, sample_weight=None)) #print(eclf1.predict([['2018', '04', '01']])) filename = 'finalized_model.sav' pickle.dump(eclf1, open(filename, 'wb'))
train_size = 10000 tf = TfidfVectorizer(max_features=30000, ngram_range=(1, 3), stop_words='english') tf.fit(data.text) transformed = tf.transform(data.text) x_data = transformed[:train_size].toarray() y_data = data.polarity[:train_size].values voting = VotingClassifier([('LR', LogisticRegression()), ('NB', MultinomialNB()), ('Ridge', RidgeClassifier())]) voting.fit(x_data, y_data) # Define the streaming classifier class StreamClassifier(StreamListener): def __init__(self, classifier, vectorizer, api=None): super().__init__(api) self.clf = classifier self.vec = vectorizer # What to do when a tweet arrives def on_data(self, data): # Create a json object json_format = json.loads(data) # Get the tweet's text text = json_format['text']
svc = SVC(kernel="linear") lr = LinearRegression(normalize=True) knn = neighbors.KNeighborsClassifier(n_neighbors=5) rfc = RandomForestClassifier(n_estimators=10) lor = LogisticRegression(random_state=1) gnb = GaussianNB() vot = VotingClassifier(estimators=[('lr', lor), ('rf', rfc), ('gnb', gnb), ('knn', knn)], voting='hard') lr.fit(x_train, y_train) svc.fit(x_train, y_train) knn.fit(x_train, y_train) rfc.fit(x_train, y_train) lor.fit(x_train, y_train) gnb.fit(x_train, y_train) vot.fit(x_train, y_train) print("LogisticRegression", lor.score(x_test, y_test)) print("GaussianNB", gnb.score(x_test, y_test)) print("RandomForestClassifier ", rfc.score(x_test, y_test)) print("KNeighborsClassifier ", knn.score(x_test, y_test)) print("SVC ", svc.score(x_test, y_test)) print("LinearRegression ", lr.score(x_test, y_test)) print('VotingClassifier', vot.score(x_test, y_test)) N = 7 x = range(N) y = [ lor.score(x_test, y_test), gnb.score(x_test, y_test), rfc.score(x_test, y_test), knn.score(x_test, y_test), svc.score(x_test, y_test),
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier from xgboost import XGBClassifier Classification_models=[('LogisticRegression',LogisticRegression()),('StochasticGDC',SGDClassifier()),('KNC',KNeighborsClassifier()),('SVC',SVC()), ('LinearSVC',LinearSVC()),('GNaiveBayes',GaussianNB()),('MNaiveBayes',MultinomialNB()),('DTree',DecisionTreeClassifier()), ('MLPerceptronC',MLPClassifier()),('RF',RandomForestClassifier()),('ET',ExtraTreesClassifier()),('AdaBoostC',AdaBoostClassifier()), ('GBC',GradientBoostingClassifier()),('XGBC',XGBClassifier())] result=[] names=[] for name,model in Classification_models: cvresult=cross_val_score(model,X,y,cv=5,n_jobs=-1,scoring = 'accuracy') result.append(cvresult.mean()) names.append(name) print("%s gives %f " % (name, cvresult.mean())) params={'C':[0.01,0.1,1],'gamma':[1,0.1,0.01],'kernel':['linear', 'poly', 'rbf'] } from sklearn.model_selection import GridSearchCV grid=GridSearchCV(SVC(),param_grid=params,n_jobs=-1,cv=5) gridfit=grid.fit(X,y) gridfit.best_score_ gridfit.best_params_ vc=VotingClassifier(estimators=[('Support Vector Classifier',SVC(C=1, gamma=1, kernel='linear')), ('Gaussian Naive Bayes',GaussianNB())]) vc.fit(X,y) predictions = vc.predict(test1) submission = pd.DataFrame({'id':test['id'], 'type':predictions}) submission['type']=label.inverse_transform(submission['type']) submission.to_csv('submission.csv', index=False)
NN.fit(trainData_x, trainData_y) predict = NN.predict(testData_x) score = NN.score(testData_x, testData_y) S = 'Overall Accuracy: ' + repr(score * 100) + ' %' + '\n' print(S) confusionMatrix = confusion_matrix(testData_y, predict) print('Confusion Matrix: ') print(confusionMatrix) print('\n') #Unweighted Majority Voting Classifier print("********TASK 2.2: VOTING CLASSIFIER: UNWEIGHTED******** \n") VCU = VotingClassifier(estimators=[('gnb', NB), ('lr', LR), ('dt', DT), ('knn', KNN), ('mlp', NN)], voting='soft') VCU.fit(trainData_x, trainData_y) predict = VCU.predict(testData_x) score = VCU.score(testData_x, testData_y) S = 'Overall Accuracy: ' + repr(score * 100) + ' %' + '\n' print(S) confusionMatrix = confusion_matrix(testData_y, predict) print('Confusion Matrix: ') print(confusionMatrix) print('\n') #Weighted Majority Voting Classifier print("********TASK 2.3: VOTING CLASSIFIER: WEIGHTED******** \n") p, q, r, s, t = 0, 0, 0, 0, 0 maxScore = 0 #Grid search for weights
presort=False, random_state=None, splitter='best'), bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=0.5, n_estimators=20, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) print(bg.score(data_pd1, Y_test)) print(bg.score(data_pd, Y_train)) lr = LogisticRegression() dt = DecisionTreeClassifier() rf1 = RandomForestClassifier() svm = SVC(kernel='poly', degree=2) evc = VotingClassifier(estimators=[('lr', lr), ('dt', dt), ('rf1', rf1), ('svm', svm)], voting='hard') evc.fit(data_pd, Y_train) print(evc.score(data_pd1, Y_test)) print(evc.score(data_pd, Y_train))
# In[110]: """ train an ensemble model using the previous models """ from sklearn.ensemble import VotingClassifier #create a dictionary of our models estimators = [('blob', blob_classifier), ('rf', text_classifier), ('net', net), ('features', feature_classifier), ('net2', net2)] #create our voting classifier, inputting our models ensemble = VotingClassifier(estimators, voting='hard') # In[111]: #fit model to training data ensemble.fit(processed_features, labels) ensemble_pred = ensemble.predict(processed_features) #test our model on the test data print(accuracy_score(labels, ensemble_pred)) # In[112]: test_predictions = ensemble.predict(X_test) # In[113]: final_predictions = df_test[['review_id']] # In[114]: final_predictions['is_good_rating'] = test_predictions
#create fucntion alaises knc = KNeighborsClassifier(n_neighbors=19) dtc = DecisionTreeClassifier() rfc = RandomForestClassifier(random_state=226) bc = BaggingClassifier(DecisionTreeClassifier(), n_estimators=9) adc = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=5, learning_rate=1) lr = LogisticRegression() vc = VotingClassifier(estimators=[('lr', lr), ('dtc', dtc), ('knc', knc), ('rfc', rfc), ('bc', bc), ('adc', adc)], voting='hard') #get the VotingClassifier Function vc.fit(X_train, y_train.values.ravel()) #create model using train set y_pred = vc.predict(X_test) #predict the values cm = confusion_matrix(y_test, y_pred) #get the confusion matrix diag_sum = 0 for i in range(0, 3): diag_sum = diag_sum + cm[i, i] total = 0 for i in range(0, 3): for j in range(0, 3): total = total + cm[i, j] accuracy = diag_sum / total # accuracy of the confusion matrix #get accuracy and confusion matrix print(accuracy) print(cm)
# pcafeatures = pca.transform(features) features = sc.fit_transform(features) train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.2) # Train our classifier model2 = clf1.fit(train, train_labels) model3 = clf2.fit(train, train_labels) model4 = lda.fit(train, train_labels) model5 = kneigh.fit(train, train_labels) model7 = lr.fit(train, train_labels) modelVoting = voting.fit(train, train_labels) modelEnsemble1 = ensemble1.fit(train, train_labels) modelEnsemble2 = ensemble2.fit(train, train_labels) modelEnsemble3 = ensemble3.fit(train, train_labels) # Make predictions preds2 = clf1.predict(test) preds3 = clf2.predict(test) preds4 = lda.predict(test) preds5 = kneigh.predict(test) preds7 = lr.predict(test) predVoting = voting.predict(test) predEnsemble1 = ensemble1.predict(test)
df_test = pd.concat([test_pred1, test_pred2,test_pred3], axis=1) ytest=pd.DataFrame(y_test) model = RandomForestClassifier(random_state=1,criterion="entropy") model.fit(df_train,y_train) model.score(df_test,y_test) #Bagging from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier bg = BaggingClassifier(clf3, max_samples= 0.5, max_features = 1.0, n_estimators = 20) bg.fit(x_train,y_train) bg.score(x_test,y_test) #Boosting - Ada Boost adb = AdaBoostClassifier(LogisticRegression(),n_estimators = 5, learning_rate = 1) adb.fit(x_train,y_train) adb.score(x_test,y_test) evc = VotingClassifier( estimators= [('svm',clf),('dt',clf1),('nb',clf2),('lr',clf3),('knn',clf6)], voting = 'hard') evc.fit(x_train,y_train) evc.score(x_test, y_test) x_train[0] ###working directory import joblib # Save to filepip joblib_file = "model.pkl" joblib.dump(evc, joblib_file)
def select_model(df, features): all_X = df[features] all_y = df["Survived"] # List of dictionaries, each containing a model name, # it's estimator and a dict of hyperparameters models = [{ "name": "LogisticRegression", "estimator": LogisticRegression(), "hyperparameters": { "solver": ["newton-cg", "lbfgs", "liblinear"] } }, { "name": "KNeighborsClassifier", "estimator": KNeighborsClassifier(), "hyperparameters": { "n_neighbors": range(1, 30, 2), "weights": ["distance", "uniform"], "algorithm": ["ball_tree", "kd_tree", "brute"], "p": [1, 2] } }, { "name": "RandomForestClassifier", "estimator": RandomForestClassifier(random_state=1), "hyperparameters": { "n_estimators": [4, 6, 9, 15], "criterion": ["entropy", "gini"], "max_depth": [2, 5, 10], "max_features": ["log2", "sqrt"], "min_samples_leaf": [1, 5, 8], "min_samples_split": [2, 3, 5], } }] for model in models: print(model['name']) print('-' * len(model['name'])) grid = GridSearchCV(model["estimator"], param_grid=model["hyperparameters"], cv=10, n_jobs=3) grid.fit(all_X, all_y) model["best_params"] = grid.best_params_ model["best_score"] = grid.best_score_ model["best_model"] = grid.best_estimator_ print("Best Score: {}".format(model["best_score"])) print("Best Parameters: {}\n".format(model["best_params"])) #create ensemble of best models votingC = VotingClassifier(estimators=[(model["name"], model["best_model"]) for model in models], voting='soft', n_jobs=4) votingC.fit(all_X, all_y) scores = cross_val_score(votingC, all_X, all_y, cv=10) accuracy = np.mean(scores) models.append({ "name": "VotingClassifier", "best_model": votingC, "best_score": accuracy }) #print results to screen print(models[3]['name']) print('-' * len(models[3]['name'])) print("Best Score: {}".format(models[3]["best_score"])) return models
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'): #NOTE we might not need xtltrain # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength # ytest is optional and depends on if you are using a testing set or the practice set # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget) # print 'finished removal of Nans' ytrain = np.ravel(ytrain) ytarget = np.ravel(ytarget) #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] targetStringMat = [] targets1 = [] predictions1 = [] # svc1 = SVC() # svc1.fit(xtrain,ytrain) # ytest = svc1.predict(xtest) # predictionMat[:,count] = ytest # count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 # print xtltest # print len(ytest) for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] if testing: modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0) else: modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0) ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0) if testing: modeStr = temppredVec2Str(modeCol,grids) else: modeStr = predVec2Str(modeCol) modeStrans = predVec2Str(ytarg) predictionStringMat.append(modeStr) predictions1.append(modeCol) finalPredMat += map(int,modeCol) targetStringMat.append(modeStrans) targets1.append(ytarg) if testing == False: if ytarget != None: #print targets1 #print "" #print predictions1 confusionme = confusion_matrix(targets1[0],predictions1[0]) #print "Confusion Matrix is: " #print confusionme return predictionStringMat, targetStringMat, finalPredMat
g = plot_learning_curve(gsExtC.best_estimator_,"ExtC ExtraTrees learning curves",X_train,Y_train,cv=kfold) g = plot_learning_curve(gsGBC.best_estimator_,"GBC Gradient Boost learning curves",X_train,Y_train,cv=kfold) g = plot_learning_curve(gsrandom_forest.best_estimator_,"RandomForest learning curves",X_train,Y_train,cv=kfold) g = plot_learning_curve(gsSVMC.best_estimator_,"SVMC learning curves",X_train,Y_train,cv=kfold) test_Survived_AdaDTC = pd.Series(adaDTC_best.predict(X_test), name="AdaDTC") test_Survived_ExtC = pd.Series(ExtC_best.predict(X_test), name="ExtC") test_Survived_GBC = pd.Series(GBC_best.predict(X_test), name="GBC") test_Survived_SVMC = pd.Series(SVMC_best.predict(X_test), name="SVMC") test_Survived_random_forest = pd.Series(random_forest_best.predict(X_test), name="random_forest") # Concatenate all classifier results ensemble_results = pd.concat([test_Survived_AdaDTC, test_Survived_ExtC, test_Survived_GBC,test_Survived_SVMC,test_Survived_random_forest],axis=1) g= sns.heatmap(ensemble_results.corr(),annot=True) VotingPredictor = VotingClassifier(estimators=[('ExtC', ExtC_best), ('GBC',GBC_best), ('SVMC', SVMC_best), ('random_forest', random_forest_best)], voting='soft', n_jobs=4) VotingPredictor = VotingPredictor.fit(X_train, Y_train) VotingPredictor_predictions = VotingPredictor.predict(test) test_Survived = pd.Series(VotingPredictor_predictions, name="Survived") # Preparing data for Submission 3 test_Survived = pd.Series(VotingPredictor_predictions, name="Survived") Submission3 = pd.concat([PassengerId,test_Survived],axis=1) Submission3.head(15) nrows = ncols = 2 fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,7)) names_classifiers = [("AdaBoosting", adaDTC_best),("ExtraTrees",ExtC_best), ("GradientBoosting",GBC_best), ("RandomForest",random_forest_best)] nclassifier = 0 for row in range(nrows): for col in range(ncols):