def test_warm_start_equal_n_estimators(): # Test that nothing happens when fitting without increasing n_estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # modify X to nonsense values, this should not change anything X_train += 1. assert_warns_message(UserWarning, "Warm-start fitting without increasing n_estimators does not", clf.fit, X_train, y_train) assert_array_equal(y_pred, clf.predict(X_test))
def bagging(X_train, X_test, y_train, y_test,n_est): n_est=51 estimators=range(1,n_est) decision_clf = DecisionTreeClassifier() for est in estimators: bagging_clf = BaggingClassifier(decision_clf, n_estimators=est, max_samples=0.67,max_features=0.67, bootstrap=True, random_state=9) bagging_clf.fit(X_train, y_train) # test line y_pred_bagging1 = bagging_clf.predict(X_test) score_bc_dt1 = accuracy_score(y_test, y_pred_bagging1) scores1.append(score_bc_dt1) # train line y_pred_bagging2 = bagging_clf.predict(X_train) score_bc_dt2 = accuracy_score(y_train, y_pred_bagging2) scores2.append(score_bc_dt2) plt.figure(figsize=(10, 6)) plt.title('Bagging Info') plt.xlabel('Estimators') plt.ylabel('Scores') plt.plot(estimators,scores1,'g',label='test line', linewidth=3) plt.plot(estimators,scores2,'c',label='train line', linewidth=3) plt.legend() plt.show()
def main(): '''main function''' bagging = BaggingClassifier(DecisionTreeClassifier()) iris = load_iris() x = iris.data y = iris.target #train, test, train_, test_ = train_test_split(x, y, test_size=0.2, random_state=42) bagging.fit(x, y) bagging.predict(x[:2]) print(bagging.score(x[:2], y[:2]))
def bagging_with_base_estimator(base_estimator, x_train, x_test, y_train, y_test, rands = None): """ Predict the lemons using a Bagging Classifier and a random seed both for the number of features, as well as for the size of the sample to train the data on ARGS: - x_train: :class:`pandas.DataFrame` of the x_training data - y_train: :class:`pandas.Series` of the y_training data - x_test: :class:`pandas.DataFrame` of the x_testing data - y_test: :class:`pandas.Series` of the y_testing data - rands: a :class:`tuple` of the (rs, rf) to seed the sample and features of the BaggingClassifier. If `None`, then rands are generated and provided in the return `Series` RETURNS: :class:`pandas.Series` of the f1-scores and random seeds """ #create a dictionary for the return values ret_d = {'train-f1':[], 'test-f1':[], 'rs':[], 'rf':[]} #use the randoms provided if there are any, otherwise generate them if not rands: rs = numpy.random.rand() rf = numpy.random.rand() while rf < 0.1: rf = numpy.random.rand() else: rs, rf = rands[0], rands[1] #place them into the dictionary ret_d['rs'], ret_d['rf'] = rs, rf #create and run the bagging classifier bc = BaggingClassifier(base_estimator = base_estimator, n_estimators = 300, max_samples = rs, max_features = rf, n_jobs = 1) bc.fit(x_train, y_train) y_hat_train = bc.predict(x_train) ret_d['train-f1'] = f1_score(y_train, y_hat_train) y_hat_test = bc.predict(x_test) ret_d['test-f1'] = f1_score(y_test, y_hat_test) return pandas.Series(ret_d)
def baggedDecisionTree( X_train, y_train, X_test, y_test, nEstimators ): print("\n### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###") print("baggedDecisionTree()\n") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myBaggedDecisionTree = BaggingClassifier( base_estimator = DecisionTreeClassifier(), n_estimators = nEstimators, # max_samples = X_train.shape[0], bootstrap = True, oob_score = True, n_jobs = -1 # use all available cores ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myBaggedDecisionTree.fit(X_train,y_train) y_pred = myBaggedDecisionTree.predict(X_test) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### print( "nEstimators: " + str(nEstimators) ) print( "out-of-bag score: " + str(myBaggedDecisionTree.oob_score_) ) print( "accuracy score: " + str(accuracy_score(y_test,y_pred)) ) print( "out-of-bag decision function:" ) print( str(myBaggedDecisionTree.oob_decision_function_) ) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
class ADABoost(Base): def train(self, data = None, plugin=None): """ With dataframe train mllib """ super(ADABoost, self).train(data, plugin) #cl = svm.SVC(gamma=0.001, C= 100, kernel='linear', probability=True) X = self.X_train.iloc[:,:-1] Y = self.X_train.iloc[:,-1] self.scaler = StandardScaler().fit(X) X = self.scaler.transform(X) cl = SGDClassifier(loss='hinge') p = Pipeline([("Scaler", self.scaler), ("svm", cl)]) self.clf = BaggingClassifier(p, n_estimators=50) #self.clf = AdaBoostClassifier(p, n_estimators=10) #self.clf = AdaBoostClassifier(SGDClassifier(loss='hinge'),algorithm='SAMME', n_estimators=10) self.clf.fit(X, Y) def predict(self, file, plugin=None): super(ADABoost, self).predict(file, plugin) data = file.vector X = data[plugin] X = self.scaler.transform(X) guess = self.clf.predict(X) return self.getTag(guess)
def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline( FunctionTransformer(replace, validate=False), classifier ) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert_equal(y.shape, y_hat.shape) bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) assert_raises(ValueError, pipeline.fit, X, y) bagging_classifier = BaggingClassifier(pipeline) assert_raises(ValueError, bagging_classifier.fit, X, y)
def train_classifiers(data): train_vars = [ 'X', 'Y', 'Darkness', 'Moon', 'Hour', 'DayOfWeekInt', 'Day', 'Month', 'Year', 'PdDistrictInt', 'TemperatureC', 'Precipitationmm', 'InPdDistrict', 'Conditions', 'AddressCode', ] weather_mapping = { 'Light Drizzle': 1, 'Drizzle': 2, 'Light Rain': 3, 'Rain': 4, 'Heavy Rain': 5, 'Thunderstorm': 6, } data.Precipitationmm = data.Precipitationmm.fillna(-1) data.Conditions = data.Conditions.map(weather_mapping).fillna(0) train, test = split(data) X_train = train[train_vars] y_train = train.CategoryInt X_test = test[train_vars] y_test = test.CategoryInt bdt_real_2 = AdaBoostClassifier( DecisionTreeClassifier(max_depth=8), n_estimators=10, learning_rate=1 ) #bdt_real = DecisionTreeClassifier(max_depth=None, min_samples_split=1, #random_state=6065) bdt_real = BaggingClassifier(base_estimator=bdt_real_2, random_state=6065, n_estimators=100) #bdt_real = RandomForestClassifier(random_state=6065, #n_estimators=200) #bdt_real = ExtraTreesClassifier(random_state=6065, #min_samples_split=5, #n_estimators=200) bdt_real.fit(X_train, y_train) y_predict = pandas.Series(bdt_real.predict(X_test)) print len(y_predict[y_predict == y_test]) print len(y_predict) return bdt_real
def classification(self, x_train, y_train): ml = BaggingClassifier(DecisionTreeClassifier()) ml.fit(x_train, y_train) # print y_train[0] # print x_train[0] y_pred = ml.predict(x_train) print 'y_train ',y_train print 'y_pred ',y_pred.tolist()
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)
def test_sparse_classification(): # Check classification for various parameter settings on sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set""" def fit(self, X, y): super(CustomSVC, self).fit(X, y) self.data_type_ = type(X) return self rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) parameter_sets = [ {"max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True}, {"max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True}, {"max_features": 2, "bootstrap": False, "bootstrap_features": True}, {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False}, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: # Trained on sparse format sparse_classifier = BaggingClassifier( base_estimator=CustomSVC(), random_state=1, **params ).fit(X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = BaggingClassifier( base_estimator=CustomSVC(), random_state=1, **params ).fit(X_train, y_train).predict(X_test) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert_array_equal(sparse_results, dense_results) assert all([t == sparse_type for t in types])
def adaboost_train(train_file,test_file): _,x,y = readFile(train_file) print 'reading done.' ts = x.shape[0] id,x2 = readFile(test_file) print x.shape print x2.shape x = np.concatenate((x,x2)) print 'concatenate done.' from sklearn.preprocessing import scale x = scale(x,with_mean=False) print 'scale done.' x2 = x[ts:] x=x[0:ts] from sklearn.feature_selection import SelectKBest,chi2 x = SelectKBest(chi2,k=50000).fit_transform(x,y) from sklearn.cross_validation import train_test_split tmp_array = np.arange(x.shape[0]) train_i, test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 500) train_x = x[train_i] test_x = x[test_i] train_y = y[train_i] test_y = y[test_i] from sklearn.ensemble import BaggingClassifier bagging = BaggingClassifier(LR(penalty='l2',dual=True),n_estimators = 10,max_samples=0.6,max_features=0.6) bagging.fit(train_x,train_y) print 'train done.' res = bagging.predict(train_x) print res from sklearn.metrics import roc_auc_score score = roc_auc_score(train_y,res) res = bagging.predict_proba(train_x) print res score = roc_auc_score(train_y,res[:,1]) print score print '-----------------------------------------' print res[:,1] res = bagging.predict_proba(test_x) score = roc_auc_score(test_y,res[:,1]) print score y=bagging.predict_proba(x2) output = pd.DataFrame( data={"id":id, "sentiment":y[:,1]} ) output.to_csv( "/home/chuangxin/Bagging_result.csv", index=False, quoting=3 ) return bagging
class BaggingLearner(AbstractLearner): def __init__(self): self.learner = BaggingClassifier(KNeighborsClassifier()) def _train(self, x_train, y_train): self.learner = self.learner.fit(x_train, y_train) def _predict(self, x): return self.learner.predict(x) def _predict_proba(self, x): return self.learner.predict_proba(x)
class BaggingDecisionTrees(object): def __init__(self, n_estimators): self.classifier = BaggingClassifier(n_estimators=n_estimators) def fit(self, xs, ys): xs = xs.values ys = ys['y'] self.classifier.fit(xs, ys) def predict(self, xs): xs = xs.values ys = self.classifier.predict(xs) return ys
class SVMBag(DMCClassifier): classifier = None estimators = 10 max_features = .5 max_samples = .5 def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False): super().__init__(X, Y, tune_parameters) self.X, self.Y = X.toarray(), Y self.classifier = SVC(decision_function_shape='ovo') self.clf = BaggingClassifier(self.classifier, n_estimators=self.estimators, n_jobs=8, max_samples=self.max_samples, max_features=self.max_features) def predict(self, X: csr_matrix): X = X.toarray() return self.clf.predict(X)
class BaggingClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._bc = BC(n_estimators=10) def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._bc = self._bc.fit(matrix, classes) print "Fitting complete..." self._has_fit = True output = self._bc.predict(self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
def make_bagging_test(): from sklearn.ensemble import BaggingClassifier x,y,dates,movies = load_data() x = add_missed_value_indicator(x) test_x, train_x, test_y, train_y = create_test_train_set(x, y) clf = BaggingClassifier(n_estimators=100, max_features=1.0,\ max_samples=0.8).fit(train_x, train_y.ix[:,0]) pred = clf.predict(test_x) print "mse:", np.sqrt(np.mean((pred-test_y.ix[:,0])**2)) return pred
def RandomNbSGD(data_train,labels_train,data_test,labels_test,show_infos,n_estima=10): from sklearn.ensemble import BaggingClassifier from sklearn.linear_model import SGDClassifier as SGD from sklearn import cross_validation t1 = time() base_model = SGD(loss = 'modified_huber') # n_estimator = 100 pour perf max clf = BaggingClassifier(base_estimator=base_model, n_estimators=n_estima) y_score3 = clf.fit(data_train, labels_train) labels_predicted= clf.predict(data_test) t2=time() -t1 if(show_infos == True): print "-------------------Vectorizing and fitting the SGD with a modified_huber loss took %s"%t2,"sec---------------" print "classification report" print classification_report(labels_test, labels_predicted) print "the accuracy score on the test data is :", accuracy_score(labels_test, labels_predicted) scores = cross_validation.cross_val_score(clf, data_train, labels_train, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def svm_class_and_score( X_train, y_train, X_test, y_test, labels, search_type=RandomizedSearchCV, parameter_space={ 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto', 1e-3, 1e-4], 'C': [0.01, .1, 1, 10, 100, 1000], 'class_weight': [ {0: 0.01}, {1: 1}, {1: 2}, {1: 10}, {1: 50}, 'balanced']}, score='recall_weighted', iid=True, bagged=False, svm_results=True): """Build an SVM and return its scoring metrics """ print("# Tuning hyper-parameters for %s" % score) print() # Find the Hyperparameters clf = search_type(SVC(C=1), parameter_space, cv=10, scoring=score, iid=iid) # Build the SVM clf.fit(X_train, y_train) print("Hyperparameters found:") print(clf.best_params_) # Make the predictions y_pred = clf.predict(X_test) print() print() print("Results for basic SVM") clf_scoring(y_test, y_pred, labels) if bagged is True: bgg = BaggingClassifier(base_estimator=clf) bgg.fit(X_train, y_train) y_pred = bgg.predict(X_test) print() print() print("Results for bagging:") clf_scoring(y_test, y_pred, labels) return clf, bgg else: return clf
).fit(x_local_train, y_local_train) else: vprint(verbose, "[-] task not recognized") break vprint(verbose, "[+] Fitting success, time spent so far %5.2f sec" % (time.time() - start)) # Make predictions on local validation set if task == 'binary.classification': y_local_valid_pred = M.predict_proba(x_local_valid)[:, 1] elif task == 'multiclass.classification': y_local_valid_pred = M.predict_proba(x_local_valid).T elif task == 'multilabel.classification': y_local_valid_pred = np.array([Ms[i].predict_proba(x_local_valid)[:, 1] for i in range(K)]).T elif task == 'regression': y_local_valid_pred = M.predict(x_local_valid) # Local validation # x_local_valid, y_local_valid metric_type = D.info['metric'] if 'f1_metric' == metric_type: metric = f1_metric(y_local_valid, y_local_valid_pred) elif 'r2_metric' == metric_type: metric = r2_metric(y_local_valid, y_local_valid_pred) elif 'bac_metric' == metric_type: metric = bac_metric(y_local_valid, y_local_valid_pred) elif 'auc_metric' == metric_type: metric = auc_metric(y_local_valid, y_local_valid_pred) elif 'pac_metric' == metric_type: metric = pac_metric(y_local_valid, y_local_valid_pred)
print( classification_report(Test_Y, ada_predictions_valid, target_names=target_names)) plot_confusion_matrix(Test_Y, ada_predictions_valid, class_names, title='Confusion matrix, without normalization') plt.show() #bagging bag = BaggingClassifier(n_estimators=100, base_estimator=clf, max_samples=0.5, max_features=1.0) bag.fit(Train_X_Count, Train_Y) bag_predictions_valid = bag.predict(Test_X_Count) print("Bagging Score -> ", accuracy_score(bag_predictions_valid, Test_Y) * 100) print( classification_report(Test_Y, bag_predictions_valid, target_names=target_names)) plot_confusion_matrix(Test_Y, bag_predictions_valid, class_names, title='Confusion matrix, without normalization') plt.show()
f1 = 2*(precision*recall)/(precision + recall) print("Precision: " + str(precision)) print("Recall: " + str(recall)) print("F1: " + str(f1)) tn, fp, fn, tp = confusion_matrix(y_test, final_pred).ravel() print(classification_report(y_test, final_pred)) bg = BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5, max_features=1.0, n_estimators=20) bg.fit(x_train,y_train) bg.score(x_train,y_train) bg.score(x_test,y_test) print('train bagging score: ', bg.score(x_train,y_train)) print('test bagging score: ', bg.score(x_test,y_test)) final_pred = bg.predict(x_test) from sklearn.metrics import roc_curve, classification_report from sklearn.metrics import auc false_positive, true_positive, threshold = roc_curve(y_test, final_pred) roc_auc = auc(false_positive, true_positive) plt.title('Receiver Operating Characteristic') plt.plot(false_positive, true_positive, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate')
def on_click(self): df = pd.read_csv('training-data/Well-all.csv') df.dropna(inplace=True) x = np.array(df.drop(['Lithology'], 1)) y = np.array(df['Lithology']) x_train, x_test, y_train, y_test = model_selection.train_test_split( x, y, test_size=0.20) #20% test data clf = BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5, max_features=0.5) clf.fit(x_train, y_train) KNN = clf.score(x_test, y_test) self.textbox.setText(f"{KNN*100:1.4f} %") with open("output/KNNprediction.csv", "w") as f: f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n") df2 = pd.read_csv('10thwell/Well-10_log_data.csv') a = np.array(df2.drop(['LITHOLOGY'], 1)) for sample in a: example_measures = np.array( [sample[0], sample[1], sample[2], sample[3]]) example_measures = example_measures.reshape(1, -1) prediction = clf.predict(example_measures) f.write( f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n" ) '''--------------------------------RF------------------------------------''' clf = RandomForestClassifier(n_estimators=100) clf.fit(x_train, y_train) RF = clf.score(x_test, y_test) self.textbox1.setText(f"{RF*100:1.4f} %") with open("output/RFprediction.csv", "w") as f: f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n") df2 = pd.read_csv('10thwell/Well-10_log_data.csv') a = np.array(df2.drop(['LITHOLOGY'], 1)) for sample in a: example_measures = np.array( [sample[0], sample[1], sample[2], sample[3]]) example_measures = example_measures.reshape(1, -1) prediction = clf.predict(example_measures) f.write( f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n" ) '''----------------------------------GNB--------------------------------------------''' clf = GaussianNB() clf.fit(x_train, y_train) NB = clf.score(x_test, y_test) self.textbox2.setText(f"{NB*100:1.4f} %") with open("output/NBprediction.csv", "w") as f: f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n") df2 = pd.read_csv('10thwell/Well-10_log_data.csv') a = np.array(df2.drop(['LITHOLOGY'], 1)) for sample in a: example_measures = np.array( [sample[0], sample[1], sample[2], sample[3]]) example_measures = example_measures.reshape(1, -1) prediction = clf.predict(example_measures) f.write( f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n" ) '''---------------------------------------DECISION TREE ---------------------------------------''' clf = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5) clf.fit(x_train, y_train) DT = clf.score(x_test, y_test) self.textbox3.setText(f"{DT*100:1.4f} %") with open("output/DecisionTreeprediction.csv", "w") as f: f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n") df2 = pd.read_csv('10thwell/Well-10_log_data.csv') a = np.array(df2.drop(['LITHOLOGY'], 1)) for sample in a: example_measures = np.array( [sample[0], sample[1], sample[2], sample[3]]) example_measures = example_measures.reshape(1, -1) prediction = clf.predict(example_measures) f.write( f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n" ) '''------------------------------------------LR-----------------------------------------------''' reg = LogisticRegression() reg.fit(x_train, y_train) LR = reg.score(x_test, y_test) self.textbox4.setText(f"{LR*100:1.4f} %") with open("output/LRprediction.csv", "w") as f: f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n") df2 = pd.read_csv('10thwell/Well-10_log_data.csv') a = np.array(df2.drop(['LITHOLOGY'], 1)) for sample in a: example_measures = np.array( [sample[0], sample[1], sample[2], sample[3]]) example_measures = example_measures.reshape(1, -1) prediction = reg.predict(example_measures) f.write( f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n" ) '''------------------------------------------SVM-----------------------------------------------''' clf = svm.SVC(gamma='auto') clf.fit(x_train, y_train) SM = clf.score(x_test, y_test) self.textbox5.setText(f"{SM*100:1.4f} %") with open("output/SVMprediction.csv", "w") as f: f.write("Lithology,DEPTH,SGR,NPHI,RHOB,DT\n") df2 = pd.read_csv('10thwell/Well-10_log_data.csv') a = np.array(df2.drop(['LITHOLOGY'], 1)) for sample in a: example_measures = np.array( [sample[0], sample[1], sample[2], sample[3]]) example_measures = example_measures.reshape(1, -1) prediction = clf.predict(example_measures) f.write( f"{prediction[0]},{sample[4]},{sample[0]},{sample[1]},{sample[2]},{sample[3]}\n" ) '''-----------------------------------------Result--------------------------------------------------''' best = "" if KNN > RF and KNN > LR and KNN > SM and KNN > DT and KNN > NB: best = f"K- Nearest Neighbours with Accuracy : {KNN*100:1.4f} %" elif RF > KNN and RF > LR and RF > SM and RF > DT and RF > NB: best = f"Random Forest with Accuracy : {RF*100:1.4f} %" elif LR > RF and LR > KNN and LR > SM and LR > DT and LR > NB: best = f"Logistic Regression with Accuracy : {LR*100:1.4f} %" elif SM > RF and SM > KNN and SM > LR and SM > DT and SM > NB: best = f"Support Vector Machine with Accuracy : {SM*100:1.4f} %" elif DT > RF and DT > KNN and DT > SM and DT > LR and DT > NB: best = f"Decision Tree with Accuracy : {DT*100:1.4f} %" else: best = f"Naive Bayes with Accuracy : {NB*100:1.4f} %" self.textbox6.setText(best)
# In[ ]: from sklearn.metrics import accuracy_score tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, y_train_pred) tree_test = accuracy_score(y_test, y_test_pred) print("decision tree train/test accuracies {} / {}".format( tree_train, tree_test)) # In[ ]: bag = bag.fit(X_train, y_train) y_trian_pred = bag.predict(X_train) y_test_pred = bag.predict(X_test) bag_train = accuracy_score(y_train, y_train_pred) bag_test = accuracy_score(y_true=y_test, y_pred=y_test_pred) print(bag_test) print("bagging train/test accuracies {} / {}".format(bag_train, bag_test)) # In[ ]: X_min = X_train[:, 0].min() - 1 X_max = X_train[:, 0].max() + 1 y_min = X_train[:, 1].min() - 1 y_max = X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(X_min, X_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(nrows=1,
# training scores print "Training scores..." print bdt.score(x_train, y_train) print bagged.score(x_train, y_train) print rfc.score(x_train, y_train) # score the classfier on the test set # print "Scoring..." # print bdt.score(x_test, y_test) # print bagged.score(x_test, y_test) # print rfc.score(x_test, y_test) # print "Writing predictions..." predictions1 = bdt.predict(x_test) predictions2 = bagged.predict(x_test) predictions3 = rfc.predict(x_test) predictions = [] for i in range(100): if predictions1[i] + predictions2[i] + predictions3[i] > 1: predictions.append(1) else: predictions.append(0) f = open('/Users/LeiyaMa/Desktop/binary/predictions.csv', 'w') f.write('SID,Label\n') for i in range(100): f.write('Sbj' + str(i + 1) + ',' + str(int(predictions[i])) + '\n') ################################################################################
y=data[:,-1] acc=[] kf=KFold(n_splits=10) i=0 tp=[] tn=[] fp=[] fn=[] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf = linear_model.LogisticRegression() model = BaggingClassifier(base_estimator=clf,n_estimators=10,max_features=24) model.fit(X_train, y_train) y_pred= model.predict(X_test) acc=acc+[metrics.accuracy_score(y_test,y_pred)*100] tp=tp+[metrics.confusion_matrix(y_test,y_pred)[0][0]] tn=tn+[metrics.confusion_matrix(y_test,y_pred)[1][1]] fp=fp+[metrics.confusion_matrix(y_test,y_pred)[1][0]] fn=fn+[metrics.confusion_matrix(y_test,y_pred)[0][1]] acc=np.array(acc) tp=np.array(tp) tn=np.array(tn) fp=np.array(fp) fn=np.array(fn) print("Accuracy",acc.mean()) print('tp',tp.mean()) print('tn',tn.mean()) print('fp',fp.mean())
softXValScore # ============================================================================= # ============================================================================= # ============================================================================= # # # Bagging # ============================================================================= # ============================================================================= # ============================================================================= bc = BaggingClassifier(base_estimator=vc, n_estimators=300, n_jobs=-1) bc.fit(X_train, y_train) y_pred = bc.predict(X_test) calculateTestAccuracy(bc) calculateTrainAccuracy(bc)
def model(boosting_name, data_name, classifier_name, cv_name, mode): """ 模板方法 :param boosting_name: 集成学习的方法 :param data_name: 数据集名称 :param classifier_name: 使用的基分类器 :param cv_name: 交叉验证模式 :param mode: 采样模式 :return: """ # 加载数据 if data_name in fetch_datasets().keys(): dataset = fetch_datasets()[data_name] X = dataset.data y = dataset.target print(Counter(y)) else: # 加载自定义数据 df = pd.read_csv('../imbalanced_data/%s.csv' % data_name, header=None) array = df.values.astype(float) X = array[:, 0:array.shape[1] - 1] y = array[:, -1] print(Counter(y)) base = None if classifier_name == 'CART': base = tree.DecisionTreeClassifier(max_depth=8, random_state=42, min_samples_split=10) elif classifier_name == 'svm': base = svm.SVC() else: pass # 起始时间 start_time = time.time() cv = None if cv_name == 'StratifiedKFold': cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True) elif cv_name == 'RepeatedStratifiedKFold': cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=42) else: pass mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) # 插值点(保证每一折的fpr和tpr相同) aucs = [] for train, test in cv.split(X, y): # 预处理 scaler = preprocessing.MinMaxScaler().fit(X[train]) X_train_minmax = scaler.transform(X[train]) X_test_minmax = scaler.transform(X[test]) classifier = None if boosting_name == 'CART': classifier = base elif boosting_name == 'Bagging': classifier = BaggingClassifier(base_estimator=base, n_estimators=40) elif boosting_name == 'BalancedBagging': classifier = BalancedBaggingClassifier(base_estimator=base, ratio='auto', replacement=True, random_state=42) elif boosting_name == 'Adaboost': classifier = AdaBoostClassifier(base_estimator=base, n_estimators=40) elif boosting_name == 'Random Forest': classifier = RandomForestClassifier(max_depth=8, min_samples_split=10, n_estimators=40, random_state=42) elif boosting_name == 'EasyEnsemble': model_under(boosting_name, X_train_minmax, y[train], X_test_minmax, y[test]) continue elif boosting_name == 'BalanceCascade': model_under(boosting_name, X_train_minmax, y[train], X_test_minmax, y[test]) continue elif boosting_name == 'SMOTEBoost': classifier = SMOTEBoost(rate=100, n_estimators=40, weak_estimator=base, random_state=42, class_dist=False) elif boosting_name == 'RUSBoost': classifier = RUSBoost(ratio=50, n_estimators=40, weak_estimator=base, random_state=42, class_dist=False) else: pass classifier.fit(X_train_minmax, y[train]) # 采样 predict = classifier.predict(X_test_minmax) probability = classifier.predict_proba(X_test_minmax)[:, 1] # 指标计算 precision = metrics.precision_score(y[test], predict) recall = metrics.recall_score(y[test], predict) if precision == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) auc = metrics.roc_auc_score(y[test], probability) gmean = geometric_mean_score(y[test], predict) accuracy = metrics.accuracy_score(y[test], predict) # -------------step6.计算每一折的ROC曲线和PR曲线上的点 ------------- fpr, tpr, thresholds = metrics.roc_curve(y[test], probability) # 对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数 mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 # 为什么? roc_auc = metrics.auc(fpr, tpr) aucs.append(roc_auc) # write2dic fill_dic('precision', boosting_name, precision) fill_dic('recall', boosting_name, recall) fill_dic('f1', boosting_name, f1) fill_dic('auc', boosting_name, auc) fill_dic('gmean', boosting_name, gmean) if boosting_name != 'EasyEnsemble' and boosting_name != 'BalanceCascade': # 将frp和tpr写入文件 # 在mean_fpr100个点,每个点处插值插值多次取平均 mean_tpr /= cv.get_n_splits() # 坐标最后一个点为(1,1) mean_tpr[-1] = 1.0 # 计算平均AUC值 mean_auc = metrics.auc(mean_fpr, mean_tpr) # 将平均fpr和tpr拼接起来存入文件 filename = './ROC/{data_name}/{mode}/{base_classifier}/{sampler}.csv'. \ format(data_name=data_name, mode=mode, base_classifier=classifier_name, sampler=boosting_name) # 将文件路径分割出来 file_dir = os.path.split(filename)[0] # 判断文件路径是否存在,如果不存在,则创建,此处是创建多级目录 if not os.path.isdir(file_dir): os.makedirs(file_dir) # # 然后再判断文件是否存在,如果不存在,则创建 # if not os.path.exists(filename): # os.system(r'touch %s' % filename) # 将结果拼合起来 all = np.c_[mean_fpr, mean_tpr] np.savetxt(filename, all, delimiter=',', fmt='%f') print('%s building id transforming took %fs!' % (boosting_name, time.time() - start_time))
n_jobs=1, random_state=None, verbose=0): """ algo = BaggingClassifier(base_estimator=dtree, n_estimators=10, oob_score=True) # 模型训练 algo.fit(X_train, Y_train) # 模型效果评估 print('训练集上的准确率:{}'.format(algo.score(X_train, Y_train))) print('测试集上的准确率:{}'.format(algo.score(X_test, Y_test))) # 查看下API属性 X_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]] print('样本的预测值:') print(algo.predict(X_test)) print('样本预测值概率:') print(algo.predict_log_proba(X_test)) print('样本预测概率值的Log转换:') print(algo.predict_log_proba(X_test)) # print('训练好的所有子模型:{}'.format(algo.estimators_)) for index, estimators in enumerate(algo.estimators_): print('第{}个子模型对于数据的预测值为:{}'.format(index + 1, algo.predict(X_test))) # 就是有放回的抽样获取的数据子集 print('每个子模型的训练数据:\n{}'.format(algo.estimators_samples_)) print('每个子模型的训练数据使用的特征属性:\n{}'.format(algo.estimators_features_)) print('Bagging模型的袋外准确率:\n{}'.format(algo.oob_score_)) # 所有子模型可视化 for index, estimators in enumerate(algo.estimators_):
def machineRun(balancing): texts1, labels, pmids1 = _load_data( '../output_data/proton-beam-merged.csv') classifiers = {} labels = [] texts = [] pmids = [] getcrowdvotequestion = crowd_main( 0) # change the label with first question label! for item in getcrowdvotequestion.keys(): pmids.append(item) for item in pmids: labels.append(getcrowdvotequestion[item]) for item in pmids: index = pmids1.index(item) texts.append(texts1[index]) if (balancing > 0): Outscope = [i for i, j in list(enumerate(labels)) if j == 0] # get index Inscope = [i for i, j in list(enumerate(labels)) if j == 1] # get index sample = len(Inscope) * balancing candid = random.sample(Outscope, sample) # random sample from out texts = [j for i, j in list(enumerate(texts)) if i in Inscope ] + [j for i, j in list(enumerate(texts)) if i in candid] labels = [j for i, j in list(enumerate(labels)) if i in Inscope ] + [j for i, j in list(enumerate(labels)) if i in candid] pmids = [j for i, j in list(enumerate(pmids)) if i in Inscope ] + [j for i, j in list(enumerate(pmids)) if i in candid] vectorizer = TfidfVectorizer(stop_words="english", min_df=3, max_features=50000, norm='l2') X = vectorizer.fit_transform(texts) X = X.toarray() y = np.array(labels) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.5) result = [] # Machine 1 DummyClassifier print('DummyClassifier_stratified') Random_classifier = DummyClassifier(strategy='stratified', random_state=42).fit(X_train, y_train) y_pred = Random_classifier.predict(X_test) classifiers['0'] = y_pred precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) accuracy_train = Random_classifier.score(X_train, y_train) accuracy_test = Random_classifier.score(X_test, y_test) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) result.append([ 'DumClassifierStratified', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # Machine 1 DummyClassifier print('DummyClassifier_stratified') Random1_classifier = DummyClassifier(strategy='most_frequent', random_state=42).fit( X_train, y_train) y_pred = Random1_classifier.predict(X_test) classifiers['1'] = y_pred precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) accuracy_train = Random1_classifier.score(X_train, y_train) accuracy_test = Random1_classifier.score(X_test, y_test) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) result.append([ 'DumClassifierMostfrequent', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # Machine 1 NaiveBase print('Machine 1 MultinomialNaiveBase') gs_NaiveBase_clf = MultinomialNB().fit(X_train, y_train) y_pred = gs_NaiveBase_clf.predict(X_test) classifiers['2'] = y_pred accuracy_train = gs_NaiveBase_clf.score(X_train, y_train) accuracy_test = gs_NaiveBase_clf.score(X_test, y_test) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) result.append([ 'MultinomialNB', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # Machine 1 BeNaiveBase print('Machine 1 BernoulliNB') gs_NaiveBase_clf = BernoulliNB().fit(X_train, y_train) y_pred = gs_NaiveBase_clf.predict(X_test) classifiers['3'] = y_pred accuracy_train = gs_NaiveBase_clf.score(X_train, y_train) accuracy_test = gs_NaiveBase_clf.score(X_test, y_test) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) result.append([ 'BernoulliNB', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # Machine 2 SGD Norm2 print('Machine 2 SGD') params_d = {"alpha": 10.0**-np.arange(1, 7)} sgd = SGDClassifier(class_weight={1: 2}, random_state=42, penalty='l2') clfsgd = GridSearchCV(sgd, params_d, scoring='roc_auc', cv=3) clfsgd = clfsgd.fit(X_train, y_train) y_pred = clfsgd.predict(X_test) classifiers['4'] = y_pred accuracy_train = clfsgd.score(X_train, y_train) accuracy_test = clfsgd.score(X_test, y_test) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) result.append([ 'SGDl2{1:2}', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # Machine 2 SGD Norm1 print('Machine 3 SGD') sgd = SGDClassifier(class_weight={1: 1}, random_state=42, penalty='l1') clfsgd = GridSearchCV(sgd, params_d, scoring='roc_auc', cv=3) clfsgd = clfsgd.fit(X_train, y_train) y_pred = clfsgd.predict(X_test) classifiers['5'] = y_pred accuracy_train = clfsgd.score(X_train, y_train) accuracy_test = clfsgd.score(X_test, y_test) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) result.append([ 'SGDl1{1:1}', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # Machine 3 RandomForrest print('Machine 4 RandomForrest') RF_clf = RandomForestClassifier(class_weight={1: 5}, random_state=42) parameters_RF = { 'n_estimators': [300], # 300 is enough 'max_depth': [20] # this is good fit } gs_RF_clf = GridSearchCV(RF_clf, parameters_RF, n_jobs=-1, scoring='roc_auc', cv=3) gs_RF_clf = gs_RF_clf.fit(X_train, y_train) print('RF fitted!') y_pred = gs_RF_clf.predict(X_test) classifiers['6'] = y_pred accuracy_train = gs_RF_clf.score(X_train, y_train) accuracy_test = gs_RF_clf.score(X_test, y_test) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) result.append([ 'RF{1:5}', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # # Machine 4 KNN print('Machine 5 KNN') knn_clf = KNeighborsClassifier(weights='uniform') parameters_knn = {'n_neighbors': [2, 3, 4]} gs_knn_clf = GridSearchCV(knn_clf, parameters_knn, scoring='roc_auc', n_jobs=-1, cv=3) gs_knn_clf = gs_knn_clf.fit(X_train, y_train) y_pred = gs_knn_clf.predict(X_test) accuracy_train = gs_knn_clf.score(X_train, y_train) accuracy_test = gs_knn_clf.score(X_test, y_test) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) f1score = metrics.f1_score(y_test, y_pred) roc = metrics.roc_auc_score(y_test, y_pred) result.append([ 'KNN', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # # # # Machine 4 GB print('Machine 6 GB') GB_clf = GradientBoostingClassifier(random_state=42, max_features=0.1) parameters_GB = {'n_estimators': [200], 'learning_rate': [0.1]} gb_clf = GridSearchCV(GB_clf, parameters_GB, scoring='roc_auc', n_jobs=-1, cv=3) gb_clf = gb_clf.fit(X_train, y_train) print('GB fitted!') y_pred = gb_clf.predict(X_test) classifiers['7'] = y_pred accuracy_train = gb_clf.score(X_train, y_train) accuracy_test = gb_clf.score(X_test, y_test) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) result.append( ['GB', accuracy_train, accuracy_test, f1score, roc, precision, recall]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) # Machine 4 GB print('Machine 6 baggingWithSVC') n_estimators = 10 SVC_clf = BaggingClassifier(base_estimator=SVC(kernel='linear', class_weight={1: 10}), n_estimators=n_estimators, max_samples=1.0 / n_estimators, random_state=42, max_features=0.3) SVC_clf = SVC_clf.fit(X_train, y_train) print('baggingWithSVC fitted!') y_pred = SVC_clf.predict(X_test) classifiers['8'] = y_pred accuracy_train = SVC_clf.score(X_train, y_train) accuracy_test = SVC_clf.score(X_test, y_test) f1score = metrics.f1_score(y_test, y_pred, average='macro') roc = metrics.roc_auc_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) result.append([ 'SVCBagging{1:10}', accuracy_train, accuracy_test, f1score, roc, precision, recall ]) print('accuracy_train:' + str(accuracy_train)) print('accuracy_test:' + str(accuracy_test)) print('f1score:' + str(f1score)) print('roc_auc_score:' + str(roc)) print('recall:' + str(recall)) print('precision:' + str(precision)) print('*******************************') return result, classifiers, y_test
n_estimators=50, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1, random_state=42) # In[ ]: bag_clf.fit(X_train, y_train.ravel()) # In[ ]: print_score(bag_clf, X_train, y_train, X_test, y_test, train=True) print_score(bag_clf, X_train, y_train, X_test, y_test, train=False) # In[ ]: Y_pred = bag_clf.predict(test_df.drop('PassengerId', axis=1)) Y_pred submission = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": Y_pred }) submission.to_csv('submissions_bag_last.csv', index=False) # In[ ]:
hw5_run_test.py This program runs the identified best classifier on the test dataset Bagging w/ Decision Trees (31 estimators) @author: HyunJae Pi, [email protected] """ import numpy as np import pandas as pd from sklearn import preprocessing #from sklearn import svm from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import BaggingClassifier # training data df0 = pd.read_csv("./training2b.csv", header=None) n_features = df0.shape[1] - 1 X_training = preprocessing.scale(df0.loc[:, 0:n_features - 1].values) y_training = df0.loc[:, n_features].values # test data df1 = pd.read_csv("./test2b.csv", header=None) X_test = preprocessing.scale(df1.loc[:, 0:n_features - 1].values) clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=31).fit(X_training, y_training) y_test = clf.predict(X_test).astype(int) # save np.savetxt('./hw5_prediction.txt', y_test, fmt='%d')
X, y = datasets.fetch_covtype(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Scaling data scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) n_neighbors = 5 modelo = KNeighborsClassifier(n_neighbors) modelo.fit(X_train,y_train) print(modelo.predict(X_test)) # Bagging modeloB = BaggingClassifier(KNeighborsClassifier(n_neighbors), max_samples=0.3, max_features=0.3) modeloB.fit(X_train,y_train) print(modeloB.predict(X_test)) # Bagging 2 modeloB2 = BaggingClassifier(n_estimators=10, max_samples=0.3, max_features=0.3) modeloB2.fit(X_train,y_train) print(modeloB2.predict(X_test)) print(y_test) print(modelo.score(X_test,y_test)) print(modeloB.score(X_test,y_test)) print(modeloB2.score(X_test,y_test))
print(n_correct / len(y_pred)) # In[72]: from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier # In[73]: bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1) bag_clf.fit(X_train, y_train) y_pred = bag_clf.predict(X_train) # In[74]: bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, oob_score=True, bootstrap=True, n_jobs=-1) # In[75]: bag_clf.fit(X_train, y_train) # In[76]:
for clf in best_pool: results = defaultdict(float) y = clf.predict(x_test.reshape(1, -1)) results[y[0]] += 1 y = max(results.iteritems(), key=operator.itemgetter(1))[0] y_pred.append(y) return y_pred if __name__ == '__main__': X, y = make_classification(n_samples=1000, n_features=20, class_sep=0.7, flip_y=0.03) x_test, y_test = X[0], y[0] X, y = X[1:], y[1:] X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train \ , test_size=0.2) bag = BaggingClassifier(n_estimators=200) bag.fit(X_train, y_train) knora = KNORA(ensemble_clf=bag, knn=8, X_val=X_val, y_val=y_val) meta = IbepMlc(ensemble_clf=bag, knn=8, X_val=X_val, y_val=y_val) print accuracy_score(bag.predict(X_test), y_test) print accuracy_score(knora.predict(X_test), y_test) print accuracy_score(meta.predict(X_test), y_test)
n_estimators=1500) results = model_selection.cross_val_score(bg, X_final_train, y, cv=5) print(results.mean()) # print(bg.score(X_final_train, y)) # Timer stops stop = timeit.default_timer() print("Time Execution: {}".format(stop - start)) #------------------------------End of Baggin classifier---------------------- #-----------------------------FINAL TEST PURPOSE ONLY----------------------- X_final_train_cv = stemmed_cv.fit_transform(X) X_final_train = tfidf_vectorizer.fit_transform(X_final_train_cv) df_final = pd.read_csv("reddit_test.csv") X_final_test = df_final["comments"].values X_final_test_cv = stemmed_cv.transform(X_final_test) X_final_test = tfidf_vectorizer.transform(X_final_test_cv) # mnb.fit(X_final_train, y) # y_final = mnb.predict(X_final_test) bg.fit(X_final_train, y) y_final = bg.predict(X_final_test) predict_arr = np.c_[df_final["id"], y_final] predict_dataset = pd.DataFrame({ "Id": predict_arr[:, 0], "Category": predict_arr[:, 1] }) predict_dataset.to_csv("out_mnb2.csv", index=False) #--------------------------END OF FINAL TEST-----------------------------------
print(roc_score) # Code ends here # -------------- # Import Bagging Classifier from sklearn.ensemble import BaggingClassifier # Code starts here bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, max_samples=100, random_state=0) bagging_clf.fit(X_train, y_train) y_pred = bagging_clf.predict(X_test) score_bagging = bagging_clf.score(X_test, y_test) print(score_bagging) roc_score = roc_auc_score(y_test, y_pred) print(roc_score) # Code ends here # -------------- # Import libraries from sklearn.ensemble import VotingClassifier # Various models clf_1 = LogisticRegression()
def test(): traintokenCnt = [20000000] testtokenCnt = [500000,1000000,5000000,10000000,20000000] trainFeatDic = dict() c7 = set(['bg','ca','de','el','hu','tr','hi']) for totaltoken in traintokenCnt: c7Feature = [] c7Label = [] c7Word = [] trainFeatDic[totaltoken] = dict() for trainlangKey in __W2cTrainCorpusDic: trainfilepath = '../feature/train/' + trainlangKey + '/' + str(totaltoken) + '.txt' trainFeatDic[totaltoken][trainlangKey] = dict() trainFeatDic[totaltoken][trainlangKey]['feature'] = [] trainFeatDic[totaltoken][trainlangKey]['wordform'] = [] trainFeatDic[totaltoken][trainlangKey]['label'] = [] for line in open(trainfilepath): feat = json.loads(line.strip()) trainFeatDic[totaltoken][trainlangKey]['feature'].append(feat['feature']) trainFeatDic[totaltoken][trainlangKey]['wordform'].append(feat['wordform']) trainFeatDic[totaltoken][trainlangKey]['label'].append(feat['label']) if trainlangKey in c7: c7Feature.append(feat['feature']) c7Label.append(feat['label']) c7Word.append(feat['wordform']) trainFeatDic[totaltoken]['c7'] = dict() trainFeatDic[totaltoken]['c7']['feature'] = c7Feature trainFeatDic[totaltoken]['c7']['wordform'] = c7Word trainFeatDic[totaltoken]['c7']['label'] = c7Label testFeatDic = dict() for totaltoken in testtokenCnt: testFeatDic[totaltoken] = dict() for testlangKey in __W2cTestCorpusDic: testfilepath = '../feature/test/' + testlangKey + '/' + str(totaltoken) + '.txt' testFeatDic[totaltoken][testlangKey] = dict() testFeatDic[totaltoken][testlangKey]['feature'] = [] testFeatDic[totaltoken][testlangKey]['wordform'] = [] testFeatDic[totaltoken][testlangKey]['label'] = [] for line in open(testfilepath): feat = json.loads(line.strip()) testFeatDic[totaltoken][testlangKey]['feature'].append(feat['feature']) testFeatDic[totaltoken][testlangKey]['wordform'].append(feat['wordform']) testFeatDic[totaltoken][testlangKey]['label'].append(feat['label']) for traintotaltoken in trainFeatDic: fb = open(str(traintotaltoken) + 'result','w') for trainlangKey in trainFeatDic[traintotaltoken]: correctDic = dict() correctDic[trainlangKey] = dict() trainX = np.array(trainFeatDic[traintotaltoken][trainlangKey]['feature'])[:,:17] scaler = preprocessing.StandardScaler().fit(trainX) trainX_scaled = scaler.transform(np.array(trainX)) trainY = trainFeatDic[traintotaltoken][trainlangKey]['label'] #clf = BaggingClassifier(KNeighborsClassifier(), max_features=9,\ # bootstrap_features=True) clf = BaggingClassifier(svm.SVC(),max_features = 12, bootstrap_features=True) clf = clf.fit(trainX_scaled,trainY) for testtotaltoken in testFeatDic: predictYDic = dict() for testlangKey in testFeatDic[testtotaltoken]: testX = np.array(testFeatDic[testtotaltoken][testlangKey]['feature'])[:,:17] testY = testFeatDic[testtotaltoken][testlangKey]['label'] testX_scaled = scaler.transform(np.array(testX)) predictY = clf.predict(testX_scaled) correctCnt = 0 for index,labelY in enumerate(predictY): if testY[index] == labelY: correctCnt += 1 predictfilename = './predictlabel/' + trainlangKey + '_' + testlangKey + '_'\ + str(traintotaltoken) + '_' + str(testtotaltoken) +'.txt' tmpfb = open(predictfilename,'w') for index,labelY in enumerate(predictY): tmpfb.write(testFeatDic[testtotaltoken][testlangKey]['wordform'][index]\ + '\t' + testY[index] + '\t' + '\t' + labelY + '\n') tmpfb.flush() tmpfb.write('\n\nout of vocabulary word prediction\n') oovfile = '../feature/test/' + testlangKey + '/' + str(testtotaltoken) +'_oov.txt' oovcorrectcnt = 0 oovtotalcnt = 0 for line in open(oovfile): oovtotalcnt += 1 word,pos = line.strip().split('\t')[0],line.strip().split('\t')[1] if __digitPatt.match(word) and pos == 'NUM': oovcorrectcnt += 1 tmpfb.write(word + '\t' + pos + '\t' + 'NUM' + '\n') tmpfb.flush() continue elif pos == 'NOUN': oovcorrectcnt += 1 tmpfb.write(word + '\t' + pos + '\t' + 'NOUN' + '\n') tmpfb.close() predictYDic[testlangKey] = (correctCnt + oovcorrectcnt) / ((len(testY) + oovtotalcnt) * 1.0) correctDic[trainlangKey][testtotaltoken] = predictYDic fb.write(json.dumps(correctDic,ensure_ascii=False)+'\n') fb.flush() fb.close()
labelProcessor = preprocessing.LabelEncoder() for i in range(14): df.iloc[:,i] = labelProcessor.fit_transform(df.iloc[:,i]) Y = df.iloc[:,-1] X = df.iloc[:,0:14] bagger = BaggingClassifier(n_estimators=100, bootstrap_features=True) bagger = bagger.fit(X,Y) testDF = pd.read_csv(test) test_predictions = bagger.predict(X) print(accuracy_score(Y, test_predictions)) for i in range(1,15): testDF.iloc[:,i] = labelProcessor.fit_transform(testDF.iloc[:,i]) predictions = bagger.predict(testDF.iloc[:,1:15]) predictionDF = pd.DataFrame(predictions) predictionDF["ID"] = testDF["ID"].values predictionDF.to_csv('Predictions_bagging.csv', index=False, header=['Prediction','ID'])
bagging3.fit(df_input3_data,numpy.ravel(df_input3_target)) pickle.dump(bagging3, open('model_bagging_t3.pkl', 'wb')) bagging4 = BaggingClassifier(KNeighborsClassifier(n_neighbors=2),max_samples=0.3, max_features=0.1) bagging4.fit(df_input4_data,numpy.ravel(df_input4_target)) pickle.dump(bagging4, open('model_bagging_t4.pkl', 'wb')) bagging5 = BaggingClassifier(KNeighborsClassifier(n_neighbors=2),max_samples=0.3, max_features=0.1) bagging5.fit(df_input5_data,numpy.ravel(df_input5_target)) pickle.dump(bagging5, open('model_bagging_t5.pkl', 'wb')) # bagging = KMeans(n_clusters=5, random_state=RandomState(9) # bagging.fit(df_input_data,numpy.ravel(df_input_target)) # pickle.dump(bagging, open('model_bagging_train.pkl', 'wb')) predicted1 = bagging1.predict(df_input1_data) predicted2 = bagging2.predict(df_input2_data) predicted3 = bagging3.predict(df_input3_data) predicted4 = bagging4.predict(df_input4_data) predicted5 = bagging5.predict(df_input5_data) # predicted = bagging.predict(df_input_data) matches1 = (predicted1 == [item for sublist in df_input1_target for item in sublist]) matches2 = (predicted2 == [item for sublist in df_input2_target for item in sublist]) matches3 = (predicted3 == [item for sublist in df_input3_target for item in sublist]) matches4 = (predicted4 == [item for sublist in df_input4_target for item in sublist]) matches5 = (predicted5 == [item for sublist in df_input5_target for item in sublist]) # matches = (predicted == [item for sublist in df_input_target for item in sublist]) print 'using excess rock & uncats removed' print "Accuracy of T1 : ", (matches1.sum() / float(len(matches1)))
graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("决策树.pdf") # model=Sequential() # model.add(Dense(2*(X_train.shape[1]),input_shape=((X_train.shape[1]),))) # model.add(Activation('relu')) # model.add(Dense(1)) # model.add((Dropout(0.3))) # model.compile(loss='mean_squared_error', optimizer='adam') # model.summary() # # model.fit(X_train,y_train,epochs=10000,batch_size=50 ) # svmmodel=SVC() # svmmodel.fit(X_train,y_test) t=bagging_clf.predict(X_test) joblib.dump(bagging_clf,'clf.model') z=treemodel.predict(X_test) joblib.dump(treemodel,'treemodel.model') w=randomtree.predict(X_test) joblib.dump(randomtree,'randomtree.model') s=sgd.predict(X_test) joblib.dump(sgd,'sgd.model') # m=model.predict(X_test) # model.save('NNmodel.h5') rate1=0
tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) print(X_train_tfidf.shape) seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) cart = DecisionTreeClassifier() num_trees = 100 model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed).fit(X_train_tfidf, y) results = model_selection.cross_val_score(model, X_train_tfidf, y, cv=kfold) print(results.mean() * 100) url1 = ( "C:\\Users\\sidharth.m\\Desktop\\Project_sid_35352\\outputkrithika.csv") documents1 = pd.read_csv(url1) array1 = documents1.values #choose tweet column #x1 = array1[0:, 2] x2 = (documents1['tweet']).astype(str) X_test = count_vect.transform(x2) #print(X_test.shape) test = tfidf_transformer.transform(X_test) #print(test.shape) predicted = model.predict(test) print(predicted)
return cc if __name__ == '__main__': X, y = make_classification(n_samples=300) x_test, y_test = X[0], y[0] X, y = X[1:], y[1:] X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size=0.2) X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train \ , test_size=0.2) bag = BaggingClassifier(n_estimators=30) bag.fit(X_train, Y_train) Y_bag = bag.predict(X_test) desCV = DesCV(ensemble_clf=bag, X_val=X_val, y_val=Y_val) #y_pred = desCV.predict_pattern(x_test) #print y_pred #print y_test Y_pred = desCV.predict(X_test) print Y_pred print Y_test print accuracy_score(Y_pred, Y_test) print accuracy_score(Y_bag, Y_test)
# -- # Fit ASE A = nx.to_numpy_array(G) X_hat = AdjacencySpectralEmbed(algorithm='full').fit_transform(A) X_hat = np.column_stack(X_hat) # -- # Train classifiers scores = np.zeros((n_class, args.n_iters)) for label_idx, label in enumerate(tqdm(ulabels)): for iter_idx in range(args.n_iters): X_train, X_test, y_train, y_test = train_test_split( X_hat, y == label, train_size=args.p_train, test_size=1 - args.p_train) model = BaggingClassifier(DecisionTreeClassifier()) model = model.fit(X_train, y_train) y_hat = model.predict(X_test) scores[label_idx, iter_idx] = metrics.f1_score(y_test, y_hat, average='binary') print('f1.mean', scores.mean(axis=-1)) print('f1.std', scores.std(axis=-1))
############################################################################### # Classification using bagging classifier with and without sampling ############################################################################### # Instead of using a single tree, we will check if an ensemble of decsion tree # can actually alleviate the issue induced by the class imbalancing. First, we # will use a bagging classifier and its counter part which internally uses a # random under-sampling to balanced each boostrap sample. bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1) bagging.fit(X_train, y_train) balanced_bagging.fit(X_train, y_train) y_pred_bc = bagging.predict(X_test) y_pred_bbc = balanced_bagging.predict(X_test) ############################################################################### # Balancing each bootstrap sample allows to increase significantly the balanced # accuracy and the geometric mean. print('Bagging classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_bc), geometric_mean_score(y_test, y_pred_bc))) cm_bagging = confusion_matrix(y_test, y_pred_bc) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0], title='Bagging')
coeff = np.abs(lr.coef_[0]) names = X_train.columns coefficients = pd.Series(coeff, index=names) sorted_coefficients = coefficients.sort_values() plt.clf() plt.tight_layout() sorted_coefficients.plot(kind='barh', color='lightgreen') plt.show() #Bagging - decision tree dt = DecisionTreeClassifier(max_depth=20, min_samples_leaf=0.01, random_state=1) bc = BaggingClassifier(base_estimator=dt, n_estimators=300, n_jobs=-1) bc.fit(X_train, y_train) y_pred = bc.predict(X_test) y_pred_prob = bc.predict_proba(X_test)[:, 1] accuracy_score(y_test, y_pred) roc_auc_score(y_test, y_pred_prob) f1_score(y_test, y_pred) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) #Random forest rf = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_leaf=0.0001, max_features=12, random_state=1) rf.fit(X_train, y_train) y_pred = rf.predict(X_test)
table=[] '''for name, clf in clfs: clf.fit(train_[cols], train_["TripType"]) clf.predict(test_[cols]) preds = clf.predict_proba(test_[cols]) #print(confusion_matrix(test['class'], clf.predict(test[cols]))) print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"])) print (classification_report(test_['TripType'], clf.predict(test_[cols]))) score=accuracy_score(test_['TripType'],clf.predict(test_[cols])) table.append([name,score]) print (table) ''' clf=BaggingClassifier(GradientBoostingClassifier()) clf.fit(train_[cols], train_["TripType"]) clf.predict(test_[cols]) preds = clf.predict_proba(test_[cols]) #print(confusion_matrix(test['class'], clf.predict(test[cols]))) print (pd.crosstab(test_['TripType'], clf.predict(test_[cols]), rownames=["Actual"], colnames=["Predicted"])) print (classification_report(test_['TripType'], clf.predict(test_[cols]))) score=accuracy_score(test_['TripType'],clf.predict(test_[cols])) table.append([score]) print (table) eclf = VotingClassifier(estimators = [('BaggingKNN', BaggingClassifier(KNeighborsClassifier(20))), ('RandomForest', RandomForestClassifier(10)), ('BaggingCART', BaggingClassifier(DecisionTreeClassifier()))], voting='soft', weights=[7,1,1]) eclf.fit(train[cols], train["TripType"]) #use the classifier to predict predicted=eclf.predict(test[cols])
class SearchEngine(): def __init__(self, label_names, X_train, y_train): self.k = len( y_train ) # K is the number of clases, in this case, specializations self.label_names = label_names self.X_train, self.y_train = X_train, y_train def fit(self): # min_df: This corresponds to the minimum number of documents that should contain this feature. # max_df: we should include only those words that occur in a maximum of 70% of all the documents self.vectorizer = CountVectorizer( ngram_range=(1, 1), max_features=1500, min_df=5, max_df=0.4, stop_words=stopwords.words('english')) X_train_vect = self.vectorizer.fit_transform(self.X_train) self.tfidf_transformer = TfidfTransformer() X_train_trans = self.tfidf_transformer.fit_transform(X_train_vect) # Print TF and TFIDF #print(*list(X_train_vect.toarray()), sep = "\n") #print(*list(X_train_trans.toarray()), sep = "\n") # Uncomment the model to use #self.classifier = KNeighborsClassifier(n_neighbors=self.k) #self.classifier = RandomForestClassifier(n_estimators=500, max_features=0.25, criterion="entropy", class_weight="balanced") self.classifier = BaggingClassifier(n_estimators=25, max_features=0.25) #self.classifier = GradientBoostingClassifier(n_estimators =100, learning_rate =0.1, max_depth=6, min_samples_leaf =1, max_features=1.0) clf.fit(X, training_set_y) #self.classifier = MultinomialNB() self.classifier.fit(X_train_trans, self.y_train) def predict(self, X_test): X_test_vect = self.vectorizer.transform(X_test) X_test_trans = self.tfidf_transformer.transform(X_test_vect) y_pred = self.classifier.predict(X_test_trans) return y_pred def predict_single(self, doc): X_test_vect = self.vectorizer.transform([doc]) X_test_trans = self.tfidf_transformer.transform(X_test_vect) y_pred = zip(self.classifier.classes_, self.classifier.predict_proba(X_test_trans)[0]) y_pred = sorted([(self.label_names[ind], score) for ind, score in y_pred], key=lambda x: -x[1]) return y_pred def report(self, X_test, y_test, y_pred): print( classification_report(y_test, y_pred, target_names=self.label_names, digits=4)) total = 0 same = 0 for i in range(len(y_test)): if y_test[i] == y_pred[i]: same += 1 total += 1 print(total, same)
from sklearn.model_selection import train_test_split from sklearn.ensemble import BaggingClassifier from sklearn.metrics import confusion_matrix l = list() for i in range(0, 10): X_train, X_test, y_train, y_test = train_test_split( train, y, test_size=0.2) classifier = BaggingClassifier(base_estimator=None, ) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) confusion_matrix(y_test, y_pred)
voting_clf.fit(X, y) for clf in (log_clf, rnd_clf, svm_clf, voting_clf): clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(clf.__class__.__name__, accuracy_score(y_test, y_pred)) bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=1) bag_clf.fit(X_train, y_train) y_pred = bag_clf.predict(X_test) print(y_pred) y_pred_proba = bag_clf.predict_proba(X_test) print(y_pred_proba) print(accuracy_score(y_test, y_pred)) #oob bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, bootstrap=True, n_jobs=1, oob_score=True) bag_clf.fit(X_train, y_train) print(bag_clf.oob_score_) y_pred = bag_clf.predict(X_test) print(accuracy_score(y_test, y_pred))
wine = datasets.load_wine() X = wine.data y = wine.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) tree = DecisionTreeClassifier(criterion='entropy', max_depth=None) bag = BaggingClassifier(base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1, random_state=1) tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_true=y_train, y_pred=y_train_pred) tree_test = accuracy_score(y_true=y_test, y_pred=y_test_pred) print("Decision tree train/test accuracy {0:.3f}/{1:.3f}".format(tree_train, tree_test)) bag.fit(X_train, y_train) y_train_pred_bag = bag.predict(X_train) y_test_pred_bag = bag.predict(X_test) bag_train = accuracy_score(y_true=y_train, y_pred=y_train_pred_bag) bag_test = accuracy_score(y_true=y_test, y_pred=y_test_pred_bag) print("Bagging train/test accuracy {0:.3f}/{1:.3f}".format(bag_train, bag_test)) x_min = X_train[:, 0].min() - 1 x_max = X_train[:, 0].max() + 1 y_min = X_train[:, 1].min() - 1 y_max = X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(nrows=1, ncols=2, sharex='col', sharey='row', figsize=(8, 3)) for idx, clf, tt in zip([0, 1], [tree, bag], ['Decision Tree', 'Bagging']): clf.fit(X_train[:, 0:2], y_train) z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) z = z.reshape(xx.shape) axarr[idx].contourf(xx, yy, z, alpha=0.3)
tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) #print(X_train_tfidf.shape) seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) cart = DecisionTreeClassifier() num_trees = 100 model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed).fit(X_train_tfidf, y) results = model_selection.cross_val_score(model, X_train_tfidf, y, cv=kfold) print(results.mean() * 100) predicted = model.predict(X_train_tfidf) acc = accuracy_score(y, predicted) * 100 print(acc) url1 = ("C:\\Users\\sidharth.m\\Desktop\\Project_sid_35352\\Test.csv") documents1 = pd.read_csv(url1) array1 = documents1.values #choose tweet column x1 = array1[0:, 1] #x2= (documents1['tweet']).astype(str) y1 = array1[0:, 0] X_test = count_vect.transform(x1) #print(X_test.shape)
def main(): """magic happens here""" # preprocess, then train, test, and split chess_num_datatrain, chess_num_datatest, chess_num_targettrain, chess_num_targettest = tts_chess_numeric( ) iris_num_datatrain, iris_num_datatest, iris_num_targettrain, iris_num_targettest = tts_iris_numeric( ) letter_num_datatrain, letter_num_datatest, letter_num_targettrain, letter_num_targettest = tts_letter_numeric( ) # For each dataset ## Try at least 3 different "regular" learning algorithms and note the results. ### DS1 - chess print("") ##### method 1 - MLP ** clf_chess_num_MLP = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(40, 30), random_state=1) clf_chess_num_MLP.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_MLP.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "Chess - Neural Network") ##### method 2 - Decision Tree clf_chess_num_DT = DecisionTreeClassifier(random_state=0) clf_chess_num_DT.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_DT.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "Chess - Decision Tree") ##### method 3 - KNN clf_chess_num_KNN = KNeighborsClassifier(n_neighbors=7) clf_chess_num_KNN.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_KNN.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "Chess - KNN") ### DS2 - iris print("") ##### method 1 - MLP clf_iris_num_MLP = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10, 7), random_state=1) clf_iris_num_MLP.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_MLP.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "Iris - Neural Network") # clf_iris_num_MLP_gs = MLPClassifier() # iris_param_grid = [ # { # 'activation' : ['identity', 'logistic', 'tanh', 'relu'], # 'solver' : ['lbfgs', 'sgd', 'adam'], # 'hidden_layer_sizes': [ # (9,1),(9,2),(9,3),(9,4),(9,5),(9,6),(9,7),(9,8),(9,10),(9,11),(9,12), # (10,1),(10,2),(10,3),(10,4),(10,5),(10,6),(10,7),(10,8),(10,10),(10,11),(10,12), # (11,1),(11,2),(11,3),(11,4),(11,5),(11,6),(11,7),(11,8),(11,10),(11,11),(11,12) # ] # } # ] # grid_clf = GridSearchCV(clf_iris_num_MLP_gs, iris_param_grid, cv=3, # scoring='accuracy') # grid_clf.fit(iris_num_datatrain, iris_num_targettrain) # print("the best parameters out of those chosen are: ") # print(grid_clf.best_params_) ##### method 2 - Decision Tree clf_iris_num_DT = DecisionTreeClassifier() clf_iris_num_DT.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_DT.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "Iris - Decision Tree") ##### method 3 - KNN clf_iris_num_KNN = KNeighborsClassifier(n_neighbors=3) clf_iris_num_KNN.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_KNN.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "Iris - KNN") ### DS3 print("") ##### method 1 - MLP clf_letter_num_MLP = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(40, 30), random_state=1) clf_letter_num_MLP.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_MLP.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "Letter - Neural Network") ##### method 2 - Decision Tree clf_letter_num_DT = DecisionTreeClassifier() clf_letter_num_DT.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_DT.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "Letter - Decision Tree") ##### method 3 - KNN clf_letter_num_KNN = KNeighborsClassifier(n_neighbors=3) clf_letter_num_KNN.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_KNN.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "Letter - KNN") print("") ## Use Bagging and note the results. (Play around with a few different options) ### DS1 - Chess clf_chess_num_Bagging = BaggingClassifier(bootstrap=True, n_estimators=20) clf_chess_num_Bagging.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_Bagging.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "BAGGING - Chess") ### DS2 - Iris clf_iris_num_Bagging = BaggingClassifier(bootstrap=True) clf_iris_num_Bagging.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_Bagging.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "BAGGING - Iris") ### DS3 - Letter clf_letter_num_Bagging = BaggingClassifier(bootstrap=True, n_estimators=20) clf_letter_num_Bagging.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_Bagging.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "BAGGING - Letter") print("") ## Use AdaBoost and note the results. (Play around with a few different options) ### DS1 - Chess clf_chess_num_AdaBoost = AdaBoostClassifier() clf_chess_num_AdaBoost.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_AdaBoost.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "ADABOOST - Chess") params = clf_chess_num_AdaBoost.get_params() print(params) ### DS2 - Iris clf_iris_num_AdaBoost = AdaBoostClassifier(learning_rate=0.3) clf_iris_num_AdaBoost.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_AdaBoost.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "ADABOOST - Iris") params = clf_iris_num_AdaBoost.get_params() print(params) ### DS3 - Letter clf_letter_num_AdaBoost = AdaBoostClassifier(n_estimators=200) clf_letter_num_AdaBoost.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_AdaBoost.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "ADABOOST - Letter") params = clf_letter_num_AdaBoost.get_params() print(params) print("") ## Use a random forest and note the results. (Play around with a few different options) ### DS1 - Chess clf_chess_num_RandomForest = RandomForestClassifier(criterion='entropy', bootstrap=False, n_estimators=30) clf_chess_num_RandomForest.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_RandomForest.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "RANDOM FOREST - Chess") ### DS2 - Iris clf_iris_num_RandomForest = RandomForestClassifier() clf_iris_num_RandomForest.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_RandomForest.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "RANDOM FOREST - Iris") ### DS3 - Letter clf_letter_num_RandomForest = RandomForestClassifier(bootstrap=False) clf_letter_num_RandomForest.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_RandomForest.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "RANDOM FOREST - Letter")
print("model_1 정확도(학습 데이터) :", model_1.score(X_train, y_train)) print("model_2 정확도(학습 데이터) :", model_2.score(X_train, y_train)) print("model_1 정확도(테스트 데이터) :", model_1.score(X_test, y_test)) print("model_2 정확도(테스트 데이터) :", model_2.score(X_test, y_test)) predicted_1 = model_1.predict(X_test) print('Confusion Matrix - 1:') print(confusion_matrix(y_test, predicted_1)) print('Classification Report - 1 :') print(classification_report(y_test, predicted_1)) predicted_2 = model_2.predict(X_test) print('Confusion Matrix - 1:') print(confusion_matrix(y_test, predicted_2)) print('Classification Report - 1 :') print(classification_report(y_test, predicted_2))
def test_parallel(): """Check parallel computations.""" rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3) # Regression X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
else: estimator = SVC() clf = SVC() clf.fit(X_train, y_train) y_pred_tree = clf.predict(X_test) bag_clf = BaggingClassifier(estimator, n_estimators=n_estimators, max_samples=max_samples, bootstrap=bootstrap_samples, max_features=max_features, bootstrap_features=bootstrap_features, random_state=42) bag_clf.fit(X_train, y_train) y_pred = bag_clf.predict(X_test) orig.empty() fig, ax = plt.subplots() fig1, ax1 = plt.subplots() XX, YY, input_array = draw_meshgrid() labels = clf.predict(input_array) labels1 = bag_clf.predict(input_array) col1, col2 = st.beta_columns(2) with col1: st.header(estimators) ax.scatter(X.T[0], X.T[1], c=y, cmap='rainbow') ax.contourf(XX,
from sklearn.ensemble import BaggingClassifier tree = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=None) bag = BaggingClassifier(base_estimator=tree, n_estimators=50, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1) ## from sklearn.metrics import accuracy_score tree = tree.fit(X_train, y_train) t_train_pred = tree.predict(X_train) t_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, t_train_pred) tree_test = accuracy_score(y_test, t_test_pred) print('Decision tree tain/test accuracy %.3f/%.3f' % (tree_train, tree_test)) ## from sklearn.metrics import accuracy_score bag = bag.fit(X_train, y_train) b_train_pred = bag.predict(X_train) b_test_pred = bag.predict(X_test) bag_train = accuracy_score(y_train, b_train_pred) bag_test = accuracy_score(y_test, b_test_pred) print('Bag tain/test accuracy %.3f/%.3f' % (bag_train, bag_test))
from sklearn.svm import SVC clf5 = SVC(kernel='rbf') import xgboost as xgb model = xgb.XGBClassifier(random_state=1, learning_rate=0.01) from sklearn.ensemble import BaggingClassifier clf = BaggingClassifier(base_estimator=clf1, n_estimators=30, random_state=0) #x1,y1=SMOTE().fit_resample(x1, y1) print("Starting... ") clf.fit(x1, y1) o = clf.predict(x2) print("End... ") pred_aud = clf.predict_proba(x2) cou = 0 tol = 0 pos = [0 for i in range(len(pred_aud[0]))] pos1 = [0 for i in range(len(pred_aud[0]))] pos2 = [0 for i in range(len(pred_aud[0]))] for i in tqdm(range(len(o))): tol += 1.0 pos1[y2[i]] += 1
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'): #NOTE we might not need xtltrain # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength # ytest is optional and depends on if you are using a testing set or the practice set # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget) # print 'finished removal of Nans' ytrain = np.ravel(ytrain) ytarget = np.ravel(ytarget) #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] targetStringMat = [] targets1 = [] predictions1 = [] # svc1 = SVC() # svc1.fit(xtrain,ytrain) # ytest = svc1.predict(xtest) # predictionMat[:,count] = ytest # count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 # print xtltest # print len(ytest) for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] if testing: modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0) else: modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0) ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0) if testing: modeStr = temppredVec2Str(modeCol,grids) else: modeStr = predVec2Str(modeCol) modeStrans = predVec2Str(ytarg) predictionStringMat.append(modeStr) predictions1.append(modeCol) finalPredMat += map(int,modeCol) targetStringMat.append(modeStrans) targets1.append(ytarg) if testing == False: if ytarget != None: #print targets1 #print "" #print predictions1 confusionme = confusion_matrix(targets1[0],predictions1[0]) #print "Confusion Matrix is: " #print confusionme return predictionStringMat, targetStringMat, finalPredMat
# -*- coding: utf-8 -*- from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier clf_bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) clf_bagging.predict()