class Ensemble: def __init__(self, data): self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy') self.lda = LDA() self.dec = DecisionTreeClassifier(criterion='entropy') self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25) self.make_prediction(data) def make_prediction(self, data): ''' Make an ensemble prediction ''' self.rf.fit(data.features_train, data.labels_train) self.lda.fit(data.features_train, data.labels_train) self.dec.fit(data.features_train, data.labels_train) self.ada.fit(data.features_train, data.labels_train) pre_pred = [] self.pred = [] ada_pred = self.ada.predict(data.features_test) rf_pred = self.rf.predict(data.features_test) lda_pred = self.lda.predict(data.features_test) dec_pred = self.dec.predict(data.features_test) for i in range(len(rf_pred)): pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ]) for entry in pre_pred: pred_list = sorted(entry, key=entry.count, reverse=True) self.pred.append(pred_list[0])
def plot_adaboost(): X, y = make_moons(noise=0.3, random_state=0) # Create and fit an AdaBoosted decision tree est = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R", n_estimators=200) sample_weight = np.empty(X.shape[0], dtype=np.float) sample_weight[:] = 1. / X.shape[0] est._validate_estimator() est.estimators_ = [] est.estimator_weights_ = np.zeros(4, dtype=np.float) est.estimator_errors_ = np.ones(4, dtype=np.float) plot_step = 0.02 # Plot the decision boundaries x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) fig, axes = plt.subplots(1, 4, figsize=(14, 4), sharey=True) colors = ['#d7191c', '#fdae61', '#ffffbf', '#abd9e9', '#2c7bb6'] c = lambda a, b, c: map(lambda x: x / 254.0, [a, b, c]) colors = [c(215, 25, 28), c(253, 174, 97), c(255, 255, 191), c(171, 217, 233), c(44, 123, 182), ] for i, ax in enumerate(axes): sample_weight, estimator_weight, estimator_error = est._boost(i, X, y, sample_weight) est.estimator_weights_[i] = estimator_weight est.estimator_errors_[i] = estimator_error sample_weight /= np.sum(sample_weight) Z = est.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=matplotlib.colors.ListedColormap([colors[1], colors[-2]]), alpha=1.0) ax.axis("tight") # Plot the training points ax.scatter(X[:, 0], X[:, 1], c=np.array([colors[0], colors[-1]])[y], s=20 + (200 * sample_weight) ** 2, cmap=plt.cm.Paired) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xlabel('$x_0$') if i == 0: ax.set_ylabel('$x_1$') plt.tight_layout() plt.show()
def cvalidate(): from sklearn import cross_validation trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:] X = np.array([x[1:8] for x in trainset]) y = np.array([x[8] for x in trainset]) #print X,y import math for i, x in enumerate(X): for j, xx in enumerate(x): if(math.isnan(xx)): X[i][j] = 26.6 #print X[0:3] #print y[0:3] X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0) X_train, X_test = decomposition_pca(X_train, X_test) bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200) bdt.fit(X_train, y_train) print bdt.score(X_test, y_test)
def Adaboost(TrainData,TestData): features=['Time','Season','Hour','Minute','District'] clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30) size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for i in range(0,len(size)): train,validation= train_test_split(TrainData, train_size=size[i]) while len(set(train['Category'])) != len(set(validation['Category'])): train,validation= train_test_split(TrainData, train_size=size[i]) clf = clf.fit(train[features], train['Category']) """stop = timeit.default_timer() print "Runnin time adaboost is ", stop-start""" predicted=np.array(clf.predict_proba(validation[features])) model=clf.predict(train[features]) model1=clf.predict(validation[features]) #scores = cross_val_score(clf, validation[features], validation['Category']) #print "Scores mean is",scores.mean() #accuracy print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model) print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1) print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro') print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None) #writing to file """Category_new=[]
def runAdaBoost(arr):#depth, n_est, lrn_rate=1.0): # removing filename for the scipy optimise thing '''filename,''' #ada = AdaBoostClassifier(n_estimators=100) global file_dir, nEvents, solutionFile, counter print 'iteration number ' + str(counter) counter+=1 depth = int(arr[0]*100) n_est = int(arr[1]*100) lrn_rate = arr[2] if depth <= 0 or n_est <= 0 or lrn_rate <= 0: return 100 fname = 'ada_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) filename = fname ada = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=depth), algorithm="SAMME", n_estimators=n_est)#,n_jobs=4) print "AdaBoost training" ada.fit(sigtr[train_input].values,sigtr['Label'].values) print "AdaBoost testing" ada_pred = ada.predict(sigtest[train_input].values) solnFile(filename,ada_pred,sigtest['EventId'].values)# print "AdaBoost finished" # added for teh scipy optimise thing ams_score = ams.AMS_metric(solutionFile, file_dir+fname+'.out', nEvents) print ams_score logfile.write(fname + ': ' + str(ams_score)+'\n') return -1.0*float(ams_score) # since we are minimising
def main(): trainset = np.genfromtxt(open('train.csv','r'), delimiter=',')[1:] X = np.array([x[1:8] for x in trainset]) y = np.array([x[8] for x in trainset]) #print X,y import math for i, x in enumerate(X): for j, xx in enumerate(x): if(math.isnan(xx)): X[i][j] = 26.6 testset = np.genfromtxt(open('test.csv','r'), delimiter = ',')[1:] test = np.array([x[1:8] for x in testset]) for i, x in enumerate(test): for j, xx in enumerate(x): if(math.isnan(xx)): test[i][j] = 26.6 X, test = decomposition_pca(X, test) bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=20, algorithm = 'auto'), algorithm="SAMME", n_estimators = 200) bdt.fit(X, y) print 'PassengerId,Survived' for i, t in enumerate(test): print '%d,%d' % (i + 892, int(bdt.predict(t)[0]))
def ada_prediction(features_train, labels_train, features_test, ids): X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3) clf = AdaBoostClassifier(RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=None, max_features=2, max_leaf_nodes=16, min_samples_split=10, n_estimators=1000, n_jobs=-1, oob_score=False), algorithm="SAMME", n_estimators=200) #clf_acc = clf.fit(X_train, y_train) # print(clf.best_estimator_) #feature_importance = clf.feature_importances_ #print (feature_importance) #pred = clf_acc.predict_proba(X_test)[:,1] #print (y_test, pred) # acc = accuracy_score(y_test, pred) # print ("Acc {}".format(acc)) clf = clf.fit(features_train, labels_train) pred = clf.predict_proba(features_test)[:,1] predictions_file = open("data/canivel_ada_forest.csv", "wb") predictions_file_object = csv.writer(predictions_file) predictions_file_object.writerow(["ID", "TARGET"]) predictions_file_object.writerows(zip(ids, pred)) predictions_file.close()
def train_adaboost(features, labels, learning_rate, n_lab, n_runs, n_estim, n_samples): uniqLabels = np.unique(labels) print 'Taking ', str(n_lab), ' labels' uniqLabels = uniqLabels[:n_lab] used_labels = uniqLabels pbar = start_progressbar(len(uniqLabels), 'training adaboost for %i labels' %len(uniqLabels)) allLearners = [] for yy ,targetLab in enumerate(uniqLabels): runs=[] for rrr in xrange(n_runs): #import ipdb;ipdb.set_trace() feats,labs = get_binary_sets(features, labels, targetLab, n_samples) #print 'fitting stump' #import ipdb;ipdb.set_trace() baseClf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=10, min_samples_split=10) baseClf.fit(feats, labs) ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate, n_estimators=n_estim, algorithm="SAMME.R") #import ipdb;ipdb.set_trace() runs.append(ada_real.fit(feats, labs)) allLearners.append(runs) update_progressbar(pbar, yy) end_progressbar(pbar) return allLearners, used_labels
def adaBoost(n,x,t,x_test,t_test): clf = AdaBoostClassifier(n_estimators = n) clf.fit(x, t) predictions = clf.predict(x_test) X = confusion_matrix(t_test,predictions) classificationRate = (X[1,1]+X[0,0]) / sum(sum(X)) return(1-classificationRate)
def test_oneclass_adaboost_proba(): # Test predict_proba robustness for one class label input. # In response to issue #7501 # https://github.com/scikit-learn/scikit-learn/issues/7501 y_t = np.ones(len(X)) clf = AdaBoostClassifier().fit(X, y_t) assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
def train_classifiers(X_data, y_data): ############ Linear SVM: 0.908 ############# clf_LSVM = svm.SVC(kernel = 'linear') clf_LSVM.fit(X_data, y_data) ############ MultinomialNB: 0.875 ############# clf_MNB = MultinomialNB() clf_MNB.fit(X_data, y_data) ############ Random Forest: 0.910 ############# clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy') clf_RF.fit(X_data, y_data) ############ Extra Tree: 0.915 ################## clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_ETC.fit(X_data, y_data) ############ AdaBoost: 0.88 ################## clf_Ada = AdaBoostClassifier() clf_Ada.fit(X_data, y_data) ############ rbf SVM: 0.895 ############# clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf') clf_rbf.fit(X_data, y_data) ############ GradientBoosting: 0.88 ############# clf_GBC = GradientBoostingClassifier() clf_GBC.fit(X_data, y_data) return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC
def createAdaBoostClassifier(trainingVectors, targetValues): clf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) clf.fit(trainingVectors, targetValues, targetValues*10000) return(clf)
class DomainTypeClassifier(object): def __init__(self, radius, window_mode=False): self.classifier = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=20, learning_rate=1, algorithm="SAMME") # svm.SVC(kernel='rbf') self.radius = radius self.window_mode = window_mode def train(self, dataset): k = self.radius if not self.window_mode else 2 * self.radius + 1 rin, rout = dataset.getData(k, self.window_mode) print("fitting", len(rin)) self.classifier.fit(np.asarray(rin, float), np.asarray(rout, float)) def predict(self, ns): k = self.radius if not self.window_mode else 2 * self.radius + 1 to_predict = [] for i in range(len(ns)): if not self.window_mode: to_predict.append(encode(create_region(ns, i, k))) else: if i > len(ns) - k: break to_predict.append(encode(ns[i:i+k])) return int(Counter(self.classifier.predict( np.asarray(to_predict, float))).most_common(1)[0][0])
def cvalidate(): targetset = np.genfromtxt(open('trainLabels.csv','r'), dtype='f16') y = [x for x in targetset] trainset = np.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f16') X = np.array([x for x in trainset]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0) X_train, X_test = decomposition_pca(X_train, X_test) #SVM c_range = 10.0 ** np.arange(6.5,7.5,.25) gamma_range = 10.0 ** np.arange(-2.5,0.5,.25) parameters = {'kernel':['rbf'], 'C':c_range, 'gamma':gamma_range} svr = SVC() clf = grid_search.GridSearchCV(svr, parameters) clf.fit(X_train, y_train) bdt = AdaBoostClassifier(base_estimator = clf.best_estimator_, algorithm="SAMME", n_estimators=100) #bdt = AdaBoostClassifier(base_estimator = KNeighborsClassifier(n_neighbors=10)) bdt.fit(X_train, y_train) print bdt.score(X_test, y_test)
def ANGEL_training(cds_filename, utr_filename, output_pickle, num_workers=3): coding = [ r for r in SeqIO.parse(open(cds_filename), 'fasta') ] utr = [ r for r in SeqIO.parse(open(utr_filename), 'fasta') ] o_all = c_ORFscores.CDSWindowFeat() add_to_background(o_all, coding) add_to_background(o_all, utr) data_pos = get_data_parallel(o_all, coding, [0], num_workers) data_neg = get_data_parallel(o_all, utr, [0, 1, 2], num_workers) data = data_neg + data_pos target = [0]*len(data_neg) + [1]*len(data_pos) data = np.array(data) print >> sys.stderr, "data prep done, running classifier...." bdt = AdaBoostClassifier(n_estimators=50) bdt.fit(data, target) print >> sys.stderr, "classifier trained. putting pickle to", output_pickle with open(output_pickle, 'wb') as f: dump({'bdt':bdt, 'o_all':o_all}, f) return data, target, bdt
def some(X, Y, X_test, Y_test): ada = AdaBoostClassifier() print "Train Model ---" t1 = time() ada.fit(X, Y) t2 = time() print "Model Trained ----------", t2 - t1 test_errors = [] cur = 1 Y_test2 = [] for k in Y_test: Y_test2.append(k[0]) print "Testing: " print Y_test2 pred = ada.predict(X_test) print pred accu = 1. - accuracy_score(y_true= Y_test2, y_pred= pred) print accu print "STAGED _____________" for test_predict in ( ada.staged_predict(X_test)): test_errors.append( 1. - accuracy_score(test_predict, Y_test2)) print "errorss : " print test_errors
def ada(xtrain, ytrain, train_weight, tests, test_weight): #Initiate the training model clf = AdaBoostClassifier() mistakes = 0 cost = 0 #Fit the model clf.fit(xtrain, ytrain) vector_count = 0 #Iterate over the tests for i in range(len(tests)): #Get the number of elements in each test vector_count += len(tests[i]) test_count = 0 #Iterate over each feature in the tests for vector in tests[i]: #Predict based on each feature prediction = clf.predict(vector) #Determine the cost cost += test_weight[i][test_count] * pen[i][prediction[0]] #Count the number of mistakes if pen[i][prediction[0]] > 0: #print("Incorrectly Predicted " + str(Segments.reverse_mapping[i]) + " as " + str(Segments.reverse_mapping[prediction[0]])) mistakes += 1 test_count += 1 print("Number of mistakes: " + str(mistakes) + " of " + \ str(vector_count) + ", " + \ str((1.-float(mistakes)/float(vector_count))*100) + \ "% accurate") return cost
def ada_boost_dt(): """ Submission: ada_boost_dt_0707_03.csv E_val: 0.854350 E_in: 0.889561 E_out: 0.8832315976033993 """ from sklearn.ensemble import AdaBoostClassifier from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) ab = AdaBoostClassifier(n_estimators=300) scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1) logger.debug('CV: %s', scores) logger.debug('E_val: %f', sum(scores) / len(scores)) ab.fit(X_scaled, y) logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('ab', ab)]), 'ada_boost_dt_0707_03')
class AdaBoost: def __init__(self, data, n_estimators=50, learning_rate=1.0): features, weights, labels = data self.clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate) self.predictions, self.trnaccuracy, self.tstaccuracy = None, None, None self.dataset = split_dataset(features, weights, labels) def train(self): """ Train Ada Boost on the higgs dataset """ self.clf = self.clf.fit(self.dataset['training']['features'], self.dataset['training']['labels']) def predict(self): """ Predict label using Ada Boost :return: """ self.predictions = self.clf.predict(self.dataset['test']['features']) def evaluate(self): self.trnaccuracy = self.clf.score(self.dataset['training']['features'], self.dataset['training']['labels'], sample_weight=self.dataset['training']['weights']) self.tstaccuracy = self.clf.score(self.dataset['test']['features'], self.dataset['test']['labels'], sample_weight=self.dataset['test']['weights'])
def ab_predictedValue(): print '----------AdaBoost----------' ab_clf = AdaBoostClassifier(n_estimators = NoOfEstimators) ab_clf.fit(train_df[features], train_df['SeriousDlqin2yrs']) ab_predictedValue = ab_clf.predict_proba(test_df[features]) print 'Feature Importance = %s' % ab_clf.feature_importances_ return ab_predictedValue[:,1]
def main(): print("gradient boosting classifier!") X,Y,Xtest = importdata() print(Y.shape) param_grid={ "n_estimators":[10,100,200,2000,20000], "base_estimator__n_estimators":[10,20,50,100,200], "base_estimator__min_samples_split":[5,10,20,50] } ab=AdaBoostClassifier(RandomForestClassifier()) Gridsearch_impl(X,Y,ab,param_grid,5) # for i in range(10,11,5): # clf = DecisionTreeClassifier(min_samples_split=i) # rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i) # ab = AdaBoostClassifier(rf,n_estimators = 10) #ab = GradientBoostingClassifier(n_estimators = 100) # score = cross_validation.cross_val_score(ab,X,Y,cv=3) # print(score) # print("average score %f"%np.mean(score)) # print("std %f"%np.std(score)) # ab.fit(X,Y) Ytest = ab.predict(Xtest) output(Ytest,'submit3.csv')
def adaboost(df,label_name,feature_names,features_len,ifeat,n_estimators=100): # TODO: just copied from RF, needs real code from sklearn.ensemble import AdaBoostClassifier print('---------------------------------------------------') print(ifeat,features_len,'Adaboost, features:',feature_names) df_train_Y = df[label_name] train_Y = df_train_Y.values.ravel() # turn from 2D to 1D df_train_X = df[feature_names] train_X = df_train_X.values clf =AdaBoostClassifier(n_estimators=n_estimators) clf = clf.fit(train_X,train_Y) # output = clf.predict(train_X) E_in = round(1.-clf.score(train_X, train_Y),5) # 'in sample' error #print('\tE_in :',E_in) # ----- # Kfold as estimator for 'out of sample' error kf=skl.cross_validation.KFold(n=len(train_X), n_folds=5) cv_scores=skl.cross_validation.cross_val_score(clf, train_X, y=train_Y, cv=kf) E_out = round(1.-np.mean(cv_scores),5) #print("\tE_out:",E_out) return E_in,E_out
def trainClassifier(dataDir, trialName, NUMFISH): ch = circularHOGExtractor(6,4,3) nFeats = ch.getNumFields()+1 trainData = np.array([])#np.zeros((len(lst0)+len(lst0c)+len(lst1),nFeats)) targetData = np.array([])#np.hstack((np.zeros(len(lst0)+len(lst0c)),np.ones(len(lst1)))) for tr in range(NUMFISH): directory = dataDir + '/process/' + trialName + '/FR_ID' + str(tr) + '/' files = [name for name in os.listdir(directory)] thisData = np.zeros((len(files),nFeats)) thisTarget = tr*np.ones(len(files)) i = 0 for imName in files: sample = cv2.imread(directory + imName) thisIm = cv2.cvtColor(sample, cv2.COLOR_BGR2GRAY) thisData[i,:] = np.hstack((ch.extract(thisIm), np.mean(thisIm))) i = i + 1 trainData = np.vstack((trainData, thisData)) if trainData.size else thisData targetData = np.hstack((targetData, thisTarget)) if targetData.size else thisTarget #clf = svm.SVC() clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=50) y_pred = clf.fit(trainData,targetData) pickle.dump(clf, open( dataDir + '/process/' + trialName + '/boost' + trialName + '.p',"wb")) y_pred = clf.predict(trainData) print("Number of mislabeled points out of a total %d points : %d" % (trainData.shape[0],(targetData != y_pred).sum()))
def __init__(self,n_estimators=50, learning_rate=1.0, algorithm='SAMME.R',\ criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1,\ max_features=None, random_state=None, min_density=None, compute_importances=None): base_estimator=DecisionTreeClassifier() self.base_estimator = base_estimator self.base_estimator_class = self.base_estimator.__class__ self.n_estimators = n_estimators self.learning_rate = learning_rate self.algorithm = algorithm self.splitter = splitter self.max_depth = max_depth self.criterion = criterion self.max_features = max_features self.min_density = min_density self.random_state = random_state self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.compute_importances = compute_importances self.estimator = self.base_estimator_class(criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth,\ min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features,\ random_state=self.random_state, min_density=self.min_density, compute_importances=self.compute_importances) AdaBoostClassifier.__init__(self, base_estimator=self.estimator, n_estimators=self.n_estimators, learning_rate=self.learning_rate, algorithm=self.algorithm)
def do_all_study(X,y): names = [ "Decision Tree","Gradient Boosting", "Random Forest", "AdaBoost", "Naive Bayes"] classifiers = [ #SVC(), DecisionTreeClassifier(max_depth=10), GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1), RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1), AdaBoostClassifier()] for name, clf in zip(names, classifiers): estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1) param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_GBC, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_GBC.fit(X_train,y_train) y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1] print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC)) clf_AB = AdaBoostClassifier() param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_AB, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_AB.fit(X_train,y_train) y_pred_AB = clf_AB.predict_proba(X_test)[:,1] print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
def train_adaboost(features, labels): uniqLabels = np.unique(labels) print 'TAKING ONLY ', str(N_LAB), ' LABELS FOR SPEED ' uniqLabels = uniqLabels[:N_LAB] allLearners = [] for targetLab in uniqLabels: print 'processing for label ', str(targetLab) runs=[] #import ipdb;ipdb.set_trace() for rrr in xrange(N_RUNS): #import ipdb;ipdb.set_trace() feats,labs = get_binary_sets(features, labels, targetLab) #print 'fitting stump' #import ipdb;ipdb.set_trace() baseClf = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1) baseClf.fit(feats, labs) ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate, n_estimators=N_ESTIM, algorithm="SAMME.R") #import ipdb;ipdb.set_trace() runs.append(ada_real.fit(feats, labs)) allLearners.append(runs) return allLearners
def prediction(feat,label): x_train, x_test, y_train, y_test = cross_validation.train_test_split(feat, label, test_size = 0.25, random_state = 0) num_leaves = [] accuracy_score = [] auc_score = [] # for depth in range(1,10): # clf = tree.DecisionTreeClassifier(max_depth = depth) # clf.fit(x_train,y_train) # predictions = clf.predict(x_test) # accuracy = clf.score(x_test,y_test) # auc = metrics.roc_auc_score(y_test,predictions) # num_leaves.append(depth) # accuracy_score.append(accuracy) # auc_score.append(auc) for depth in range(1,10): clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth = depth), n_estimators = 100) clf.fit(x_train,y_train) predictions = clf.predict(x_test) accuracy = clf.score(x_test,y_test) auc = metrics.roc_auc_score(y_test,predictions) num_leaves.append(depth) accuracy_score.append(accuracy) auc_score.append(auc) return num_leaves,accuracy_score,auc_score
def runAdaReal(arr):#depth, n_est, filename, lrn_rate=1.0): global file_dir, nEvents, solutionFile, counter depth = int(arr[0]*100) n_est = int(arr[1]*100) lrn_rate = arr[2] print 'iteration number ' + str(counter) counter+=1 if depth <= 0 or n_est <= 0 or lrn_rate <= 0: print 'return 100' return 100 filename = 'adar_dep'+str(depth)+'_est'+str(n_est)+'_lrn'+str(lrn_rate) # low bdt_real = AdaBoostClassifier( tree.DecisionTreeClassifier(max_depth=depth), n_estimators=n_est, learning_rate=lrn_rate) print "AdaBoostReal training" bdt_real.fit(sigtr[train_input].values,sigtr['Label'].values) print "AdaBoostReal testing" bdt_real_pred = bdt_real.predict(sigtest[train_input].values) solnFile(filename,bdt_real_pred,sigtest['EventId'].values)# print "AdaBoostReal finished" ams_score = ams.AMS_metric(solutionFile, file_dir+filename+'.out', nEvents) print ams_score logfile.write(filename+': ' + str(ams_score)+'\n') return -1.0*float(ams_score)
def classify(x, y, cv, n_estimator=50): acc, prec, recall = [], [], [] base_clf = DecisionTreeClassifier( compute_importances=None, criterion="entropy", max_depth=1, max_features=None, max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, random_state=None, splitter="best", ) global clf clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=n_estimator) for train, test in cv: x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test] clf = clf.fit(x_train, y_train) y_pred = clf.predict(x_test) acc.append(accuracy_score(y_test, y_pred)) prec.append(precision_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) a = np.mean(acc) p = np.mean(prec) r = np.mean(recall) f = 2 * p * r / (p + r) return a, p, r, f
def AB_results(): # AdaBoostClassifier print "--------------AdaBoostClassifier-----------------" rang = [60, 80] # print "--------------With HOG-----------------" # ans = [] # print "n_estimators Accuracy" # for i in rang: # clf = AdaBoostClassifier(n_estimators=i) # clf.fit(X_train_hog, y_train) # mean_accuracy = clf.score(X_test_hog, y_test) # print i, " ", mean_accuracy # ans.append('('+str(i)+", "+str(mean_accuracy)+')') # print ans # plt.plot(rang, ans, linewidth=2.0) # plt.xlabel("n_estimators") # plt.ylabel("mean_accuracy") # plt.savefig("temp_hog.png") print "\n--------------Without HOG-----------------" ans = [] print "n_estimators Accuracy" for i in rang: clf = AdaBoostClassifier(n_estimators=i) clf.fit(X_train, y_train) mean_accuracy = clf.score(X_test, y_test) print i, " ", mean_accuracy ans.append('('+str(i)+", "+str(mean_accuracy)+')') print ans plt.plot(rang, ans, linewidth=2.0) plt.xlabel("n_estimators") plt.ylabel("mean_accuracy") plt.savefig("temp_plain.png")
from sklearn.ensemble import AdaBoostClassifier x_train, x_test, y_train, y_test = train_test_split(x, y_class, test_size=0.2) decision_tree = AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy", random_state=0, max_depth=3), n_estimators=20) decision_tree = decision_tree.fit(x_train, y_train) train_accuracy = decision_tree.score(x_train, y_train) test_accuracy = decision_tree.score(x_test, y_test)
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42) classifier.fit(train, yTrain) y_pred = classifier.predict(test) result = gen_result(test_id, y_pred) result.to_csv('./data/submission3.csv', index=False) print('random forest finished!') ''' Ada boost ''' ada_params = { 'n_estimators': 200, 'learning_rate' : 0.75 } clf = AdaBoostClassifier(**ada_params) clf.fit(train, yTrain) y_pred = clf.predict(test) result = gen_result(test_id, y_pred) result.to_csv('./data/submission4.csv', index=False) print('adaboost finished!') # Vote for the result res1 = pd.read_csv('./data/submission1.csv') res2 = pd.read_csv('./data/submission2.csv') res3 = pd.read_csv('./data/submission3.csv') res4 = pd.read_csv('./data/submission4.csv') label1 = np.array(lbl.transform(list(res1.country.values))).reshape(-1, 1)
model = RandomForestClassifier(n_estimators=2, random_state=1) model = model.fit(X_train, y_train) score_report(X_test, y_test) from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=1, random_state=1) model = model.fit(X_train, y_train) score_report(X_test, y_test) from sklearn.ensemble import AdaBoostClassifier model = AdaBoostClassifier(n_estimators=5, random_state=1) model = model.fit(X_train, y_train) score_report(X_test, y_test) from sklearn import svm model = svm.LinearSVC(C=0.05) model = model.fit(X_train, y_train) score_report(X_test, y_test) model = svm.SVC(kernel='linear', C=0.4) model = model.fit(X_train, y_train) score_report(X_test, y_test)
#ensemble models models = {} print "Training on all features" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1010) models['RFC'] = RandomForestClassifier(n_estimators=300) models['XGB'] = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05) models['GBC'] = GradientBoostingClassifier() models['ABC'] = AdaBoostClassifier() models['ETC'] = ExtraTreesClassifier() for name, model in models.iteritems(): model.fit(X_train, y_train) print name print classification_report(y_test, model.predict(X_test)) print "Accuracy: ", accuracy_score(y_test, model.predict(X_test)) print '\n' feature_importances = pd.DataFrame() for name, model in models.iteritems(): df = pd.DataFrame(data=model.feature_importances_, index=X_test.columns, columns=[name]).transpose()
plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary #t0 = time() #knnClf = KNeighborsClassifier() #knnClf.fit(features_train, labels_train) #print "default knn training time:", round(time()-t0, 3), "s" t0 = time() adaBoostClf = AdaBoostClassifier(n_estimators=30,learning_rate=0.4) adaBoostClf.fit(features_train, labels_train) print "default adaBoost training time:", round(time()-t0, 3), "s" #t0 = time() #rfClf = RandomForestClassifier() #rfClf.fit(features_train, labels_train) #print "default randomForest training time:", round(time()-t0, 3), "s" #knnPred = knnClf.predict(features_test) #knnacc = accuracy_score(knnPred, labels_test) adaBoostPred = adaBoostClf.predict(features_test) adaBoostacc = accuracy_score(adaBoostPred, labels_test) #rfPred = rfClf.predict(features_test)
DATA_DIRECTORY = '../data/full' MAX_FEATURES = 2500 CORPUS = [] VECTORIZER = CountVectorizer(max_features=MAX_FEATURES) CLASSIFIERS = { 'GaussianNB': GaussianNB(), 'MultinomialNB': MultinomialNB(), 'KNN': KNeighborsClassifier(n_neighbors=3), 'LogisticRegression': LogisticRegression(), 'RandomForest': RandomForestClassifier(n_estimators=100), 'ExtraTrees': ExtraTreesClassifier(n_estimators=100), 'AdaBoost': AdaBoostClassifier(), 'MLP': MLPClassifier(max_iter=500), 'SVC(linear, C=0.025)': SVC(kernel="linear", C=0.025, probability=True) } ANEW_EMOTION_DICTIONARY = common_utils.get_anew_emotion_dictionary() def text_id_to_filename(text_id): """ Creates the full filename for the text_id :param text_id: the id :return: the full filename """ return DATA_DIRECTORY + '/' + text_id + '.txt'
# EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],voting='soft', verbose=0), # SVC(kernel="linear", C=0.025), ExtraTreesClassifier(n_estimators=150, criterion="entropy", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-7, bootstrap=False, oob_score=False, n_jobs=1, random_state=410, verbose=0, warm_start=False, class_weight=None), RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=1, oob_score=True, random_state=410, verbose=0, warm_start=False), AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=1, min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0., max_features=None, random_state=None,max_leaf_nodes=None, min_impurity_split=1e-7, class_weight=None, presort=False), n_estimators=100, learning_rate=0.1,algorithm='SAMME.R', random_state=410), GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, min_impurity_split=1e-7, init=None, random_state=410, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'), LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.001, priors=None, n_components=410, store_covariance=False, tol=1e-4)] # Logging for Visual Comparison log_cols = ["Classifier", "Accuracy", "Log Loss"] log = pd.DataFrame(columns=log_cols)
# k-nearest neighbor from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(features_train, labels_train) print "KNN Accuracy:", neigh.score(features_test, labels_test) # Random Forest from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(n_estimators=200) rfc.fit(features_train, labels_train) print "Random Forest Accuracy:", rfc.score(features_test, labels_test) # AdaBoost from sklearn.ensemble import AdaBoostClassifier abc = AdaBoostClassifier() abc.fit(features_train, labels_train) print "AdaBoost Accuracy:", abc.score(features_test, labels_test) prettyPicture(neigh, features_test, labels_test, "neigh.png") prettyPicture(rfc, features_test, labels_test, "rfc.png") prettyPicture(abc, features_test, labels_test, "abc.png") # for clf in [neigh, rfc, abc]: # try: # print "plotting" # prettyPicture(clf, features_test, labels_test) # except NameError: # print "passed" # pass
def init_model(self, settings): return AdaBoostClassifier(n_estimators=settings["n_estimators"])
topic_dist_train_all_stars['Sentiment'] = sentiment_predicted_train topic_dist_test_all_stars['Sentiment'] = sentiment_predicted_test # Feed in the predicted sentiment as the feature along with the topic distribution (From LDA), for the model to train on # Use the model to predict star rating from the topic distribution and sentiment of the testing reviews train_features = topic_dist_train_all_stars train_lables = stars_label_train_all_stars test_features = topic_dist_test_all_stars test_lables = stars_label_test_all_stars classifiers = [ MultinomialNB(), LogisticRegression(), RandomForestClassifier(n_estimators=100, n_jobs=2), AdaBoostClassifier(n_estimators=100) ] classifiers_names = [ 'Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest', 'AdaBoost' ] LdaSentimentResults = {} for (i, classifier) in enumerate(classifiers): model = classifier.fit(train_features, train_lables) preds = model.predict(test_features) precision = metrics.precision_score(test_lables, preds) recall = metrics.recall_score(test_lables, preds) F1 = metrics.f1_score(test_lables, preds) accuracy = accuracy_score(test_lables, preds)
adasyn = over_sampling.ADASYN() adasyn_x, adasyn_y = adasyn.fit_sample(train_x,train_y) print adasyn_x.shape models = [] models.append(("LR",LogisticRegression())) models.append(("LDA",LinearDiscriminantAnalysis())) models.append(("KNN",KNeighborsClassifier())) models.append(("DCT",DecisionTreeClassifier())) models.append(("GNB",GaussianNB())) models.append(("SVC",SVC())) models.append(("GPC",GaussianProcessClassifier(1.0*RBF(1.0)))) models.append(("MLP",MLPClassifier())) models.append(("ADB",AdaBoostClassifier())) for name, model in models: training("Normal",name,train_x,train_y) training("ROS",name,ROS_x,ROS_y) training("SMOTE",name,smote_x,smote_y) training("ADASYN",name,adasyn_x,adasyn_y) print "----------------------------------------------" print np.unique(train_y) weight=class_weight.compute_class_weight("balanced",np.unique(train_y),train_y) print weight result = SVC(class_weight={0:weight[0],1:weight[1],2:weight[2]}).fit(train_x,train_y) print "class_weight metod and svc models result : ",result.score(test_x,test_y)
print('Naive Bayes Accuracy score: ', accuracy_score(Y_test, predictions)) print('Naive Bayes Recall score: ', recall_score(Y_test, predictions)) print('Naive Bayes F measure: ', f1_score(Y_test, predictions)) print('Naive Bayes precision score: ', precision_score(Y_test, predictions)) print() # Decision Tree Classifier decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train_cv, Y_train) predictions = decision_tree.predict(X_test_cv) print('Decision Tree Accuracy score: ', accuracy_score(Y_test, predictions)) print() # Adaboost Tree Classifier adaboost = AdaBoostClassifier() adaboost.fit(X_train_cv, Y_train) predictions = adaboost.predict(X_test_cv) print('Adaboost Accuracy score: ', accuracy_score(Y_test, predictions)) print() # K Nearest Neighbor classfier k_nearest = KNeighborsClassifier() k_nearest.fit(X_train_cv, Y_train) predictions = k_nearest.predict(X_test_cv) print('K-Nearest Neighbour Accuracy score: ', accuracy_score(Y_test, predictions)) print() # Random Forest classifier random_forest = RandomForestClassifier() random_forest.fit(X_train_cv, Y_train)
anon_text = doc.create_text(anon=True) blob = TextBlob(anon_text) anon_scores.append(blob.sentiment.subjectivity) return pure_scores, anon_scores if __name__ == "__main__": clf_names = [ "AdaBoostClassifier", "LogisticRegression", "SGDClassifier", "LinearSVC", "RandomForest", "GradientBoosting" ] classifiers = [ AdaBoostClassifier(), LogisticRegression(class_weight="balanced"), SGDClassifier(class_weight="balanced"), LinearSVC(class_weight="balanced"), RandomForestClassifier(class_weight="balanced"), GradientBoostingClassifier() ] """ test_polarity_azure(clf_path="binary_classification/models/" + clf_names[0] + "_default_flair", clf=classifiers[0], save_path="results/sentiment/azure_baseline_polarity.csv", anon=False) #for clf, clf_name in zip(classifiers, clf_names): # print("\nWorking on " + clf_name + "\n") # test_polarity_azure(clf_path="binary_classification/models/" + clf_name + "_default_flair", clf=clf, # save_path="results/sentiment/azure_" + clf_name + "_polarity.csv")
path = "C://Users//Arushi//PycharmProjects//Final_Thesis_chap1//9//" for threshold in thresholdRange: print(threshold) overallPrecision = 0 overallRecall = 0 overallAuauc = 0 overallAccuracy = 0 overallMc = 0 for i in range(iter): X_train = np.load(path + 'transformed_train_data_' + str(i) + '.npy').astype(float) Y_train = np.load(path + 'transformed_train_labels_' + str(i) + '.npy'). astype(float).astype(int) X_test = np.load(path + 'transformed_test_data_' + str(i) + '.npy').astype(float) Y_test = np.load(path + 'transformed_test_labels_' + str(i) + '.npy').astype(float).astype(int) bp = best_params[i] clf = AdaBoostClassifier(base_estimator=bp['base_estimator'], n_estimators=bp['n_estimators'], algorithm=bp['algorithm'], random_state=seed) clf_sigmoid = CalibratedClassifierCV(clf, cv=cvCount, method='sigmoid').fit(X_train, Y_train.ravel()) predictionsProb = clf_sigmoid.predict_proba(X_test) predictions = getPredictionsGivenThreshold(predictionsProb, threshold) precision = precision_score(Y_test, predictions) recall = recall_score(Y_test, predictions) auroc = roc_auc_score(Y_test, predictionsProb[:, 1]) accuracy = accuracy_score(Y_test, predictions) matthewsCoeff = matthews_corrcoef(Y_test, predictions) overallPrecision += precision overallRecall += recall overallAuauc += auroc overallAccuracy +=accuracy overallMc += matthewsCoeff thresholdList.append(threshold)
[int(bagb_pred[i] != y_test[i]) for i in range(0, ts)]) bagb_error = np.sum(bagb_verror) bagb_ccidx = np.where(bagb_verror == 0) bagb_mcidx = np.where(bagb_verror == 1) print("🌲 ----------Decision Tree Classfication + Bagging----------") print(bagb_error, "misclassified data out of", ts, "(", bagb_error / ts, "%)\n") '''-------------------- CART (Decision Tree) + Boosting http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html --------------------''' # adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=20) # adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=20,learning_rate=1.5,algorithm="SAMME") #[dodiku] there are many more parameters we can play with adab = AdaBoostClassifier(dtc, n_estimators=20, learning_rate=1.5, algorithm="SAMME.R") # adab = GradientBoostingClassifier(max_depth=5, n_estimators=30) adab.fit(x_training, y_training) # Predicting adab_pred = adab.predict(x_test) # Finding mispredicted samples adab_verror = np.asarray( [int(adab_pred[i] != y_test[i]) for i in range(0, ts)]) adab_error = np.sum(adab_verror) adab_ccidx = np.where(adab_verror == 0) adab_mcidx = np.where(adab_verror == 1)
def btnConvert_click(self): msgBox = QMessageBox() try: FoldFrom = np.int32(ui.txtFoldFrom.text()) FoldTo = np.int32(ui.txtFoldTo.text()) except: print("Please check fold parameters!") return if FoldTo < FoldFrom: print("Please check fold parameters!") return # Algorithm Algorithm = ui.cbAlgorithm.currentText() # NEstimators try: NEstimators = np.int(ui.txtNEstimators.text()) except: msgBox.setText("Number of Estimators is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # LearningRate try: LearningRate = np.float(ui.txtLearningRate.text()) except: msgBox.setText("Learning Rate is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Filter try: Filter = ui.txtFilter.text() if not len(Filter): Filter = None else: Filter = Filter.replace("\'", " ").replace(",", " ").replace( "[", "").replace("]", "").split() Filter = np.int32(Filter) except: print("Filter is wrong!") return # OutFile OutFile = ui.txtOutFile.text() if not len(OutFile): msgBox.setText("Please enter out file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False Fold = list() accuracy = list() precision = list() average_precision = list() f1score = list() recall = list() accuracyTr = list() precisionTr = list() average_precisionTr = list() f1scoreTr = list() recallTr = list() InFileList = list() OutData = dict() OutData["ModelAnalysis"] = "AdaBoost" for fold in range(FoldFrom, FoldTo + 1): # OutModel OutModel = ui.txtOutModel.text() if not len(OutModel): OutModel = None else: OutModel = OutModel.replace("$FOLD$", str(fold)) # InFile InFile = ui.txtInFile.text() InFile = InFile.replace("$FOLD$", str(fold)) InFileList.append(InFile) if not len(InFile): msgBox.setText("Please enter input file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not os.path.isfile(InFile): msgBox.setText("Input file not found!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False InData = io.loadmat(InFile) # Data if not len(ui.txtITrData.currentText()): msgBox.setText("Please enter Input Train Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeData.currentText()): msgBox.setText("Please enter Input Test Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Label if not len(ui.txtITrLabel.currentText()): msgBox.setText("Please enter Train Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not len(ui.txtITeLabel.currentText()): msgBox.setText("Please enter Test Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False TrX = InData[ui.txtITrData.currentText()] TeX = InData[ui.txtITeData.currentText()] TrL = InData[ui.txtITrLabel.currentText()][0] TeL = InData[ui.txtITeLabel.currentText()][0] try: if Filter is not None: for fil in Filter: # Remove Training Set labelIndx = np.where(TrL == fil)[0] TrL = np.delete(TrL, labelIndx, axis=0) TrX = np.delete(TrX, labelIndx, axis=0) # Remove Testing Set labelIndx = np.where(TeL == fil)[0] TeL = np.delete(TeL, labelIndx, axis=0) TeX = np.delete(TeX, labelIndx, axis=0) print("Class ID = " + str(fil) + " is removed from data.") if ui.cbScale.isChecked(): TrX = preprocessing.scale(TrX) TeX = preprocessing.scale(TeX) print( "Whole of data is scaled Train~N(0,1) and Test~N(0,1)." ) except: print("Cannot load data or label") return # FoldID if not len(ui.txtFoldID.currentText()): msgBox.setText("Please enter FoldID variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: currFID = InData[ui.txtFoldID.currentText()][0][0] Fold.append(currFID) except: print("Cannot load Fold ID!") return try: allvars = dict(locals(), **globals()) exec(ui.txtBase.toPlainText(), allvars, allvars) base = allvars['base'] except Exception as e: print("Event codes generated following error:") print(e) msgBox = QMessageBox() msgBox.setText(str(e)) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return try: clf = AdaBoostClassifier(base_estimator=base, n_estimators=NEstimators,\ learning_rate=LearningRate,algorithm=Algorithm) print("FoldID = " + str(currFID) + " is training ...") clf.fit(TrX, TrL) if OutModel is not None: joblib.dump(clf, OutModel) print("FoldID = " + str(currFID) + " Model is saved: " + OutModel) print("FoldID = " + str(currFID) + " is testing ...") PeL = clf.predict(TeX) PrL = clf.predict(TrX) OutData["confusion_matrix"] = confusion_matrix( TeL, PeL, np.unique(TeL)) except Exception as e: print(e) msgBox = QMessageBox() msgBox.setText(str(e)) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return if ui.cbAverage.isChecked(): acc = accuracy_score(TeL, PeL) accTr = accuracy_score(TrL, PrL) accuracy.append(acc) accuracyTr.append(accTr) print( "FoldID = {:d}, Average Train {:5.2f} Test {:5.2f}" .format(currFID, accTr * 100, acc * 100)) if ui.cbPrecision.isChecked(): pre = precision_score(TeL, PeL, average=ui.cbPrecisionAvg.currentData()) preTr = precision_score( TrL, PrL, average=ui.cbPrecisionAvg.currentData()) precision.append(pre) precisionTr.append(preTr) print( "FoldID = {:d}, Precision Train {:5.2f} Test {:5.2f}" .format(currFID, preTr * 100, pre * 100)) if ui.cbAPrecision.isChecked(): prA = average_precision_score( TeL, PeL, average=ui.cbAPrecisionAvg.currentData()) prATr = average_precision_score( TrL, PrL, average=ui.cbAPrecisionAvg.currentData()) average_precision.append(prA) average_precisionTr.append(prATr) print( "FoldID = {:d}, Average Precision: Train {:5.2f} Test {:5.2f}" .format(currFID, prATr * 100, prA * 100)) if ui.cbRecall.isChecked(): rec = recall_score(TeL, PeL, average=ui.cbRecallAvg.currentData()) recTr = recall_score(TrL, PrL, average=ui.cbRecallAvg.currentData()) recall.append(rec) recallTr.append(recTr) print( "FoldID = {:d}, Recall: Train {:5.2f} Test {:5.2f}" .format(currFID, recTr * 100, rec * 100)) if ui.cbF1.isChecked(): f1 = f1_score(TeL, PeL, average=ui.cbF1Avg.currentData()) f1Tr = f1_score(TrL, PrL, average=ui.cbF1Avg.currentData()) f1score.append(f1) f1scoreTr.append(f1Tr) print( "FoldID = {:d}, F1: Train {:5.2f} Test {:5.2f}" .format(currFID, f1Tr * 100, f1 * 100)) print("FoldID = " + str(currFID) + " is analyzed!") if ui.cbAverage.isChecked(): OutData["FoldAccuracy"] = accuracy MeanAcc = np.mean(accuracy) OutData["MeanTestAccuracy"] = MeanAcc STDAcc = np.std(accuracy) OutData["StdTestAccuracy"] = STDAcc MeanAccTr = np.mean(accuracyTr) OutData["MeanTrainAccuracy"] = MeanAccTr STDAccTr = np.std(accuracyTr) OutData["StdTrainAccuracy"] = STDAccTr print( "Accuracy: Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}" .format(MeanAccTr * 100, STDAccTr, MeanAcc * 100, STDAcc)) if ui.cbPrecision.isChecked(): OutData["ModePrecision"] = ui.cbPrecisionAvg.currentText() OutData["FoldPrecision"] = precision MeanPre = np.mean(precision) OutData["MeanTrainPrecision"] = MeanPre STDPre = np.std(precision) OutData["StdTrainPrecision"] = STDPre MeanPreTr = np.mean(precisionTr) OutData["MeanTestPrecision"] = MeanPreTr STDPreTr = np.std(precisionTr) OutData["StdTestPrecision"] = STDPreTr print( "Precision: Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}" .format(MeanPreTr * 100, STDPreTr, MeanPre * 100, STDPre)) if ui.cbAPrecision.isChecked(): OutData["ModeAveragePrecision"] = ui.cbAPrecisionAvg.currentText() OutData["FoldAveragePrecision"] = average_precision MeanAPre = np.mean(average_precision) OutData["MeanTrainAveragePrecision"] = MeanAPre STDAPre = np.std(average_precision) OutData["StdTestAveragePrecision"] = STDAPre MeanAPreTr = np.mean(average_precisionTr) OutData["MeanTrainAveragePrecision"] = MeanAPreTr STDAPreTr = np.std(average_precisionTr) OutData["StdTrainAveragePrecision"] = STDAPreTr print( "AveragePrecision: Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}" .format(MeanAPreTr * 100, STDAPreTr, MeanAPre * 100, STDAPre)) if ui.cbRecall.isChecked(): OutData["ModeRecall"] = ui.cbRecallAvg.currentText() OutData["FoldRecall"] = recall MeanRec = np.mean(recall) OutData["MeanTestRecall"] = MeanRec STDRec = np.std(recall) OutData["StdTestRecall"] = STDRec MeanRecTr = np.mean(recallTr) OutData["MeanTrainRecall"] = MeanRecTr STDRecTr = np.std(recallTr) OutData["StdTrainRecall"] = STDRecTr print( "Recall: Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}" .format(MeanRecTr * 100, STDRecTr, MeanRec * 100, STDRec)) if ui.cbF1.isChecked(): OutData["ModeF1"] = ui.cbF1Avg.currentText() OutData["FoldF1"] = f1score MeanF1 = np.mean(f1score) OutData["MeanTestF1"] = MeanF1 STDF1 = np.std(f1score) OutData["StdTestF1"] = STDF1 MeanF1Tr = np.mean(f1scoreTr) OutData["MeanTrainF1"] = MeanF1Tr STDF1Tr = np.std(f1scoreTr) OutData["StdTrainF1"] = STDF1Tr print( "F1: Train {:5.2f} +/- {:4.2f} Test {:5.2f} +/- {:4.2f}" .format(MeanF1Tr * 100, STDF1Tr, MeanF1 * 100, STDF1)) OutData["InputFiles"] = InFileList print("Saving ...") io.savemat(OutFile, mdict=OutData) print("DONE.") msgBox.setText("AdaBoost Classification is done.") msgBox.setIcon(QMessageBox.Information) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_()
test = pd.read_csv("test.csv") #Select features and parse the result to pandas dataframe X_test = pd.DataFrame(test.loc[:, features].values) #Load targets for test submission = pd.read_csv("gender_submission.csv") #Select the target column Y_test = submission.loc[:, "Survived"].values #Slpit train data into features and targets for train X_train = pd.DataFrame(train.loc[:, features].values) Y_train = train.loc[:, "Survived"].values #Data encoding : since machine learning work just with number we're going to parse strings to numeric values using Label Encoder le = preprocessing.LabelEncoder() X_train = X_train.apply(le.fit_transform) X_test = X_test.apply(le.fit_transform) #Create a Ada Boost Classifier instance classifier = AdaBoostClassifier() #Fit the classifier classifier.fit(X_train, Y_train) #Calculate the score (Accuracy) score = classifier.score(X_test, Y_test) #Printing the score print(score)
def train(): # if os.path.exists('dataset/per_feature_matrix'): # per_feature_matrix = pickle.load(open('dataset/per_feature_matrix', 'rb')) # else: start = time.time() print "extracting feature matrix..." if 1: per_feature_matrix = {} for each in os.listdir('dataset/per_feature'): path = os.path.join('dataset/per_feature/', each) per_feature_matrix = dict(pickle.load(open(path, 'rb')), **per_feature_matrix) per_feature_matrix = per_feature_matrix.values() pickle.dump(per_feature_matrix, open('dataset/per_feature_matrix', 'wb')) # if os.path.exists('dataset/api_feature_matrix'): # api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb')) # else: if 1: api_feature_matrix = {} for each in os.listdir('dataset/api_feature'): path = os.path.join('dataset/api_feature/', each) api_feature_matrix = dict(pickle.load(open(path, 'rb')), **api_feature_matrix) api_feature_matrix = api_feature_matrix.values() pickle.dump(api_feature_matrix, open('dataset/api_feature_matrix', 'wb')) # if os.path.exists('dataset/ngram_feature_matrix'): # ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb')) # else: if 1: ngram_feature_matrix = {} for each in os.listdir('dataset/ngram_feature'): path = os.path.join('dataset/ngram_feature/', each) ngram_feature_matrix = dict(pickle.load(open(path, 'rb')), **ngram_feature_matrix) ngram_feature_matrix = ngram_feature_matrix.values() pickle.dump(ngram_feature_matrix, open('dataset/ngram_feature_matrix', 'wb')) classification = pickle.load(open('dataset/classification', 'rb')) if per_feature_matrix is not None and api_feature_matrix is not None and ngram_feature_matrix is not None: feature_matrix = _concatenate(per_feature_matrix, api_feature_matrix, ngram_feature_matrix) elif per_feature_matrix is not None: feature_matrix = per_feature_matrix elif api_feature_matrix is not None: feature_matrix = api_feature_matrix elif ngram_feature_matrix is not None: feature_matrix = ngram_feature_matrix else: return print "extracting feature matrix done." print "处理前样本总数:%d" % len(feature_matrix) #print len(feature_matrix) #print len(classification) features = 400 fsmodel = SelectKBest(chi2, k=features) raw_feature_matrix = feature_matrix feature_matrix = fsmodel.fit_transform(feature_matrix, classification) pickle.dump(fsmodel, open('dataset/fsmodel', 'wb')) features = 300 svc = SVC(kernel="linear", C=1) fsmodel2 = RFE(estimator=svc, n_features_to_select=features, step=1) ######################### DEBUG ############################ #classification = classification[7:] ################################################################## feature_matrix = fsmodel2.fit_transform(feature_matrix, classification) pickle.dump(fsmodel2, open('dataset/fsmodel2', 'wb')) ######################### DEBUG ############################ b_s = 5 #改这里也要改dl.py里面的默认值 length = len(feature_matrix) feature_matrix = feature_matrix[length % b_s:] raw_feature_matrix = raw_feature_matrix[length % b_s:] classification = classification[length % b_s:] print "处理后样本总数:%d" % len(feature_matrix) ################################################################## ######################### DEBUG ############################ fs_vec = [] for i in range(len(raw_feature_matrix[0])): fs_vec.append(i) #构造值等于编号的特殊向量 fs_vec = fsmodel.transform(fs_vec) #print fs_vec fs_vec = fsmodel2.transform(fs_vec) #print fs_vec feature_matrix_dl = [x for x in range(len(raw_feature_matrix))] for i in range(len(feature_matrix_dl)): feature_matrix_dl[i] = [ x for x in range(len(raw_feature_matrix[0]) - features) ] temp = 0 for i in range(len(raw_feature_matrix[0])): if i not in fs_vec: print "第%d列特征没有选用" % i for j in range(len(feature_matrix_dl)): feature_matrix_dl[j][temp] = raw_feature_matrix[j][i] temp = temp + 1 #print "行数%d" % len(feature_matrix_dl) #print "列数%d" % len(feature_matrix_dl[0]) #print feature_matrix_dl ################################################################## #hiddeny, da = test_dA(feature_matrix_dl, len(feature_matrix_dl[0])) # hiddeny2, test = test_dA(feature_matrix,len(feature_matrix[0]), batch_size=6, da_object = da) hiddeny, da = test_rbm(feature_matrix_dl, len(feature_matrix_dl[0])) #print len(feature_matrix) print "浅度特征数:%d" % len(feature_matrix[0]) #print len(hiddeny) print "深度特征数:%d" % len(hiddeny[0]) # print (hiddeny == hiddeny2).all() #固化深度训练器 pickle.dump(da, open('dataset/rbmmodel', 'wb')) # 深度特征融合 feature_matrix = numpy.concatenate((feature_matrix, hiddeny), axis=1) Z = [] count = 0 for i in feature_matrix: Z.append([]) for j in i: Z[count].append(j) count += 1 feature_matrix = Z # print feature_matrix Z = [] for i in classification: Z.append(int(i)) classification = Z if 1: per_feature_matrix2 = {} for each in os.listdir('test/per_feature'): path = os.path.join('test/per_feature/', each) per_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **per_feature_matrix2) per_feature_matrix2 = per_feature_matrix2.values() pickle.dump(per_feature_matrix2, open('test/per_feature_matrix', 'wb')) # if os.path.exists('dataset/api_feature_matrix'): # api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb')) # else: if 1: api_feature_matrix2 = {} for each in os.listdir('test/api_feature'): path = os.path.join('test/api_feature/', each) api_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **api_feature_matrix2) api_feature_matrix2 = api_feature_matrix2.values() pickle.dump(api_feature_matrix2, open('test/api_feature_matrix', 'wb')) # if os.path.exists('dataset/ngram_feature_matrix'): # ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb')) # else: if 1: ngram_feature_matrix2 = {} for each in os.listdir('test/ngram_feature'): path = os.path.join('test/ngram_feature/', each) ngram_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **ngram_feature_matrix2) ngram_feature_matrix2 = ngram_feature_matrix2.values() pickle.dump(ngram_feature_matrix2, open('test/ngram_feature_matrix', 'wb')) classification2 = pickle.load(open('test/classification', 'rb')) if per_feature_matrix2 is not None and api_feature_matrix2 is not None and ngram_feature_matrix2 is not None: feature_matrix2 = _concatenate(per_feature_matrix2, api_feature_matrix2, ngram_feature_matrix2) elif per_feature_matrix2 is not None: feature_matrix2 = per_feature_matrix2 elif api_feature_matrix2 is not None: feature_matrix2 = api_feature_matrix2 elif ngram_feature_matrix2 is not None: feature_matrix2 = ngram_feature_matrix2 else: return print "extracting feature matrix done." print "处理前样本总数:%d" % len(feature_matrix2) #print len(feature_matrix) #print len(classification) features = 400 fsmodel2 = SelectKBest(chi2, k=features) raw_feature_matrix2 = feature_matrix2 feature_matrix2 = fsmodel.fit_transform(feature_matrix2, classification2) features2 = 300 svc = SVC(kernel="linear", C=1) fsmodel2 = RFE(estimator=svc, n_features_to_select=features2, step=1) feature_matrix2 = fsmodel2.fit_transform(feature_matrix2, classification2) ######################### DEBUG ############################ b_s = 5 #改这里也要改dl.py里面的默认值 length = len(feature_matrix2) feature_matrix2 = feature_matrix2[length % b_s:] raw_feature_matrix2 = raw_feature_matrix2[length % b_s:] classification2 = classification2[length % b_s:] print "处理后样本总数:%d" % len(feature_matrix2) ################################################################## ######################### DEBUG ############################ fs_vec2 = [] for i in range(len(raw_feature_matrix2[0])): fs_vec2.append(i) #构造值等于编号的特殊向量 fs_vec2 = fsmodel.transform(fs_vec2) #print fs_vec fs_vec2 = fsmodel2.transform(fs_vec2) #print fs_vec feature_matrix_dl2 = [x for x in range(len(raw_feature_matrix2))] for i in range(len(feature_matrix_dl2)): feature_matrix_dl2[i] = [ x for x in range(len(raw_feature_matrix2[0]) - features2) ] temp = 0 for i in range(len(raw_feature_matrix2[0])): if i not in fs_vec2: print "第%d列特征没有选用" % i for j in range(len(feature_matrix_dl2)): feature_matrix_dl2[j][temp] = raw_feature_matrix2[j][i] temp = temp + 1 hiddeny2, da = test_rbm(feature_matrix_dl2, len(feature_matrix_dl2[0])) #print len(feature_matrix) print "浅度特征数:%d" % len(feature_matrix2[0]) #print len(hiddeny) print "深度特征数:%d" % len(hiddeny2[0]) # print (hiddeny == hiddeny2).all() # 深度特征融合 feature_matrix2 = numpy.concatenate((feature_matrix2, hiddeny2), axis=1) Z = [] count = 0 for i in feature_matrix2: Z.append([]) for j in i: Z[count].append(j) count += 1 feature_matrix2 = Z # print feature_matrix Z = [] for i in classification2: Z.append(int(i)) classification2 = Z ''' kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with RF..." rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) rf.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(rf, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with RF done.\n" pickle.dump(rf, open('dataset/model', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with GBDT..." gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) gbdt.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(gbdt, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with GBDT done.\n" pickle.dump(gbdt, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with AdaBoost..." ada = AdaBoostClassifier(n_estimators=300) ada.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(ada, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with AdaBoost done.\n" pickle.dump(ada, open('dataset/model3', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with LogisticRegression..." lr = LogisticRegression() lr.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(lr, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with LogisticRegression done.\n" pickle.dump(lr, open('dataset/model4', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with RF..." rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) rf.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(rf, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with RF done.\n" pickle.dump(rf, open('dataset/model', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with GBDT..." gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) gbdt.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(gbdt, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with GBDT done.\n" pickle.dump(gbdt, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with AdaBoost..." ada = AdaBoostClassifier(n_estimators=300) ada.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(ada, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with AdaBoost done.\n" pickle.dump(ada, open('dataset/model3', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with LogisticRegression..." lr = LogisticRegression() lr.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(lr, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with LogisticRegression done.\n" pickle.dump(lr, open('dataset/model4', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' ''' kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with SVC..." slffork=SVC(kernel='rbf',probability = True) slffork.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(slffork, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with SVC done.\n" pickle.dump(slffork, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' ''' print "learning with BaggingClassifier..." kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) baggingfork = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5,max_features=0.5) baggingfork.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(baggingfork, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with BaggingClassifier done.\n" pickle.dump(baggingfork, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' '''kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)''' rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) ada = AdaBoostClassifier(n_estimators=300) #slf1=SVC(kernel='rbf',probability = True) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) print "learning with Voting Classifier..." vc = VotingClassifier(estimators=[('rf', rf), ('ada', ada), ('bagging', bagging), ('gbdt', gbdt)], voting='soft', weights=[1.5, 1.5, 1.3, 1.5]) vc.fit(feature_matrix, classification) ''' print "Cross Validating..." predicted = cross_validation.cross_val_predict(vc, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) ''' print "learning with Ensemble Classifier done.\n" pickle.dump(vc, open('dataset/model_final', 'wb')) # 固化训练结果 print 'time :%f' % (time.time() - start)
def class34(filename, i): ''' This function performs experiment 3.4 Parameters filename : string, the name of the npz file from Task 2 i: int, the index of the supposed best classifier (from task 3.1) ''' i = i - 1 data = np.load(filename)["arr_0"] X = [] y = [] for d in data: X.append(d[0:173]) y.append(d[173]) X = np.array(X) y = np.array(y) classifiers = [ SVC(kernel='linear', max_iter=1000), SVC(gamma=2, max_iter=1000), RandomForestClassifier(max_depth=5, n_estimators=10), MLPClassifier(alpha=0.05), AdaBoostClassifier() ] kf = KFold(n_splits=5, shuffle=True) # global list to store result fold_test_result_list = [] p_values = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] accuracy_list = [] for clf in classifiers: classifier = clone(clf) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) c_m = confusion_matrix(y_test, prediction) accuracy_list.append(accuracy(c_m)) fold_test_result_list.append(accuracy_list) vertical_result = np.transpose(fold_test_result_list) # compare the result with the best classifier for j in range(len(classifiers)): if i != j: S = stats.ttest_rel(vertical_result[i], vertical_result[j]) p_values.append(S[1]) with open('a1_3.4.csv', 'w', newline='') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',') for result in fold_test_result_list: spamwriter.writerow(result) spamwriter.writerow(p_values) spamwriter.writerow([ "The accuracy of the cross-validation's result may lead different result as part3.1 " + "It could be caused by the variance of the data. In the 3.1, there are only one set of training" " and testing data. The form of the trianing set may lead to bias." ])
for pre, real in zip(predictions, input_y): if pre == real and real == 0: count += 1 return count def getFN(predictions,input_y): count = 0 for pre, real in zip(predictions, input_y): if pre == 0 and real == 1: count += 1 return count if __name__ == "__main__": rf_model = RandomForestClassifier() adb_model = AdaBoostClassifier() gdbc_model = GradientBoostingClassifier() et_model = ExtraTreesClassifier() svc_model = SVC() for myK in range(5): train_x, test_x, train_y, test_y = get_dataset('../Features/features2.txt',myK=myK) train_sets = [] test_sets = [] for clf in [rf_model, adb_model, gdbc_model, et_model, svc_model]: train_set, test_set = get_stacking(clf, train_x, train_y, test_x) train_sets.append(train_set) test_sets.append(test_set)
test_x = data[len(train):(len(train) + len(test))].drop(target, axis=1) ################## #2 Model data ################### #import sys #sys.path.insert(0, '../helper') #from meta_predictor import BestRegressor #meta = BestRegressor(train_x, train_y, 4, 'r2', 0) #meta.evaluate() regs = [] regs.append(DecisionTreeClassifier()) regs.append(AdaBoostClassifier(n_estimators=120, learning_rate=0.2)) regs.append(RandomForestClassifier(n_estimators=50, max_depth=8)) regs.append(GradientBoostingClassifier(n_estimators=150, max_depth=3)) regs.append( XGBClassifier(n_estimators=275,max_depth=3,\ early_stopping_rounds=5) ) infos = [] for r in regs: infos.append(r.__class__.__name__) train_y = train_y.astype(bool) qmetric = 'accuracy' for reg, info in zip(regs, infos): scores = cross_val_score(reg, train_x, train_y, cv=7, scoring=qmetric) print("%s: %0.3f (+/- %0.2f) [%s]"%(qmetric,scores.mean(),scores.std(),\ info))
def class31(filename): ''' This function performs experiment 3.1 Parameters filename : string, the name of the npz file from Task 2 Returns: X_train: NumPy array, with the selected training features X_test: NumPy array, with the selected testing features y_train: NumPy array, with the selected training classes y_test: NumPy array, with the selected testing classes i: int, the index of the supposed best classifier ''' data = np.load(filename)["arr_0"] X = [] y = [] random.shuffle(data) for d in data: X.append(d[0:173]) y.append(d[173]) # splits data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8) classifiers = [ SVC(kernel='linear', max_iter=1000), SVC(gamma=2, max_iter=1000), RandomForestClassifier(max_depth=5, n_estimators=10), MLPClassifier(alpha=0.05), AdaBoostClassifier() ] accuracy_list = [] recall_list = [] precision_list = [] cm_list = [] for classifier in classifiers: classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) c_m = confusion_matrix(y_test, prediction) cm_list.append(c_m) accuracy_list.append(accuracy(c_m)) recall_list.append(recall(c_m)) precision_list.append(precision(c_m)) with open('a1_3.1.csv', 'w', newline='') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',') for i in range(len(accuracy_list)): spamwriter.writerow([i + 1] + [accuracy_list[i]] + recall_list[i] + precision_list[i] + cm_list[i].ravel().tolist()) iBest = np.argmax(accuracy_list) + 1 return (X_train, X_test, y_train, y_test, iBest)
X_train, X_test, y_train, y_test = train_test_split(X, y) #ID Models I want to run model_list = [ LogisticRegression(), LogisticRegression(penalty='l1'), RandomForestClassifier(n_jobs=-1), RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_leaf=2), GradientBoostingClassifier(), GradientBoostingClassifier(n_estimators=100, min_samples_leaf=2, max_depth=5, learning_rate=.01), AdaBoostClassifier(), AdaBoostClassifier(n_estimators=100, learning_rate=.01) ] title_list = [ 'Logistic Regression Base Model', 'Logistic Regression - L1 Penalty Added', 'Random Forest Classifier Base Model', 'Random Forest - Higher N-estimators, pruning trees', 'Gradient Boosting Classifier Base Model', 'Gradient Boosting Classifier - Higher n_estimators and min_samples changed', 'Adaptive Boosting Classifier Base Model', 'Adaptive Boosting Classifier - Higher n_estimators and learning rate changed' ] ##Start NLP modeling using ridenotes column
def adaboost(train_x, train_y, test_x, test_y, msno_df): print("Adaboost") clf = AdaBoostClassifier(base_estimator=LogisticRegression(), learning_rate=1.0, n_estimators=200, algorithm='SAMME.R') checkResult(clf, "Adaboost", train_x, train_y, test_x, test_y, msno_df)
from sklearn.ensemble import AdaBoostClassifier from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier X, y = make_gaussian_quantiles(n_samples=13000, n_features=10, n_classes=3, random_state=1) n_split = 3000 X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] print(X_train.shape) print(y_train.shape) bdt_real = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1) bdt_discrete = AdaBoostClassifier( DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME") bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) real_test_errors = [] discrete_test_errors = [] for real_test_predict, discrete_train_predict in zip(bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
def setup_clf_list(): """ Instantiates all classifiers of interstes to be used. """ # List of tuples of a classifier and its parameters. clf_list = [] # # clf_naive = GaussianNB() # params_naive = {} # clf_list.append( (clf_naive, params_naive) ) # clf_tree = DecisionTreeClassifier() params_tree = { "min_samples_split": [2, 5, 10, 20], "criterion": ('gini', 'entropy') } clf_list.append((clf_tree, params_tree)) # clf_linearsvm = LinearSVC() params_linearsvm = { "C": [0.5, 1, 5, 10, 100, 10**10], "tol": [10**-1, 10**-10] #,"class_weight":['auto'] } clf_list.append((clf_linearsvm, params_linearsvm)) # clf_adaboost = AdaBoostClassifier() params_adaboost = {"n_estimators": [20, 25, 30, 40, 50, 100]} clf_list.append((clf_adaboost, params_adaboost)) # clf_random_tree = RandomForestClassifier() params_random_tree = { "n_estimators": [2, 3, 5], "criterion": ('gini', 'entropy') } clf_list.append((clf_random_tree, params_random_tree)) # clf_knn = KNeighborsClassifier() params_knn = {"n_neighbors": [2, 5], "p": [2, 3]} clf_list.append((clf_knn, params_knn)) # clf_log = LogisticRegression() params_log = { "C": [0.05, 0.5, 1, 10, 10**2, 10**5, 10**10, 10**20], "tol": [10**-1, 10**-5, 10**-10] #,"class_weight":['auto'] } clf_list.append((clf_log, params_log)) # clf_lda = LinearDiscriminantAnalysis() params_lda = {"n_components": [0, 1, 2, 5, 10]} clf_list.append((clf_lda, params_lda)) # logistic = LogisticRegression() rbm = BernoulliRBM() clf_rbm = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) params_rbm = { "logistic__tol": [10**-10, 10**-20], "logistic__C": [0.05, 0.5, 1, 10, 10**2, 10**5, 10**10, 10**20] #,"logistic__class_weight":['auto'] , "rbm__n_components": [2, 3, 4] } clf_list.append((clf_rbm, params_rbm)) return clf_list
def execute(self, parameters, messages): """ Model Train tool Trains one of the predefined models with their respective parameters. This tool should be executed from a python toolbox Currently supports for Adaboost(SAMME), BrownBoost, logistic regression, random forest and support vector machine New models need to have implemented the methods fit, get_params, predict and one of predict_proba or decision_function Additionally, it can implement feature_importances_ :param parameters: parameters object with all the parameters from the python-tool. It necessarily contains train_points: (Points) Points that will be used for the training train_regressors: (Field) Name of the regressors fields that will be used for the training train_response: (Field) Name of the response/class field that will be used for the training output_model: (File path) Name of the file where the model will be stored leave_one_out: (Boolean) Choose between test with leave-one-out (true) or 3-fold cross-validation (false) classifier_name: (String) Name of the model to be trained :param messages: messages object to print in the console, must implement AddMessage :return: None """ global MESSAGES MESSAGES = messages # Print parameters for debugging purposes print_parameters(parameters) # Decompose the parameters object and assign the value to variables parameter_dic = {par.name: par for par in parameters} classifier_name = parameter_dic["classifier_name"].valueAsText train_points = parameter_dic["train_points"].valueAsText train_regressors_name = parameter_dic["train_regressors"].valueAsText.split(";") train_response_name = parameter_dic["train_response"].valueAsText output_model = parameter_dic["output_model"].valueAsText leave_one_out = parameter_dic["leave_one_out"].value # Check for correctness in the parameters _input_validation(parameters) train_regressors = _get_fields(train_points, train_regressors_name) train_response = _get_fields(train_points, train_response_name) # Choice of the model type, the specific parameters are then passed to variables if classifier_name == "Adaboost": """ Parameters: num_estimators: (Integer) Number of estimators to be used learning_rate: (Float) Learning rate of the model For more information about the model visit http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html """ _verbose_print("Adaboost selected") num_estimators = parameter_dic["num_estimators"].value learning_rate = parameter_dic["learning_rate"].value classifier = AdaBoostClassifier(base_estimator=None, n_estimators=num_estimators, learning_rate=learning_rate, algorithm='SAMME.R', random_state=None) elif classifier_name == "Logistic Regression": """ Parameters: deposit_weight: (Integer) weight to be given to the deposits to deal with unbalanced data penalty: (string) type of norm for the penalty random_state: (Integer) seed for random generator, useful to obtain reproducible results For more information about the model visit http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html """ _verbose_print("Logistic Regression selected") penalty = parameter_dic["penalty"].valueAsText deposit_weight = parameter_dic["deposit_weight"].value random_state = parameter_dic["random_state"].value if deposit_weight is None: _verbose_print("deposit_weight is None, balanced wighting will be used") class_weight = "balanced" else: class_weight = {1: float(deposit_weight), -1: (100-float(deposit_weight))} classifier = LogisticRegression(penalty=penalty, dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1, class_weight=class_weight, random_state=random_state, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) elif classifier_name == "Brownboost": """ Parameters: countdown: (Float) Initial value of the countdown timer """ _verbose_print("BrownBoost selected") countdown = parameter_dic["countdown"].value classifier = BrownBoostClassifier(base_estimator=None, n_estimators=1000, learning_rate=1, algorithm='BROWNIAN', random_state=None, countdown = countdown) elif classifier_name == "SVM": """ Parameters: kernel: (String) Kernel to be used deposit_weight: (Integer) weight to be given to the deposits to deal with unbalanced data penalty: (string) type of norm for the penalty random_state:(Integer) seed for random generator, useful to obtain reproducible results normalize: (Boolean) Indicates if the data needs to be normalized (True) or not (False). Notice that SVM is sensitive linear transformations For more information about the model visit http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html """ penalty = parameter_dic["penalty"].value kernel = str(parameter_dic["kernel"].valueAsText) random_state = parameter_dic["random_state"].value deposit_weight = parameter_dic["deposit_weight"].value if deposit_weight is None: _verbose_print("deposit_weight is None, balanced wighting will be used") class_weight = "balanced" else: class_weight = {1: float(deposit_weight), -1: (100-float(deposit_weight))} classifier = SVC(C=penalty, kernel=kernel, degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=class_weight, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=random_state) elif classifier_name == "Random Forest": """ Parameters: num_estimators: (Integer) Number of trees to be trained max_depth: (Integer) max depth of the trained trees deposit_weight: (Integer) weight to be given to the deposits to deal with unbalanced data random_state:(Integer) seed for random generator, useful to obtain reproducible results For more information about the model visit http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html """ _verbose_print("Random Forest selected") num_estimators = parameter_dic["num_estimators"].value max_depth = parameter_dic["max_depth"].value random_state = parameter_dic["random_state"].value deposit_weight = parameter_dic["deposit_weight"].value if deposit_weight is None: _verbose_print("deposit_weight is None, balanced wighting will be used") class_weight = "balanced" else: class_weight = {1: float(deposit_weight), -1: (100-float(deposit_weight))} classifier = RandomForestClassifier(n_estimators=num_estimators, criterion='gini', max_depth=max_depth, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=random_state, verbose=0, warm_start=False, class_weight=class_weight) else: raise NotImplementedError("Not implemented classifier: {}".format(classifier_name)) # Some classifiers need the data be normalized before training, this is done here if classifier_name in ["SVM"]: normalize = parameter_dic["normalize"].value if normalize: scaler = StandardScaler().fit(train_regressors) train_regressors = scaler.transform(train_regressors) MESSAGES.AddMessage("Data normalized") if output_model is not None: # Save the information of the normalize transformation joblib.dump(scaler, output_model.replace(".pkl", "_scale.pkl")) # train the model start = timer() classifier.fit(train_regressors, train_response) end = timer() MESSAGES.AddMessage("Training time: {:.3f} seconds".format(end-start)) if output_model is not None: _save_model(classifier_name, classifier, output_model, train_points, train_regressors_name, train_response_name) else: _verbose_print("No output model selected") _print_train_results(classifier_name, classifier, train_regressors, train_response, train_regressors_name, leave_one_out) return
end = time.clock() print "rbf support vector machine accuracy on titanic dataset: %.2f%%" % accuracy print "time to train rbf support vector machine: %.2f seconds\n" % (end - start) #random forest on titanic start = time.clock() clf = RandomForestClassifier(100) clf.fit(X_train,y_train) accuracy = clf.score(X_test, y_test) * 100.0 end = time.clock() print "random forest accuracy on titanic dataset: %.2f%%" % accuracy print "time to train random forest: %.2f seconds\n" % (end - start) #adaboost on titanic start = time.clock() clf = AdaBoostClassifier() clf.fit(X_train,y_train) accuracy = clf.score(X_test, y_test) * 100.0 end = time.clock() print "adaboost accuracy on titanic dataset: %.2f%%" % accuracy print "time to train adaboost: %.2f seconds\n" % (end - start) #k nearest neighbors w/ euclidean distance on titanic start = time.clock() clf = KNeighborsClassifier(n_neighbors=5, algorithm='auto') clf.fit(X_train,y_train) accuracy = clf.score(X_test, y_test) * 100.0 end = time.clock() print "euclidean k nearest neighbors accuracy on titanic dataset: %.2f%%" % accuracy print "time to train euclidean k nearest neighbors: %.2f seconds\n" % (end - start)
def third_generation(X, y, size=200, seed=None): mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\ [0, 0.2, 0.5, 0.9], [0.1, 0.3, 0.6])) mlp_clf = [ MLPClassifier(hidden_layer_sizes=(h, ), momentum=m, learning_rate_init=a) for (h, m, a) in mlp_parameters ] mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters] neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)] weighting_methods = ['uniform', 'distance'] knn_clf = [ KNeighborsClassifier(n_neighbors=nn, weights=w) for (nn, w) in itertools.product(neigbhors_number, weighting_methods) ] knn_name = [ 'knn_{0}_{1}'.format(*param) for param in itertools.product( neigbhors_number, ['uniform', 'distance']) ] C = np.logspace(-3, 7, num=11) degree = [2, 3, 4] gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2] svm_clf_poly = [ SVC(C=c, kernel='poly', degree=d) for (c, d) in itertools.product(C, degree) ] svm_clf_poly_name = [ 'svm_poly_{0}_{1}'.format(*param) for param in itertools.product(C, degree) ] svm_clf_rbf = [ SVC(C=c, kernel='rbf', gamma=g) for (c, g) in itertools.product(C, gamma) ] svm_clf_rbf_name = [ 'svm_rbf_{0}_{1}'.format(*param) for param in itertools.product(C, gamma) ] dt_params = list(itertools.product(['gini', 'entropy'], \ [1, 2, 3, 4, 5, None], \ [None, 'sqrt', 'log2'], \ ['best', 'random'])) dt_clf = [ DecisionTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] et_clf = [ ExtraTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] ada_params = list(itertools.product([2**i for i in range(1, 14)], \ [1, 2, 3])) ada_dt_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=DecisionTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_et_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=ExtraTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params] ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params] nb_bag_est = 50 nb_bag_stumps = 200 bag_dt = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=DecisionTreeClassifier()) bag_et = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=ExtraTreeClassifier()) bag_stumps = BaggingClassifier( n_estimators=nb_bag_stumps, base_estimator=DecisionTreeClassifier(max_depth=1)) bag_dt.fit(X, y) bag_et.fit(X, y) bag_stumps.fit(X, y) dt_bag_clf = bag_dt.estimators_ et_bag_clf = bag_et.estimators_ stump_bag_clf = bag_stumps.estimators_ dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] stump_bag_name = [ 'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps) ] bag_dt_clf = [bag_dt] bag_et_clf = [bag_dt] bag_stump_clf = [bag_stumps] bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))] bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))] bag_stump_name = ['bag_stump_{0}'.format(str(200))] nb_rf = 15 rf = RandomForestClassifier(n_estimators=nb_rf) rf.fit(X, y) dt_rf_clf = rf.estimators_ dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)] log_parameters = list(itertools.product(['l1', 'l2'],\ np.logspace(-5, 9, num=15), [True, False])) log_clf = [ LogisticRegression(penalty=l, C=c, fit_intercept=f) for (l, c, f) in log_parameters ] log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters] sgd_parameters = list( itertools.product([ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1))) sgd_clf = [ SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1) for (l, p, f, l1) in sgd_parameters ] sgd_name = [ 'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters ] pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \ dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \ log_clf + sgd_clf pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \ ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \ bag_stump_name + dt_rf_name + log_name + sgd_name for model in pool: if not check_model_is_fitted(model, X[0, :].reshape((1, -1))): model.fit(X, y) np.random.seed(seed) order = np.random.permutation(range(len(pool))) estimators = [pool[i] for i in order[:size]] return estimators, pool_name
### Try a variety of classifiers # Import classifiers from sklearn.naive_bayes import GaussianNB from sklearn import tree #from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier # Initialize classifiers clf_NB = GaussianNB() clf_DT = tree.DecisionTreeClassifier(min_samples_split=5,criterion='entropy') #clf_SVC = SVC() clf_KN = KNeighborsClassifier() clf_RF = RandomForestClassifier() clf_AB = AdaBoostClassifier() # Leverage tester.py to fit and test the classifiers test_classifier(clf_NB, my_dataset, features_list) #test_classifier(clf_DT, my_dataset, features_list) #test_classifier(clf_SVC, my_dataset, features_list) #test_classifier(clf_KN, my_dataset, features_list) #test_classifier(clf_RF, my_dataset, features_list) #test_classifier(clf_AB, my_dataset, features_list) # Apply Grid Search to fine tune the parameters from sklearn import grid_search # Set the parameters for my two chosen classifiers parameters_DT = {'min_samples_split':[2,5,10,15,20], 'criterion':('gini','entropy')}