def variables_relevantes_arbol(X, Y, alpha=None): if len(X) == 0: logger.info("No se ingreso informacion de variables") return [] features = list(X.columns) if alpha == None: alpha = 1.0 / len(features) logger.info( 'Se calcula el valor minimo de aceptacion de importancia: {0}'. format(alpha)) try: model = ExtraTreeClassifier() model.fit(X, Y) importance = model.feature_importances_ relevant_features = [] for i in range(len(features)): if importance[i] > alpha: relevant_features.append(features[i]) except Exception as e: logger.info( 'Error con el metodo de arboles, no se determinaron variables relevantes: {0}' .format(e)) relevant_features = [] return importance, relevant_features
def dTree(data, labels, test, impurity="gini", mdepth=None): newData = pd.DataFrame() newTest = pd.DataFrame() le = LabelEncoder() for datum in data: newData[datum] = le.fit_transform(data[datum]) for testItem in test: newTest[testItem] = le.fit_transform(test[testItem]) tree1 = DecisionTreeClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree2 = ExtraTreeClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree3 = RandomForestClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree1.fit(newData, labels) tree2.fit(newData, labels) tree3.fit(newData, labels) predict1 = tree1.predict(newTest) print("tree1", evaluate(predict1, validation_genres)) predict2 = tree2.predict(newTest) print("tree2", evaluate(predict2, validation_genres)) predict3 = tree3.predict(newTest) print("tree3", evaluate(predict3, validation_genres)) combined_prediction = voting([predict1, predict2, predict3], [1, 1, 1]) return combined_prediction
def variables_relevantes_arbol(X, Y, alpha=None): if len(X) == 0: logger.info("No information was passed") return [] features = list(X.columns) if alpha == None: alpha = 1.0 / len(features) logger.info( 'Aceptance threshold for variable importance is calculated: {0}'. format(alpha)) try: model = ExtraTreeClassifier() model.fit(X, Y) importance = model.feature_importances_ relevant_features = [] for i in range(len(features)): if importance[i] > alpha: relevant_features.append(features[i]) except Exception as e: logger.info( 'Error with the tree based model, : There was not relevant variables found{0}' .format(e)) relevant_features = [] return importance, relevant_features
def apply_extra_trees_classifier(trainData, targetTrain, testData, targetTest): """ Applies decision tree algorithm on the dataset, by tuning various parameters Args: dataframe: The input trainData, testData and class label on which the decision tree algorithm has to be applied """ # fit a CART model to the data etc = ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=None, splitter='random') etc.fit(trainData, targetTrain) print(etc) # make predictions expected = targetTest predicted = etc.predict(testData) # summarize the fit of the model print(accuracy_score(expected, predicted))
def extratree(typ, X_train, Y_train, X_test, Y_test, text): text.delete(1.0, tk.END) text.insert( tk.END, "\n\nIMPORTING ExtraTree" + "\nProcessing this might take a while...", "bold") text.update_idletasks() from sklearn.tree import ExtraTreeClassifier ETC = ExtraTreeClassifier() ETC.fit(X_train, Y_train) Y_pred = ETC.predict(X_test) text.insert( tk.END, "\n\nExtra Tree Classifier report \n" + classification_report(Y_pred, Y_test), "bold") text.insert( tk.END, "*****roc_auc_score: %0.3f*****\n" % roc_auc_score(Y_pred, Y_test), "bold") text.insert( tk.END, "Extra Tree Classifier confusion matrix \n" + str(confusion_matrix(Y_pred, Y_test)), "bold") score = accuracy_score(Y_pred, Y_pred) text.insert(tk.END, "Extra tree score= ", score) text.update_idletasks() roc_curve_acc(Y_test, Y_pred, 'ETC') if typ == "s": plt.show() elif typ == "a": pass
class ExtraTreeClassifierTestCase(SchemaValidationTestCase, unittest.TestCase): def setUp(self): super().setUp() self.model = ExtraTreeClassifier() iris = load_iris() X = iris.data.astype(np.float32) y = iris.target.astype(np.int32) self.model.fit(X, y)
def tree_select(self): # 树模型嵌入式特征选取 clf = ExtraTreeClassifier(max_depth=7) clf.fit(self.X, self.y.ravel()) feature_var = list(clf.feature_importances_) features = dict(zip(self.feature_names, feature_var)) # print(features) features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[:self.select_feature_num] return set(features)
def get_feature_relevance_tree(X, y): vect = DictVectorizer() X = vect.fit_transform(X) tree = ExtraTreeClassifier(criterion='entropy') tree.fit(X, y) return zip(['general'], vect.inverse_transform(tree.feature_importances_.reshape(-1, 1)))
def get_decision_tree(X, y, depth=None): vect = DictVectorizer() X = vect.fit_transform(X) tree = ExtraTreeClassifier(max_depth=depth) tree.fit(X, y) return export_graphviz(tree, feature_names=vect.feature_names_, class_names=tree.classes_, filled=True)
def trees_models(x_train, y_train): from sklearn.tree import DecisionTreeClassifier classifier1 = DecisionTreeClassifier(criterion='entropy', random_state=0) classifier1.fit(x_train, y_train) from sklearn.tree import ExtraTreeClassifier classifier2 = ExtraTreeClassifier() classifier2.fit(x_train, y_train) print('DecisionTreeClassifier training accuracy: ', classifier1.score(x_train, y_train)) print('ExtraTreesClassifier training accuracy: ', classifier2.score(x_train, y_train)) return classifier1, classifier2
def test_extra_tree_clf(): X, y = load_iris(return_X_y=True) X_ = X.tolist() for y_ in [y, (y == 0).astype(int), (y == 2).astype(int)]: for max_depth in [5, 10, None]: clf = ExtraTreeClassifier() clf.fit(X, y_) clf_ = convert_estimator(clf) for method in METHODS: with warnings.catch_warnings(): warnings.simplefilter("ignore") scores = getattr(clf, method)(X) scores_ = getattr(clf_, method)(X_) assert np.allclose(scores.shape, shape(scores_)) assert np.allclose(scores, scores_, equal_nan=True)
def RecommendByET(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """多标签分类 """ clf = ExtraTreeClassifier() clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def extra_tree_classifier(self): self.log.writeToLog('Running Extra Tree Classifier Model...') X_train, X_test, y_train, y_test = self.train_test_split() et = ExtraTreeClassifier() trained_model = et.fit(X_train, y_train) self.save_pickle(trained_model) y_pred = et.predict(X_test) self.model_auc_roc(y_test, y_pred, "Extra Tree Classifier Model") self.model_evaluation(y_test, y_pred, "Extra Tree Classifier Model")
def extra_tree(self): x_train, x_test, y_train, y_test = self.preprocessing() extra_tree_model = ExtraTreeClassifier() y_pred = extra_tree_model.fit(x_train, y_train).predict(x_test) acc = accuracy_score(y_test, y_pred) print('Extra Tree Classifier:- ', acc) conf = confusion_matrix(y_test, y_pred) f1 = f1_score(y_test, y_pred, average='micro') print('and its f1 score- ', f1) print('confusion matrix: \n', conf)
def fit(self, X, y): """Build a random decision tree based classifier from the training set (X, y).""" # Remove protected features X_protect = np.delete(X, [self.prot_class], axis=1) num_tr = len(y) num_prot_1 = sum(X[:, self.prot_class]) num_prot_0 = num_tr - num_prot_1 #X_protect = X i = 0 fair_trees = [] predictions = [] # Pick up fair trees while i < self.num_fair_trees: new_tree = ExtraTreeClassifier( max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, max_features=1) new_tree.fit(X_protect, y) new_prediction = new_tree.predict(X_protect) # Calculate the probability we predict someone will dropout between groups (Statistical Parity) num_pred_1 = len([ e for e in range(0, num_tr) if new_prediction[e] == 0 and X[e, self.prot_class] == 1 ]) num_pred_0 = len([ e for e in range(0, num_tr) if new_prediction[e] == 0 and X[e, self.prot_class] == 0 ]) stat_parity = abs(num_pred_1 / num_prot_1 - num_pred_0 / num_prot_0) if stat_parity < self.rho: i += 1 fair_trees.append(new_tree) predictions.append(new_prediction) self.ridge_model.fit(np.transpose(np.asarray(predictions)), y) self.decision_trees = fair_trees
def read_results(data, model_name): with open('data.json') as data_json: data_params = json.load(data_json) # Prepare data data_path = os.path.join(DATA_PATH, data_params['data'][data]['file_name']) print('Read file: {}'.format(data_path)) X, y = load_csv(data_path) # Apply scaling scaler = MinMaxScaler().fit(X) X = scaler.transform(X) n_test = data_params['data'][data]['n_test'] random_state = RANDOM_STATE X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=n_test, random_state=random_state) model = ExtraTreeClassifier(random_state=RANDOM_STATE) model.fit(X_train, y_train) acc_train = model.score(X_train, y_train) acc_test = model.score(X_test, y_test) print(('Train Acc: {:.4f}, ' + 'Test Acc: {:.4f}').format( acc_train, acc_test)) df = pd.DataFrame(columns=COLUMNS) for attack in ATTACKS_NUM: for defence in DEFENCES_NUM: try: df = get_dataframe_sklearn(df, model, data, model_name, attack, defence) except FileNotFoundError as err: print(err) continue # These attacks have no hyperparameter df.loc[(df['Attack'] == 'boundary') | (df['Attack'] == 'tree'), 'Adv_param'] = np.nan output_file = os.path.join( OUTPUT_PATH, '{}_{}_{}.csv'.format(data, model_name, VERSION)) df.to_csv(output_file) print('Save to:', output_file)
def train_extratree_model(): results_extratree_model = {} results_extratree_model['acc'] = [] results_extratree_model['p_r_f1_s'] = [] for i in range(30): train_features, train_labels = get_train_data() test_features, test_labels = get_test_data() clf = ExtraTreeClassifier() clf.fit(train_features, train_labels) predictions = clf.predict(test_features) p_r_f1_s = precision_recall_fscore_support(test_labels, predictions) acc = accuracy_score(test_labels, predictions) print("ExtraTree Model Classifier : ", acc) print( "ExtraTree Model Classifier Precision, Recall, F1-Score, Support: ", p_r_f1_s) results_extratree_model['acc'].append(acc) results_extratree_model['p_r_f1_s'].append(p_r_f1_s) time.sleep(10) pickle.dump(results_extratree_model, open('results_extratree_model.pkl', 'wb'))
class ExtraTreeClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._etc = ETC() def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._etc = self._etc.fit(matrix, classes) print('Fitting complete...') self._has_fit = True output = self._etc.predict( self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
class ExtraTreeClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._etc = ETC() def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._etc = self._etc.fit(matrix, classes) print 'Fitting complete...' self._has_fit = True output = self._etc.predict(self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
"去除低方差特征,统计学方法" # np.random.seed(10) # arr = np.random.random((5, 6)) # var = 0.4 # arrnew = VarianceThreshold(0.4 * var * (1 - var)).fit(arr).transform(arr) # print(arr) # print(arrnew) "单变量特征选择,统计学方法" X, y = load_iris(return_X_y=True) print(X.shape, y.shape) "SelectFromModel 只包含两种方法" clf = ExtraTreeClassifier() clf.fit(X, y) print(clf.feature_importances_) model = SelectFromModel(clf, prefit=True) X_new = model.transform(X) print(X_new.shape) # "基于原始数据" # DTC = RandomForestClassifier(n_estimators=20) # DTC.fit(X, y) # score1 = cross_val_score(DTC, X, y, cv=5) # print(np.mean(score1)) # # "单变量特征选择,统计学方法" # Xnew = SelectKBest(chi2, k=2).fit_transform(X, y) # 保留特征的个数
#zero variance removal #from sklearn.feature_selection import VarianceThreshold #var=VarianceThreshold() # #train_X=var.fit_transform(train_X) #train_X[train_X._get_numeric_data().columns]=var.fit_transform(train_X._get_numeric_data()) from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.tree import ExtraTreeClassifier clf=ExtraTreeClassifier(criterion="entropy",random_state=100) #clf=RandomForestClassifier(n_estimators=100,criterion="entropy",random_state=100) clf.fit(train_X,train_y) print(clf.feature_importances_) for name, importance in zip(train_X.columns, clf.feature_importances_): print(name, "=", importance) imp_datafram=pd.DataFrame(list(zip(train_X.columns, clf.feature_importances_))) #import matplotlib.pyplot as plt # #y_pos = np.arange(len(imp_datafram[0])) #performance = imp_datafram[1] # #plt.bar(y_pos, performance, align='center', alpha=0.5) #plt.xticks(y_pos, imp_datafram[0])
# data = '' with open(fname) as f: for s in f: tmp = map(int, s.split()) labels.append(tmp[-1]) res.append(tmp[:-1]) # data += (str(tmp)[1:-1]).replace(',', '')+'\n' # with open('out.txt', 'w') as o: # o.write(str(data)[1:-1]) return res, labels X, Y = readData('german.data-numeric.txt') Xt = X[:-200] ; Yt = Y[:-200] XT = X[-200:] ; YT = Y[-200:] print len(Xt) clf = ExtraTreeClassifier(max_depth=None, random_state=0) clf = clf.fit(Xt, Yt) #proba = clf.predict_proba(XT) #print len(proba) #print proba err = 0 for i, x in enumerate(XT): if clf.predict(x) != YT[i]: prob = clf.predict_proba(x) # print prob err += 1 print err
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest): count = 0 bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) count += 1 classifiers = [bagging2.score(xtest,ytest)] if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) #print tree2.fit(xtrain,ytrain) #print tree2.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree2.score(xtest,ytest)) print "1" print tree2.score(xtest,ytest) if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging1.score(xtest,ytest)) print "2" print bagging1.score(xtest,ytest) # if count < numfiers: # # votingClassifiers combine completely different machine learning classifiers and use a majority vote # clff1 = SVC() # clff2 = RFC(bootstrap=False) # clff3 = ETC() # clff4 = neighbors.KNeighborsClassifier() # clff5 = quadda() # print"3" # eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) # eclf = eclf.fit(xtrain,ytrain) # #print(eclf.score(xtest,ytest)) # # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # # cla # # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # # print () # count+=1 # classifiers = np.append(classifiers,eclf.score(xtest,ytest)) # if count < numfiers: # svc1 = SVC() # svc1.fit(xtrain,ytrain) # dec = svc1.score(xtest,ytest) # count+=1 # classifiers = np.append(classifiers,svc1.score(xtest,ytest)) # print "3" if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,qda.score(xtest,ytest)) print "4" if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) #print tree1.fit(xtrain,ytrain) #print tree1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree1.score(xtest,ytest)) if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) #print(knn1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn1.score(xtest,ytest)) if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) #print(lda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,lda.score(xtest,ytest)) if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) #print tree3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree3.score(xtest,ytest)) if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) #print bagging3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging3.score(xtest,ytest)) if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) #print bagging4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging4.score(xtest,ytest)) if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) #print tree4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree4.score(xtest,ytest)) if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) #print(tree6.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree6.score(xtest,ytest)) if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) #print(knn2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn2.score(xtest,ytest)) if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) #print(knn3.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn3.score(xtest,ytest)) if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) #print(knn4.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn4.score(xtest,ytest)) if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) #print(knn5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn5.score(xtest,ytest)) if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) #print (ncc1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc1.score(xtest,ytest)) if count < numfiers: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain) #print(ncc2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc2.score(xtest,ytest)) if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) #print(tree5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree5.score(xtest,ytest)) classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC", "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC", "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)", "Nearest Centroid","Shrunken Centroid?","ABC"] classifierlabel = classifierlabel[:len(classifiers)] #print len(classifiers) #print classifiers for i in range(len(classifiers)): print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) # In[ ]: DTree = DecisionTreeClassifier(max_depth=3) DTree.fit(x_train, y_train) yhat = DTree.predict(x_test) print("DecisionTreeClassifier") print("Train set Accuracy: ", metrics.accuracy_score(y_train, DTree.predict(x_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) # In[ ]: ETree = ExtraTreeClassifier(max_depth=3) ETree.fit(x_train, y_train) yhat = ETree.predict(x_test) print("ExtraTreeClassifier") print("Train set Accuracy: ", metrics.accuracy_score(y_train, ETree.predict(x_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) # In[ ]: Ada = AdaBoostClassifier() Ada.fit(x_train, y_train) yhat = Ada.predict(x_test) print("AdaBoostClassifier") print("Train set Accuracy: ", metrics.accuracy_score(y_train, Ada.predict(x_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
def main(): # filepath: sentence data file path # vecfile: word vector file path pre-generated from other # vectype: compression methods. Average, avg+tf-idf one line, agg+tf-idf whole data # vec_path: vector file save path filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/stem_testdata' # 'data/data_test' vecfile = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt' vec_files = [ '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.100d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.200d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.300d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.42B.300d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.840B.300d.txt' ] # don't know why yet, relative file path having permission deny # so we're using absolute path for now vec_path = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/word_vector/' # Here, we can choose type of vectorization # there are 6 word vector file downloaded from glove """ vectype = 1 for v in vec_files: start_time = time.time() name = v.split('/')[-1][:-4] + '_vec' print(name, 'vectorization in process') word_vec_gen(filepath, v, vectype, vec_path+name) print("--- %s seconds ---" % (time.time() - start_time)) vectype = 2 for v in vec_files: start_time = time.time() name = v.split('/')[-1][:-4] + '_vec_OnelineTF' print(name, 'vectorization in process') word_vec_gen(filepath, v, vectype, vec_path + name) print("--- %s seconds ---" % (time.time() - start_time)) vectype = 3 for v in vec_files: start_time = time.time() name = v.split('/')[-1][:-4] + '_vec_WholeDataTF' print(name, 'vectorization in process') word_vec_gen(filepath, v, vectype, vec_path + name) print("--- %s seconds ---" % (time.time() - start_time)) """ # from here, will earase. filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test' # 'data/stem_testdata' #filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/hyp1-hyp2-ref' vectype = 1 start_time = time.time() name = vecfile.split('/')[-1][:-4] + '_vec_diffOrder' #print(name, 'vectorization in process') #word_vec_gen(filepath, vecfile, vectype, vec_path + name) #print("--- %s seconds ---" % (time.time() - start_time)) filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test' # 'data/stem_testdata' vectype = 2 start_time = time.time() name = vecfile.split('/')[-1][:-4] + '_vec_OnelineTF' #print(name, 'vectorization in process') #word_vec_gen(filepath, vecfile, vectype, vec_path + name) #print("--- %s seconds ---" % (time.time() - start_time)) filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test' # 'data/stem_testdata' vectype = 3 start_time = time.time() name = vecfile.split('/')[-1][:-4] + '_vec_WholeDataTF' #print(name, 'vectorization in process') #word_vec_gen(filepath, vecfile, vectype, vec_path + name) #print("--- %s seconds ---" % (time.time() - start_time)) vec_path = 'data/word_vector/glove.6B.50d_vec_diffOrder' wvec = load_wordvec(vec_path) target_path = 'data/dev.answers' answer = load_target(target_path) from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB from sklearn.tree import DecisionTreeClassifier from sklearn.tree import ExtraTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import NuSVC from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC clf1 = KNeighborsClassifier() clf2 = DecisionTreeClassifier() clf3 = ExtraTreeClassifier() clf4 = MLPClassifier() clf5nu = NuSVC() clf6lin = LinearSVC() # 'sag', 'saga' and 'lbfgs' ’ print("Training Starts") X_train, X_test, y_train, y_test = train_test_split(wvec, answer, test_size=0.10, random_state=42) #clf1.fit(X_train, y_train) clf1.fit(X_train, y_train) print('KNeighborsClassifier score 50d', clf1.score(X_test, y_test)) clf2.fit(X_train, y_train) print('DecisionTreeClassifier score 50d', clf2.score(X_test, y_test)) clf3.fit(X_train, y_train) print('ExtraTreeClassifier score 50d', clf3.score(X_test, y_test)) clf4.fit(X_train, y_train) print('MLPClassifier score 50d', clf4.score(X_test, y_test)) clf1 = OneVsRestClassifier(KNeighborsClassifier()) clf2 = OneVsRestClassifier(DecisionTreeClassifier()) clf3 = OneVsRestClassifier(ExtraTreeClassifier()) clf4 = OneVsRestClassifier(MLPClassifier()) clf5 = OneVsOneClassifier(NuSVC()) clf6 = OneVsRestClassifier(LinearSVC()) from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier clf7 = OneVsRestClassifier(SGDClassifier()) clf8 = OneVsRestClassifier(Perceptron()) clf9 = OneVsRestClassifier(PassiveAggressiveClassifier()) print('One vs Rest methods case::') print('KNeighborsClassifier score 50d', clf1.fit(X_train, y_train).score(X_test, y_test)) print('DecisionTreeClassifier score 50d', clf2.fit(X_train, y_train).score(X_test, y_test)) print('ExtraTreeClassifier score 50d', clf3.fit(X_train, y_train).score(X_test, y_test)) print('MLPClassifier score 50d', clf4.fit(X_train, y_train).score(X_test, y_test)) print('SGDClassifier score 50d', clf7.fit(X_train, y_train).score(X_test, y_test)) print('Perceptron score 50d', clf8.fit(X_train, y_train).score(X_test, y_test)) print('PassiveAggressiveClassifier score 50d', clf9.fit(X_train, y_train).score(X_test, y_test)) print('NuSVC score 50d', clf5.fit(X_train, y_train).score(X_test, y_test)) print('LinearSVC score 50d', clf6.fit(X_train, y_train).score(X_test, y_test)) clf5nu.fit(X_train, y_train) print('NuSVC score 50d', clf5nu.score(X_test, y_test)) clf6lin.fit(X_train, y_train) print('LinearSVC score 50d', clf6lin.score(X_test, y_test)) from sklearn.datasets import make_friedman1 from sklearn.feature_selection import RFECV from sklearn.neighbors import KNeighborsClassifier estimator = DecisionTreeClassifier()
test_set = df.iloc[train_data_len:, :] #print(train_set.head(5)) train_x = train_set.iloc[:, 0:6] train_y = train_set.iloc[:, 6:] #print(type(train_y)) #train_y.reshape(len(train_y), ) #print(train_y.head(5)) test_x = test_set.iloc[:, 0:6] test_y = test_set.iloc[:, 6:] #test_y.reshape(len(test_y), ) #print(train_x.head(5)) #print(train_y.head(5)) from sklearn.tree import ExtraTreeClassifier classifier = ExtraTreeClassifier(random_state=0, criterion="entropy", splitter="best") classifier.fit(train_x, train_y.values.ravel()) info = classifier.score(test_x, test_y.values.ravel()) print(info) #model = Sequential()
f1_score(y_test, smote_pred) recall_score(y_test, smote_pred) ################################################################### ########################## Feature Selection ###################### ################################################################### #Feature Selection using Tree Classifier a = r2.iloc[:,0:19] #independent columns b = r2.iloc[:,-1] #target column model = ExtraTreeClassifier() model.fit(a,b) print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers #plot graph of feature importances for better visualization feat_importances = pd.Series(model.feature_importances_, index=a.columns) feat_importances.nlargest(19).plot(kind='barh') ############################################################### ####################### Cross Validation ###################### ############################################################### colnames = list(r2.columns) predictors = colnames[:19]
def myclassify_AudPow(numfiers,xtrain_1,xtrain_2,ytrain_1,ytrain_2,xtest): # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest = xtest[~np.isnan(xtest).any(axis=1),:] xtest = xtest[~np.isinf(xtest).any(axis=1),:] xtrain = np.append(xtrain_1,xtrain_2,0) ytrain = np.append(ytrain_1,ytrain_2) ytrain = np.ravel(ytrain) xtrunclength = sio.loadmat('../Files/xtrunclength.mat') xtrunclength = xtrunclength['xtrunclength'][0] #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] modeCol = predWindowVecModeFinder(tempCol,xtrunclength) modeStr = predVec2Str(modeCol) predictionStringMat.append(modeStr) finalPredMat += map(int,modeCol) return predictionStringMat,finalPredMat
filled = True, rounded = True, special_characters = True) print(check_output('dot -Tpdf cart.dot -o cart.pdf', shell = True)) print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_cart.predict(rnd_test_X))) print("Precision = %s"%precision_score(rnd_test_y, clf_cart.predict(rnd_test_X))) print("Recall = %s"%recall_score(rnd_test_y, clf_cart.predict(rnd_test_X))) print("F = %s"%fbeta_score(rnd_test_y, clf_cart.predict(rnd_test_X), beta=1)) print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_cart.predict(rnd_test_X))) roc_auc_scorer = get_scorer("roc_auc") print("ROC AUC = %s"%roc_auc_scorer(clf_cart, rnd_test_X, rnd_test_y)) fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_cart.predict_proba(rnd_test_X)[:, 1]) axes_roc.plot(fpr, tpr, label = 'CART-2') ## randomized tree with default setting clf_rnd_tree = ExtraTreeClassifier() clf_rnd_tree.fit(rnd_training_X, rnd_training_y) export_graphviz(clf_rnd_tree, out_file = 'default_rnd_tree.dot', feature_names = attribute_names, class_names = bi_class_target_attrs, filled = True, rounded = True, special_characters = True) print(check_output('dot -Tpdf default_rnd_tree.dot -o default_rnd_tree.pdf', shell = True)) print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("Precision = %s"%precision_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("Recall = %s"%recall_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("F = %s"%fbeta_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X), beta=1)) print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_rnd_tree.predict_proba(rnd_test_X)[:, 1]) axes_roc.plot(fpr, tpr, label = "Randomized tree-1") axes_roc.set_title("ROC of CART and a randomized tree") axes_roc.set_xlabel("FPR")
# separate the data from the target attributes test_data = dataset2.drop('change_id', axis=1) test_data = test_data.drop('411_commit_time', axis=1) test_data = test_data.drop('412_full_path', axis=1) # remove unnecessary features #test_data = test_data.drop('File', axis=1) # the lables of test data test_target = dataset2.Buggy #print(test_target) from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(train_data, train_target) test_data_resampled, test_target_resampled = ros.fit_resample( test_data, test_target) clf = ExtraTreeClassifier(splitter='best') test_pred = clf.fit(X_resampled, y_resampled).predict(test_data_resampled) file.write( classification_report(test_target_resampled, test_pred, labels=[0, 1])) file.write("\n") file.close()
def build_separate_tree(X,y,max_features,max_depth,min_samples_split): clf = ExtraTreeClassifier(max_features=max_features,max_depth=max_depth,min_samples_split=min_samples_split) clf = clf.fit(X,y) return clf
mix_estimators = [ ('le', LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial')), ('te', rn.choice(tree_estimators)[1]), *rn.sample(boost_estimators, 4), *rn.sample(nb_estimators, 2)] voting_classifier_mix = VotingClassifier(estimators=mix_estimators) all_estimators = [('lgr', LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial')), *tree_estimators, *boost_estimators, *nb_estimators] voting_classifier_all = VotingClassifier(estimators=all_estimators) # Train all models, (it will take some time) logistic_regression.fit(x_train, y_train) tree_classifier.fit(x_train, y_train) extra_tree_classifier.fit(x_train, y_train) adaboost_classifier.fit(x_train, y_train) extra_trees_classifier.fit(x_train, y_train) bagging_classifier.fit(x_train, y_train) gradient_boost_classifier.fit(x_train, y_train) mlp_classifier.fit(x_train, y_train) gaussian_nb.fit(x_train, y_train) voting_classifier_tree.fit(x_train, y_train) voting_classifier_boost.fit(x_train, y_train) voting_classifier_nb.fit(x_train, y_train) voting_classifier_mix.fit(x_train, y_train) voting_classifier_all.fit(x_train, y_train)
#data = np.array(data) print('Finish Label Encode') clf = ExtraTreeClassifier(random_state=103, splitter='random', max_features=9) ##Get dataset X = np.array(traindata.iloc[:, :10]) y = np.array(traindata.iloc[:, 10]) ##build decision tree X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=11) clf.fit(X_train, y_train) print('Finish Extra tree training') predicttest = clf.predict(X_test) ##count click (0 or 1) countClick = [0, 0] for i in predicttest: if i == 0: countClick[0] += 1 else: countClick[1] += 1 print(countClick) ##get accuracy, precision, recall, f_measure
dfscore.plot(kind='barh') # Label Encoding from sklearn.preprocessing import LabelEncoder le=LabelEncoder() y.iloc[:]=le.fit_transform(y.iloc[:]) # Feature Scaling from sklearn.preprocessing import StandardScaler sc=StandardScaler() x.iloc[:,:]=sc.fit_transform(x.iloc[:,:]) # Feature Importance from sklearn.tree import ExtraTreeClassifier classifier=ExtraTreeClassifier() classifier.fit(x,y) importance=pd.Series(classifier.feature_importances_,index=x.columns) importance.plot(kind='barh') # Segregating training & testing data from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0) # Modelling # Extreme Gradient Boosting: from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier gb=BaggingClassifier(DecisionTreeClassifier(),n_estimators=20,max_samples=0.5,max_features=1) gb.fit(x_train,y_train) gb.score(x_train,y_train) gb.score(x_test,y_test)
print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plotting featre importance plt.figure(figsize=(10,5)) plt.title("Feature importances") plt.bar(range(X.shape[1]), importances[indices], color="g", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), features,rotation=90) plt.xlim([-1, X.shape[1]]) plt.show() # Importing,intitiating and fitting Extra trees classifier from sklearn.tree import ExtraTreeClassifier extree = ExtraTreeClassifier(max_features=11,min_samples_split=21, random_state=101,max_depth =28) extree.fit(X_train_sm1,y_train_sm1) extree_predict=extree.predict(X_test) #checking performacne of the extra trees classifier print(confusion_matrix(y_test,extree_predict)) print(classification_report(y_test,extree_predict)) #Importing test data test=pd.read_csv('FIA_predictions.csv') # getting columns same as training data test=test.iloc[:,0:33] #converting data type for categorical variables test['NAICS2']=test['NAICS2'].astype('category') test['NAICS4']=test['NAICS4'].astype('category') test['NAICS_CD']=test['NAICS_CD'].astype('category') test['Restricted_Vertical']=test['Restricted_Vertical'].astype('category') test['LCTN_TYP_VAL']=test['LCTN_TYP_VAL'].astype('category') test['srvc_five_dgt_zip']=test['srvc_five_dgt_zip'].astype('category')
def extra_tree(self): x_train, x_test, y_train, y_test = self.preprocessing() extra_tree_model = ExtraTreeClassifier() y_pred = extra_tree_model.fit(x_train, y_train).predict(x_test) self.printing(y_test, y_pred, 'Extra Tree')
### TREESSSSS from sklearn import tree from sklearn.tree import DecisionTreeClassifier as DTC tree1 = DTC() print tree1 tree1.fit(xtrain,ytrain1) print tree1.fit(xtrain,ytrain1) print tree1.score(xtest,ytest1) # In[22]: from sklearn.tree import ExtraTreeClassifier as ETC tree2 = ETC() print tree2 tree2.fit(xtrain,ytrain1) print tree2.fit(xtrain,ytrain1) print tree2.score(xtest,ytest1) # In[23]: from sklearn.ensemble import BaggingClassifier bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain1) print bagging1.score(xtest,ytest1) # In[24]: from sklearn.ensemble import BaggingClassifier
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'): #NOTE we might not need xtltrain # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength # ytest is optional and depends on if you are using a testing set or the practice set # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget) # print 'finished removal of Nans' ytrain = np.ravel(ytrain) ytarget = np.ravel(ytarget) #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] targetStringMat = [] targets1 = [] predictions1 = [] # svc1 = SVC() # svc1.fit(xtrain,ytrain) # ytest = svc1.predict(xtest) # predictionMat[:,count] = ytest # count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 # print xtltest # print len(ytest) for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] if testing: modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0) else: modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0) ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0) if testing: modeStr = temppredVec2Str(modeCol,grids) else: modeStr = predVec2Str(modeCol) modeStrans = predVec2Str(ytarg) predictionStringMat.append(modeStr) predictions1.append(modeCol) finalPredMat += map(int,modeCol) targetStringMat.append(modeStrans) targets1.append(ytarg) if testing == False: if ytarget != None: #print targets1 #print "" #print predictions1 confusionme = confusion_matrix(targets1[0],predictions1[0]) #print "Confusion Matrix is: " #print confusionme return predictionStringMat, targetStringMat, finalPredMat
print("CV error = %f +-%f" % (np.mean(scores), np.std(scores))) # print "Cross validation" scores = cross_val_score(RandomForestClassifier(), training, classes, cv=KFold(n=len(training), n_folds=5, random_state=42), scoring="accuracy") print("CV error = %f +-%f" % (1. - np.mean(scores), np.std(scores))) print("Accuracy =", accuracy_score(y_test, tlf.predict(X_test))) print("Precision =", precision_score(y_test, tlf.predict(X_test))) print("Recall =", recall_score(y_test, tlf.predict(X_test))) print("F =", fbeta_score(y_test, tlf.predict(X_test), beta=1)) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Extra Tree classifier" rlf = ExtraTreeClassifier() rlf.fit(training, classes) print("Training error =", zero_one_loss(classes, rlf.predict(training))) X_train, X_test, y_train, y_test = train_test_split(training, classes) rlf = ExtraTreeClassifier() rlf.fit(X_train, y_train) print("Training error =", zero_one_loss(y_train, rlf.predict(X_train))) print("Test error =", zero_one_loss(y_test, rlf.predict(X_test))) scores = [] print "K-fold cross validation" for train, test in KFold(n=len(training), n_folds=5, random_state=42): X_train, y_train = training[train], classes[train] X_test, y_test = training[test], classes[test] rlf = ExtraTreeClassifier().fit(X_train, y_train)