def read_results(data, model_name): with open('data.json') as data_json: data_params = json.load(data_json) # Prepare data data_path = os.path.join(DATA_PATH, data_params['data'][data]['file_name']) print('Read file: {}'.format(data_path)) X, y = load_csv(data_path) # Apply scaling scaler = MinMaxScaler().fit(X) X = scaler.transform(X) n_test = data_params['data'][data]['n_test'] random_state = RANDOM_STATE X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=n_test, random_state=random_state) model = ExtraTreeClassifier(random_state=RANDOM_STATE) model.fit(X_train, y_train) acc_train = model.score(X_train, y_train) acc_test = model.score(X_test, y_test) print(('Train Acc: {:.4f}, ' + 'Test Acc: {:.4f}').format( acc_train, acc_test)) df = pd.DataFrame(columns=COLUMNS) for attack in ATTACKS_NUM: for defence in DEFENCES_NUM: try: df = get_dataframe_sklearn(df, model, data, model_name, attack, defence) except FileNotFoundError as err: print(err) continue # These attacks have no hyperparameter df.loc[(df['Attack'] == 'boundary') | (df['Attack'] == 'tree'), 'Adv_param'] = np.nan output_file = os.path.join( OUTPUT_PATH, '{}_{}_{}.csv'.format(data, model_name, VERSION)) df.to_csv(output_file) print('Save to:', output_file)
def trees_models(x_train, y_train): from sklearn.tree import DecisionTreeClassifier classifier1 = DecisionTreeClassifier(criterion='entropy', random_state=0) classifier1.fit(x_train, y_train) from sklearn.tree import ExtraTreeClassifier classifier2 = ExtraTreeClassifier() classifier2.fit(x_train, y_train) print('DecisionTreeClassifier training accuracy: ', classifier1.score(x_train, y_train)) print('ExtraTreesClassifier training accuracy: ', classifier2.score(x_train, y_train)) return classifier1, classifier2
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest): count = 0 bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) count += 1 classifiers = [bagging2.score(xtest,ytest)] if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) #print tree2.fit(xtrain,ytrain) #print tree2.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree2.score(xtest,ytest)) print "1" print tree2.score(xtest,ytest) if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging1.score(xtest,ytest)) print "2" print bagging1.score(xtest,ytest) # if count < numfiers: # # votingClassifiers combine completely different machine learning classifiers and use a majority vote # clff1 = SVC() # clff2 = RFC(bootstrap=False) # clff3 = ETC() # clff4 = neighbors.KNeighborsClassifier() # clff5 = quadda() # print"3" # eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) # eclf = eclf.fit(xtrain,ytrain) # #print(eclf.score(xtest,ytest)) # # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # # cla # # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # # print () # count+=1 # classifiers = np.append(classifiers,eclf.score(xtest,ytest)) # if count < numfiers: # svc1 = SVC() # svc1.fit(xtrain,ytrain) # dec = svc1.score(xtest,ytest) # count+=1 # classifiers = np.append(classifiers,svc1.score(xtest,ytest)) # print "3" if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,qda.score(xtest,ytest)) print "4" if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) #print tree1.fit(xtrain,ytrain) #print tree1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree1.score(xtest,ytest)) if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) #print(knn1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn1.score(xtest,ytest)) if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) #print(lda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,lda.score(xtest,ytest)) if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) #print tree3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree3.score(xtest,ytest)) if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) #print bagging3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging3.score(xtest,ytest)) if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) #print bagging4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging4.score(xtest,ytest)) if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) #print tree4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree4.score(xtest,ytest)) if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) #print(tree6.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree6.score(xtest,ytest)) if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) #print(knn2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn2.score(xtest,ytest)) if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) #print(knn3.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn3.score(xtest,ytest)) if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) #print(knn4.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn4.score(xtest,ytest)) if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) #print(knn5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn5.score(xtest,ytest)) if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) #print (ncc1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc1.score(xtest,ytest)) if count < numfiers: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain) #print(ncc2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc2.score(xtest,ytest)) if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) #print(tree5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree5.score(xtest,ytest)) classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC", "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC", "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)", "Nearest Centroid","Shrunken Centroid?","ABC"] classifierlabel = classifierlabel[:len(classifiers)] #print len(classifiers) #print classifiers for i in range(len(classifiers)): print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
def main(): # filepath: sentence data file path # vecfile: word vector file path pre-generated from other # vectype: compression methods. Average, avg+tf-idf one line, agg+tf-idf whole data # vec_path: vector file save path filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/stem_testdata' # 'data/data_test' vecfile = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt' vec_files = [ '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.50d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.100d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.200d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.6B.300d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.42B.300d.txt', '/home/junlinux/Desktop/CSCI544_Last/hw7/data/glove.6B/glove.840B.300d.txt' ] # don't know why yet, relative file path having permission deny # so we're using absolute path for now vec_path = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/word_vector/' # Here, we can choose type of vectorization # there are 6 word vector file downloaded from glove """ vectype = 1 for v in vec_files: start_time = time.time() name = v.split('/')[-1][:-4] + '_vec' print(name, 'vectorization in process') word_vec_gen(filepath, v, vectype, vec_path+name) print("--- %s seconds ---" % (time.time() - start_time)) vectype = 2 for v in vec_files: start_time = time.time() name = v.split('/')[-1][:-4] + '_vec_OnelineTF' print(name, 'vectorization in process') word_vec_gen(filepath, v, vectype, vec_path + name) print("--- %s seconds ---" % (time.time() - start_time)) vectype = 3 for v in vec_files: start_time = time.time() name = v.split('/')[-1][:-4] + '_vec_WholeDataTF' print(name, 'vectorization in process') word_vec_gen(filepath, v, vectype, vec_path + name) print("--- %s seconds ---" % (time.time() - start_time)) """ # from here, will earase. filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test' # 'data/stem_testdata' #filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/hyp1-hyp2-ref' vectype = 1 start_time = time.time() name = vecfile.split('/')[-1][:-4] + '_vec_diffOrder' #print(name, 'vectorization in process') #word_vec_gen(filepath, vecfile, vectype, vec_path + name) #print("--- %s seconds ---" % (time.time() - start_time)) filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test' # 'data/stem_testdata' vectype = 2 start_time = time.time() name = vecfile.split('/')[-1][:-4] + '_vec_OnelineTF' #print(name, 'vectorization in process') #word_vec_gen(filepath, vecfile, vectype, vec_path + name) #print("--- %s seconds ---" % (time.time() - start_time)) filepath = '/home/junlinux/Desktop/CSCI544_Last/hw7/data/data_test' # 'data/stem_testdata' vectype = 3 start_time = time.time() name = vecfile.split('/')[-1][:-4] + '_vec_WholeDataTF' #print(name, 'vectorization in process') #word_vec_gen(filepath, vecfile, vectype, vec_path + name) #print("--- %s seconds ---" % (time.time() - start_time)) vec_path = 'data/word_vector/glove.6B.50d_vec_diffOrder' wvec = load_wordvec(vec_path) target_path = 'data/dev.answers' answer = load_target(target_path) from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB from sklearn.tree import DecisionTreeClassifier from sklearn.tree import ExtraTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import NuSVC from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC clf1 = KNeighborsClassifier() clf2 = DecisionTreeClassifier() clf3 = ExtraTreeClassifier() clf4 = MLPClassifier() clf5nu = NuSVC() clf6lin = LinearSVC() # 'sag', 'saga' and 'lbfgs' ’ print("Training Starts") X_train, X_test, y_train, y_test = train_test_split(wvec, answer, test_size=0.10, random_state=42) #clf1.fit(X_train, y_train) clf1.fit(X_train, y_train) print('KNeighborsClassifier score 50d', clf1.score(X_test, y_test)) clf2.fit(X_train, y_train) print('DecisionTreeClassifier score 50d', clf2.score(X_test, y_test)) clf3.fit(X_train, y_train) print('ExtraTreeClassifier score 50d', clf3.score(X_test, y_test)) clf4.fit(X_train, y_train) print('MLPClassifier score 50d', clf4.score(X_test, y_test)) clf1 = OneVsRestClassifier(KNeighborsClassifier()) clf2 = OneVsRestClassifier(DecisionTreeClassifier()) clf3 = OneVsRestClassifier(ExtraTreeClassifier()) clf4 = OneVsRestClassifier(MLPClassifier()) clf5 = OneVsOneClassifier(NuSVC()) clf6 = OneVsRestClassifier(LinearSVC()) from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.linear_model import PassiveAggressiveClassifier clf7 = OneVsRestClassifier(SGDClassifier()) clf8 = OneVsRestClassifier(Perceptron()) clf9 = OneVsRestClassifier(PassiveAggressiveClassifier()) print('One vs Rest methods case::') print('KNeighborsClassifier score 50d', clf1.fit(X_train, y_train).score(X_test, y_test)) print('DecisionTreeClassifier score 50d', clf2.fit(X_train, y_train).score(X_test, y_test)) print('ExtraTreeClassifier score 50d', clf3.fit(X_train, y_train).score(X_test, y_test)) print('MLPClassifier score 50d', clf4.fit(X_train, y_train).score(X_test, y_test)) print('SGDClassifier score 50d', clf7.fit(X_train, y_train).score(X_test, y_test)) print('Perceptron score 50d', clf8.fit(X_train, y_train).score(X_test, y_test)) print('PassiveAggressiveClassifier score 50d', clf9.fit(X_train, y_train).score(X_test, y_test)) print('NuSVC score 50d', clf5.fit(X_train, y_train).score(X_test, y_test)) print('LinearSVC score 50d', clf6.fit(X_train, y_train).score(X_test, y_test)) clf5nu.fit(X_train, y_train) print('NuSVC score 50d', clf5nu.score(X_test, y_test)) clf6lin.fit(X_train, y_train) print('LinearSVC score 50d', clf6lin.score(X_test, y_test)) from sklearn.datasets import make_friedman1 from sklearn.feature_selection import RFECV from sklearn.neighbors import KNeighborsClassifier estimator = DecisionTreeClassifier()
from sklearn.tree import DecisionTreeClassifier as DTC tree1 = DTC() print tree1 tree1.fit(xtrain,ytrain1) print tree1.fit(xtrain,ytrain1) print tree1.score(xtest,ytest1) # In[22]: from sklearn.tree import ExtraTreeClassifier as ETC tree2 = ETC() print tree2 tree2.fit(xtrain,ytrain1) print tree2.fit(xtrain,ytrain1) print tree2.score(xtest,ytest1) # In[23]: from sklearn.ensemble import BaggingClassifier bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain1) print bagging1.score(xtest,ytest1) # In[24]: from sklearn.ensemble import BaggingClassifier bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain1)
test_set = df.iloc[train_data_len:, :] #print(train_set.head(5)) train_x = train_set.iloc[:, 0:6] train_y = train_set.iloc[:, 6:] #print(type(train_y)) #train_y.reshape(len(train_y), ) #print(train_y.head(5)) test_x = test_set.iloc[:, 0:6] test_y = test_set.iloc[:, 6:] #test_y.reshape(len(test_y), ) #print(train_x.head(5)) #print(train_y.head(5)) from sklearn.tree import ExtraTreeClassifier classifier = ExtraTreeClassifier(random_state=0, criterion="entropy", splitter="best") classifier.fit(train_x, train_y.values.ravel()) info = classifier.score(test_x, test_y.values.ravel()) print(info) #model = Sequential()
from AppleStore_Milestone2 import * from sklearn.model_selection import train_test_split from sklearn.ensemble import BaggingClassifier from sklearn.tree import ExtraTreeClassifier import time print('\t\t\t Extra Tree Classifier Model \t\t\t\n','*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*') start_t=time.time() ExtraTreeClassifierModel = ExtraTreeClassifier(random_state=0,max_depth=12) ExtraTreeClassifierModel=BaggingClassifier(ExtraTreeClassifierModel, random_state=0).fit(X_train, Y_train) end_t=time.time() max_d=12 print('max depth = ',max_d,'\n accuracy of training is : ',ExtraTreeClassifierModel.score(X_train, Y_train),'\n trainning time = ',end_t-start_t) start_t=time.time() acc=ExtraTreeClassifierModel.score(X_test, Y_test) end_t=time.time() print('accuracy of testing is : ',acc,'\n testing time = ',end_t-start_t) joblib.dump(ExtraTreeClassifierModel,'joblib_ExtraTreeClassifierModel.pkl') # loaded_model = joblib.load('joblib_ExtraTreeClassifierModel.pkl') # predict = loaded_model.predict(X_test) # accuracy = loaded_model.score(X_test, Y_test) # print('Decission tree accuracy test : ' + str(accuracy),'\n')
def run(data, classifications, scoring_data, scoring_classifications): classifer = ExtraTreeClassifier() classifer.fit(data, classifications) accuracy = classifer.score(scoring_data, scoring_classifications) return accuracy
print("keys: ", iris.keys()) print("data: ", iris["data"]) print("target_names: ", iris["target_names"]) print("target: ", iris["target"]) print("feature_names: ", iris["feature_names"]) """Divide la data en set de entrenamiento y de test""" X_train, X_test, Y_train, Y_test = train_test_split(iris["data"], iris["target"]) print("X_train: ", X_train.shape) print("X_test: ", X_test.shape) print("Y_train: ", Y_train.shape) print("Y_test: ", Y_test.shape) """__________________""" arbol = ExtraTreeClassifier(max_depth=3) print("Entreno: ", arbol.fit(X_train, Y_train)) print("Comprobacion1: ", arbol.score(X_test, Y_test)) print("Comprobacion2: ", arbol.score(X_train, Y_train)) """__________________""" G = export_graphviz(arbol, out_file='arbol.dot', class_names=iris.target_names, feature_names=iris.feature_names, impurity=False, filled=True) with open('arbol.dot') as f: dot_graph = f.read() graphviz.Source(dot_graph).render('arbol', view=False, format='png') # graphviz.Source(dot_graph).view() # graph=graphviz.Source(dot_graph)
sgd.score(x_test_3, y_test_3) sgd = SGDClassifier(loss='log', shuffle=True, random_state=171) sgd.fit(x_train_3, y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3, y_test_3) sgd = SGDClassifier(shuffle=True, random_state=171) sgd.fit(x_train_3, y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3, y_test_3) submission = pd.DataFrame({'Id': test.Id, 'Cover_Type': ensemble_test_pred}) submission.head() submission.to_csv('submission.csv', index=False) submission_tree = pd.DataFrame({'Id': test.Id, 'Cover_Type': tree_test_pred}) submission_tree.head() submission_tree.to_csv('submission2.csv', index=False) #Extra tree classifier is a tree based model for classification problems et = ExtraTreeClassifier() et.fit(x_train_3, y_train_3) et.predict(x_train_3) et.score(x_test_3, y_test_3) from sklearn.semi_supervised import LabelPropagation lb = LabelPropagation() lb.fit(x_train_3, y_train_3) lb.predict(x_train_3) lb.score(x_test_3, y_test_3) from sklearn.neighbors import KNeighborsClassifier knng = KNeighborsClassifier() knng.fit(x_train_3, y_train_3) knng.predict(x_train_3) knng.score(x_test_3, y_test_3)