def evaluatePOIidentifier(): import pickle import sys sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "rb")) ### first element is our labels, any added elements are predictor ### features. Keep this the same for the mini-project, but you'll ### have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list, sort_keys='../tools/python2_lesson14_keys.pkl') # data = featureFormat(data_dict, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### it's all yours from here forward! ### Decision Tree from time import time from sklearn import tree ### Using min_samples_split = 2 accuracy = 90.8% ### Using min_samples_split = 50 accuracy = 91.2% #clf = tree.DecisionTreeClassifier(min_samples_split=40) clf = tree.DecisionTreeClassifier() t0 = time() clf.fit(features, labels) print("training time for all data:", round(time() - t0, 3), "s") ### print accuracy print("all data accuracy: ", clf.score(features, labels)) # from email_preprocess import preprocess from classifyDT import classify from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels # features_train, features_test, labels_train, labels_test = preprocess() clf = classify(features_train, labels_train, features_test, labels_test) ### expected result was 0.724 print("#Features in data: ", len(features_train[0]))
sys.path.append("../tools/") from class_vis import prettyPicture from prep_terrain_data import makeTerrainData import matplotlib.pyplot as plt import numpy as np import pylab as pl from classifyDT import classify features_train, labels_train, features_test, labels_test = makeTerrainData() ### the classify() function in classifyDT is where the magic ### happens--fill in this function in the file 'classifyDT.py'! clf = classify(features_train, labels_train) #### store your predictions in a list named pred pred = clf.predict(features_test) from sklearn.metrics import accuracy_score acc = accuracy_score(pred, labels_test) #### grader code, do not modify below this line prettyPicture(clf, features_test, labels_test) def submitAccuracy(): return acc print(submitAccuracy())
import sys from class_vis import prettyPicture, output_image from prep_terrain_data import makeTerrainData import matplotlib.pyplot as plt import numpy as np import pylab as pl from classifyDT import classify features_train, labels_train, features_test, labels_test = makeTerrainData() ### the classify() function in classifyDT is where the magic ### happens--it's your job to fill this in! clf = classify(features_train, labels_train) #### grader code, do not modify below this line prettyPicture(clf, features_test, labels_test) output_image("test.png", "png", open("test.png", "rb").read()) print clf.score(features_test, labels_test)
#!/usr/bin/python """ This is the code to accompany the Lesson 3 (decision tree) mini-project. Use a Decision Tree to identify emails from the Enron corpus by author: Sara has label 0 Chris has label 1 """ import sys sys.path.append("../tools/") from email_preprocess import preprocess from classifyDT import classify ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() clf = classify(features_train, labels_train, features_test, labels_test) print("#Features in data: ", len(features_train[0])) ######################################################### ### your code goes here ### #########################################################
#!/usr/bin/python """ lecture and example code for decision tree unit """ import sys from class_vis import prettyPicture, output_image from prep_terrain_data import makeTerrainData import matplotlib.pyplot as plt import numpy as np import pylab as pl from classifyDT import classify from sklearn.metrics import accuracy_score features_train, labels_train, features_test, labels_test = makeTerrainData() ### the classify() function in classifyDT is where the magic ### happens--fill in this function in the file 'classifyDT.py'! clf2 = classify(features_train, labels_train, 2) clf50 = classify(features_train, labels_train, 50) #### grader code, do not modify below this line prettyPicture(clf2, features_test, labels_test) output_image("test.png", "png", open("test.png", "rb").read()) acc_min_samples_split_2 = accuracy_score(clf2.predict(features_test), labels_test) acc_min_samples_split_50 = accuracy_score(clf50.predict(features_test), labels_test) print "acc_min_samples_split_2:",round(acc_min_samples_split_2,3) print "acc_min_samples_split_50:",round(acc_min_samples_split_50,3)
def submitAccuracies(): return { "acc_min_samples_split_2": round(acc_min_samples_split_2, 3), "acc_min_samples_split_50": round(acc_min_samples_split_50, 3) } ########################## DECISION TREE ################################# ### your code goes here--now create 2 decision tree classifiers, ### one with min_samples_split=2 and one with min_samples_split=50 ### compute the accuracies on the testing data and store ### the accuracy numbers to acc_min_samples_split_2 and ### acc_min_samples_split_5, respectively from classifyDT import classify from sklearn.metrics.metrics import accuracy_score clf = classify(features_train, labels_train, 50.0) pred = clf.predict_proba(features_test) roundedNumber = [] for i in range(0, len(pred)): roundedNumber.append(round(pred[i, 1])) acc_min_samples_split_50 = accuracy_score(labels_test, roundedNumber) ### you fill this in! clf = classify(features_train, labels_train, 2.0) pred = clf.predict_proba(features_test) acc_min_samples_split_2 = accuracy_score(labels_test, pred[:, 1]) ### you fill this in! print submitAccuracies()
from class_vis import prettyPicture, output_image from prep_terrain_data import makeTerrainData import matplotlib.pyplot as plt import numpy as np import pylab as pl from classifyDT import classify features_train, labels_train, features_test, labels_test = makeTerrainData() ### the classify() function in classifyDT is where the magic ### happens--fill in this function in the file 'classifyDT.py'! minsplit = 2 clf1 = classify(features_train, labels_train, minsplit) minsplit = 50 clf2 = classify(features_train, labels_train, minsplit) ''' different ways to print the accuracy of the GNB classifier ''' #print clf1.score(features_test,labels_test) # find the prediction score for the test data pred1 = clf1.predict(features_test) from sklearn.metrics import accuracy_score acc_min_samples_split_2 = accuracy_score(labels_test, pred1) print acc_min_samples_split_2 #print clf2.score(features_test,labels_test) # find the prediction score for the test data pred2 = clf2.predict(features_test) from sklearn.metrics import accuracy_score acc_min_samples_split_50 = accuracy_score(labels_test, pred2)
features_train, labels_train, features_test, labels_test = makeTerrainData() def submitAccuracies(): return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3), "acc_min_samples_split_50":round(acc_min_samples_split_50,3)} ########################## DECISION TREE ################################# ### your code goes here--now create 2 decision tree classifiers, ### one with min_samples_split=2 and one with min_samples_split=50 ### compute the accuracies on the testing data and store ### the accuracy numbers to acc_min_samples_split_2 and ### acc_min_samples_split_5, respectively from classifyDT import classify from sklearn.metrics.metrics import accuracy_score clf = classify(features_train, labels_train,50.0) pred = clf.predict_proba(features_test) roundedNumber = [] for i in range(0,len(pred)): roundedNumber.append(round(pred[i,1])) acc_min_samples_split_50 = accuracy_score(labels_test,roundedNumber)### you fill this in! clf = classify(features_train, labels_train,2.0) pred = clf.predict_proba(features_test) acc_min_samples_split_2 = accuracy_score(labels_test,pred[:,1])### you fill this in! print submitAccuracies()
except ValueError: return False features_train, labels_train, features_test, labels_test = makeTerrainData() #adding taking parameter from the commend line to have different min split sample s = sys.argv[1] #taking the first parameter if (RepresentsInt(s) == True): min_sample_split = int(s) else: min_sample_split = 2 #default value ### the classify() function in classifyDT is where the magic ### happens--it's your job to fill this in! clf = classify(features_train, labels_train, min_sample_split) #getting the acuuracy pred = clf.predict(features_test) from sklearn.metrics import accuracy_score acc = accuracy_score(pred, labels_test) prettyPicture(clf, features_test, labels_test) output_image("test.png", "png", open("test.png", "rb").read()) plt.show() #I add this line to show the image from matplotlib #accuracy result will be shown after the show window is close print ('Accuacy from decision tree = ', acc) # print 'Number of arguments:', len(sys.argv), 'arguments.' # print 'Argument List:', str(sys.argv)
import sys import time from class_vis import prettyPicture from prep_terrain_data import makeTerrainData import numpy as np import pylab as pl features_train, labels_train, features_test, labels_test = makeTerrainData() ################################################################################# ########################## DECISION TREE ################################# #### your code goes here from classifyDT import classify Tree_Test = classify(features_train, labels_train) t0 = time() terrain_pred = Tree_Test.predict(features_test) print("prediction time:", round(time() - t0, 3), "s") print(terrain_pred) from sklearn.metrics import accuracy_score acc = accuracy_score(labels_test, terrain_pred) ### be sure to compute the accuracy on the test set def submitAccuracies(): return {"acc": round(acc, 3)}