def evaluatePOIidentifier():
    import pickle
    import sys
    sys.path.append("../tools/")
    from feature_format import featureFormat, targetFeatureSplit

    data_dict = pickle.load(
        open("../final_project/final_project_dataset.pkl", "rb"))

    ### first element is our labels, any added elements are predictor
    ### features. Keep this the same for the mini-project, but you'll
    ### have a different feature list when you do the final project.
    features_list = ["poi", "salary"]

    data = featureFormat(data_dict,
                         features_list,
                         sort_keys='../tools/python2_lesson14_keys.pkl')
    # data = featureFormat(data_dict, features_list, sort_keys = True)

    labels, features = targetFeatureSplit(data)

    ### it's all yours from here forward!

    ### Decision Tree
    from time import time
    from sklearn import tree
    ### Using min_samples_split = 2 accuracy = 90.8%
    ### Using min_samples_split = 50 accuracy = 91.2%
    #clf = tree.DecisionTreeClassifier(min_samples_split=40)
    clf = tree.DecisionTreeClassifier()
    t0 = time()
    clf.fit(features, labels)
    print("training time for all data:", round(time() - t0, 3), "s")

    ### print accuracy
    print("all data accuracy: ", clf.score(features, labels))

    # from email_preprocess import preprocess
    from classifyDT import classify

    from sklearn.model_selection import train_test_split
    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.3, random_state=42)

    ### features_train and features_test are the features for the training
    ### and testing datasets, respectively
    ### labels_train and labels_test are the corresponding item labels
    # features_train, features_test, labels_train, labels_test = preprocess()

    clf = classify(features_train, labels_train, features_test, labels_test)
    ### expected result was 0.724
    print("#Features in data: ", len(features_train[0]))
Ejemplo n.º 2
0
sys.path.append("../tools/")
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData

import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from classifyDT import classify

features_train, labels_train, features_test, labels_test = makeTerrainData()



### the classify() function in classifyDT is where the magic
### happens--fill in this function in the file 'classifyDT.py'!
clf = classify(features_train, labels_train)

#### store your predictions in a list named pred
pred = clf.predict(features_test)

from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)

#### grader code, do not modify below this line

prettyPicture(clf, features_test, labels_test)

def submitAccuracy():
    return acc

print(submitAccuracy())
import sys
from class_vis import prettyPicture, output_image
from prep_terrain_data import makeTerrainData

import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from classifyDT import classify

features_train, labels_train, features_test, labels_test = makeTerrainData()



### the classify() function in classifyDT is where the magic
### happens--it's your job to fill this in!
clf = classify(features_train, labels_train)







#### grader code, do not modify below this line

prettyPicture(clf, features_test, labels_test)
output_image("test.png", "png", open("test.png", "rb").read())

print clf.score(features_test, labels_test)
Ejemplo n.º 4
0
#!/usr/bin/python
""" 
    This is the code to accompany the Lesson 3 (decision tree) mini-project.

    Use a Decision Tree to identify emails from the Enron corpus by author:    
    Sara has label 0
    Chris has label 1
"""

import sys
sys.path.append("../tools/")
from email_preprocess import preprocess
from classifyDT import classify

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

clf = classify(features_train, labels_train, features_test, labels_test)

print("#Features in data: ", len(features_train[0]))
#########################################################
### your code goes here ###

#########################################################
Ejemplo n.º 5
0
#!/usr/bin/python

""" lecture and example code for decision tree unit """

import sys
from class_vis import prettyPicture, output_image
from prep_terrain_data import makeTerrainData

import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from classifyDT import classify
from sklearn.metrics import accuracy_score

features_train, labels_train, features_test, labels_test = makeTerrainData()

### the classify() function in classifyDT is where the magic
### happens--fill in this function in the file 'classifyDT.py'!
clf2 = classify(features_train, labels_train, 2)
clf50 = classify(features_train, labels_train, 50)
#### grader code, do not modify below this line

prettyPicture(clf2, features_test, labels_test)
output_image("test.png", "png", open("test.png", "rb").read())
acc_min_samples_split_2 = accuracy_score(clf2.predict(features_test), labels_test)
acc_min_samples_split_50 = accuracy_score(clf50.predict(features_test), labels_test)

print "acc_min_samples_split_2:",round(acc_min_samples_split_2,3)
print "acc_min_samples_split_50:",round(acc_min_samples_split_50,3)
Ejemplo n.º 6
0
def submitAccuracies():
    return {
        "acc_min_samples_split_2": round(acc_min_samples_split_2, 3),
        "acc_min_samples_split_50": round(acc_min_samples_split_50, 3)
    }


########################## DECISION TREE #################################

### your code goes here--now create 2 decision tree classifiers,
### one with min_samples_split=2 and one with min_samples_split=50
### compute the accuracies on the testing data and store
### the accuracy numbers to acc_min_samples_split_2 and
### acc_min_samples_split_5, respectively
from classifyDT import classify
from sklearn.metrics.metrics import accuracy_score
clf = classify(features_train, labels_train, 50.0)
pred = clf.predict_proba(features_test)
roundedNumber = []
for i in range(0, len(pred)):
    roundedNumber.append(round(pred[i, 1]))
acc_min_samples_split_50 = accuracy_score(labels_test,
                                          roundedNumber)  ### you fill this in!

clf = classify(features_train, labels_train, 2.0)

pred = clf.predict_proba(features_test)
acc_min_samples_split_2 = accuracy_score(labels_test,
                                         pred[:, 1])  ### you fill this in!
print submitAccuracies()
Ejemplo n.º 7
0
from class_vis import prettyPicture, output_image
from prep_terrain_data import makeTerrainData

import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from classifyDT import classify

features_train, labels_train, features_test, labels_test = makeTerrainData()



### the classify() function in classifyDT is where the magic
### happens--fill in this function in the file 'classifyDT.py'!
minsplit = 2
clf1 = classify(features_train, labels_train, minsplit)
minsplit = 50
clf2 = classify(features_train, labels_train, minsplit)
'''
different ways to print the accuracy of the GNB classifier
'''
#print clf1.score(features_test,labels_test) # find the prediction score for the test data
pred1 = clf1.predict(features_test)
from sklearn.metrics import accuracy_score
acc_min_samples_split_2 = accuracy_score(labels_test, pred1)
print acc_min_samples_split_2

#print clf2.score(features_test,labels_test) # find the prediction score for the test data
pred2 = clf2.predict(features_test)
from sklearn.metrics import accuracy_score
acc_min_samples_split_50 = accuracy_score(labels_test, pred2)
features_train, labels_train, features_test, labels_test = makeTerrainData()

def submitAccuracies():
    return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3),
          "acc_min_samples_split_50":round(acc_min_samples_split_50,3)}


########################## DECISION TREE #################################


### your code goes here--now create 2 decision tree classifiers,
### one with min_samples_split=2 and one with min_samples_split=50
### compute the accuracies on the testing data and store
### the accuracy numbers to acc_min_samples_split_2 and
### acc_min_samples_split_5, respectively
from classifyDT import classify
from sklearn.metrics.metrics import accuracy_score
clf = classify(features_train, labels_train,50.0)
pred = clf.predict_proba(features_test)
roundedNumber = []
for i in range(0,len(pred)):
    roundedNumber.append(round(pred[i,1]))
acc_min_samples_split_50 = accuracy_score(labels_test,roundedNumber)### you fill this in!


clf = classify(features_train, labels_train,2.0)

pred = clf.predict_proba(features_test)
acc_min_samples_split_2 = accuracy_score(labels_test,pred[:,1])### you fill this in!
print submitAccuracies()
Ejemplo n.º 9
0
    except ValueError:
        return False

features_train, labels_train, features_test, labels_test = makeTerrainData()

#adding taking parameter from the commend line to have different min split sample
s = sys.argv[1] #taking the first parameter
if (RepresentsInt(s) == True):
	min_sample_split = int(s)

else:
	min_sample_split = 2 #default value

### the classify() function in classifyDT is where the magic
### happens--it's your job to fill this in!
clf = classify(features_train, labels_train, min_sample_split)

#getting the acuuracy
pred = clf.predict(features_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)

prettyPicture(clf, features_test, labels_test)
output_image("test.png", "png", open("test.png", "rb").read())
plt.show() #I add this line to show the image from matplotlib

#accuracy result will be shown after the show window is close
print ('Accuacy from decision tree = ', acc)

# print 'Number of arguments:', len(sys.argv), 'arguments.'
# print 'Argument List:', str(sys.argv)
Ejemplo n.º 10
0
import sys
import time
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData

import numpy as np
import pylab as pl

features_train, labels_train, features_test, labels_test = makeTerrainData()

#################################################################################

########################## DECISION TREE #################################

#### your code goes here
from classifyDT import classify
Tree_Test = classify(features_train, labels_train)
t0 = time()
terrain_pred = Tree_Test.predict(features_test)
print("prediction time:", round(time() - t0, 3), "s")
print(terrain_pred)

from sklearn.metrics import accuracy_score
acc = accuracy_score(labels_test, terrain_pred)

### be sure to compute the accuracy on the test set

def submitAccuracies():
    return {"acc": round(acc, 3)}