Ejemplo n.º 1
0
    Sara has label 0
    Chris has label 1

"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(percentile=1)
clf = DecisionTreeClassifier(min_samples_split=40)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
acc = accuracy_score(pred, labels_test)
print "Decision Tree accuracy: %r" % acc

"""
	
	You found in the SVM mini-project that the parameter tune can significantly 
	speed up the training time of a machine learning algorithm. A general rule is 
	that the parameters can tune the complexity of the algorithm, with more 
	complex algorithms generally running more slowly.

	Another way to control the complexity of an algorithm is via the number of 
	features that you use in training/testing. The more features the algorithm 
Ejemplo n.º 2
0
    Use a Decision Tree to identify emails from the Enron corpus by author:    
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()



#########################################################
### your code goes here ###

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(features_train, labels_train)

pred = clf.predict(features_test)

#########################################################

from sklearn.metrics import accuracy_score
Ejemplo n.º 3
0
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("C:/WorkSpace/MachineLearning/MachineLearning/tools")
from email_preprocess import preprocess
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(words_file = "C:/WorkSpace/MachineLearning/MachineLearning/tools/word_data.pkl", authors_file="C:/WorkSpace/MachineLearning/MachineLearning/tools/email_authors.pkl")




#########################################################
### your code goes here ###
def classify(features_train, labels_train,features_test,labels_test):
    # just training 1% of the full training set
    #features_train = features_train[:len(features_train)/100] 
    #labels_train = labels_train[:len(labels_train)/100] 
    #clf = SVC(kernel="linear")
    clf = SVC(kernel="rbf",C=10000.0)
    t0 = time()
    clf.fit(features_train,labels_train)
    print "training time:", round(time()-t0, 3), "s"
Ejemplo n.º 4
0
    authors and labels:
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.naive_bayes import GaussianNB


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()


#########################################################
### your code goes here ###
print type(preprocess())
print type(features_train)
print len(features_train)


clf = GaussianNB()

# counting time for fitting model 
t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
    This is the code to accompany the Lesson 2 (SVM) mini-project.

    Use a SVM to identify emails from the Enron corpus by their authors:
    Sara has label 0
    Chris has label 1
"""

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess

# features_train and features_test are the features for the training
# and testing datasets, respectively
# labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###
from sklearn.svm import SVC

# features_train = features_train[:len(features_train) / 100]
# labels_train = labels_train[:len(labels_train) / 100]

# Optimize C Parameter
"""
for i in range(1, 5):
    c = 10**i
    print "C=" + str(c)
    clf = SVC(kernel="rbf", C=c)
Ejemplo n.º 6
0
""" 
    This is the code to accompany the Lesson 3 (decision tree) mini-project.

    Use a Decision Tree to identify emails from the Enron corpus by author:    
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
import email_preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = email_preprocess.preprocess()

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

t0 = time()
acc = clf.score(features_test, labels_test)
print "predict time:", round(time()-t0, 3), "s"

print acc
Ejemplo n.º 7
0
 def __init__(self, words_file, authors_file):
     self.words_file = "../tools/word_data.pkl"
     self.authors_file = "../tools/email_authors.pkl"
     ### features_train and features_test are the features for the training
     ### and testing datasets, respectively
     ### labels_train and labels_test are the corresponding item labels
     self.features_train, self.features_test, self.labels_train, self.labels_test = preprocess(words_file=self.words_file, authors_file=self.authors_file)
    Use a Decision Tree to identify emails from the Enron corpus by author:
    Sara has label 0
    Chris has label 1
"""

import sys
from time import time
sys.path.append("./tools/")
from email_preprocess import preprocess
from sklearn.tree import DecisionTreeClassifier

# features_train and features_test are the features for the training
# and testing datasets, respectively
# labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess(
    words_file="./tools/word_data.pkl",
    authors_file="./tools/email_authors.pkl")

# Number of features: 3785
print "Number of features: ", len(features_train[0])

# clf 1
# feature  : percentile=10
# accuracy : 0.978
# time     : 62.838
# clf = DecisionTreeClassifier(min_samples_split=40)
# t0 = time()
# clf.fit(features_train, labels_train)
# print("training time:", round(time() - t0, 3), "s")
# print("accuracy:", clf.score(features_test, labels_test))
Ejemplo n.º 9
0
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from data_formating import format_mails

#Formatting mails (coverting into pickle files) from the csv database file
format_mails()

#features_train is a numpy array contains emails for training
#features_test is a numpy array containing emails for testing
#labels_train is a numpy array containing training labels(spams/ham)
#mail_detect is a numpy array containing processed email to be checked as spam
features_train, features_test, labels_train, labels_test, final_transformed = preprocess(
)

#Random Forest algorithm to train classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=75,
                             criterion='entropy',
                             min_samples_split=3)
t = time()
clf.fit(features_train, labels_train)
print "Training Time:", round(time() - t, 3), "s"
t = time()
pred = clf.predict(features_test)
print "Prediction Time:", round(time() - t, 3), "s"

#code to check accuracy
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)
Ejemplo n.º 10
0
    
import sys
# from time import time
import time

# sys.path.append("../tools/")

# from email_preprocess import preprocess
import email_preprocess

import sklearn.svm 

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = email_preprocess.preprocess()

# only use 1%, 0.01 of features_train - performance increases, time decreases, accuracy decreases
# len(features_train) / 100 
print("len(features_test) - {}".format(len(features_test)))
# print("len(features_train) - {}".format(len(features_train)))
# print("len(features_train) / 100 - {}".format(len(features_train) / 100))
# print("round(len(features_train) / 100) - {}".format(round(len(features_train) / 100)))
# features_train = features_train[:round(len(features_train)/100)] 
# labels_train = labels_train[:round(len(labels_train)/100)]
 
#########################################################
### your code goes here ###

#########################################################
Ejemplo n.º 11
0
    authors and labels:
    Sara has label 0
    Chris has label 1
"""
    
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from utils import execute


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = execute(lambda : preprocess(), "Process data")




#########################################################
### your code goes here ###
# Imports
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Visualize data


# Create classifier
clf = GaussianNB()