def output(partId): # Random Test Cases x1 = np.sin(np.arange(1, 11)) x2 = np.cos(np.arange(1, 11)) ec = 'the quick brown fox jumped over the lazy dog' wi = np.abs(np.round(x1 * 1863)).astype(int) wi = np.concatenate([wi, wi]) if partId == '1': sim = gaussianKernel(x1, x2, 2) out = formatter('%0.5f ', sim) elif partId == '2': mat = scipy.io.loadmat('ex6data3.mat') X = mat['X'] y = mat['y'].ravel() Xval = mat['Xval'] yval = mat['yval'].ravel() C, sigma = dataset3Params(X, y, Xval, yval) out = formatter('%0.5f ', C) out += formatter('%0.5f ', sigma) elif partId == '3': word_indices = processEmail(ec) + 1 out = formatter('%d ', word_indices) elif partId == '4': x = emailFeatures(wi) out = formatter('%d ', x) return out
from emailFeatures import emailFeatures from getVocabList import getVocabList ## ==================== Part 1: Email Preprocessing ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, you first need # to convert each email into a vector of features. In this part, you will # implement the preprocessing steps for each email. You should # complete the code in processEmail.m to produce a word indices vector # for a given email. print 'Preprocessing sample email (emailSample1.txt)' # Extract Features file = open('emailSample1.txt', 'r') file_contents = file.readlines() word_indices = processEmail(''.join(file_contents)) # Print Stats print 'Word Indices: ' print word_indices raw_input("Program paused. Press Enter to continue...") ## ==================== Part 2: Feature Extraction ==================== # Now, you will convert each email into a vector of features in R^n. # You should complete the code in emailFeatures.m to produce a feature # vector for a given email. print 'Extracting features from sample email (emailSample1.txt)' # Extract Features
from linearKernel import linearKernel from svmPredict import svmPredict ## ==================== Part 1: Email Preprocessing ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, you first need # to convert each email into a vector of features. In this part, you will # implement the preprocessing steps for each email. You should # complete the code in processEmail.m to produce a word indices vector # for a given email. print 'Preprocessing sample email (emailSample1.txt)' # Extract Features with open('emailSample1.txt') as f: file_contents = f.read() word_indices = processEmail(file_contents) # Print Stats print 'Word Indices:' print word_indices print print 'Program paused. Press enter to continue.' raw_input() ## ==================== Part 2: Feature Extraction ==================== # Now, you will convert each email into a vector of features in R^n. # You should complete the code in emailFeatures.py to produce a feature # vector for a given email. print '\nExtracting features from sample email (emailSample1.txt)'
import numpy as np import scipy.io as sio # Used to load the OCTAVE *.mat files from sklearn import svm from processEmail import processEmail, getVocabList ## ==================== Part 1: Email Preprocessing ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, you first need # to convert each email into a vector of features. In this part, you will # implement the preprocessing steps for each email. You should # complete the code in processEmail.m to produce a word indices vector # for a given email. print("Preprocessing and extracting features sample email (emailSample1.txt)") # Extract Features with open('emailSample1.txt', 'r') as f: features =processEmail(f.read()) print('length of vector = {}\nnum of non-zero = {}' .format(len(features), int(features.sum()))) print(features.shape) ## =========== Part 2: Train Linear SVM for Spam Classification ======== # In this section, you will train a linear classifier to determine if an # email is Spam or Not-Spam. # Load the Spam Email dataset # You will have X, y in your environment mat = sio.loadmat('spamTrain.mat') X, y = mat['X'], mat['y'] print("\nTraining Linear SVM (Spam Classification)")
from emailFeatures import emailFeatures #2.1 email_contents = open("emailSample1.txt","r").read() vocabList = open("vocab.txt","r").read() #2.1.1 vocabList=vocabList.split("\n")[:-1] vocabList_d={} for ea in vocabList: value,key = ea.split("\t")[:] vocabList_d[key] = value word_indices = processEmail(email_contents , vocabList_d) #2.2 featureVector = emailFeatures(word_indices) #print(np.sum(featureVector)) #2.3 spamTrainData = loadmat('spamTrain.mat') #print(spamTrainData) X = spamTrainData['X'] y = spamTrainData['y'] print('> ' ,spamTrainData)
import numpy as np def emailFeatures(word_indices): n = 1899 x = np.zeros((n, 1)) for word_indice in word_indices: x[word_indice] = 1 return x.T if __name__ == '__main__': f = open('emailSample1.txt') email_contents = f.read() from processEmail import processEmail word_indices = processEmail(email_contents) print word_indices print len(word_indices) features = emailFeatures(word_indices) print len(features) print np.sum(features)
def extractFeature(mails): return np.array([processEmail(mail, vocabList) for mail in mails])
# # ==================== Part 1: Email Preprocessing ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, you first need # to convert each email into a vector of features. In this part, you will # implement the preprocessing steps for each email. You should # complete the code in processEmail.m to produce a word indices vector # for a given email. ml_dir = '/Users/gregory/Desktop/me/coursera/machine_learning/ml_python/machine-learning-ex6/ex6/' fname = ml_dir + 'emailSample1.txt' with open(fname) as f: file_contents = f.readlines() # Extract Features word_indices = processEmail(file_contents) # Print Stats print('Word Indices: \n') print(word_indices) # ==================== Part 2: Feature Extraction ==================== # Now, you will convert each email into a vector of features in R^n. # You should complete the code in emailFeatures.m to produce a feature # vector for a given email. # Extract Features #file_contents = readFile('emailSample1.txt'); #word_indices = processEmail(file_contents); vocab_length = len(getVocabList())
def main() : path = os.getcwd() path = os.path.join(path,'dataSets') # =============== Part 1 ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, we first need # to convert each email into a vector of features. In this part, we # implement the preprocessing steps for each email. f = open(os.path.join(path,"emailSample1.txt"),'r') email_contents = f.read() f.close() print(email_contents) word_indices = processEmail.processEmail(email_contents) features = emailFeatures(word_indices) print('Word Indices :\n') print(word_indices, "\n") #============= Part 2 ======================= # Print Stats print('Length of feature vector: %d\n' % len(features)) print('Number of non-zero entries: %d\n' % sum(features)) # ============= Part 3 ====================== # In this section, we will train a linear classifier to determine if an # email is Spam or Not-Spam. print('\n\nRunning SVM on training set...') mat = io.loadmat(os.path.join(path,'spamTrain.mat')) X = mat['X'] y = mat['y'] y = numpy.ravel(y) model = svm.SVC(C = 0.1, kernel='linear') model.fit(X, y) p = model.predict(X) accuracy = model.score(X, y) accuracy *= 100.0 print('\nTraining Accuracy: %.2f' % accuracy) #================ Part 4 ======================== # Xtest and ytest are the env. variables mat = io.loadmat(os.path.join(path,'spamTest.mat')) XTest = mat['Xtest'] yTest = mat['ytest'] yTest = numpy.ravel(yTest) p = model.predict(XTest) accuracy = model.score(XTest,yTest) accuracy *= 100.0 print('\nTest Accuracy: %.2f' % accuracy) #================ Part 5 ============================ # Since the model we are training is a linear SVM, we can inspect the # weights learned by the model to understand better how it is determining # whether an email is spam or not. The following code finds the words with # the highest weights in the classifier. Informally, the classifier # 'thinks' that these words are the most likely indicators of spam. print('\nTop spam predictors (keywords) \n') z = model.coef_ z = numpy.ravel(z) vocabList = getVocabList.getVocabList() dic = {} for i in range(len(z)) : dic[ vocabList[i] ] = z[i] cnt = 0 for w in sorted(dic, key=dic.get, reverse=True): if cnt == 15 : break cnt = cnt + 1 print('{0:10} - {1:10f}'.format(w, dic[w])) print('\n\n') # ============ Part 6: Test a sample Email ===================== # Now that we have trained the spam classifier, we can use it on our own # emails! # The following code reads in one of these emails and then uses our # learned SVM classifier to determine whether the email is Spam or # Not Spam f = open(os.path.join(path, "spamSample1.txt"),'r') email_contents = f.read() f.close() print('Sample Email : ') print(email_contents) word_indices = processEmail.processEmail(email_contents) features = emailFeatures(word_indices) X = emailFeatures(word_indices); p = model.predict(X) print('\nEmail Processed\n\nSpam Classification: %d\n' % p); print('(1 indicates spam, 0 indicates not spam)\n\n');
from emailFeatures import emailFeatures from getVocabList import getVocabList ## ==================== Part 1: Email Preprocessing ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, you first need # to convert each email into a vector of features. In this part, you will # implement the preprocessing steps for each email. You should # complete the code in processEmail.m to produce a word indices vector # for a given email. print 'Preprocessing sample email (emailSample1.txt)' # Extract Features file = open('ex6/emailSample1.txt', 'r') file_contents = file.readlines() word_indices = processEmail(''.join(file_contents)) # Print Stats print 'Word Indices: ' print word_indices #raw_input("Program paused. Press Enter to continue...") ## ==================== Part 2: Feature Extraction ==================== # Now, you will convert each email into a vector of features in R^n. # You should complete the code in emailFeatures.m to produce a feature # vector for a given email. print 'Extracting features from sample email (emailSample1.txt)' # Extract Features
# Empty dictionary vocabList_d={} for element_of_list in vocabList: value, key = element_of_list.split("\t")[:] vocabList_d[key] = value print(file_contents) word_indices= processEmail.processEmail(file_contents,vocabList_d) features = emailFeatures.emailFeatures(word_indices,vocabList_d) print("Length of feature vector: ", len(features)) print("Number of non-zero entries: ", np.sum(features)) spam_mat = loadmat("Data/spamTrain.mat") X_train =spam_mat["X"] y_train = spam_mat["y"]
from processEmail import processEmail file = open('data/spamSample2.txt', mode='r') # read all lines at once all_of_it = file.read() processEmail(all_of_it)
def ex6_spam(): ## Machine Learning Online Class # Exercise 6 | Spam Classification with SVMs # # Instructions # ------------ # # This file contains code that helps you get started on the # exercise. You will need to complete the following functions: # # gaussianKernel.m # dataset3Params.m # processEmail.m # emailFeatures.m # # For this exercise, you will not need to change any code in this file, # or any other files other than those mentioned above. # ## Initialization #clear ; close all; clc ## ==================== Part 1: Email Preprocessing ==================== # To use an SVM to classify emails into Spam v.s. Non-Spam, you first need # to convert each email into a vector of features. In this part, you will # implement the preprocessing steps for each email. You should # complete the code in processEmail.m to produce a word indices vector # for a given email. print('\nPreprocessing sample email (emailSample1.txt)') # Extract Features file_contents = readFile('emailSample1.txt') word_indices = processEmail(file_contents) # Print Stats print('Word Indices: ') print(formatter(' %d', np.array(word_indices) + 1)) print('\n') print('Program paused. Press enter to continue.') #pause; ## ==================== Part 2: Feature Extraction ==================== # Now, you will convert each email into a vector of features in R^n. # You should complete the code in emailFeatures.m to produce a feature # vector for a given email. print('\nExtracting features from sample email (emailSample1.txt)') # Extract Features file_contents = readFile('emailSample1.txt') word_indices = processEmail(file_contents) features = emailFeatures(word_indices) # Print Stats print('Length of feature vector: %d' % features.size) print('Number of non-zero entries: %d' % np.sum(features > 0)) print('Program paused. Press enter to continue.') #pause; ## =========== Part 3: Train Linear SVM for Spam Classification ======== # In this section, you will train a linear classifier to determine if an # email is Spam or Not-Spam. # Load the Spam Email dataset # You will have X, y in your environment mat = scipy.io.loadmat('spamTrain.mat') X = mat['X'].astype(float) y = mat['y'][:, 0] print('\nTraining Linear SVM (Spam Classification)\n') print('(this may take 1 to 2 minutes) ...\n') C = 0.1 model = svmTrain(X, y, C, linearKernel) p = svmPredict(model, X) print('Training Accuracy: %f' % (np.mean(p == y) * 100)) ## =================== Part 4: Test Spam Classification ================ # After training the classifier, we can evaluate it on a test set. We have # included a test set in spamTest.mat # Load the test dataset # You will have Xtest, ytest in your environment mat = scipy.io.loadmat('spamTest.mat') Xtest = mat['Xtest'].astype(float) ytest = mat['ytest'][:, 0] print('\nEvaluating the trained Linear SVM on a test set ...\n') p = svmPredict(model, Xtest) print('Test Accuracy: %f\n' % (np.mean(p == ytest) * 100)) #pause; ## ================= Part 5: Top Predictors of Spam ==================== # Since the model we are training is a linear SVM, we can inspect the # weights learned by the model to understand better how it is determining # whether an email is spam or not. The following code finds the words with # the highest weights in the classifier. Informally, the classifier # 'thinks' that these words are the most likely indicators of spam. # # Sort the weights and obtin the vocabulary list idx = np.argsort(model['w']) top_idx = idx[-15:][::-1] vocabList = getVocabList() print('\nTop predictors of spam: ') for word, w in zip(np.array(vocabList)[top_idx], model['w'][top_idx]): print(' %-15s (%f)' % (word, w)) #end print('\n') print('\nProgram paused. Press enter to continue.') #pause; ## =================== Part 6: Try Your Own Emails ===================== # Now that you've trained the spam classifier, you can use it on your own # emails! In the starter code, we have included spamSample1.txt, # spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. # The following code reads in one of these emails and then uses your # learned SVM classifier to determine whether the email is Spam or # Not Spam # Set the file to be read in (change this to spamSample2.txt, # emailSample1.txt or emailSample2.txt to see different predictions on # different emails types). Try your own emails as well! filename = 'spamSample1.txt' # Read and predict file_contents = readFile(filename) word_indices = processEmail(file_contents) x = emailFeatures(word_indices) p = svmPredict(model, x.ravel()) print('\nProcessed %s\n\nSpam Classification: %d' % (filename, p)) print('(1 indicates spam, 0 indicates not spam)\n')