def data_info(enron_data): num_people = len(enron_data) print("Size of Enron dataset: " + str(num_people)) num_features = [len(v) for v in enron_data.itervalues()] num_features = list(dict.fromkeys(num_features)) print("Total features in Enron dataset: " + str(num_features)) features_list = [v for v in enron_data.itervalues()][0].keys() print("Features:") for f in features_list: print("\t" + f) poi_list = poiEmails()
def poiFlagEmail(f): """ given an email file f, return a trio of booleans for whether that email is to, from, or cc'ing a poi """ import sys sys.path.append("../final_project/") from poi_email_addresses import poiEmails to_emails, from_emails, cc_emails = getToFromStrings(f) ### poi_emails.poiEmails() returns a list of all POIs' email addresses. poi_email_list = poiEmails() to_poi = False from_poi = False cc_poi = False ### to_poi and cc_poi are boolean variables which flag whether the email ### under inspection is addressed to a POI, or if a POI is in cc, ### respectively. You don't have to change this code at all. ### There can be many "to" emails, but only one "from", so the ### "to" processing needs to be a little more complicated if to_emails: ctr = 0 while not to_poi and ctr < len(to_emails): if to_emails[ctr] in poi_email_list: to_poi = True ctr += 1 if cc_emails: ctr = 0 while not cc_poi and ctr < len(cc_emails): if cc_emails[ctr] in poi_email_list: cc_poi = True ctr += 1 ################################# ######## your code below ######## ### set from_poi to True if ##### ### the email is from a POI ##### ################################# if from_emails: ctr = 0 while not from_poi and ctr < len(from_emails): if from_emails[ctr] in poi_email_list: from_poi = True ctr += 1 ################################# return to_poi, from_poi, cc_poi
def text_classifier(data_dict): ''' This function is the main function appealing to other functions in the dependencies folder. This function creates a dict of all the emails of all poi and non-poi. Then it iterate through the dict and feed the results to the email_open function. In turn this function feed a list of email_path to email_process which will return a list of emails. This will finally be fed to a vectorizing function which will root the words and return them for the classification. The function classify the list of emails and test them using different metrics. ''' email_dict = poiEmails() for key in email_dict: email_dict[key] = {"email": email_dict[key], "poi": True} for key in data_dict: if key in email_dict: continue else: email_dict[key] = { "email": [data_dict[key]["email_address"]], "poi": data_dict[key]["poi"] } for key in email_dict: try: for i in email_dict[key]["email"]: None except: print key email_list, poi_list = email_open(email_dict) word_list = email_process(email_list) features_train, features_test, labels_train, labels_test = email_vectorizer( word_list, poi_list) clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) predict = list(clf.predict(features_test)) print "Number of POI emails in test set: " + str(predict.count(1.)) print "Number of emails in test set: " + str(len(features_test)) print "Number of real POI emails in test set: " + str(labels_test.count(1)) print "Precision is: " + str(metrics.precision_score(labels_test, predict)) print "Recall is: " + str(metrics.recall_score(labels_test, predict))
def process_text_learning_features(): poi_emails = poiEmails() _all = False data_dict = pickle.load(open("final_project_dataset.pkl", "r") ) emls = [] eml_key = {} for key in data_dict.keys(): emls.append(data_dict[key]['email_address']) eml_key[data_dict[key]['email_address']] = key author_dict = get_text_from_emails(emls,eml_key,poi_emails, data_dict,from_pkl=True, combined=_all) print 'Got the Author Text Dictionary!!!' #look_for_common_words(author_dict, 'body', data_dict) subj_scrub = [] body_scrub = ['ddelainnsf','delainey','regard','david','houect','delaineyhouect','christoph', 'jeff', 'product', 'kitchen', 'jdasovicnsf','allegheni','pastoria','jdasovicnsf','neuner','jacobi','catalytica','calpin', 'ect','dave','tim','ena','62602pst','belden','guy','chris','calger','valu','salisburi', 'swerzbin','kelli','paula','motley','chapman','johnson','frevertna','presto','ben','ray','janet','wes','dietrich', 'deal','holden','kay','floor','thxs','portland','manag','plan','turbin','enron','board','meet','forward','year', 'term','pdx','market','goal','lavoratocorp','2702pst','desk','unit','discuss','mid','2000','kellycom','7138532534', '7138536485','execut','power','cost','busi','complet','ensur','howev','provid','sheet','short','right','structur', 'trade','organ','peopl','jskillinpst','corp','generat','kevin','dash','rob','view','sale','need'] #body_scrub = ['ddelainnsf','delaineyhouect','hang','court','advis','delainey','david' # 'ect','0921','ray','todaytonight','lavoratoenron','let' # ] #body_scrub = ['pastoria','calpin','amita','ecc','turbin','las','calgerpdxectect','calgerpdxect','allegheni', #'dwr','catalytica','cdwr','2mm','qf','02062001','vega','creditworthi','psco','calger', #'5mm','jacobi','erc','01262000','7mm','10mm','jdasovicnsf','pcg','parquet','goldendal', #'ae','eix','neuner','4mm','5034643735','helpdesk','christoph','louis', 'product', 'kitchen', #'christoph', 'jeff', 'david', 'delainey', 'ben'] s_vectorizer, scrubs_subj, recur, workset = vectorize_text_first(author_dict, 'subject',subj_scrub,test_size=0.4, all_together=_all) store_results(workset, author_dict, 'text_subj_score') b_vectorizer, scrubs_body, recur, workset = vectorize_text_first(author_dict, 'body',body_scrub,test_size=0.4, all_together=_all) store_results(workset, author_dict, 'text_score') print scrubs_subj print scrubs_body pickle.dump({'all':_all,'dict':author_dict}, open('author_dict.pkl', "w") )
# print "lenron_data=", enron_data print "len(enron_data)=", len(enron_data) print "# of features=", len(enron_data.items()[0][1]) cnt = 1 for f in enron_data.items()[0][1]: print cnt, "-", f cnt += 1 import sys sys.path.append("../final_project/") from poi_email_addresses import poiEmails emails = poiEmails() cnt = 0 tscnt = 0 ptcnt = 0 ecnt = 0 scnt = 0 mcnt = 0 pcnt = 0 for i in enron_data.items(): if enron_data[i[0]]["poi"] == True: cnt += 1 if enron_data[i[0]]["total_payments"] == "NaN": ptcnt +=1 if enron_data[i[0]]["total_stock_value"] == "NaN": tscnt +=1
print len(enron_data.keys()) print len(enron_data[enron_data.keys()[0]].keys()) poi_count = 0 for person in enron_data: if enron_data[person]['poi'] == True: poi_count += 1 else: pass print poi_count from poi_email_addresses import poiEmails print len(poiEmails()) print enron_data['PRENTICE JAMES']['total_stock_value'] print enron_data['COLWELL WESLEY']['from_this_person_to_poi'] print enron_data['SKILLING JEFFREY K']['exercised_stock_options'] print enron_data['FASTOW ANDREW S']['total_payments'] print enron_data['LAY KENNETH L']['total_payments'] print enron_data['SKILLING JEFFREY K']['total_payments'] type(poi_count) salary_count = 0 email_address_count = 0 for person in enron_data:
for (key, val) in enron_data.items(): if (val['poi']): count = count + 1 print 'Numbe of poi: ', count #avoid loop and use lambda newDict = dict(filter(lambda (key, val): val['poi'], enron_data.items())) count = len(newDict) print 'Numbe of poi: ', count import sys sys.path.append('../final_project/') from poi_email_addresses import poiEmails print '# of Poi emails: ', len(poiEmails()) names = open("../final_project/poi_names.txt", "r") #first three lines are headers print "Number of names ", len(names.readlines()[2:]) print "James Prince, total stock value: ", enron_data['PRENTICE JAMES'][ 'total_stock_value'] #find the keys print enron_data['PRENTICE JAMES'].keys() print "Email from this to poi: ", enron_data['COLWELL WESLEY'][ 'from_this_person_to_poi'] print enron_data['SKILLING JEFFREY K']
enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000 """ import sys import pickle import pandas as pd import numpy sys.path.append("../final_project/") from poi_email_addresses import poiEmails enron_data = pickle.load( open("../final_project/final_project_dataset.pkl", "rb")) x = enron_data["ALLEN PHILLIP K"] y = poiEmails() jp = enron_data["PRENTICE JAMES"]["total_stock_value"] wc = enron_data["COLWELL WESLEY"]["from_this_person_to_poi"] jks = enron_data["SKILLING JEFFREY K"]["exercised_stock_options"] names = ["SKILLING JEFFREY K", "LAY KENNETH L", "FASTOW ANDREW S"] for n in names: earnings = enron_data[n]["total_payments"] print(earnings) enron_pd = pd.DataFrame.from_dict(enron_data) enron_pd = enron_pd.T
def getNumberOfPersonsOfInterestsInTxt(): f = open("../final_project/poi_names.txt", "r") lines = f.readlines() return len(poiEmails())
salry_cnt += 1 if enron_data[x]['email_address'] !='NaN': email_cnt += 1 # In[49]: print(salry_cnt) print(email_cnt) # In[40]: poiEmails_list = poiEmails() poiEmails_list # In[43]: len(poiEmails_list) # In[45]: i = 0; lenth = 0; for x in enron_data:
def dump_email_data(data_dict): directory = "emails_by_address/" counter = 0 word_data = {} ls = poiEmails() for key in ls: email = ls[key] path1 = directory + "from_" + email + ".txt" path2 = directory + "to_" + email + ".txt" words = "" try: f1 = open(path1, "r") ls1 = f1.readlines() f1.close() for path in ls1: path = "../" + path[:-1] f2 = open(path, "r") words = words + " " + (parseOutText(f2)) f2.close() except Exception: pass try: f1 = open(path2, "r") ls1 = f1.readlines() f1.close() for path in ls1: path = "../" + path[:-1] f2 = open(path, "r") words = words + " " + (parseOutText(f2)) f2.close() except Exception: pass if words != "": if key in word_data: word_data[key] = word_data[key] + " " + words else: word_data[key] = words del words for key in data_dict: email = data_dict[key]['email_address'] path1 = directory + "from_" + email + ".txt" path2 = directory + "to_" + email + ".txt" words = "" try: f1 = open(path1, "r") ls = f1.readlines() f1.close() for path in ls: path = "../" + path[:-1] f2 = open(path, "r") words = words + " " + (parseOutText(f2)) f2.close() except Exception: pass try: f1 = open(path2, "r") ls = f1.readlines() f1.close() for path in l1: path = "../" + path[:-1] f2 = open(path, "r") words = words + " " + (parseOutText(f2)) f2.close() except Exception: pass if words != "": if key in word_data: word_data[key] = word_data[key] + " " + words else: word_data[key] = words del words counter += 1 print counter pickle_out = open("email_data.pkl", "wb") pickle.dump(word_data, pickle_out) pickle_out.close() print "\nSuccess!\nEmail Data Fetched."
The dataset has the form: enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict } {features_dict} is a dictionary of features associated with that person. You should explore features_dict as part of the mini-project, but here's an example to get you started: enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000 """ import pickle import math from poi_email_addresses import poiEmails knownEmails = poiEmails() enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) print "Number of People:", len(enron_data) print "Number of Features:", len(enron_data["SKILLING JEFFREY K"]) print "Avaliable features:", dict.keys(enron_data["SKILLING JEFFREY K"]) poiNum = 0 peopleWithSalary = 0 peopleWithKnownEmail = 0 peopleWithTotalPayments = 0 poiWithTotalPayments = 0 for people in enron_data: if (enron_data[people]["poi"] == 1): poiNum+=1 if (enron_data[people]["salary"] != "NaN"): peopleWithSalary+=1
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectPercentile, f_classif from parse_out_email_text import parseOutText from poi_email_addresses import poiEmails from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus from collections import defaultdict # In[3]: poi_emails = poiEmails() from_emails_folder = r'C:\Users\jcsmi329\Documents\Homework\Udacity\Projects\Machine Learning\ud120-projects-master\final_project\emails_by_address' base_folder = r'C:\Users\jcsmi329\Documents\Homework\Udacity\Projects\Machine Learning\ud120-projects-master' word_data = defaultdict(list) final_project_emails = [] email_names = set() ### Load the dictionary containing the dataset with open( r"C:\Users\jcsmi329\Documents\Homework\Udacity\Projects\Machine Learning\final_project\final_project_dataset.pkl", "rb") as data_file: data_dict = pickle.load(data_file) for x in data_dict.values():
#### parameters to be tuned ######### debug_mode = True debug_counter = 200 person_filter_percentage = 0 nonpoi_filter_percentage = 60 poi_filter_percentage = 40 email_feature_percentage = 10 n_pca_components = 40 n_kfolds = 10 ###################################### addr_list = [] label_list = [] email_list = [] poiEmailList = poiEmails() path = 'emails_by_address' t0 = time() # generate label_list and email_list for training for file_name in os.listdir(path): # for debug run only if debug_mode: print "debug_counter = ", debug_counter debug_counter -= 1 if (debug_counter < 0): break # go to each person's folder, get his label, and the path to his emails try: ToOrFrom, email_addr = file_name.split('_', 1)
""" #%% import pickle import sys sys.path.append("../final_project/") from poi_email_addresses import poiEmails enron_data = pickle.load( open("../final_project/final_project_dataset_unix.pkl", "rb")) poi_count = {k: v for k, v in enron_data.items() if v['poi'] == 1} print('POI Count: ', len(poi_count)) print('Email Count: ', len(poiEmails())) print( f"James Prentice stock value: ${enron_data['PRENTICE JAMES']['total_stock_value']}" ) print( f"Skilling profits: ${enron_data['SKILLING JEFFREY K']['total_payments']}") print(f"Lay profits: ${enron_data['LAY KENNETH L']['total_payments']}") print(f"Fastow profits: ${enron_data['FASTOW ANDREW S']['total_payments']}") with_salary = {k: v for k, v in enron_data.items() if v['salary'] != 'NaN'} print('Employees w/ quantified salary: ', len(with_salary)) with_email = { k: v
import sys sys.path.append("../final_project/") from poi_email_addresses import poiEmails enron_data = pickle.load( open("../final_project/final_project_dataset.pkl", "rb")) f = open("../final_project/poi_names.txt", "rb") poi_names = [] poi_lines = f.readlines() print(enron_data["SKILLING JEFFREY K"]["bonus"]) print(len(enron_data)) print(list(enron_data.keys())) print(len(enron_data["SKILLING JEFFREY K"].keys())) pois = [x for x, y in enron_data.items() if y['poi']] print('Number of POI\'s: {0}'.format(len(pois))) email_lst = poiEmails() print(email_lst) for line in poi_lines: line = line.rstrip() print(line) print(enron_data['PRENTICE JAMES']['total_stock_value']) print(enron_data['COLWELL WESLEY']['from_this_person_to_poi']) print(enron_data['SKILLING JEFFREY K']['exercised_stock_options']) print(enron_data['SKILLING JEFFREY K']['total_payments']) print(enron_data['FASTOW ANDREW S']['total_payments']) print(enron_data['LAY KENNETH L']['total_payments']) print(enron_data) count_salary = 0
sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". features_list = ['poi','salary'] # You will need to use more features ### Load the dictionary containing the dataset with open("../final_project/final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) print(len(data_dict)) print(len(poiEmails())) poi_email_in_data=[] all_email_in_data=[] for name, features in data_dict.iteritems(): print(name) email = features['email_address'] all_email_in_data.append(email) if(features['poi']): if(email in poiEmails()): poi_email_in_data.append(email) #print(email) #print('1: poi in poi_email_list') else: print('2: poi not in poi_email_list')
email_exist_count = 0 none_count = 0 true_count = 0 true_contain_count = 0 false_count = 0 for name in enron_data: row = enron_data[name] if row['salary'] != 'NaN': salary_exist_count += 1 if row['email_address'] != 'NaN': email_exist_count += 1 if row['poi'] is None: none_count += 1 elif row['poi']: if row['email_address'] in poiEmails(): true_contain_count += 1 true_count += 1 else: false_count += 1 print("none_count:{}, true_count:{}, false_count:{}, total_count:{}".format(none_count, true_count, false_count, none_count + true_count + false_count)) print("true_contain_count:{}".format(true_contain_count)) print(enron_data['PRENTICE JAMES']['total_stock_value']) print(enron_data['COLWELL WESLEY']['from_this_person_to_poi']) print(enron_data['SKILLING JEFFREY K']['exercised_stock_options']) # print(enron_data.keys())
'loan_advances'] >= 1000000: print key, value['long_term_incentive'], value['loan_advances'] print( "******** End Possible Outliers and POIs - ( Long Term Incentive & loan Advance) **********" ) # REMOVING THE OUTLIER data_dict.pop("TOTAL", 0) ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. import os from poi_email_addresses import poiEmails EMAILS = "./emails_by_address/" POI_EMAILS = poiEmails() def process_emails(): ''' extract filename of each file in emails_by_address folder and pass it to process_email method ''' for _, _, files in os.walk(EMAILS): for filename in files: process_email(filename) def process_email(email): ''' Will extract from title and verify if it was a sender or recipient It will open the file and count the amount of deleted emails by that person '''
{features_dict} is a dictionary of features associated with that person. You should explore features_dict as part of the mini-project, but here's an example to get you started: enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000 """ import cPickle import sys sys.path.append("/Users/shubhamgupta/shubham/udacity_ml/final_project") from poi_email_addresses import poiEmails import pdb enron_data = cPickle.load( open("../final_project/final_project_dataset.pkl", "r")) address_list = poiEmails() valid_email_count = 0 valid_salary_count = 0 valid_poi = 0 for obj in enron_data.iteritems(): x, y = obj if y.get("email_address") in address_list: valid_poi += 1 if y.get("email_address") != "NaN": valid_email_count += 1 if y.get("salary") != "NaN": valid_salary_count += 1 print(len(enron_data)) print(valid_poi) print(valid_email_count)
with open(line, "r") as text: #open email message msg = email.message_from_file(text) match1 = re.findall('\w+',msg['X-From']) # considering names of email sender match2 = re.findall('\w+',msg['X-To']) # and recipient as stopwords if len(match1) >= 2: #condition to ignore data that do not in include name in 'X-From' and 'X-To' stp_words.append(match1[0]) stp_words.append(match1[1]) if len(match2) >=2: stp_words.append(match2[0]) stp_words.append(match2[1]) for key, value in msg.items(): # condition for the email to be send to poi if key == 'To': if value in poiEmails(): parsed_text = parseOutText(text) for item in stp_words: text = parsed_text.replace(item, '') merge_text.append(text) wrd = ' '.join([merge_text[i] for i in range(len(merge_text))]) wrd = ''.join(i for i in wrd if not i.isdigit()) word_data.update({name: wrd}) else: #
""" Starter code for exploring the Enron dataset (emails + finances); loads up the dataset (pickled dict of dicts). The dataset has the form: enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict } {features_dict} is a dictionary of features associated with that person. You should explore features_dict as part of the mini-project, but here's an example to get you started: enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000 """ import pickle from poi_email_addresses import poiEmails enron_data = pickle.load( open("../final_project/final_project_dataset.pkl", "rb")) email_addresses = poiEmails() counter = 0 for x in enron_data: if (enron_data[x]["poi"] == 1): counter = counter + 1 print "POIs: {}".format(counter) print "POI Emails: {}".format(len(email_addresses))
'\w+', msg['X-From'] ) # considering names of email sender match2 = re.findall( '\w+', msg['X-To']) # and recipient as stopwords if len(match1) >= 2: #condition to ignore data that do not in include name in 'X-From' and 'X-To' stp_words.append(match1[0]) stp_words.append(match1[1]) if len(match2) >= 2: stp_words.append(match2[0]) stp_words.append(match2[1]) for key, value in msg.items( ): # condition for the email to be send to poi if key == 'To': if value in poiEmails(): parsed_text = parseOutText(text) for item in stp_words: text = parsed_text.replace( item, '') merge_text.append(text) wrd = ' '.join([merge_text[i] for i in range(len(merge_text))]) wrd = ''.join(i for i in wrd if not i.isdigit()) word_data.update({name: wrd}) else: # from_email = '(?<=from_)' + data_point['email_address'] for i in list_of_emails: