Esempio n. 1
0
def data_info(enron_data):
    num_people = len(enron_data)
    print("Size of Enron dataset: " + str(num_people))
    num_features = [len(v) for v in enron_data.itervalues()]
    num_features = list(dict.fromkeys(num_features))
    print("Total features in Enron dataset: " + str(num_features))
    features_list = [v for v in enron_data.itervalues()][0].keys()
    print("Features:")
    for f in features_list:
        print("\t" + f)

    poi_list = poiEmails()
Esempio n. 2
0
def poiFlagEmail(f):
    """ given an email file f,
        return a trio of booleans for whether that email is
        to, from, or cc'ing a poi """
    import sys
    sys.path.append("../final_project/")
    from poi_email_addresses import poiEmails
    to_emails, from_emails, cc_emails = getToFromStrings(f)

    ### poi_emails.poiEmails() returns a list of all POIs' email addresses.
    poi_email_list = poiEmails()

    to_poi = False
    from_poi = False
    cc_poi = False

    ### to_poi and cc_poi are boolean variables which flag whether the email
    ### under inspection is addressed to a POI, or if a POI is in cc,
    ### respectively. You don't have to change this code at all.

    ### There can be many "to" emails, but only one "from", so the
    ### "to" processing needs to be a little more complicated
    if to_emails:
        ctr = 0
        while not to_poi and ctr < len(to_emails):
            if to_emails[ctr] in poi_email_list:
                to_poi = True
            ctr += 1
    if cc_emails:
        ctr = 0
        while not cc_poi and ctr < len(cc_emails):
            if cc_emails[ctr] in poi_email_list:
                cc_poi = True
            ctr += 1

    #################################
    ######## your code below ########
    ### set from_poi to True if #####
    ### the email is from a POI #####
    #################################
    if from_emails:
        ctr = 0
        while not from_poi and ctr < len(from_emails):
            if from_emails[ctr] in poi_email_list:
                from_poi = True
            ctr += 1

    #################################
    return to_poi, from_poi, cc_poi
Esempio n. 3
0
def text_classifier(data_dict):
    '''
	This function is the main function appealing to other functions
	in the dependencies folder. This function creates a dict of all
	the emails of all poi and non-poi. Then it iterate through the dict
	and feed the results to the email_open function. In turn this
	function feed a list of email_path to email_process which will
	return a list of emails. This will finally be fed to a 
	vectorizing function which will root the words and return them
	for the classification.

	The function classify the list of emails and test them using
	different metrics.
	'''
    email_dict = poiEmails()
    for key in email_dict:
        email_dict[key] = {"email": email_dict[key], "poi": True}
    for key in data_dict:
        if key in email_dict:
            continue
        else:
            email_dict[key] = {
                "email": [data_dict[key]["email_address"]],
                "poi": data_dict[key]["poi"]
            }

    for key in email_dict:
        try:
            for i in email_dict[key]["email"]:
                None
        except:
            print key
    email_list, poi_list = email_open(email_dict)
    word_list = email_process(email_list)
    features_train, features_test, labels_train, labels_test = email_vectorizer(
        word_list, poi_list)

    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    predict = list(clf.predict(features_test))

    print "Number of POI emails in test set: " + str(predict.count(1.))
    print "Number of emails in test set: " + str(len(features_test))
    print "Number of real POI emails in test set: " + str(labels_test.count(1))
    print "Precision is: " + str(metrics.precision_score(labels_test, predict))
    print "Recall is: " + str(metrics.recall_score(labels_test, predict))
def process_text_learning_features():
    poi_emails = poiEmails()
    _all = False
    data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
    emls = []
    eml_key = {}
    for key in data_dict.keys():
        emls.append(data_dict[key]['email_address'])
        eml_key[data_dict[key]['email_address']] = key
    
    author_dict = get_text_from_emails(emls,eml_key,poi_emails, data_dict,from_pkl=True, combined=_all)
    print 'Got the Author Text Dictionary!!!'
    
    #look_for_common_words(author_dict, 'body', data_dict)
    subj_scrub = []
    body_scrub = ['ddelainnsf','delainey','regard','david','houect','delaineyhouect','christoph', 'jeff', 'product', 'kitchen', 
                  'jdasovicnsf','allegheni','pastoria','jdasovicnsf','neuner','jacobi','catalytica','calpin',
                  'ect','dave','tim','ena','62602pst','belden','guy','chris','calger','valu','salisburi',
                  'swerzbin','kelli','paula','motley','chapman','johnson','frevertna','presto','ben','ray','janet','wes','dietrich',
                  'deal','holden','kay','floor','thxs','portland','manag','plan','turbin','enron','board','meet','forward','year',
                  'term','pdx','market','goal','lavoratocorp','2702pst','desk','unit','discuss','mid','2000','kellycom','7138532534',
                  '7138536485','execut','power','cost','busi','complet','ensur','howev','provid','sheet','short','right','structur',
                  'trade','organ','peopl','jskillinpst','corp','generat','kevin','dash','rob','view','sale','need']
    #body_scrub = ['ddelainnsf','delaineyhouect','hang','court','advis','delainey','david'
    #              'ect','0921','ray','todaytonight','lavoratoenron','let'
    #              ]
    #body_scrub = ['pastoria','calpin','amita','ecc','turbin','las','calgerpdxectect','calgerpdxect','allegheni', 
    #'dwr','catalytica','cdwr','2mm','qf','02062001','vega','creditworthi','psco','calger', 
    #'5mm','jacobi','erc','01262000','7mm','10mm','jdasovicnsf','pcg','parquet','goldendal', 
    #'ae','eix','neuner','4mm','5034643735','helpdesk','christoph','louis', 'product', 'kitchen', 
    #'christoph', 'jeff', 'david', 'delainey', 'ben']
    s_vectorizer, scrubs_subj, recur, workset = vectorize_text_first(author_dict, 'subject',subj_scrub,test_size=0.4, all_together=_all)
    store_results(workset, author_dict, 'text_subj_score')
    b_vectorizer, scrubs_body, recur, workset = vectorize_text_first(author_dict, 'body',body_scrub,test_size=0.4, all_together=_all)
    store_results(workset, author_dict, 'text_score')
    print scrubs_subj
    print scrubs_body
    pickle.dump({'all':_all,'dict':author_dict}, open('author_dict.pkl', "w") )
# print "lenron_data=", enron_data
print "len(enron_data)=", len(enron_data)
print "# of features=", len(enron_data.items()[0][1])

cnt = 1
for f in enron_data.items()[0][1]:
    print cnt, "-", f
    cnt += 1

import sys
sys.path.append("../final_project/")

from poi_email_addresses import poiEmails

emails = poiEmails()

cnt = 0
tscnt = 0
ptcnt = 0
ecnt = 0
scnt = 0
mcnt = 0
pcnt = 0
for i in enron_data.items():
    if enron_data[i[0]]["poi"] == True:
        cnt += 1
        if enron_data[i[0]]["total_payments"] == "NaN":
            ptcnt +=1
        if enron_data[i[0]]["total_stock_value"] == "NaN":
            tscnt +=1
print len(enron_data.keys())

print len(enron_data[enron_data.keys()[0]].keys())

poi_count = 0
for person in enron_data:
    if enron_data[person]['poi'] == True:
        poi_count += 1
    else:
        pass
    
print poi_count

from poi_email_addresses import poiEmails

print len(poiEmails())

print enron_data['PRENTICE JAMES']['total_stock_value']

print enron_data['COLWELL WESLEY']['from_this_person_to_poi']

print enron_data['SKILLING JEFFREY K']['exercised_stock_options']

print enron_data['FASTOW ANDREW S']['total_payments']
print enron_data['LAY KENNETH L']['total_payments']
print enron_data['SKILLING JEFFREY K']['total_payments']
type(poi_count)

salary_count = 0
email_address_count = 0
for person in enron_data:
for (key, val) in enron_data.items():
    if (val['poi']):
        count = count + 1
print 'Numbe of poi: ', count

#avoid loop and use lambda
newDict = dict(filter(lambda (key, val): val['poi'], enron_data.items()))
count = len(newDict)

print 'Numbe of poi: ', count

import sys
sys.path.append('../final_project/')
from poi_email_addresses import poiEmails

print '# of Poi emails: ', len(poiEmails())

names = open("../final_project/poi_names.txt", "r")
#first three lines are headers
print "Number of names ", len(names.readlines()[2:])

print "James Prince, total stock value: ", enron_data['PRENTICE JAMES'][
    'total_stock_value']

#find the keys
print enron_data['PRENTICE JAMES'].keys()

print "Email from this to poi: ", enron_data['COLWELL WESLEY'][
    'from_this_person_to_poi']

print enron_data['SKILLING JEFFREY K']
Esempio n. 8
0
    enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
    
"""

import sys
import pickle
import pandas as pd
import numpy
sys.path.append("../final_project/")
from poi_email_addresses import poiEmails

enron_data = pickle.load(
    open("../final_project/final_project_dataset.pkl", "rb"))

x = enron_data["ALLEN PHILLIP K"]
y = poiEmails()

jp = enron_data["PRENTICE JAMES"]["total_stock_value"]

wc = enron_data["COLWELL WESLEY"]["from_this_person_to_poi"]

jks = enron_data["SKILLING JEFFREY K"]["exercised_stock_options"]

names = ["SKILLING JEFFREY K", "LAY KENNETH L", "FASTOW ANDREW S"]
for n in names:
    earnings = enron_data[n]["total_payments"]
    print(earnings)

enron_pd = pd.DataFrame.from_dict(enron_data)
enron_pd = enron_pd.T
def getNumberOfPersonsOfInterestsInTxt():
    f = open("../final_project/poi_names.txt", "r")
    lines = f.readlines()
    return len(poiEmails())
Esempio n. 10
0
       salry_cnt +=  1
    if enron_data[x]['email_address'] !='NaN':
       email_cnt +=  1


# In[49]:


print(salry_cnt)
print(email_cnt)


# In[40]:


poiEmails_list = poiEmails()
poiEmails_list


# In[43]:


len(poiEmails_list)


# In[45]:


i = 0;
lenth = 0;
for x in enron_data:
Esempio n. 11
0
def dump_email_data(data_dict):

	directory = "emails_by_address/"
	counter = 0
	word_data = {}


	ls = poiEmails()
	for key in ls:
		email = ls[key]
		path1 = directory + "from_" + email + ".txt"
		path2 = directory + "to_" + email + ".txt"
		words = ""
		
		try:
			f1 = open(path1, "r")
			ls1 = f1.readlines()
			f1.close()
			for path in ls1:
				path = "../" + path[:-1]
				f2 = open(path, "r")
				words = words + " " + (parseOutText(f2))
				f2.close()

		except Exception:
			pass

		try:
			f1 = open(path2, "r")
			ls1 = f1.readlines()
			f1.close()
			for path in ls1:
				path = "../" + path[:-1]
				f2 = open(path, "r")
				words = words + " " + (parseOutText(f2))
				f2.close()

		except Exception:
			pass

		if words != "":
			if key in word_data:
				word_data[key] = word_data[key] + " " + words
			else:
				word_data[key] = words
			del words

	for key in data_dict:
		email = data_dict[key]['email_address']
		path1 = directory + "from_" + email + ".txt"
		path2 = directory + "to_" + email + ".txt"
		words = ""

		try:
			f1 = open(path1, "r")
			ls = f1.readlines()
			f1.close()
			for path in ls:
				path = "../" + path[:-1]
				f2 = open(path, "r")
				words = words + " " + (parseOutText(f2))
				f2.close()
		except Exception:
			pass

		try:
			f1 = open(path2, "r")
			ls = f1.readlines()
			f1.close()
			for path in l1:
				path = "../" + path[:-1]
				f2 = open(path, "r")
				words = words + " " + (parseOutText(f2))
				f2.close()
		except Exception:
			pass

		if words != "":
			if key in word_data:
				word_data[key] = word_data[key] + " " + words
			else:
				word_data[key] = words
			del words
		
		counter += 1
		print counter

	pickle_out = open("email_data.pkl", "wb")
	pickle.dump(word_data, pickle_out)
	pickle_out.close()

	print "\nSuccess!\nEmail Data Fetched."
    The dataset has the form:
    enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }

    {features_dict} is a dictionary of features associated with that person.
    You should explore features_dict as part of the mini-project,
    but here's an example to get you started:

    enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000

"""

import pickle
import math
from poi_email_addresses import poiEmails

knownEmails = poiEmails()
enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))
print "Number of People:", len(enron_data)
print "Number of Features:", len(enron_data["SKILLING JEFFREY K"])
print "Avaliable features:", dict.keys(enron_data["SKILLING JEFFREY K"])

poiNum = 0
peopleWithSalary = 0
peopleWithKnownEmail = 0
peopleWithTotalPayments = 0
poiWithTotalPayments = 0
for people in enron_data:
    if (enron_data[people]["poi"] == 1):
        poiNum+=1
    if (enron_data[people]["salary"] != "NaN"):
        peopleWithSalary+=1
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

from parse_out_email_text import parseOutText
from poi_email_addresses import poiEmails

from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

from collections import defaultdict

# In[3]:

poi_emails = poiEmails()

from_emails_folder = r'C:\Users\jcsmi329\Documents\Homework\Udacity\Projects\Machine Learning\ud120-projects-master\final_project\emails_by_address'
base_folder = r'C:\Users\jcsmi329\Documents\Homework\Udacity\Projects\Machine Learning\ud120-projects-master'

word_data = defaultdict(list)
final_project_emails = []
email_names = set()

### Load the dictionary containing the dataset
with open(
        r"C:\Users\jcsmi329\Documents\Homework\Udacity\Projects\Machine Learning\final_project\final_project_dataset.pkl",
        "rb") as data_file:
    data_dict = pickle.load(data_file)

for x in data_dict.values():
Esempio n. 14
0
#### parameters to be tuned #########
debug_mode = True
debug_counter = 200
person_filter_percentage = 0
nonpoi_filter_percentage = 60
poi_filter_percentage = 40
email_feature_percentage = 10
n_pca_components = 40
n_kfolds = 10
######################################

addr_list = []
label_list = []
email_list = []
poiEmailList = poiEmails()
path = 'emails_by_address'
t0 = time()

# generate label_list and email_list for training
for file_name in os.listdir(path):
    # for debug run only
    if debug_mode:
        print "debug_counter = ", debug_counter
        debug_counter -= 1
        if (debug_counter < 0):
            break

    # go to each person's folder, get his label, and the path to his emails
    try:
        ToOrFrom, email_addr = file_name.split('_', 1)
Esempio n. 15
0
    
"""
#%%
import pickle
import sys

sys.path.append("../final_project/")
from poi_email_addresses import poiEmails

enron_data = pickle.load(
    open("../final_project/final_project_dataset_unix.pkl", "rb"))

poi_count = {k: v for k, v in enron_data.items() if v['poi'] == 1}
print('POI Count: ', len(poi_count))

print('Email Count: ', len(poiEmails()))

print(
    f"James Prentice stock value: ${enron_data['PRENTICE JAMES']['total_stock_value']}"
)

print(
    f"Skilling profits: ${enron_data['SKILLING JEFFREY K']['total_payments']}")
print(f"Lay profits: ${enron_data['LAY KENNETH L']['total_payments']}")
print(f"Fastow profits: ${enron_data['FASTOW ANDREW S']['total_payments']}")

with_salary = {k: v for k, v in enron_data.items() if v['salary'] != 'NaN'}
print('Employees w/ quantified salary: ', len(with_salary))

with_email = {
    k: v
Esempio n. 16
0
import sys
sys.path.append("../final_project/")
from poi_email_addresses import poiEmails
enron_data = pickle.load(
    open("../final_project/final_project_dataset.pkl", "rb"))
f = open("../final_project/poi_names.txt", "rb")
poi_names = []
poi_lines = f.readlines()

print(enron_data["SKILLING JEFFREY K"]["bonus"])
print(len(enron_data))
print(list(enron_data.keys()))
print(len(enron_data["SKILLING JEFFREY K"].keys()))
pois = [x for x, y in enron_data.items() if y['poi']]
print('Number of POI\'s: {0}'.format(len(pois)))
email_lst = poiEmails()
print(email_lst)
for line in poi_lines:
    line = line.rstrip()
    print(line)

print(enron_data['PRENTICE JAMES']['total_stock_value'])
print(enron_data['COLWELL WESLEY']['from_this_person_to_poi'])
print(enron_data['SKILLING JEFFREY K']['exercised_stock_options'])

print(enron_data['SKILLING JEFFREY K']['total_payments'])
print(enron_data['FASTOW ANDREW S']['total_payments'])
print(enron_data['LAY KENNETH L']['total_payments'])
print(enron_data)

count_salary = 0
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features

### Load the dictionary containing the dataset
with open("../final_project/final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

print(len(data_dict))

print(len(poiEmails()))

poi_email_in_data=[]
all_email_in_data=[]
for name, features in data_dict.iteritems():
    print(name)
    email = features['email_address']
    all_email_in_data.append(email)

    if(features['poi']):
        if(email in poiEmails()):
            poi_email_in_data.append(email)
            #print(email)
            #print('1: poi in poi_email_list')
        else:
            print('2: poi not in poi_email_list')
email_exist_count = 0
none_count = 0
true_count = 0
true_contain_count = 0
false_count = 0
for name in enron_data:
    row = enron_data[name]
    if row['salary'] != 'NaN':
        salary_exist_count += 1
    if row['email_address'] != 'NaN':
        email_exist_count += 1

    if row['poi'] is None:
        none_count += 1
    elif row['poi']:
        if row['email_address'] in poiEmails():
            true_contain_count += 1
        true_count += 1
    else:
        false_count += 1

print("none_count:{}, true_count:{}, false_count:{}, total_count:{}".format(none_count, true_count, false_count,
                                                                            none_count + true_count + false_count))

print("true_contain_count:{}".format(true_contain_count))

print(enron_data['PRENTICE JAMES']['total_stock_value'])
print(enron_data['COLWELL WESLEY']['from_this_person_to_poi'])
print(enron_data['SKILLING JEFFREY K']['exercised_stock_options'])

# print(enron_data.keys())
Esempio n. 19
0
            'loan_advances'] >= 1000000:
        print key, value['long_term_incentive'], value['loan_advances']
print(
    "******** End Possible Outliers and POIs - ( Long Term Incentive & loan Advance) **********"
)

# REMOVING THE OUTLIER
data_dict.pop("TOTAL", 0)

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
import os
from poi_email_addresses import poiEmails
EMAILS = "./emails_by_address/"

POI_EMAILS = poiEmails()


def process_emails():
    ''' extract filename of each file in emails_by_address folder
        and pass it to process_email method
    '''
    for _, _, files in os.walk(EMAILS):
        for filename in files:
            process_email(filename)


def process_email(email):
    ''' Will extract from title and verify if it was a sender or recipient
        It will open the file and count the amount of deleted emails by that person
    '''
    {features_dict} is a dictionary of features associated with that person.
    You should explore features_dict as part of the mini-project,
    but here's an example to get you started:

    enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000

"""

import cPickle
import sys
sys.path.append("/Users/shubhamgupta/shubham/udacity_ml/final_project")
from poi_email_addresses import poiEmails
import pdb
enron_data = cPickle.load(
    open("../final_project/final_project_dataset.pkl", "r"))
address_list = poiEmails()
valid_email_count = 0
valid_salary_count = 0
valid_poi = 0
for obj in enron_data.iteritems():
    x, y = obj
    if y.get("email_address") in address_list:
        valid_poi += 1
    if y.get("email_address") != "NaN":
        valid_email_count += 1
    if y.get("salary") != "NaN":
        valid_salary_count += 1

print(len(enron_data))
print(valid_poi)
print(valid_email_count)
                         with open(line, "r") as text:
                             #open email message                            
                         
                             msg = email.message_from_file(text)
                             match1 = re.findall('\w+',msg['X-From']) # considering names of email sender
                             match2 = re.findall('\w+',msg['X-To'])   # and recipient as stopwords
                             if len(match1) >= 2:
                                 #condition to ignore data that do not in include name in 'X-From' and 'X-To'
                                 stp_words.append(match1[0])
                                 stp_words.append(match1[1])
                             if len(match2) >=2:
                                 stp_words.append(match2[0])
                                 stp_words.append(match2[1])
                             for key, value in msg.items():  # condition for the email to be send to poi
                                 if key == 'To':
                                     if value in poiEmails():
                                         
                                         parsed_text = parseOutText(text)
                                         for item in stp_words:
                                             text = parsed_text.replace(item, '')
                                         merge_text.append(text)
                                         
                                             
                    
     
     wrd = ' '.join([merge_text[i] for i in range(len(merge_text))])
     wrd = ''.join(i for i in wrd if not i.isdigit())
     word_data.update({name: wrd})
     
 else:
     #
""" 
    Starter code for exploring the Enron dataset (emails + finances);
    loads up the dataset (pickled dict of dicts).

    The dataset has the form:
    enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }

    {features_dict} is a dictionary of features associated with that person.
    You should explore features_dict as part of the mini-project,
    but here's an example to get you started:

    enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
    
"""

import pickle
from poi_email_addresses import poiEmails

enron_data = pickle.load(
    open("../final_project/final_project_dataset.pkl", "rb"))
email_addresses = poiEmails()

counter = 0

for x in enron_data:
    if (enron_data[x]["poi"] == 1):
        counter = counter + 1

print "POIs: {}".format(counter)
print "POI Emails: {}".format(len(email_addresses))
                                    '\w+', msg['X-From']
                                )  # considering names of email sender
                                match2 = re.findall(
                                    '\w+',
                                    msg['X-To'])  # and recipient as stopwords
                                if len(match1) >= 2:
                                    #condition to ignore data that do not in include name in 'X-From' and 'X-To'
                                    stp_words.append(match1[0])
                                    stp_words.append(match1[1])
                                if len(match2) >= 2:
                                    stp_words.append(match2[0])
                                    stp_words.append(match2[1])
                                for key, value in msg.items(
                                ):  # condition for the email to be send to poi
                                    if key == 'To':
                                        if value in poiEmails():

                                            parsed_text = parseOutText(text)
                                            for item in stp_words:
                                                text = parsed_text.replace(
                                                    item, '')
                                            merge_text.append(text)

        wrd = ' '.join([merge_text[i] for i in range(len(merge_text))])
        wrd = ''.join(i for i in wrd if not i.isdigit())
        word_data.update({name: wrd})

    else:
        #
        from_email = '(?<=from_)' + data_point['email_address']
        for i in list_of_emails: