def preprocess(words_file = "../tools/word_data_unix.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    original_word_file = words_file.rsplit("_unix.pkl", 1)[0] + ".pkl"
    pkl_formatting(original_word_file)

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "rb")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "rb")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=1)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print("no. of Chris training emails:", sum(labels_train))
    print("no. of Sara training emails:", len(labels_train)-sum(labels_train))
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Exemple #2
0
    # , 'email_address'
    ,
    'from_poi_to_this_person',
    'from_messages',
    'from_this_person_to_poi',
    'shared_receipt_with_poi'

    ### Custom Features (Features I created)
    ,
    'from_poi_ratio',
    'to_poi_ratio'
]

### Load the dictionary containing the dataset
dataset_file_original = "final_project_dataset.pkl"
pkl_formatting(dataset_file_original)
dataset_file_unix_format = "final_project_dataset_unix.pkl"
with open(dataset_file_unix_format, "rb") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
data_dict.pop("TOTAL", 0)

### Task 3: Create new feature(s)
for person in data_dict:
    if data_dict[person]['from_poi_to_this_person'] == 'NaN' or data_dict[
            person]['from_messages'] == 'NaN':
        data_dict[person]['from_poi_ratio'] = 0
    else:
        data_dict[person]['from_poi_ratio'] = float(
            data_dict[person]['from_poi_to_this_person']) / float(
Exemple #3
0
def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    """ convert dictionary to numpy array of features
        remove_NaN = True will convert "NaN" string to 0.0
        remove_all_zeroes = True will omit any data points for which
            all the features you seek are 0.0
        remove_any_zeroes = True will omit any data points for which
            any of the features you seek are 0.0
        sort_keys = True sorts keys by alphabetical order. Setting the value as
            a string opens the corresponding pickle file with a preset key
            order (this is used for Python 3 compatibility, and sort_keys
            should be left as False for the course mini-projects).
        NOTE: first feature is assumed to be 'poi' and is not checked for
            removal for zero or missing values.
    """


    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        try:
            keys = pickle.load(open(sort_keys, "rb"))
        except pickle.UnpicklingError:
            import sys
            sys.path.append("../tools/")
            from dos2unix import pkl_formatting
            pkl_formatting(sort_keys)
            modified_sort_keys = sort_keys.rsplit('.pkl', 1)[0] + '_unix.pkl'
            keys = pickle.load(open(modified_sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print("error: key ", feature, " not present")
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)
Exemple #4
0
#!/usr/bin/python

# For compatibility between python 2 and 3
from __future__ import print_function

import pickle
import sys
import matplotlib.pyplot
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from dos2unix import pkl_formatting

### read in data dictionary, convert to numpy array
data_dict_file_original = "../final_project/final_project_dataset.pkl"
pkl_formatting(data_dict_file_original)
data_dict_file_unix_format = "../final_project/final_project_dataset_unix.pkl"

data_dict = pickle.load(open(data_dict_file_unix_format, "rb"))
data_dict.pop("TOTAL", 0)
target_feature = "salary"
input_feature = "bonus"
features = [target_feature, input_feature]
data = featureFormat(data_dict, features)

### your code below
plt = matplotlib.pyplot
for point in data:
    salary = point[0]
    bonus = point[1]
    plt.scatter(salary, bonus)
plt.xlabel("salary")
    {features_dict} is a dictionary of features associated with that person.
    You should explore features_dict as part of the mini-project,
    but here's an example to get you started:

    enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
"""

import pickle
import sys
sys.path.append("../tools/")
from dos2unix import pkl_formatting
from feature_format import featureFormat, targetFeatureSplit

enron_file = "../final_project/final_project_dataset_unix.pkl"
original_enron_file = "../final_project/final_project_dataset.pkl"
pkl_formatting(original_enron_file)

enron_data = pickle.load(open(enron_file, "rb"))

print("Number of data points :", len(enron_data))
search_keyword = "fastow".upper()
print("Search for", search_keyword, ":",
      [name for name in enron_data if search_keyword in name])
print()
print("Number of features :", len(next(iter(enron_data.values()))))
print("Features :", list(next(iter(enron_data.values())).keys()))
print()
print("Number of people of interest :",
      sum(feature['poi'] == 1 for feature in enron_data.values()))
print()
with open("../final_project/poi_names.txt") as f:
    (why modified?  we've removed some trouble points
    that you'll find yourself in the outliers mini-project).

    Draws a little scatterplot of the training/testing data

    You fill in the regression code where indicated:
"""

import sys
import pickle
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from dos2unix import pkl_formatting
final_project_dataset_modified_original = "../final_project/final_project_dataset_modified.pkl"
final_project_dataset_modified_unix_format = "../final_project/final_project_dataset_modified_unix.pkl"
pkl_formatting(final_project_dataset_modified_original)
dictionary = pickle.load(open(final_project_dataset_modified_unix_format,
                              "rb"))

### list the features you want to look at--first item in the
### list will be the "target" feature
input_feature = "salary"
target_feature = "bonus"
features_list = [target_feature, input_feature]
data = featureFormat(dictionary,
                     features_list,
                     remove_any_zeroes=True,
                     sort_keys='../tools/python2_lesson06_keys.pkl')
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from __future__ import print_function

import random
import numpy
import matplotlib.pyplot as plt
import pickle

from outlier_cleaner import outlierCleaner

import sys
sys.path.append("../tools/")
from dos2unix import pkl_formatting

### load up some practice data with outliers in it
ages_file_original = "practice_outliers_ages.pkl"
pkl_formatting(ages_file_original)
ages_file_unix_format = "practice_outliers_ages_unix.pkl"

net_worths_file_original = "practice_outliers_net_worths.pkl"
pkl_formatting(net_worths_file_original)
net_worths_file_unix_format = "practice_outliers_net_worths_unix.pkl"

ages = pickle.load(open(ages_file_unix_format, "rb"))
net_worths = pickle.load(open(net_worths_file_unix_format, "rb"))

### ages and net_worths need to be reshaped into 2D numpy arrays
### second argument of reshape command is a tuple of integers: (n_rows, n_columns)
### by convention, n_rows is the number of data points
### and n_columns is the number of features
ages = numpy.reshape(numpy.array(ages), (len(ages), 1))
net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1))