def preprocess(words_file = "../tools/word_data_unix.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ original_word_file = words_file.rsplit("_unix.pkl", 1)[0] + ".pkl" pkl_formatting(original_word_file) ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "rb") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "rb") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print("no. of Chris training emails:", sum(labels_train)) print("no. of Sara training emails:", len(labels_train)-sum(labels_train)) return features_train_transformed, features_test_transformed, labels_train, labels_test
# , 'email_address' , 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi' ### Custom Features (Features I created) , 'from_poi_ratio', 'to_poi_ratio' ] ### Load the dictionary containing the dataset dataset_file_original = "final_project_dataset.pkl" pkl_formatting(dataset_file_original) dataset_file_unix_format = "final_project_dataset_unix.pkl" with open(dataset_file_unix_format, "rb") as data_file: data_dict = pickle.load(data_file) ### Task 2: Remove outliers data_dict.pop("TOTAL", 0) ### Task 3: Create new feature(s) for person in data_dict: if data_dict[person]['from_poi_to_this_person'] == 'NaN' or data_dict[ person]['from_messages'] == 'NaN': data_dict[person]['from_poi_ratio'] = 0 else: data_dict[person]['from_poi_ratio'] = float( data_dict[person]['from_poi_to_this_person']) / float(
def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False): """ convert dictionary to numpy array of features remove_NaN = True will convert "NaN" string to 0.0 remove_all_zeroes = True will omit any data points for which all the features you seek are 0.0 remove_any_zeroes = True will omit any data points for which any of the features you seek are 0.0 sort_keys = True sorts keys by alphabetical order. Setting the value as a string opens the corresponding pickle file with a preset key order (this is used for Python 3 compatibility, and sort_keys should be left as False for the course mini-projects). NOTE: first feature is assumed to be 'poi' and is not checked for removal for zero or missing values. """ return_list = [] # Key order - first branch is for Python 3 compatibility on mini-projects, # second branch is for compatibility on final project. if isinstance(sort_keys, str): import pickle try: keys = pickle.load(open(sort_keys, "rb")) except pickle.UnpicklingError: import sys sys.path.append("../tools/") from dos2unix import pkl_formatting pkl_formatting(sort_keys) modified_sort_keys = sort_keys.rsplit('.pkl', 1)[0] + '_unix.pkl' keys = pickle.load(open(modified_sort_keys, "rb")) elif sort_keys: keys = sorted(dictionary.keys()) else: keys = dictionary.keys() for key in keys: tmp_list = [] for feature in features: try: dictionary[key][feature] except KeyError: print("error: key ", feature, " not present") return value = dictionary[key][feature] if value=="NaN" and remove_NaN: value = 0 tmp_list.append( float(value) ) # Logic for deciding whether or not to add the data point. append = True # exclude 'poi' class as criteria. if features[0] == 'poi': test_list = tmp_list[1:] else: test_list = tmp_list ### if all features are zero and you want to remove ### data points that are all zero, do that here if remove_all_zeroes: append = False for item in test_list: if item != 0 and item != "NaN": append = True break ### if any features for a given data point are zero ### and you want to remove data points with any zeroes, ### handle that here if remove_any_zeroes: if 0 in test_list or "NaN" in test_list: append = False ### Append the data point if flagged for addition. if append: return_list.append( np.array(tmp_list) ) return np.array(return_list)
#!/usr/bin/python # For compatibility between python 2 and 3 from __future__ import print_function import pickle import sys import matplotlib.pyplot sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit from dos2unix import pkl_formatting ### read in data dictionary, convert to numpy array data_dict_file_original = "../final_project/final_project_dataset.pkl" pkl_formatting(data_dict_file_original) data_dict_file_unix_format = "../final_project/final_project_dataset_unix.pkl" data_dict = pickle.load(open(data_dict_file_unix_format, "rb")) data_dict.pop("TOTAL", 0) target_feature = "salary" input_feature = "bonus" features = [target_feature, input_feature] data = featureFormat(data_dict, features) ### your code below plt = matplotlib.pyplot for point in data: salary = point[0] bonus = point[1] plt.scatter(salary, bonus) plt.xlabel("salary")
{features_dict} is a dictionary of features associated with that person. You should explore features_dict as part of the mini-project, but here's an example to get you started: enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000 """ import pickle import sys sys.path.append("../tools/") from dos2unix import pkl_formatting from feature_format import featureFormat, targetFeatureSplit enron_file = "../final_project/final_project_dataset_unix.pkl" original_enron_file = "../final_project/final_project_dataset.pkl" pkl_formatting(original_enron_file) enron_data = pickle.load(open(enron_file, "rb")) print("Number of data points :", len(enron_data)) search_keyword = "fastow".upper() print("Search for", search_keyword, ":", [name for name in enron_data if search_keyword in name]) print() print("Number of features :", len(next(iter(enron_data.values())))) print("Features :", list(next(iter(enron_data.values())).keys())) print() print("Number of people of interest :", sum(feature['poi'] == 1 for feature in enron_data.values())) print() with open("../final_project/poi_names.txt") as f:
(why modified? we've removed some trouble points that you'll find yourself in the outliers mini-project). Draws a little scatterplot of the training/testing data You fill in the regression code where indicated: """ import sys import pickle sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit from dos2unix import pkl_formatting final_project_dataset_modified_original = "../final_project/final_project_dataset_modified.pkl" final_project_dataset_modified_unix_format = "../final_project/final_project_dataset_modified_unix.pkl" pkl_formatting(final_project_dataset_modified_original) dictionary = pickle.load(open(final_project_dataset_modified_unix_format, "rb")) ### list the features you want to look at--first item in the ### list will be the "target" feature input_feature = "salary" target_feature = "bonus" features_list = [target_feature, input_feature] data = featureFormat(dictionary, features_list, remove_any_zeroes=True, sort_keys='../tools/python2_lesson06_keys.pkl') target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification
from __future__ import print_function import random import numpy import matplotlib.pyplot as plt import pickle from outlier_cleaner import outlierCleaner import sys sys.path.append("../tools/") from dos2unix import pkl_formatting ### load up some practice data with outliers in it ages_file_original = "practice_outliers_ages.pkl" pkl_formatting(ages_file_original) ages_file_unix_format = "practice_outliers_ages_unix.pkl" net_worths_file_original = "practice_outliers_net_worths.pkl" pkl_formatting(net_worths_file_original) net_worths_file_unix_format = "practice_outliers_net_worths_unix.pkl" ages = pickle.load(open(ages_file_unix_format, "rb")) net_worths = pickle.load(open(net_worths_file_unix_format, "rb")) ### ages and net_worths need to be reshaped into 2D numpy arrays ### second argument of reshape command is a tuple of integers: (n_rows, n_columns) ### by convention, n_rows is the number of data points ### and n_columns is the number of features ages = numpy.reshape(numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1))