def test_classifier(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def get_k_best_features(data, features_list, k=10): # Setup the label and features data = featureFormat(data, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) # Apply SelectKBest k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ # pair up with feature name, ignore the first one, since # that is the 'poi' label unsorted_pairs = zip(features_list[1:], scores) # Sort based on score sorted_pairs = list(sorted(unsorted_pairs, key=lambda x: x[1], reverse=True)) return sorted_pairs
Draws a little scatterplot of the training/testing data You fill in the regression code where indicated: """ import sys import pickle sys.path.append("../tools/") from tools.feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") ) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit( data ) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. from sklearn.linear_model import LinearRegression
'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees'] email features: ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'poi', 'shared_receipt_with_poi'] ''' features_list = ['poi', 'salary', 'bonus', 'expenses', 'exercised_stock_options'] # You will need to use more features ### Load the dictionary containing the dataset with open("final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) ### Task 2: Remove outliers ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. data = featureFormat(data_dict, features_list) my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB
import sys from sklearn import tree from sklearn.metrics import accuracy_score sys.path.append("../tools/") from tools.feature_format import featureFormat, targetFeatureSplit from sklearn.cross_validation import train_test_split data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") ) ### first element is our labels, any added elements are predictor ### features. Keep this the same for the mini-project, but you'll ### have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) ### it's all yours from here forward! features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42) clf = tree.DecisionTreeClassifier() clf = clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = accuracy_score(labels_test, pred) print 'accuracy:{}'.format(acc)
You fill in the regression code where indicated: """ import pickle import sys sys.path.append("../tools/") from tools.feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") ) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit( data ) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "b" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points.
#!/usr/bin/python """ Starter code for the validation mini-project. The first step toward building your POI identifier! Start by loading/formatting the data After that, it's not our code anymore--it's yours! """ import pickle import sys from tools.feature_format import featureFormat, targetFeatureSplit data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) ### first element is our labels, any added elements are predictor ### features. Keep this the same for the mini-project, but you'll ### have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) ### it's all yours from here forward!
### Debug the data to have a better understanding on the dataset analyse(data_dict) ### Next we fix some of the identified issues data_dict = fix(data_dict) ### Task 2: Remove outliers outliers = find_outliers(data_dict) my_dataset = remove_outliers(data_dict, outliers) ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = create_new_features(my_dataset) ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html