""" import pickle from sklearn.metrics import recall_score, precision_score from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from enron.feature_format import featureFormat, targetFeatureSplit data_dict = pickle.load(open("../../resources/enron/enron_dataset.pkl", "rb")) # add more features to features_list! features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) data = featureFormat( data_dict, features_list, sort_keys='../../resources/enron/python2_lesson13_keys.pkl') labels, features = targetFeatureSplit(data) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42) dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train)
# Task 1: Select what features you'll use. # features_list is a list of strings, each of which is a feature name. # The first feature must be "poi". features_list = ['poi', 'salary'] # You will need to use more features # Load the dictionary containing the dataset with open("../../resources/enron/enron_dataset.pkl", "rb") as data_file: data_dict = pickle.load(data_file) # Task 2: Remove outliers # Task 3: Create new feature(s) # Store to my_dataset for easy export below. my_dataset = data_dict # Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) # Task 4: Try a varity of classifiers # Please name your classifier clf for easy export below. # Note that if you want to do PCA or other multi-stage operations, # you'll need to use Pipelines. For more info: # http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB clf = GaussianNB() # Task 5: Tune your classifier to achieve better than .3 precision and recall # using our testing script. Check the tester.py script in the enron project
import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from enron.feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../../resources/enron/dataset_modified.pkl", "rb")) # list the features you want to look at--first item in the # list will be the "target" feature features_list = ["bonus", "salary"] # features_list = ["bonus", "long_term_incentive"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True, sort_keys='../../resources/enron/python2_lesson06_keys.pkl') target, features = targetFeatureSplit(data) feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" reg = LinearRegression() reg.fit(feature_train, target_train) print(reg.coef_) print(reg.intercept_) print(reg.score(feature_test, target_test)) print(reg.score(feature_train, target_train))
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) # fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print(clf) print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") except: print("Got a divide by zero when trying out:", clf) print( "Precision or recall may be undefined due to a lack of true positive predicitons." )
for value in data_dict.values(): if value['salary'] != 'NaN': if salary_max < value['salary']: salary_max = value['salary'] if salary_min > value['salary']: salary_min = value['salary'] print('salary_max:', salary_max, ';salary_min', salary_min) # the input features we want to use # can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" feature_3 = 'total_payments' poi = "poi" features_list = [poi, feature_1, feature_2, feature_3] data = featureFormat(data_dict, features_list) data[:, 1] = minmax_scale(data[:, 1]) data[:, 2] = minmax_scale(data[:, 2]) print('salary_scale:', 200000 / salary_max) print('exercised_stock_options_scale:', 100 / stock_min) poi, finance_features = targetFeatureSplit(data) # in the "clustering with 3 features" part of the mini-project, # you'll want to change this line to # for f1, f2, _ in finance_features: # (as it's currently written, the line below assumes 2 features) for f1, f2, f3 in finance_features: plt.scatter(f1, f2) plt.show()