import pickle from sklearn.metrics import recall_score, precision_score from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from enron.feature_format import featureFormat, targetFeatureSplit data_dict = pickle.load(open("../../resources/enron/enron_dataset.pkl", "rb")) # add more features to features_list! features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) data = featureFormat( data_dict, features_list, sort_keys='../../resources/enron/python2_lesson13_keys.pkl') labels, features = targetFeatureSplit(data) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42) dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) # print(dtc.score(X_test,y_test))
from enron.feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../../resources/enron/dataset_modified.pkl", "rb")) # list the features you want to look at--first item in the # list will be the "target" feature features_list = ["bonus", "salary"] # features_list = ["bonus", "long_term_incentive"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True, sort_keys='../../resources/enron/python2_lesson06_keys.pkl') target, features = targetFeatureSplit(data) feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" reg = LinearRegression() reg.fit(feature_train, target_train) print(reg.coef_) print(reg.intercept_) print(reg.score(feature_test, target_test)) print(reg.score(feature_train, target_train)) # draw the scatterplot, with color-coded training and testing points for feature, target in zip(feature_test, target_test):
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) # fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print(clf) print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") except: print("Got a divide by zero when trying out:", clf) print( "Precision or recall may be undefined due to a lack of true positive predicitons." )
print('salary_max:', salary_max, ';salary_min', salary_min) # the input features we want to use # can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" feature_3 = 'total_payments' poi = "poi" features_list = [poi, feature_1, feature_2, feature_3] data = featureFormat(data_dict, features_list) data[:, 1] = minmax_scale(data[:, 1]) data[:, 2] = minmax_scale(data[:, 2]) print('salary_scale:', 200000 / salary_max) print('exercised_stock_options_scale:', 100 / stock_min) poi, finance_features = targetFeatureSplit(data) # in the "clustering with 3 features" part of the mini-project, # you'll want to change this line to # for f1, f2, _ in finance_features: # (as it's currently written, the line below assumes 2 features) for f1, f2, f3 in finance_features: plt.scatter(f1, f2) plt.show() # cluster here; create predictions of the cluster labels # for the data and store them to a list called pred pred = KMeans(3).fit_predict(finance_features) # rename the "name" parameter when you change the number of features