def predict(): # list the features you want to look at--first item in the # list will be the "target" feature features_list = ["bonus", "salary"] data = feature_format(dictionary, features_list, remove_any_zeroes=True) target, features = target_feature_split(data) # training-testing split needed in regression, just like classification feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42 ) train_color = "b" test_color = "r" reg = LinearRegression() reg.fit(feature_train, target_train) prediction = reg.predict(feature_test) print("slope: {}".format(reg.coef_)) print("intercept: {}".format(reg.intercept_)) print("r2 score: {}".format(r2_score(target_test, prediction))) print("\n") # draw the scatterplot, with color-coded training and testing points for feature, target in zip(feature_test, target_test): plt.scatter(feature, target, color=test_color) for feature, target in zip(feature_train, target_train): plt.scatter(feature, target, color=train_color) # labels for the legend plt.scatter(feature_test[0], target_test[0], color=test_color, label="test") plt.scatter(feature_test[0], target_test[0], color=train_color, label="train") # draw the regression line, once it's coded try: plt.plot(feature_test, prediction) except NameError: pass reg.fit(feature_test, target_test) prediction = reg.predict(feature_train) plt.plot(feature_train, prediction, color="b") print("slope: {}".format(reg.coef_)) print("intercept: {}".format(reg.intercept_)) print("r2 score: {}".format(r2_score(target_train, prediction))) plt.xlabel(features_list[1]) plt.ylabel(features_list[0]) plt.legend() plt.show()
def cluster(): # load in the dict of dicts containing all the data on each person in the dataset data_dict = pickle.load( open( os.path.join(BASE_PATH, "final_project/final_project_dataset.pkl"), "r")) # there's an outlier--remove it! data_dict.pop("TOTAL", 0) # the input features we want to use # can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" feature_3 = "total_payments" poi = "poi" features_list = [poi, feature_1, feature_2, feature_3] data = feature_format(data_dict, features_list) poi, finance_features = target_feature_split(data) # in the "clustering with 3 features" part of the mini-project, # you'll want to change this line to # for f1, f2, _ in finance_features: # (as it's currently written, the line below assumes 2 features) for f1, f2, _ in finance_features: plt.scatter(f1, f2) plt.show() # k_means = KMeans(n_clusters=2, random_state=0) # k_means.fit(finance_features) # pred = k_means.predict(finance_features) # rename the "name" parameter when you change the number of features # so that the figure gets saved to a different file try: draw( pred, finance_features, poi, mark_poi=False, name="clusters-3.pdf", f1_name=feature_1, f2_name=feature_2, ) except NameError: print("no predictions object named pred found, no clusters to plot")
def clean_outliers(): # read in data dictionary, convert to numpy array data_dict = pickle.load( open( os.path.join(BASE_PATH, "final_project/final_project_dataset.pkl"), "r")) del data_dict["TOTAL"] features = ["salary", "bonus"] data = feature_format(data_dict, features) for point in data: salary = point[0] bonus = point[1] plt.scatter(salary, bonus) plt.xlabel("salary") plt.ylabel("bonus") plt.show()
def poi_identifier(): data_dict = pickle.load( open( os.path.join(BASE_PATH, "final_project/final_project_dataset.pkl"), "r")) # add more features to features_list! features_list = ["poi", "salary"] data = feature_format(data_dict, features_list) labels, features = target_feature_split(data) features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.30, random_state=42) # Decision tree clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) prediction = clf.predict(features_test) print("accuracy:", accuracy_score(labels_test, prediction)) # evaluation values, counts = np.unique(prediction, return_counts=True) test_size = len(features_test) print("predicted POIs:", zip(values, counts)) print("total no in test set:", test_size) print("accuracy if all poi=0:", float(counts[0]) / float(test_size)) true_positives = 0 for actual, predicted in zip(labels_test, prediction): if actual == 1 and predicted == 1: true_positives += 1 print("true positives:", true_positives) print("precision score:", precision_score(labels_test, prediction)) print("recall score:", recall_score(labels_test, prediction)) prediction_labels = [ 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1 ] true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0] calculate_precision_and_recall(true_labels, prediction_labels)
def poi_identifier(): data_dict = pickle.load( open( os.path.join(BASE_PATH, "final_project/final_project_dataset.pkl"), "r")) # first element is our labels, any added elements are predictor # features. Keep this the same for the mini-project, but you'll # have a different feature list when you do the final project. features_list = ["poi", "salary"] data = feature_format(data_dict, features_list) labels, features = target_feature_split(data) features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.3, random_state=42) clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) prediction = clf.predict(features_test) print("accuracy: {}".format(accuracy_score(prediction, labels_test)))
for ii, pp in enumerate(pred): if poi[ii]: plt.scatter(features[ii][0], features[ii][1], color="r", marker="*") plt.xlabel(f1_name) plt.ylabel(f2_name) plt.show() data_dict = pd.read_pickle('final_project_dataset.pkl') df = pd.DataFrame.from_dict(data_dict, orient='index') data_dict.pop('TOTAL', 0) features_list = ['poi', 'salary', 'exercised_stock_options'] data = feature_format(data_dict, features_list) # feature target split poi = data[0] finance_features = np.column_stack((data[1], data[2])) for f1, f2 in finance_features: plt.scatter(f1, f2) plt.show() clt = KMeans(n_clusters=2) clt.fit(finance_features) pred = clt.predict(finance_features) try: