Exemple #1
0
def predict():
    # list the features you want to look at--first item in the
    # list will be the "target" feature
    features_list = ["bonus", "salary"]
    data = feature_format(dictionary, features_list, remove_any_zeroes=True)
    target, features = target_feature_split(data)

    # training-testing split needed in regression, just like classification
    feature_train, feature_test, target_train, target_test = train_test_split(
        features, target, test_size=0.5, random_state=42
    )
    train_color = "b"
    test_color = "r"

    reg = LinearRegression()
    reg.fit(feature_train, target_train)
    prediction = reg.predict(feature_test)

    print("slope: {}".format(reg.coef_))
    print("intercept: {}".format(reg.intercept_))
    print("r2 score: {}".format(r2_score(target_test, prediction)))
    print("\n")

    # draw the scatterplot, with color-coded training and testing points

    for feature, target in zip(feature_test, target_test):
        plt.scatter(feature, target, color=test_color)
    for feature, target in zip(feature_train, target_train):
        plt.scatter(feature, target, color=train_color)

    # labels for the legend
    plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
    plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")

    # draw the regression line, once it's coded
    try:
        plt.plot(feature_test, prediction)
    except NameError:
        pass

    reg.fit(feature_test, target_test)
    prediction = reg.predict(feature_train)
    plt.plot(feature_train, prediction, color="b")
    print("slope: {}".format(reg.coef_))
    print("intercept: {}".format(reg.intercept_))
    print("r2 score: {}".format(r2_score(target_train, prediction)))

    plt.xlabel(features_list[1])
    plt.ylabel(features_list[0])
    plt.legend()
    plt.show()
Exemple #2
0
def cluster():
    # load in the dict of dicts containing all the data on each person in the dataset
    data_dict = pickle.load(
        open(
            os.path.join(BASE_PATH, "final_project/final_project_dataset.pkl"),
            "r"))

    # there's an outlier--remove it!
    data_dict.pop("TOTAL", 0)

    # the input features we want to use
    # can be any key in the person-level dictionary (salary, director_fees, etc.)
    feature_1 = "salary"
    feature_2 = "exercised_stock_options"
    feature_3 = "total_payments"
    poi = "poi"
    features_list = [poi, feature_1, feature_2, feature_3]

    data = feature_format(data_dict, features_list)
    poi, finance_features = target_feature_split(data)

    # in the "clustering with 3 features" part of the mini-project,
    # you'll want to change this line to
    # for f1, f2, _ in finance_features:
    # (as it's currently written, the line below assumes 2 features)
    for f1, f2, _ in finance_features:
        plt.scatter(f1, f2)
    plt.show()

    # k_means = KMeans(n_clusters=2, random_state=0)
    # k_means.fit(finance_features)
    # pred = k_means.predict(finance_features)

    # rename the "name" parameter when you change the number of features
    # so that the figure gets saved to a different file
    try:
        draw(
            pred,
            finance_features,
            poi,
            mark_poi=False,
            name="clusters-3.pdf",
            f1_name=feature_1,
            f2_name=feature_2,
        )
    except NameError:
        print("no predictions object named pred found, no clusters to plot")
def clean_outliers():
    # read in data dictionary, convert to numpy array
    data_dict = pickle.load(
        open(
            os.path.join(BASE_PATH, "final_project/final_project_dataset.pkl"),
            "r"))
    del data_dict["TOTAL"]
    features = ["salary", "bonus"]
    data = feature_format(data_dict, features)

    for point in data:
        salary = point[0]
        bonus = point[1]
        plt.scatter(salary, bonus)

    plt.xlabel("salary")
    plt.ylabel("bonus")
    plt.show()
def poi_identifier():
    data_dict = pickle.load(
        open(
            os.path.join(BASE_PATH, "final_project/final_project_dataset.pkl"),
            "r"))

    # add more features to features_list!
    features_list = ["poi", "salary"]

    data = feature_format(data_dict, features_list)
    labels, features = target_feature_split(data)

    features_train, features_test, labels_train, labels_test = train_test_split(
        features, labels, test_size=0.30, random_state=42)

    # Decision tree
    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    prediction = clf.predict(features_test)
    print("accuracy:", accuracy_score(labels_test, prediction))

    # evaluation
    values, counts = np.unique(prediction, return_counts=True)
    test_size = len(features_test)
    print("predicted POIs:", zip(values, counts))
    print("total no in test set:", test_size)
    print("accuracy if all poi=0:", float(counts[0]) / float(test_size))

    true_positives = 0
    for actual, predicted in zip(labels_test, prediction):
        if actual == 1 and predicted == 1:
            true_positives += 1

    print("true positives:", true_positives)
    print("precision score:", precision_score(labels_test, prediction))
    print("recall score:", recall_score(labels_test, prediction))

    prediction_labels = [
        0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1
    ]
    true_labels = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]

    calculate_precision_and_recall(true_labels, prediction_labels)
def poi_identifier():
    data_dict = pickle.load(
        open(
            os.path.join(BASE_PATH, "final_project/final_project_dataset.pkl"),
            "r"))

    # first element is our labels, any added elements are predictor
    # features. Keep this the same for the mini-project, but you'll
    # have a different feature list when you do the final project.
    features_list = ["poi", "salary"]

    data = feature_format(data_dict, features_list)
    labels, features = target_feature_split(data)

    features_train, features_test, labels_train, labels_test = train_test_split(
        features, labels, test_size=0.3, random_state=42)

    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)

    prediction = clf.predict(features_test)

    print("accuracy: {}".format(accuracy_score(prediction, labels_test)))
Exemple #6
0
        for ii, pp in enumerate(pred):
            if poi[ii]:
                plt.scatter(features[ii][0],
                            features[ii][1],
                            color="r",
                            marker="*")
    plt.xlabel(f1_name)
    plt.ylabel(f2_name)
    plt.show()


data_dict = pd.read_pickle('final_project_dataset.pkl')
df = pd.DataFrame.from_dict(data_dict, orient='index')
data_dict.pop('TOTAL', 0)
features_list = ['poi', 'salary', 'exercised_stock_options']
data = feature_format(data_dict, features_list)

# feature target split
poi = data[0]

finance_features = np.column_stack((data[1], data[2]))

for f1, f2 in finance_features:
    plt.scatter(f1, f2)
plt.show()

clt = KMeans(n_clusters=2)
clt.fit(finance_features)
pred = clt.predict(finance_features)

try: