Esempio n. 1
0
"""

import pickle

from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from enron.feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open("../../resources/enron/enron_dataset.pkl", "rb"))

# add more features to features_list!
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

data = featureFormat(
    data_dict,
    features_list,
    sort_keys='../../resources/enron/python2_lesson13_keys.pkl')
labels, features = targetFeatureSplit(data)

X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.3,
                                                    random_state=42)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
Esempio n. 2
0
# Task 1: Select what features you'll use.
# features_list is a list of strings, each of which is a feature name.
# The first feature must be "poi".
features_list = ['poi', 'salary']  # You will need to use more features

# Load the dictionary containing the dataset
with open("../../resources/enron/enron_dataset.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

# Task 2: Remove outliers
# Task 3: Create new feature(s)
# Store to my_dataset for easy export below.
my_dataset = data_dict

# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

# Task 4: Try a varity of classifiers
# Please name your classifier clf for easy export below.
# Note that if you want to do PCA or other multi-stage operations,
# you'll need to use Pipelines. For more info:
# http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

# Task 5: Tune your classifier to achieve better than .3 precision and recall
# using our testing script. Check the tester.py script in the enron project
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from enron.feature_format import featureFormat, targetFeatureSplit

dictionary = pickle.load(
    open("../../resources/enron/dataset_modified.pkl", "rb"))

# list the features you want to look at--first item in the
# list will be the "target" feature
features_list = ["bonus", "salary"]
# features_list = ["bonus", "long_term_incentive"]
data = featureFormat(
    dictionary,
    features_list,
    remove_any_zeroes=True,
    sort_keys='../../resources/enron/python2_lesson06_keys.pkl')
target, features = targetFeatureSplit(data)

feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

reg = LinearRegression()
reg.fit(feature_train, target_train)
print(reg.coef_)
print(reg.intercept_)
print(reg.score(feature_test, target_test))
print(reg.score(feature_train, target_train))
Esempio n. 4
0
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        # fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print(clf)
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Esempio n. 5
0
for value in data_dict.values():
    if value['salary'] != 'NaN':
        if salary_max < value['salary']:
            salary_max = value['salary']
        if salary_min > value['salary']:
            salary_min = value['salary']
print('salary_max:', salary_max, ';salary_min', salary_min)

# the input features we want to use
# can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
feature_3 = 'total_payments'
poi = "poi"
features_list = [poi, feature_1, feature_2, feature_3]
data = featureFormat(data_dict, features_list)
data[:, 1] = minmax_scale(data[:, 1])
data[:, 2] = minmax_scale(data[:, 2])
print('salary_scale:', 200000 / salary_max)
print('exercised_stock_options_scale:', 100 / stock_min)

poi, finance_features = targetFeatureSplit(data)

# in the "clustering with 3 features" part of the mini-project,
# you'll want to change this line to
# for f1, f2, _ in finance_features:
# (as it's currently written, the line below assumes 2 features)
for f1, f2, f3 in finance_features:
    plt.scatter(f1, f2)
plt.show()