Ejemplo n.º 1
0
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print(clf)
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
Ejemplo n.º 2
0
def evaluation(clf, features_list, folds=1000):
    """calculate the precision, recall and f1 of a classifier, using k-fold"""

    data = featureFormat(my_dataset, features_list, sort_keys=True)
    target, features = targetFeatureSplit(data)

    precision = []
    recall = []
    f1 = []

    cv = StratifiedShuffleSplit(target, folds, random_state=42)

    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(target[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(target[jj])

        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        precision.append(precision_score(labels_test, predictions))
        recall.append(recall_score(labels_test, predictions))
        f1.append(f1_score(labels_test, predictions))

    print "Precision: ", round(np.mean(precision), 4)
    print "Recall: ", round(np.mean(recall), 4)
    print "f1: ", round(np.mean(f1), 4)
def main():
    keys_path = os.path.join(os.path.dirname(tools.__file__),
                             'python2_lesson14_keys.pkl')

    data_dict = load_pickle(
        os.path.join(os.path.dirname(final_project.__file__),
                     'final_project_dataset.pkl'))

    # first element is our labels, any added elements are predictor
    # features. Keep this the same for the mini-project, but you'll
    # have a different feature list when you do the final project.
    features_list = ["poi", "salary"]

    data = featureFormat(data_dict, features_list, sort_keys=keys_path)
    labels, features = targetFeatureSplit(data)

    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.3, random_state=42)

    # it's all yours from here forward!
    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    print('Accuracy', clf.score(features_test, labels_test))
    predictions = clf.predict(features_test)
    conf_matrix = confusion_matrix(labels_test,
                                   predictions,
                                   labels=[True, False])
    print('Confusion matrix')
    print(conf_matrix)
    precision = precision_score(labels_test, predictions)
    print('Precision', precision)
    recall = recall_score(labels_test, predictions)
    print('Recall', recall)
Ejemplo n.º 4
0
def main():
    # load in the dict of dicts containing all the data
    # on each person in the dataset
    file_path = os.path.join(os.path.dirname(final_project.__file__),
                             'final_project_dataset.pkl')
    data_dict = load_pickle(file_path)
    # there's an outlier--remove it!
    del data_dict["TOTAL"]

    # the input features we want to use
    # can be any key in the person-level dictionary
    # (salary, director_fees, etc.)
    feature_1 = "salary"
    feature_2 = "exercised_stock_options"
    # feature_3 = "total_payments"
    poi = "poi"
    features_list = [poi, feature_1, feature_2]  # , feature_3]
    data = featureFormat(data_dict, features_list)
    poi, finance_features = targetFeatureSplit(data)
    scaler = MinMaxScaler()
    finance_features = scaler.fit_transform(finance_features)
    print(scaler.transform([[2e5, 1e6]]))

    # in the "clustering with 3 features" part of the mini-project,
    # you'll want to change this line to
    # for f1, f2, _ in finance_features:
    # (as it's currently written, the line below assumes 2 features)
    for f1, f2 in finance_features:
        plt.scatter(f1, f2)
        plt.xlabel(feature_1)
        plt.ylabel(feature_2)
    plt.show()

    # cluster here; create predictions of the cluster labels
    # for the data and store them to a list called pred
    clustering = KMeans(n_clusters=2)
    clustering.fit(X=finance_features)
    pred = clustering.predict(finance_features)

    # rename the "name" parameter when you change the number of features
    # so that the figure gets saved to a different file
    try:
        draw(pred,
             finance_features,
             poi,
             mark_poi=False,
             name="clusters3.pdf",
             f1_name=feature_1,
             f2_name=feature_2)
    except NameError:
        print("no predictions object named pred found, no clusters to plot")
Ejemplo n.º 5
0
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
Ejemplo n.º 6
0
def prepare_data(input_data, features_list):
    """
    准备分类器需要的features,target数据
    """
    data_format = featureFormat(input_data, features_list)
    targets, features = targetFeatureSplit(data_format)
    features_train, features_test, target_train, target_test = train_test_split(features, targets, test_size = 0.3, random_state=42)
    
    from sklearn.cross_validation import KFold
    kf=KFold(len(targets),3)
    for train_indices, test_indices in kf:
        #make training and testing sets
        features_train= [features[ii] for ii in train_indices]
        features_test= [features[ii] for ii in test_indices]
        target_train=[targets[ii] for ii in train_indices]
        target_test=[targets[ii] for ii in test_indices]
    return features_train, features_test, target_train, target_test
Ejemplo n.º 7
0
def get_k_best_features(data, features_list, k=10):
    # Setup the label and features
    data = featureFormat(data, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    # Apply SelectKBest
    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_

    # pair up with feature name, ignore the first one, since
    # that is the 'poi' label
    unsorted_pairs = zip(features_list[1:], scores)

    # Sort based on score
    sorted_pairs = list(sorted(unsorted_pairs, key=lambda x: x[1], reverse=True))

    return sorted_pairs
Ejemplo n.º 8
0
def main():
    keys_path = os.path.join(os.path.dirname(tools.__file__),
                             'python2_lesson13_keys.pkl')

    data_dict = load_pickle(
        os.path.join(os.path.dirname(final_project.__file__),
                     'final_project_dataset.pkl'))

    # first element is our labels, any added elements are predictor
    # features. Keep this the same for the mini-project, but you'll
    # have a different feature list when you do the final project.
    features_list = ["poi", "salary"]

    data = featureFormat(data_dict, features_list, sort_keys=keys_path)
    labels, features = targetFeatureSplit(data)

    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.3, random_state=42)

    # it's all yours from here forward!
    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    print(clf.score(features_test, labels_test))
Ejemplo n.º 9
0
    You fill in the regression code where indicated:
"""

import sys
import pickle

from tools.feature_format import featureFormat, targetFeatureSplit

dictionary = pickle.load(
    open("../final_project/final_project_dataset_modified.pkl", "r"))

### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat(dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split

feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "b"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.

### draw the scatterplot, with color-coded training and testing points
Ejemplo n.º 10
0
import sys
sys.path.append("../tools/")
from tools.feature_format import featureFormat, targetFeatureSplit

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )

### first element is our labels, any added elements are predictor
### features. Keep this the same for the mini-project, but you'll
### have a different feature list when you do the final project.
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42)

clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)

pred = clf.predict(features_test)
print accuracy_score(pred, labels_test)


### it's all yours from here forward!  


Ejemplo n.º 11
0
salary = [min(salary), 200000.0, max(salary)]
salary = numpy.array([[e] for e in salary])
salary_scaler = MinMaxScaler()
rescaled_salary = salary_scaler.fit_transform(salary)
print "Rescaled salary:", rescaled_salary

# the input features we want to use
# can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
feature_3 = "total_payments"
poi = "poi"
features_list = [poi, feature_1, feature_2, feature_3]
data = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data)

# in the "clustering with 3 features" part of the mini-project,
# you'll want to change this line to
# for f1, f2, _ in finance_features:
# (as it's currently written, the line below assumes 2 features)
for f1, f2, _ in finance_features:
    plt.scatter(f1, f2)
plt.show()

# cluster here; create predictions of the cluster labels
# for the data and store them to a list called pred

kmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300)
pred = kmeans.fit_predict(finance_features)
Ejemplo n.º 12
0
# FEATURE REMOVAL
for features in ['loan_advances', 'total_payments']:
    features_list.remove(features)

# Task 3: Create new feature(s)
new_features_list = [
    'poi',
    'shared_receipt_with_poi',
    'expenses',
    'from_this_person_to_poi',
    'from_poi_to_this_person',
]

new_data = featureFormat(data_dict, new_features_list)
new_labels, new_features = targetFeatureSplit(new_data)

# Store to my_dataset for easy export below.
my_dataset = data_dict

# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
# Task 4: Try a varity of classifiers
# Please name your classifier clf for easy export below.
# Note that if you want to do PCA or other multi-stage operations,
# you'll need to use Pipelines. For more info:
# http://scikit-learn.org/stable/modules/pipeline.html
# Provided to give you a starting point. Try a variety of classifiers.

# 1. Decision Tree Classifier
Ejemplo n.º 13
0
    You fill in the regression code where indicated:
"""    


import sys
import pickle
sys.path.append("../tools/")
from tools.feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") )

### list the features you want to look at--first item in the 
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit( data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"



### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and 
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
Ejemplo n.º 14
0
features_list = ['poi', 'salary', 'bonus', 'expenses', 'exercised_stock_options']  # You will need to use more features

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
data = featureFormat(data_dict, features_list)

my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
Ejemplo n.º 15
0
print(
    "data[0:3] -> "
)  # features_list = ["bonus", "salary"] not features_list = ["bonus", "long_term_incentive"]
print(data[0:3])
# [[  600000.   365788.]
#  [ 1200000.   267102.]
#  [  350000.   170941.]

# print(data[0:2]) # first two rows. two columns
# print(data[0:2,0]) # first two rows, column 1 of 2 only - zero based indexing
# print(data[0:2,1]) # first two rows, column 2 of 2 only - zero based indexing
# print(data[0:2,0:1]) # first two rows, column 1 of 1 only - zero based indexing
# print(data[0:2,0:2]) # first two rows, two columns - zero based indexing - this is a good example

# target, features = targetFeatureSplit( data )
target, features = feature_format.targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.

reg = linear_model.LinearRegression()
Ejemplo n.º 16
0
### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
### there's an outlier--remove it! 
data_dict.pop("TOTAL", 0)


### the input features we want to use 
### can be any key in the person-level dictionary (salary, director_fees, etc.) 
feature_1 = "salary"
feature_2 = "exercised_stock_options"
feature_3 = "total_payments"
poi  = "poi"
features_list = [poi, feature_1, feature_2, feature_3]
data = featureFormat(data_dict, features_list )
poi, finance_features = targetFeatureSplit( data )

from numpy import ndarray


options=[]
salary=[]
for k,v in data_dict.iteritems():
    if v['exercised_stock_options'] != 'NaN':
        options.append(v['exercised_stock_options'])
    if v['salary'] != 'NaN':
        salary.append(v['salary'])

print 'maximum options: {} minimum options: {}'.format(max(options), min(options))
print 'maximum salary: {} minimum salary: {}'.format(max(salary), min(salary))
Ejemplo n.º 17
0
def main():
    dictionary_path = os.path.join(os.path.dirname(final_project.__file__),
                                   "final_project_dataset_modified.pkl")
    with io.open(dictionary_path, 'rb') as f:
        dictionary = pickle.load(f)

    ### list the features you want to look at--first item in the
    ### list will be the "target" feature
    features_list = ["bonus", "salary"]
    keys_path = os.path.join(os.path.dirname(tools.__file__),
                             'python2_lesson06_keys.pkl')
    data = featureFormat(dictionary,
                         features_list,
                         remove_any_zeroes=True,
                         sort_keys=keys_path)
    target, features = targetFeatureSplit(data)

    ### training-testing split needed in regression, just like classification
    feature_train, feature_test, target_train, target_test = train_test_split(
        features, target, test_size=0.5, random_state=42)
    train_color = "b"
    test_color = "r"

    ### Your regression goes here!
    ### Please name it reg, so that the plotting code below picks it up and
    ### plots it correctly. Don't forget to change the test_color above from "b" to
    ### "r" to differentiate training points from test points.

    reg = LinearRegression()
    reg.fit(feature_train, target_train)

    print('test score', reg.score(feature_test, target_test))
    print('train score', reg.score(feature_train, target_train))

    print('coef', reg.coef_, 'intercept', reg.intercept_)

    ### draw the scatterplot, with color-coded training and testing points

    for feature, target in zip(feature_test, target_test):
        plt.scatter(feature, target, color=test_color)
    for feature, target in zip(feature_train, target_train):
        plt.scatter(feature, target, color=train_color)

    ### labels for the legend
    plt.scatter(feature_test[0],
                target_test[0],
                color=test_color,
                label="test")
    plt.scatter(feature_test[0],
                target_test[0],
                color=train_color,
                label="train")

    ### draw the regression line, once it's coded
    plt.plot(feature_test, reg.predict(feature_test))

    reg.fit(feature_test, target_test)
    plt.plot(feature_train, reg.predict(feature_train), color="y")
    print('coef', reg.coef_, 'intercept', reg.intercept_)

    plt.xlabel(features_list[1])
    plt.ylabel(features_list[0])
    plt.legend()
    plt.show()