def precompute_recall_precision(features_list, sum = False):
    features_list_all = ['poi'] + features_list
    data = featureFormat(my_dataset, features_list_all, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    standardized = MinMaxScaler().fit_transform(features)
    # Score the features using f_classif
    sel = SelectKBest(k='all', score_func=f_classif)
    sel.fit_transform(features, labels)
    kbest = [(features_list[i], score, i) for i, score in enumerate(sel.scores_)]
    sorted_kbest = sorted(kbest, key=operator.itemgetter(1), reverse=True)
    print "Feature Set(", len(kbest), ") List and K-best scores:"
    for tup in sorted_kbest:
        print tup[2], "\t", tup[0], tup[1]
    if not sum:
        plot_feature_correlation(features, len(kbest))
    for i, method in enumerate(methods):
        pipe, params = method()
        grid_searcher = GridSearchCV(pipe, param_grid=params, cv=sk_fold, scoring='recall'), labels)
        clf = grid_searcher.best_estimator_

        ### Extract features and labels from dataset for local testing
        data = featureFormat(my_dataset, features_list_all, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        my_test_classifier(clf, my_dataset, features_list_all, i)
Esempio n. 2
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    # configure split of test_size and train_size
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42, 
                                test_size = .2, train_size = .8)
#    print cv

    for train_idx, test_idx in cv: 
        features_train      = []
        features_test       = []
        features_validation = []
        labels_train        = []
        labels_test         = []
        labels_validation   = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            if jj % 2 == 0: 
                features_validation.append( features[jj] )
                labels_validation.append( labels[jj] )
                features_test.append( features[jj] )
                labels_test.append( labels[jj] )
# Determine size of training & test sets      
    fit_and_test_classifier(clf, features_train, labels_train, features_test, labels_test)
    fit_and_test_classifier(clf, features_train, labels_train, features_validation, labels_validation)
    print "features_train:", len(features_train), "labels_train:", len(labels_train)      
    print "features_test:", len(features_test), "labels_test:", len(labels_test)
    print "features_validation:", len(features_validation), "labels_validation:", len(labels_validation)
def test_classifier(clf, dataset, feature_list, folds = 1000,scale_features = True, std_features = False):
    data = featureFormat(dataset, feature_list, sort_keys = True)

    labels, features = targetFeatureSplit(data)

    #scale features if necessary
    if scale_features == True:
        scaler = preprocessing.MinMaxScaler()
        features = scaler.fit_transform(features)

    #standardize features for pca if necessary
    if std_features == True:
        std = preprocessing.StandardScaler()
        features = preprocessing.StandardScaler().fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
                true_positives += 1
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
        return f1
        print "Got a divide by zero when trying out:", clf
def makeData(dataset, feature_list, folds = 1000):
    """Make and return dataset prepared for training.

    Keyword arguments:
    dataset --- dict of dict
    feature_list --- list of strings
    folds --- int
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    for train_idx, test_idx in cv: 
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
    return features_train, features_test, labels_train, labels_test
def test_training_stratified_split(dataset, features_list, testsize=0.2):
    For E+F dataset, split dataset into the training and test 
    set using stratified method.

    dataset: data in dictionary format 
    features_list: the full list of features to selection from 
    test: the proportion of the dataset to include in the test split

    labels_train, labels_test, features_train, features_test

    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    labels = np.array([int(label) for label in labels])
    features = np.array(features)
    ### Split data into test set and training set
    sss = StratifiedShuffleSplit(labels, 1, test_size=testsize, random_state=0)

    for train_index, test_index in sss:
        labels_train, labels_test = labels[train_index].tolist(), labels[test_index].tolist()
        features_train, features_test = features[train_index].tolist(), features[test_index].tolist()
    return labels_train, labels_test, features_train, features_test
def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, 1000, random_state=42)

    # Build an empty feature importance totals array for calculating average importance
    totals = []
    for each_feature in feature_list:

    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
        for jj in test_idx:
        clf =, labels_train)
        for i in range(len(clf.feature_importances_)):
            totals[i] += clf.feature_importances_[i]
        # print clf.feature_importances_

    for i in range(len(totals)):
        totals[i] /= 1000

    # Display results
    print "Feature list: ", feature_list[1:]
    print "Importances: ", totals
def univariateFeatureSelection(f_list, my_dataset):
	result = []
	for feature in f_list:
		# Replace 'NaN' with 0
		for name in my_dataset:
			data_point = my_dataset[name]
			if not data_point[feature]:
				data_point[feature] = 0
			elif data_point[feature] == 'NaN':
				data_point[feature] =0

		data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB(), labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
	result = sorted(result, reverse=True, key=lambda x: x[3])
	return result
def get_most_important_features(dataset, features_list):
    """Calculates the feature importances.
    Takes as input a dataset and a list of features.
    Creates an overfit Decision Tree and calculates the feature importances.
    Returns a list with the feature importances.
    # creating an overfitted decision tree
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score

    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    # new features filtered, NaN values removed
    features_train, features_test, labels_train, labels_test = train_test_split(features,

    clf = DecisionTreeClassifier(), labels_train)
    pred = clf.predict(features_test)
    acc = accuracy_score(labels_test, pred)
    # uncomment to print the accuracy score
    #print "overfitted accuracy", acc

    # calculating feature importances
    feat_imp = clf.feature_importances_
    # uncomment to print the most important (common) ones
    #print feat_imp
    #for index, feature in enumerate(feat_imp):
    #    if feature > 0.2:
    #        print "spot:", index, ":", features_list[index+1], " | value:", feature
    return feat_imp
def prep_features(df, features_list, feature_scaled):

        load dataframe (or dictionary), and features_list
        scaled features, labels in numpy.ndarray, and
        scaled features, labels in pandas dataframe
    from feature_format import featureFormat, targetFeatureSplit
    import pandas as pd
    # for pandas dataframe
    df1 = df[features_list]
    features_df = df1.drop('poi', axis=1)#.astype(float)  # new features (pandas dataframe)
    labels_df = df1['poi']  # new labels (pandas dataframe)
    if feature_scaled ==  True:
        features_df_scaled = scale_features(features_df) # scale features
        features_df_scaled = features_df
    # for dictionary
    df2 = df[features_list]
    data_dict_new = df2.T.to_dict()  # data_dict (final)
    features_dic = features_df.copy()
    X_features = list(features_dic.columns)
    features_list_new = ['poi'] + X_features  # selected features list (final)
    data = featureFormat(data_dict_new, features_list_new, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    if feature_scaled == True:
        features = scale_features(features)

    return features, labels, features_df_scaled, labels_df
def selectKBest(previous_result, data):
	# remove 'restricted_stock_deferred' and 'director_fees'

	result = []
	_k = 10
	for k in range(0,_k):
		feature_list = ['poi']
		for n in range(0,k+1):

		data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB(), labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
	return result
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
        for jj in test_idx:

        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)

        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
                true_positives += 1
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)

        print clf
        #print "Best Params: ", clf.best_params_
        #print "Best Estimator: ", clf.best_estimator_
        #current_classifier = clf.best_estimator_
        importance = None

        if importance is not None:
            print "Importance: ", importance
            imp = sorted(zip(feature_list, importance), key=lambda tup: tup[1], reverse=True)
            print "Most Important Variables: " + str(imp)

        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives,
        print ""
        print "Got a divide by zero when trying out: ", clf
def cluster2Features():
    ### the input features we want to use
    ### can be any key in the person-level dictionary (salary, director_fees, etc.)
    feature_1 = "salary"
    feature_2 = "exercised_stock_options"
    poi  = "poi"
    features_list = [poi, feature_1, feature_2]
    data = featureFormat(data_dict, features_list )
    poi, finance_features = targetFeatureSplit( data )

    ### in the "clustering with 3 features" part of the mini-project,
    ### you'll want to change this line to
    ### for f1, f2, _ in finance_features:
    ### (as it's currently written, the line below assumes 2 features)
    #print finance_features
    for f1, f2 in finance_features:
        plt.scatter( f1, f2)

    ### cluster here; create predictions of the cluster labels
    ### for the data and store them to a list called pred
    from sklearn.cluster import KMeans
    estimators = {'k_means_2': KMeans(n_clusters=2)}
    pred = estimators['k_means_2'].predict(data)

    ### rename the "name" parameter when you change the number of features
    ### so that the figure gets saved to a different file
        Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
    except NameError:
        print "no predictions object named pred found, no clusters to plot"
def algorithm(data_dict, features_list):

    from feature_format import featureFormat
    from feature_format import targetFeatureSplit
    ### store to my_dataset for easy export below
    my_dataset = data_dict
    data = featureFormat(my_dataset, features_list)

    # scale features
    #data = scaleFeatures(data)
    ### split into labels and features (this line assumes that the first
    ### feature in the array is the label, which is why "poi" must always
    ### be first in features_list
    labels, features = targetFeatureSplit(data)

    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators = 1000, random_state = 202, \
    		learning_rate = 1.0, algorithm = "SAMME.R")
    ### dump your classifier, dataset and features_list so 
    ### anyone can run/check your results
    pickle.dump(clf, open("my_classifier.pkl", "w") )
    pickle.dump(data_dict, open("my_dataset.pkl", "w") )
    pickle.dump(features_list, open("my_feature_list.pkl", "w") )
Esempio n. 14
def find_best_parameters(pipeline, parameters, score_func, dataset, 
                         feature_list, test_size=0.2, n_iter=10):
    find best parameter by using GridSearchCV with given scoring function.

    returns GridSearchCV object that has best parameters.

    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, 1, test_size=test_size, random_state = 42)
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

    sss = StratifiedShuffleSplit(labels_train, n_iter=n_iter , test_size=test_size, random_state=42)

    clf = GridSearchCV(pipeline, parameters, scoring=score_func, cv=sss, n_jobs=-1), labels_train)

    return clf
def test_classifier(clf, dataset, feature_list, scaling = False, folds = 1000):
    score_all = []
    precision_all = []
    recall_all = []
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    if scaling == True:
        min_max_scaler = preprocessing.MinMaxScaler()
        features = min_max_scaler.fit_transform(features)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    for train_indices, test_indices in cv: 
        features_train= [features[ii] for ii in train_indices]
        features_test= [features[ii] for ii in test_indices]
        labels_train=[labels[ii] for ii in train_indices]
        labels_test=[labels[ii] for ii in test_indices]
       , labels_train)
        pred = clf.predict(features_test)

    precision = numpy.average(precision_all)
    recall = numpy.average(recall_all)
    score = numpy.average(score_all)
    print "Score: " + str(score)
    print "Recall: " + str(precision)
    print "Precision: " + str(recall)	
def get_k_best(df, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection
        returns dict where keys=features, values=scores
    # feature, label = feature_format_scale(data_dict, features_list)
    from poi_dataprocess import *
    from feature_format import featureFormat, targetFeatureSplit

    data_dict_new = df[features_list].T.to_dict()

    data = featureFormat(data_dict_new, features_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    # df = df[features_list]
    # features = df.drop('poi', axis=1)#.astype(float)
    # labels = df['poi']

    from sklearn import preprocessing

    scaler = preprocessing.MinMaxScaler()
    features = scaler.fit_transform(features)

    from sklearn.feature_selection import SelectKBest

    k_best = SelectKBest(k=k), labels)
    scores = k_best.scores_
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    k_best_features = dict(sorted_pairs[:k])

    return k_best_features
Esempio n. 17
def validation(clf, dataset, feature_list, test_size=0.2, n_iter=1000):
    validate given classifier with using stratifie shuffle split cross validation. 
    returns average precision and recall
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)

    precision = []
    recall = []

    cv = StratifiedShuffleSplit(labels, n_iter, test_size=test_size, random_state = 42)
    for train_idx, test_idx in cv:
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] ), labels_train)
        predictions = clf.predict(features_test)

        precision.append(precision_score(labels_test, predictions))
        recall.append(recall_score(labels_test, predictions))

    return np.mean(precision), np.mean(recall)
Esempio n. 18
def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    ### Run testing script
    test_classifier(clf, features, labels)
def get_k_best_features(data_dict, features_list, k):

  runs scikit-learn's SelectKBest feature selection to get k best features
    data_dict: data dictionary for enron
    feature_list: a list of features with first feature as target label
    k: Number of best features which need to be selected

    returns a list of k best features and list of lists where inner list's 
    first element is feature and the second element is feature score

  data = featureFormat(data_dict, features_list)
  labels, features = targetFeatureSplit(data)

  k_best = SelectKBest(k=k), labels)
  scores = k_best.scores_
  unsorted_pairs = zip(features_list[1:], scores)
  sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
  k_best_features = dict(sorted_pairs[:k])
  return k_best_features.keys(), map(list, sorted_pairs)
def regressionBonusAndLongTermInc():
    ### list the features you want to look at--first item in the
    ### list will be the "target" feature
    features_list = ["bonus", "long_term_incentive"]
    data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
    #, sort_keys = '../../tools/python2_lesson06_keys.pkl'
    target, features = targetFeatureSplit( data )
    #print target
    #print features

    ### training-testing split needed in regression, just like classification
    from sklearn.cross_validation import train_test_split
    feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
    train_color = "b"
    test_color = "r"

    ### Your regression goes here!
    ### Please name it reg, so that the plotting code below picks it up and
    ### plots it correctly. Don't forget to change the test_color above from "b" to
    ### "r" to differentiate training points from test points.
    from sklearn import linear_model
    ### name your regression reg
    reg = linear_model.LinearRegression()

    ### your code goes here!, target_train)
    #find the score on the test data
    print reg.score(feature_test, target_test)
Esempio n. 21
def tune_classifier(clf_name, clf, dataset, features_list, scores, folds = 1000):
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    scale = True if clf_name in {'kNN', 'SVM', 'kNN (hand-tuned)'} else False
    if scale:
        # Perform feature scaling 
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    if clf_name == 'kNN':
        parameter_grid = [{'p': [1, 2, 3], 
                           'n_neighbors': [1, 5, 7, 10, 15],
                           'leaf_size': [30, 50, 70, 100]}]
    elif clf_name == 'Decision Tree':
        parameter_grid = [{'min_samples_split': [2, 3, 4, 5], 
                           'min_samples_leaf':[2, 3, 4, 5], 
                           'splitter': ['random', 'best']}]
    for score in scores:
        grid_clf = GridSearchCV(clf, parameter_grid, cv=cv, 
                                scoring="{0}_weighted".format(score)), labels)
        best_params = grid_clf.best_params_
        #print("Grid scores:")
        #for params, mean_score, scores in grid_clf.grid_scores_:
        #    print("{:0.3f} {:+0.03f} for {!r}".format(mean_score, scores.std() * 2, params))
    print("Classifier {0} has tuned parameters {1}".format(clf_name, best_params))
    return best_params
Esempio n. 22
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    ## Tester lacks feature scaling, lets put it here:

    # Scale features:
    mins = np.min(data, axis=0)
    maxs = np.max(data, axis=0)
    data = (data - mins) / (maxs - mins)

    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
        for jj in test_idx:

        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)
        print RESULTS_FORMAT_STRING.format(
            total_predictions, true_positives, false_positives, false_negatives, true_negatives
        print ""
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
def select_k_best_features(data, feature_list, k):
    For E+F dataset, select k best features based on SelectKBest from 

    data: data in dictionary format 
    feature_list: the full list of features to selection from 
    k: the number of features to keep

    the list of length of k+1 with the first element as 'poi' and other 
    k best features 

    data = featureFormat(data_dict, feature_list)
    labels, features = targetFeatureSplit(data)
    k_best = SelectKBest(k=k), labels)
    impt_unsorted = zip(feature_list[1:], k_best.scores_)
    impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True))
    k_best_features = [elem[0] for elem in impt_sorted][:k]
    print k, "best features:"
    print k_best_features
    return ['poi'] + k_best_features
Esempio n. 24
def tuner(clf, parameters, data):
    from sklearn.model_selection import GridSearchCV
    labels, features = targetFeatureSplit(data)
    scaler = MinMaxScaler()
    select = SelectKBest()

    steps = [("scale", scaler),
             ("select", select),
             ("classifier", clf)]
    pipeline = Pipeline(steps)

    shuffle = StratifiedShuffleSplit(n_splits=1000, test_size=0.3,
    my_scorer = make_scorer(my_score_func)
    scoring_metric = my_scorer
    grid_searcher = GridSearchCV(pipeline, param_grid=parameters,
                                 cv=shuffle, scoring=scoring_metric)

    features = select.fit_transform(features, labels), labels)

    print("Cross-validated {0} score: {1}".format(scoring_metric,

    print("Params: ", grid_searcher.best_params_)
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    #uncomment to do features scaling

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        ### fit the classifier using training set, and test on test se, labels_train)
        predictions = clf.predict(features_test)
        #print predictions
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
        print "Got a divide by zero when trying out:", clf
Esempio n. 26
def ptest(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        ### fit the classifier using training set, and test on test set, labels_train)
        pred = clf.predict(features_test)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
                true_positives += 1
    precision = 1.0*true_positives/(true_positives+false_positives)
    return precision
def make_feature_histograms(dataset, features_list):
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    if not os.path.exists(os.path.join(os.path.dirname(__file__), 'hists')):
        os.makedirs(os.path.join(os.path.dirname(__file__), 'hists'))
    for feature, i in zip(features_list[1:], range(len(features[0]))):
        feature_values_non_poi = [f[i] for f, l in zip(features, labels) if l == 0.0]
        feature_values_poi = [f[i] for f, l in zip(features, labels) if l == 1.0]
        feature_values = feature_values_non_poi + feature_values_poi
        non_zero_values_non_poi = [x for x in feature_values_non_poi if x != 0.0]
        non_zero_values_poi = [x for x in feature_values_poi if x != 0.0]
        non_zero_values = non_zero_values_non_poi + non_zero_values_poi
        q1, q3 = np.percentile(non_zero_values, [25, 75])
        iqr = q3 - q1
        outliers_hi = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x > q3]
        outliers_lo = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x < q1]
        # get same binwidth for both POI and non-POI
        bins = np.histogram(non_zero_values, bins=50)[1]
        plt.hist(non_zero_values_poi, bins=bins, alpha=.5, lw=0, color='r', label='POIs')
        plt.hist(non_zero_values_non_poi, bins=bins, alpha=.5, lw=0, color='b', label='Non-POIs')
        msg = ('Maximum %s: %d\n' % (feature, max(non_zero_values)) +
               'Minimum %s: %d\n' % (feature, min(non_zero_values)) +
               'Mean %s: %.5f\n' % (feature, np.mean(non_zero_values)) +
               'Median %s: %d\n' % (feature, np.median(non_zero_values)) +
               '\nTotal Number of Values: %d\n' % len(feature_values) +
               'Total Number of Non-Zero Values: %d\n' % len(non_zero_values))
        # see which features have low number of non-zero values
        #if float(len(non_zero_values)) / len(feature_values) < 0.5:
        #    print feature
        # print out some outlier values if they exist
        for outliers, which_ols in zip([outliers_hi, outliers_lo], ['Top', 'Bottom']):
            if outliers:
                if len(outliers) >= 5:
                    top_n = 5
                    top_n = len(outliers)
                outliers = sorted(outliers)
                ol_line = q1 - 1.5*iqr
                if which_ols == 'Top':
                    outliers = list(reversed(outliers))
                    ol_line = q3 + 1.5*iqr
                msg += '\n%s %d Outliers: ' % (which_ols, top_n)
                for i in range(top_n):
                    if i != top_n - 1:
                        msg += '%d, ' % outliers[i]
                        msg += '%d' % outliers[i]
                plt.axvline(ol_line, lw=.5, ls='--', c='r')
        plt.figtext(.3, .4, msg)
        plt.title("%s histogram (non-zero values)" % feature)
        figname = 'hists/%s_histogram.png' % feature
def get_k_best(dictionary, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection returning:
    data = featureFormat(dictionary, features_list)
    labels, features = targetFeatureSplit(data)

    k_best = SelectKBest(k=k), labels)
    scores = k_best.scores_
    pairs = zip(features_list[1:], scores)
    #combined scores and features into a pandas dataframe then sort 
    k_best_features = pd.DataFrame(pairs,columns = ['feature','score'])
    k_best_features = k_best_features.sort('score',ascending = False)
    #merge with null counts    
    df_nan_counts = get_nan_counts(dictionary)
    k_best_features = pd.merge(k_best_features,df_nan_counts,on= 'feature')  
    #eliminate infinite values
    k_best_features = k_best_features[np.isinf(k_best_features.score)==False]
    print 'Feature Selection by k_best_features\n'
    print "{0} best features in descending order: {1}\n".format(k, k_best_features.feature.values[:k])
    print '{0}\n'.format(k_best_features[:k])
    return k_best_features[:k]
def select_k_best(data_dict, features_list, k):
    # Create dataset from feature list
    data = featureFormat(data_dict, features_list)
    # Split dataset into labels and features
    labels, features = targetFeatureSplit(data)
    # Create Min/Max Scaler
    scaler = preprocessing.MinMaxScaler()
    # Scale Features
    features = scaler.fit_transform(features)
    # Create k_best feature selection
    k_best = SelectKBest(k=k)
    # Fit k_best, labels)
    # Get k_best scores
    scores = k_best.scores_
    # Create list with features and scores
    unsorted_pairs = zip(features_list[1:], scores)
    # Sort list
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    # Create dict
    if k == "all":
        k_best_features = dict(sorted_pairs)
        k_best_features = dict(sorted_pairs[:k])
    return k_best_features
Esempio n. 30
breitbart_data = []
with open('breitbart_articles.pkl', 'rb') as f:
    breitbart = pickle.load(f)
for article in breitbart:
    breitbart_data.append([1, article])

newsmax_data = []
with open('newsmax_articles.pkl', 'rb') as f:
    newsmax = pickle.load(f)
for article in newsmax:
    newsmax_data.append([1, article])

right_wing_data = fox_data + observer_data + breitbart_data + newsmax_data

data = left_wing_data + right_wing_data
labels, articles = targetFeatureSplit(data)
articles_train, articles_test, labels_train, labels_test = train_test_split(
    articles, labels, test_size=0.001, random_state=42)

flat_train = []
flat_test = []

for sublist in articles_train:
    for article in sublist:
for sublist in articles_test:
    for article in sublist:

vectorizer = TfidfVectorizer(strip_accents="unicode", lowercase=False)
vectors = vectorizer.fit_transform(flat_train)
Esempio n. 31

### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl",
### there's an outlier--remove it!
data_dict.pop("TOTAL", 0)

### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
poi = "poi"
features_list = [poi, feature_1, feature_2]
data = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data)

### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
for f1, f2 in finance_features:
    plt.scatter(f1, f2)

### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred
model = cluster.KMeans(n_clusters=2)
pred = model.predict(finance_features)
Esempio n. 32
    if data_dict[keys]["from_poi_to_this_person"] == "NaN" or data_dict[keys][
            "from_this_person_to_poi"] == "NaN":
        data_dict[keys]["contact_poi"] = "NaN"
        data_dict[keys]["contact_poi"] = int(
            data_dict[keys]["from_poi_to_this_person"]) + int(

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Use SelectKBest to choose which feature to use for machine learning

k = 5
selKBest = SelectKBest(f_regression, k), labels)


mask = selKBest.get_support()

scores = selKBest.scores_
feature_score = zip(features_list[1:], scores)
feature_score = list(reversed(sorted(feature_score, key=lambda x: x[1])))
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    # The inital script raised an error : StratifiedShuffleSplit not iterable
    # I rewrote the cv StratifiedShuffleSplit object with the same parameters according to sklearn doc:
    #cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    cv = StratifiedShuffleSplit(n_splits=folds, random_state=42)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0

    #for train_idx, test_idx in cv:
    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
        for jj in test_idx:

        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy,
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                           false_positives, false_negatives,
        print ""
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
Esempio n. 34
def main():
    ### Task 1: Select what features you'll use.
    ### features_list is a list of strings, each of which is a feature name.
    ### The first feature must be "poi".
    financial_features = ['salary', 'deferral_payments', 'total_payments', \
                         'loan_advances', 'bonus', 'restricted_stock_deferred',\
                         'deferred_income', 'total_stock_value', 'expenses', \
                         'exercised_stock_options', 'other', 'long_term_incentive', \
                         'restricted_stock', 'director_fees'] #(all units are in US dollars)

    email_features = ['to_messages', 'from_poi_to_this_person',
                     'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
    #(units are generally number of emails messages; notable exception is ‘email_address’, 
    # which is a text string)
    #email_address feature was removed from list

    poi_label = ['poi'] ###(boolean, represented as integer)

    features_list = poi_label + email_features + financial_features

    ### Load the dictionary containing the dataset
    with open("final_project_dataset_unix.pkl", "rb") as data_file:
        data_dict = pickle.load(data_file)
    #convert to a pandas dataframe for exploratory analysis
    df = pd.DataFrame.from_dict(data_dict, orient='index')

    #iterate df and convert string 'NaN' to actual np.nan
    for label, content in df.items():
        if label == 'email_address':
            for i in content:
                if i == 'NaN':
                    df[label][i] = np.nan
            df[label] = pd.to_numeric(df[label], errors='coerce')

    ### Investigate contents of dataset:
    # Total Number of data points
    total_people = df.shape[0]
    print('The total number of data points (people) in our data set is {}.\n'\

    # Total Number of Features Used
    all_features = df.shape[1]
    print('There are {} features for each person in our dataset.\n'\

    # Total Number of Persons Of Interest (POIs)
    poi_count = df['poi'][(df['poi'] == True)].count()
    print('Our dataset has {} persons of interest.\n'.format(poi_count))

    # Total Number of Non-POIs
    non_poi_count = total_people - poi_count
    print('Our dataset has {} Non persons of interest.\n'.format(non_poi_count))

    # Features with missing values?
    print('The following categories have missing values (NaN values)\n')
    print (df.isna().sum())

    ### Task 2: Remove outliers

    #visualize_features('salary', 'bonus', data_dict)
    #visualize_features('from_poi_to_this_person', 'from_this_person_to_poi', data_dict)
    #visualize_features('loan_advances', 'total_stock_value', data_dict)

    print('Searching for Outliers...')
    find_outlier('salary', df)
    print ()
    find_outlier('bonus', df)
    find_outlier('from_poi_to_this_person', df)
    print ()
    find_outlier('from_this_person_to_poi', df)
    print ()
    find_outlier('loan_advances', df)
    print ()
    find_outlier('total_stock_value', df)

    #get a count of number of NaN columns for each person
    nan_count = df.isna().sum(axis=1)

    print('\nThe top 5 people by number of NaN columns are:\n')
    print (nan_count.sort_values(ascending=False).head(5))

    print('\nLooking closer at Eugene Lockhart...\n')
    print( df.loc['LOCKHART EUGENE E'])

    print ('\nLooking closer at THE TRAVEL AGENCY IN THE PARK...\n')
    print (df.loc['THE TRAVEL AGENCY IN THE PARK'])

    ### Remove outliers
    df = df.drop(['TOTAL'], axis=0)
    df = df.drop(["LOCKHART EUGENE E"], axis=0)
    df = df.drop(["THE TRAVEL AGENCY IN THE PARK"], axis=0)

    #replace NaN with 0
    df = df.fillna(0)

    ### Task 3: Create new feature(s)
    ### Store to my_dataset for easy export below.
    my_dataset = df.to_dict('index')

    for person in my_dataset:
        to_poi_count = my_dataset[person]['from_this_person_to_poi']
        from_poi_count = my_dataset[person]['from_poi_to_this_person']
        total_received_emails = my_dataset[person]['from_messages']
        total_sent_emails = my_dataset[person]['to_messages']
            my_dataset[person]['to_poi_ratio'] = float(to_poi_count) /\
            my_dataset[person]['to_poi_ratio'] = 0
            my_dataset[person]['from_poi_ratio'] = float(from_poi_count) /\
            my_dataset[person]['from_poi_ratio'] = 0

    features_list = features_list + ['to_poi_ratio', 'from_poi_ratio']

    ### Preprocessing

    ### Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    #Scaling features (normalizing all features)
    min_max_scaler = MinMaxScaler()
    features = min_max_scaler.fit_transform(features)

    ### Select the best features:
    # Removes all but the k highest scoring features
    n = 6 # adjust for optimization
    skb = SelectKBest(f_classif, k=n)
    skb.fit_transform(features, labels)
    #pprint(sorted(skb.scores_, reverse=True))

    #skip poi feature and combine with returned scores (key:value --> feature:score)
    scores = zip(features_list[1:], skb.scores_)

    #sort by highest scoring feature from scores
    sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True)
    #print '\nOur {} highest feature scores are:'.format(n)
    #add k highest scoring features to create new features_list
    new_features_list = poi_label + list(map(lambda x: x[0], sorted_scores))[:n]
    #print '\nOur new features list includes: '

    ### Extract features and labels from dataset using optimized features_list
    data = featureFormat(my_dataset, new_features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    ### Task 4: Try a variety of classifiers
    ### Please name your classifier clf for easy export below.
    ### Note that if you want to do PCA or other multi-stage operations,
    ### you'll need to use Pipelines. For more info:

    print ('\nRunning GaussianNB classifier...')
    run_classifier(GaussianNB(), features, labels)

    print ('\nRunning SVM classifier...')
    run_classifier(SVC(), features, labels)

    print ('\nRunning AdaBoost classifier...')
    run_classifier(AdaBoostClassifier(), features, labels)

    print ('\nRunning DecisionTree classifier...')
    run_classifier(DecisionTreeClassifier(), features, labels)

    ### Task 5: Tune your classifier to achieve better than .3 precision and recall 
    ### using our testing script. Check the script in the final project
    ### folder for details on the evaluation method, especially the test_classifier
    ### function. Because of the small size of the dataset, the script uses
    ### stratified shuffle split cross validation. For more info: 

    ### Re-Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    # Adjust SVM parameters to refine accuracy
    # variables will be passed to fine_tune_algorithm to use in a Pipeline
    print ('\nThe best fit SVM has the following scores:\n')
    svm_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                 ('SVM', SVC())]
    svm_parameters = {'SVM__kernel': ('linear', 'rbf'), 
                  'SVM__C':[0.001, 0.01, .1, 1, 10, 100, 1000], 
                  'SVM__gamma':[0.01, .1, 1, 10, 100, 1000],
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    svm_clf = fine_tune_algorithm(svm_steps, svm_parameters, features, labels)

    # Adjust DecisionTreeClassifier parameters to refine accuracy
    print ('\nThe best fit DecisionTreeClassifer has the following scores:\n')
    dt_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), 
                ('DT', DecisionTreeClassifier())]
    dt_parameters = {'DT__criterion': ('gini', 'entropy'), 
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    dt_clf = fine_tune_algorithm(dt_steps, dt_parameters, features, labels)

    # Adjust AdaBoostClassifier parameters to refine accuracy
    # variables will be passed to fine_tune_algorithm to use in a Pipeline
    print ('\nThe best fit AdaBoostClassifier has the following scores:\n')
    ab_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                ('AB', AdaBoostClassifier())]
    ab_parameters = {'AB__algorithm': ('SAMME', 'SAMME.R'), 
                  'AB__learning_rate':[.5, .6, .7, .8, .9,1],
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    ada_clf = fine_tune_algorithm(ab_steps, ab_parameters, features, labels)

    # Adjust GaussianNB parameters to refine accuracy
    print ('\nThe best fit GaussianNB Classifier has the following scores:\n')
    nb_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                ('NB', GaussianNB())]
    nb_parameters = {'SKB__k': [2,3,4,5,6,7,8,9,10]}
    nb_clf = fine_tune_algorithm(nb_steps, nb_parameters, features, labels)

    #final best fitting classifier
    clf = nb_clf

    ### Task 6: Dump your classifier, dataset, and features_list so anyone can
    ### check your results. You do not need to change anything below, but make sure
    ### that the version of that you submit can be run on its own and
    ### generates the necessary .pkl files for validating your results.

    dump_classifier_and_data(clf, my_dataset, features_list)
Esempio n. 35
# In[37]:

#With all features
from time import time
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# In[38]:

# without engineered features
data_old = featureFormat(my_dataset, old_features, sort_keys=True)
labels_old, features_old = targetFeatureSplit(data_old)
features_train_old, features_test_old, labels_train_old, labels_test_old = train_test_split(
    features_old, labels_old, test_size=0.3, random_state=42)

# In[39]:

data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

# In[40]:

from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
Esempio n. 36
        my_dataset[names]['messages_from_poi'] = 0
    if float(to_messages) != 0 and deferral_payments != 0:
            'messages_to_poi/deferral_payments'] = from_this_person_to_poi / float(
                to_messages * deferral_payments)
        my_dataset[names]['messages_to_poi/deferral_payments'] = 0

features_list_new = POI_label + financial_features + email_features_number + [
] + ['messages_to_poi/deferral_payments']
#print "The List with all features with 2 new ones is:", features_list_new

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list_new, sort_keys=True)
labels, features = targetFeatureSplit(data)

from sklearn.feature_selection import SelectKBest, f_classif
featureSelecting = SelectKBest(f_classif, k=10), labels)
featureSelected = featureSelecting.get_support()
scores = zip(featureSelecting.scores_, features_list_new[1:], featureSelected)
scoresSorted = sorted(scores, reverse=True)
#print "Scroes are:", scoresSorted
    scoresSorted =
    [(25.09754152873549, 'exercised_stock_options', True),
    (24.4676540475264, 'total_stock_value', True),
    (21.06000170753657, 'bonus', True),
    (18.575703268041785, 'salary', True),
    (11.5955476597306, 'deferred_income', True),
Esempio n. 37
new_feature_2_inputs_add('total_poi_emails', 'to_and_from_poi_emails',
                            'from_this_person_to_poi', 'to_messages')
                            'from_poi_to_this_person', 'from_messages')
new_feature_4_inputs_divide('percent_poi_emails', 'from_poi_to_this_person',
                            'from_this_person_to_poi', 'to_messages',

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

#Draw a plot comparing two features: f1_name and f2_name, along with their prediction line: pred.
def Draw(pred,
         f1_name="feature 1",
         f2_name="feature 2"):

    #plot each cluster with a different color--add more colors for
    #drawing more than five clusters
    colors = ["b", "c", "k", "m", "g"]
    for ii, pp in enumerate(pred):
Esempio n. 38

# Now we are preparing to make our finalList, but as the project requirement its first element should be 'poi'
theFinalList = ['poi']

sep("*", "the final feature list")
print theFinalList

#Now features_list is finalized and will be utilized by the feature_format module
features_list = theFinalList

### Extract features and labels from dataset for local testing
data = feature_format.featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = feature_format.targetFeatureSplit(data)

### Task 4: Try a varity of classifiers

def use_decision_tree_clf():
    This function uses Decision tree classifier in addition to Grid search cross validation
    print "This is the use_decision_tree_clf() method"
    from sklearn import tree
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import precision_recall_fscore_support

    param = {
        'max_depth': [1, 2, 3, 9],
def estimator_evaluator1(clf, dataset, feature_list, folds):
    from feature_format import featureFormat, targetFeatureSplit
    from sklearn.cross_validation import StratifiedKFold
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedKFold(labels, n_folds=folds, random_state=30)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
        for jj in test_idx:

        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
        print("Got a divide by zero when trying out:", clf)
            "Precision or recall may be undefined due to a lack of true positive predicitons."
Esempio n. 40
### Task 2: Remove outliers
del data_dict['TOTAL']

### Task 3: Create new feature(s)
data_dict = hf.add_poi_mail_features(data_dict)
# features_list.append('from_poi_pct')
# features_list.append('to_poi_pct')

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract all features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Dimensions of the data are computed below
# print np.array(data).shape
# print np.sum(labels)

### Plot all original variables
# hf.plot_features(features_list, data_all)

### Print top 5 extreme observations for "loan_advances" and "total_payments"
# pprint.pprint(hf.return_sorted_values(data_dict, "loan_advances", 5))
# pprint.pprint(hf.return_sorted_values(data_dict, "total_payments", 5))

### Plot Lasso selection
# hf.lasso_selection(features, labels, features_list)
Esempio n. 41
new_data = pd.DataFrame(my_dataset.values())[features_list]
new_data.index = my_dataset.keys()
new_data['new_total_stock'] = new_data['exercised_stock_options'] + new_data[
new_dataset = {}
key = list(new_data.index)
for j in range(len(key)):
    v = {}
    key_v = list(new_data.columns.values)
    for i in range(len(key_v)):
        value_v = list(new_data.loc[key[j]])
        v[key_v[i]] = value_v[i]
    new_dataset[key[j]] = v
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
features_list_new = [
    'poi', 'bonus', 'exercised_stock_options', 'expenses', 'from_messages',
    'from_poi_to_this_person', 'from_this_person_to_poi', 'other',
    'restricted_stock', 'salary', 'shared_receipt_with_poi', 'to_messages',
data_new = featureFormat(new_dataset, features_list_new, sort_keys=True)
labels_new, features_new = targetFeatureSplit(data_new)
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)
features_new = scaler.fit_transform(features_new)
from numpy import mean
from sklearn import cross_validation
from sklearn.metrics import accuracy_score, precision_score, recall_score
Esempio n. 42
    for k, v in data_dict[key].items():
        if v == 'NaN':
            data_dict[key][k] = 0

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)

from sklearn import preprocessing

scaler = preprocessing.RobustScaler()
data_scaled = scaler.fit_transform(data)
labels, features = targetFeatureSplit(data_scaled)
# for point in data:
#     salary = point[4]
#     bonus = point[2]
#     matplotlib.pyplot.scatter( salary, bonus )
# matplotlib.pyplot.xlabel("salary")
# matplotlib.pyplot.ylabel("bonus")

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
def test_classifier(clf, dataset, feature_list, folds=1000):
    # extract the features specified in features_list
    data = featureFormat(dataset, feature_list, sort_keys=True)
    # split into labels and features (this line assumes that the first
    # feature in the array is the label, which is why "poi" must always
    # be first in the features list
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
        for jj in test_idx:

        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        # print clf
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
        return clf
        print("Got a divide by zero when trying out:", clf)
            "Precision or recall may be undefined due to a lack of true positive predicitons."
Esempio n. 44
            'from_this_person_to_poi'] / data_dict[i]['from_messages']
        data_dict[i]['from_this_person_to_poi_ratio'] = 'NaN'

features_list = [
    'poi', 'salary', 'deferral_payments', 'loan_advances', 'bonus',
    'restricted_stock_deferred', 'deferred_income', 'expenses',
    'exercised_stock_options', 'other', 'long_term_incentive',
    'restricted_stock', 'director_fees', 'to_messages',
    'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
    'shared_receipt_with_poi', 'from_poi_to_this_person_ratio',

data_array = featureFormat(data_dict, features_list)
poi, features = targetFeatureSplit(data_array)

# Data Split for train and test

features_train, features_test, labels_train, labels_test = train_test_split(
    features, poi, test_size=0.3, random_state=42)

# Feature scaling
scaler = MinMaxScaler()
rescaled_features_train = scaler.fit_transform(features_train)
rescaled_features_test = scaler.fit_transform(features_test)

# Feature selection with SelectKBest

from sklearn.feature_selection import SelectKBest
Esempio n. 45
def my_test_classifier(clf, dataset, feature_list, folds=1000):
    from sklearn.cross_validation import StratifiedShuffleSplit
    \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
    Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"

    RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
    \tFalse negatives: {:4d}\tTrue negatives: {:4d}"

    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
        for jj in test_idx:

        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"

        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        results = (clf,
                                                true_negatives), precision,
                   recall, accuracy, f1, f2)
        results = (
            clf, "Got a divide by zero when trying out:",
            "Precision or recall may be undefined due to a lack of true positive predicitons.",
            0, 0)

    return results
df['to_poi_rate'] = df['from_this_person_to_poi'] / df['from_messages']
df['from_poi_rate'] = df['from_poi_to_this_person'] / df['to_messages']
new_feat_list = ['from_messages_median_pubIndex', 'to_poi_median_pubIndex']
df = pd.concat([df, df_new], axis=1)
df[new_feat_list] = df[new_feat_list].fillna(

features_list = (poi_label + financial_feat_list + email_feat_list +
                 ['to_poi_rate', 'from_poi_rate'] + new_feat_list)
print("Total number of features: ", len(features_list) - 1)

### Store to my_dataset for easy export below.
my_dataset = df.to_dict(orient='index')

data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:

# Provided to give you a starting point. Try a variety of classifiers.
clf = AdaBoostClassifier(random_state=45), labels_train)

feat_importance = []
for i in range(len(clf.feature_importances_)):
Esempio n. 47
#         mail = mail + 1
# print salary
# print mail

# peeps = enron_data.keys()
# print len(peeps)
# payments = 0
# for peep in peeps:
#     if enron_data[peep]['total_payments'] == 'NaN':
#         payments = payments + 1
# print payments
# print 100 * float(payments) / float(len(peeps))

feature_list = ["poi", "total_payments"]
data_array = featureFormat(enron_data, feature_list)
label, features = targetFeatureSplit(data_array)

print label
print len(label) + 10
# i = 0
# for lab in label:
#     # print lab
#     if lab == 1.0:
#         print features[i]
#         i = i + 1

#         nada = nada + 1
# totes = len(label)
# percent_losers = 100 * float(nada) / float(totes)
# print percent_losers
            'from_this_person_to_poi'] / data_dict[e]['from_messages']
        data_dict[e]['from_this_person_to_poi_ratio'] = 'NaN'

features_list = [
    'poi', 'salary', 'deferral_payments', 'loan_advances', 'bonus',
    'restricted_stock_deferred', 'deferred_income', 'expenses',
    'exercised_stock_options', 'other', 'long_term_incentive',
    'restricted_stock', 'director_fees', 'to_messages',
    'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
    'shared_receipt_with_poi', 'from_poi_to_this_person_ratio',

data_array = featureFormat(data_dict, features_list)
poi, features = targetFeatureSplit(data_array)

### split the data into train and test

features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    features, poi, test_size=0.3, random_state=42)

###Feature scaling before selection, necessary for SVM.
scaler = MinMaxScaler()
rescaled_features_train = scaler.fit_transform(features_train)
rescaled_features_test = scaler.fit_transform(features_test)

### Univariate selection
### Using SelectKBest, I can select features according to the k highest scores.

from sklearn.feature_selection import SelectKBest
Esempio n. 49

import sys
import pickle

from feature_format import featureFormat, targetFeatureSplit

dictionary = pickle.load(
    open("../final_project/final_project_dataset_modified.pkl", "r"))

### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat(dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split

feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.
from sklearn import linear_model
Esempio n. 50
def selectKBest_f1_scores(clf, dataset, n_kbest_features, folds = 1000):
    """ Verifica os scores do numero de features selecionadas.
    Responsavel por selecionar o score F1 de 2 ate n_kbest_features.
        clf: classificador utilizado para a analise
        dataset: dados utilizados
        n_kbest_features: numero de maximo de features permitido.
        retorno1: Lista de valores K
        retorno2: Lista de Scores F1
    graficoX = []
    graficoY = []
    for k in range(2, n_kbest_features):
        features_selected = select_best_features(k)
        features_selected.insert(0, "poi")
        data = featureFormat(dataset, features_selected, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
        true_negatives = 0
        false_negatives = 0
        true_positives = 0
        false_positives = 0
        for train_idx, test_idx in cv: 
            features_train = []
            features_test  = []
            labels_train   = []
            labels_test    = []
            for ii in train_idx:
                features_train.append( features[ii] )
                labels_train.append( labels[ii] )
            for jj in test_idx:
                features_test.append( features[jj] )
                labels_test.append( labels[jj] )

  , labels_train)
            predictions = clf.predict(features_test)
            for prediction, truth in zip(predictions, labels_test):
                if prediction == 0 and truth == 0:
                    true_negatives += 1
                elif prediction == 0 and truth == 1:
                    false_negatives += 1
                elif prediction == 1 and truth == 0:
                    false_positives += 1
                elif prediction == 1 and truth == 1:
                    true_positives += 1
                    print "Warning: Found a predicted label not == 0 or 1."
                    print "All predictions should take value 0 or 1."
                    print "Evaluating performance for processed predictions:"
            f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
            print "Got a divide by zero when trying out:", clf
            print "Precision or recall may be undefined due to a lack of true positive predicitons."
    return  graficoX, graficoY
Esempio n. 51
features_list += [
    'salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus',
    'restricted_stock_deferred', 'deferred_income', 'total_stock_value',
    'expenses', 'exercised_stock_options', 'other', 'long_term_incentive',
    'restricted_stock', 'director_fees'
# email feature
features_list += [
    'to_messages', 'from_poi_to_this_person', 'from_messages',
    'from_this_person_to_poi', 'shared_receipt_with_poi'

print len(features_list)

### Load the dictionary containing the dataset
y, X = targetFeatureSplit(featureFormat(data_dict, features_list))
X = np.array(X)

from sklearn import tree

clf = tree.DecisionTreeClassifier(random_state=12)
clf =, y)
print "clf.feature_importances_ : ", clf.feature_importances_
idx_feature_importances = np.argsort(clf.feature_importances_)[::-1]
for i in range(10):
    idx = idx_feature_importances[i]
    print "importance ", i, " - ", features_list[
        idx + 1], " - ", clf.feature_importances_[idx]

new_features_list = ['poi']
for i in range(10):
from feature_format import featureFormat, targetFeatureSplit
## this include financial information and email address, how many email sent or recieved from POI
dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") )

## create another dictionary to put only POI from the original dictionary
poi_dictionary = {}
for k, v in dictionary.iteritems():
    if v['poi'] == True:
        poi_dictionary[k] = v

## change this list, to see result with other values
features_list = ["bonus", "exercised_stock_options"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
poi_data = featureFormat( poi_dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit( data )
poi_target, poi_features = targetFeatureSplit( poi_data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
poi_color = "b"
all_color = "r"

### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, targe in zip(features, target):
    plt.scatter( feature, targe, color=all_color ) 
data = featureFormat(data_dict, features_list)

# Fit a scaler on the original data (So we don't get deceived by the "NaN" points)
feature_1_scaler = fit_scaler_on_original_data(data_dict, feature_1)
feature_2_scaler = fit_scaler_on_original_data(data_dict, feature_2)

# Rescale the data using the appropriate scaler
rescaled_data = data
              1] = feature_1_scaler.transform(rescaled_data[:,
                                                            1].reshape(1, -1))
              2] = feature_2_scaler.transform(rescaled_data[:,
                                                            2].reshape(1, -1))

poi, finance_features = targetFeatureSplit(rescaled_data)

kmeans_model = KMeans(n_clusters=2)
pred = kmeans_model.fit_predict(rescaled_data)

except NameError:
    print "no predictions object named pred found, no clusters to plot"
Esempio n. 54
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    cv = StratifiedShuffleSplit(n_splits=folds, random_state=42)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0

    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []

        for ii in train_idx:
        for jj in test_idx:

        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)

        ##modifiquei a função para ela retornar os resultados ao inves de imprimi-los
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,

        return accuracy, precision, recall
        print("Got a divide by zero when trying out:", clf)
            "Precision or recall may be undefined due to a lack of true positive predicitons."
Esempio n. 55
    if msg_from_poi != "NaN" and to_msg != "NaN":
        my_dataset[person]['msg_from_poi_ratio'] = msg_from_poi/float(to_msg)
        my_dataset[person]['msg_from_poi_ratio'] = 0

    msg_to_poi = my_dataset[person]['from_this_person_to_poi']
    from_msg = my_dataset[person]['from_messages']
    if msg_to_poi != "NaN" and from_msg != "NaN":
        my_dataset[person]['msg_to_poi_ratio'] = msg_to_poi/float(from_msg)
        my_dataset[person]['msg_to_poi_ratio'] = 0
new_features_list = features_list + ['msg_to_poi_ratio', 'msg_from_poi_ratio']

## Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, new_features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

#Select the best features: 
#Removes all features whose variance is below 80% 
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
features = sel.fit_transform(features)

#Removes all but the k highest scoring features
from sklearn.feature_selection import f_classif
k = 7
selector = SelectKBest(f_classif, k=7)
selector.fit_transform(features, labels)
print("Best features:")
scores = zip(new_features_list[1:],selector.scores_)
sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True)
Esempio n. 56
def FeatureSelection(data_dict, features_list):
    # Convert dictionary to numpy array, converts NaN to 0.0
    data = featureFormat(data_dict, features_list, \
                         sort_keys = True, remove_all_zeroes = False)
    # Separate into labels = 'poi' and features = rest of features_list
    labels, features = targetFeatureSplit(data)

    from sklearn.feature_selection import RFECV
    # Recursive Feature Elimination with Cross Validation
    from sklearn.svm import SVC
    # Support Vector Classifier to estimate fit coefficients for each feature
    from sklearn.cross_validation import StratifiedShuffleSplit
    # cross validation maintain roughly equal number of POIs in each split

    ### Create Estimator
    # which will update the coefficients with each iteration
    # class weight is set to auto because of unbalanced data classes
    # weight will be inversely proportional to class size
    svc = SVC(kernel='linear', class_weight='auto', random_state=42)
    ############## Scale features ######################
    # SVC algorithm requires use scaled features
    # missing values are coded 0.0, so MinMax will preserve those zero values
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)

    ### Select cross-validation method
    # StratifiedShuffleSplit keeps roughly the same number of POIs in each split
    sss = StratifiedShuffleSplit(labels, 100, test_size=0.3, random_state=42)
    ### Select evaluation metric
    # Evaluate model using f1 = 2 * (precision * recall) / (precision + recall)
    # Model should be able to predict POIs, which are a small percentage of cases
    metric = 'f1'
    # run the feature eliminater
    rfecv = RFECV(estimator=svc, cv=sss, scoring=metric, step=1)
    rfecv =, labels)

    # view results
    import matplotlib.pyplot as plt
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score using F1 (precision&recall)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    #    plt.savefig('featureSelection.png', transparent=True)
    print("Optimal number of features is %d" % rfecv.n_features_)
        'Features selected by recursive feature elimination with cross validation:'
    F1_score = round(rfecv.grid_scores_[rfecv.n_features_], 3)
    print('F1 score from optimal features: %r' % F1_score)
    selection = rfecv.get_support()
    selected_features = ['poi']
    rejected_features = []
    for i in range(len(selection)):
        if selection[i]:
                features_list[i + 1])  # first feature is 'poi'=the label
            rejected_features.append(features_list[i + 1])
    print('Features eliminated:')
    return selected_features, F1_score
Esempio n. 57
    You fill in the regression code where indicated:

import sys
import pickle
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load(
    open("../final_project/final_project_dataset_modified.pkl", "r"))

### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat(dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.

from sklearn.linear_model import LinearRegression
reg = LinearRegression()
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    all_importance = []  #for holding feature importance from each fold
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
        for jj in test_idx:

        ### fit the classifier using training set, and test on test set, labels_train)
        importance = clf.feature_importances_
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        avg_importance = np.mean(all_importance, axis=0)
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        cm = [[true_negatives, false_positives],
              [false_negatives, true_positives]]
        print clf
        print "Feature importances", avg_importance
        print PERF_FORMAT_STRING.format(accuracy,
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                           false_positives, false_negatives,
        print ""
        return cm
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predictions."
Esempio n. 59
         (this should be the quantity you want to predict) return targets and features as separate lists
         (sklearn can generally handle both lists and numpy arrays as input formats when training/predicting)
     target = []
     features = []
     for item in data:
         target.append( item[0] )
         features.append( item[1:] )
     return target, features

data_dict = featureFormat(my_dataset, features_list, sort_keys = True)
#print "\n data_dict:", data_dict
labels, features = targetFeatureSplit(data_dict)
#print "\n The Labels are     :      ", labels
#print "\n The Labels are     :      ", features

# scale features via min-max
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)

Esempio n. 60
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    # Scale features
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        ### fit the classifier using training set, and test on test set, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print 'Total predictions: '+str(total_predictions)
        print 'Accuracy: '+str(accuracy)
        print 'Precision: '+str(precision)
        print 'Recall: '+str(recall)
        print 'F1: '+str(f1)
        print 'F2: '+str(f2)
        print ""
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."