def precompute_recall_precision(features_list, sum = False):
    features_list_all = ['poi'] + features_list
    data = featureFormat(my_dataset, features_list_all, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    standardized = MinMaxScaler().fit_transform(features)
    # Score the features using f_classif
    sel = SelectKBest(k='all', score_func=f_classif)
    sel.fit_transform(features, labels)
    kbest = [(features_list[i], score, i) for i, score in enumerate(sel.scores_)]
    sorted_kbest = sorted(kbest, key=operator.itemgetter(1), reverse=True)
    print "Feature Set(", len(kbest), ") List and K-best scores:"
    for tup in sorted_kbest:
        print tup[2], "\t", tup[0], tup[1]
    if not sum:
        plot_feature_correlation(features, len(kbest))
    for i, method in enumerate(methods):
        pipe, params = method()
        grid_searcher = GridSearchCV(pipe, param_grid=params, cv=sk_fold, scoring='recall')
        grid_searcher.fit(features, labels)
        clf = grid_searcher.best_estimator_

        ### Extract features and labels from dataset for local testing
        data = featureFormat(my_dataset, features_list_all, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        my_test_classifier(clf, my_dataset, features_list_all, i)
Example #2
0
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    # configure split of test_size and train_size
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42, 
                                test_size = .2, train_size = .8)
#    print cv

    for train_idx, test_idx in cv: 
        features_train      = []
        features_test       = []
        features_validation = []
        labels_train        = []
        labels_test         = []
        labels_validation   = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            if jj % 2 == 0: 
                features_validation.append( features[jj] )
                labels_validation.append( labels[jj] )
            else: 
                features_test.append( features[jj] )
                labels_test.append( labels[jj] )
                
# Determine size of training & test sets      
    fit_and_test_classifier(clf, features_train, labels_train, features_test, labels_test)
    fit_and_test_classifier(clf, features_train, labels_train, features_validation, labels_validation)
    print "features_train:", len(features_train), "labels_train:", len(labels_train)      
    print "features_test:", len(features_test), "labels_test:", len(labels_test)
    print "features_validation:", len(features_validation), "labels_validation:", len(labels_validation)
def test_classifier(clf, dataset, feature_list, folds = 1000,scale_features = True, std_features = False):
    data = featureFormat(dataset, feature_list, sort_keys = True)

    
    labels, features = targetFeatureSplit(data)

    #scale features if necessary
    if scale_features == True:
        scaler = preprocessing.MinMaxScaler()
        features = scaler.fit_transform(features)

    #standardize features for pca if necessary
    if std_features == True:
        std = preprocessing.StandardScaler()
        features = preprocessing.StandardScaler().fit_transform(features)


    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            else:
                true_positives += 1
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
        return f1
    except:
        print "Got a divide by zero when trying out:", clf
def makeData(dataset, feature_list, folds = 1000):
    """Make and return dataset prepared for training.

    Keyword arguments:
    dataset --- dict of dict
    feature_list --- list of strings
    folds --- int
    
    """
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    
    for train_idx, test_idx in cv: 
        
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
    return features_train, features_test, labels_train, labels_test
def test_training_stratified_split(dataset, features_list, testsize=0.2):
    """
    For E+F dataset, split dataset into the training and test 
    set using stratified method.

    Input:
    dataset: data in dictionary format 
    features_list: the full list of features to selection from 
    test: the proportion of the dataset to include in the test split

    Return:
    labels_train, labels_test, features_train, features_test

    """
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    labels = np.array([int(label) for label in labels])
    features = np.array(features)
    ### Split data into test set and training set
    sss = StratifiedShuffleSplit(labels, 1, test_size=testsize, random_state=0)

    for train_index, test_index in sss:
        labels_train, labels_test = labels[train_index].tolist(), labels[test_index].tolist()
        features_train, features_test = features[train_index].tolist(), features[test_index].tolist()
    return labels_train, labels_test, features_train, features_test
def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, 1000, random_state=42)

    # Build an empty feature importance totals array for calculating average importance
    totals = []
    for each_feature in feature_list:
        totals.append(0)

    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])
        clf = clf.fit(features_train, labels_train)
        for i in range(len(clf.feature_importances_)):
            totals[i] += clf.feature_importances_[i]
        # print clf.feature_importances_

    for i in range(len(totals)):
        totals[i] /= 1000

    # Display results
    print "Feature list: ", feature_list[1:]
    print "Importances: ", totals
def univariateFeatureSelection(f_list, my_dataset):
	result = []
	for feature in f_list:
		# Replace 'NaN' with 0
		for name in my_dataset:
			data_point = my_dataset[name]
			if not data_point[feature]:
				data_point[feature] = 0
			elif data_point[feature] == 'NaN':
				data_point[feature] =0

		data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB()
		clf.fit(features_train, labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
		result.append((feature,score[0],score[1],score[2]))
	result = sorted(result, reverse=True, key=lambda x: x[3])
	return result
def get_most_important_features(dataset, features_list):
    """Calculates the feature importances.
    Takes as input a dataset and a list of features.
    Creates an overfit Decision Tree and calculates the feature importances.
    Returns a list with the feature importances.
    """
    # creating an overfitted decision tree
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score

    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    # new features filtered, NaN values removed
    features_train, features_test, labels_train, labels_test = train_test_split(features,
                                                                                    labels,
                                                                                    test_size=0.3,
                                                                                    random_state=42)

    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    acc = accuracy_score(labels_test, pred)
    # uncomment to print the accuracy score
    #print "overfitted accuracy", acc

    # calculating feature importances
    feat_imp = clf.feature_importances_
    # uncomment to print the most important (common) ones
    #print feat_imp
    #for index, feature in enumerate(feat_imp):
    #    if feature > 0.2:
    #        print "spot:", index, ":", features_list[index+1], " | value:", feature
    return feat_imp
def prep_features(df, features_list, feature_scaled):

    """
    Arguments:
        load dataframe (or dictionary), and features_list
    return
        scaled features, labels in numpy.ndarray, and
        scaled features, labels in pandas dataframe
    """
    from feature_format import featureFormat, targetFeatureSplit
    import pandas as pd
    # for pandas dataframe
    df1 = df[features_list]
    features_df = df1.drop('poi', axis=1)#.astype(float)  # new features (pandas dataframe)
    labels_df = df1['poi']  # new labels (pandas dataframe)
    if feature_scaled ==  True:
        features_df_scaled = scale_features(features_df) # scale features
    else:
        features_df_scaled = features_df
    # for dictionary
    df2 = df[features_list]
    data_dict_new = df2.T.to_dict()  # data_dict (final)
    features_dic = features_df.copy()
    X_features = list(features_dic.columns)
    features_list_new = ['poi'] + X_features  # selected features list (final)
    data = featureFormat(data_dict_new, features_list_new, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    if feature_scaled == True:
        features = scale_features(features)

    return features, labels, features_df_scaled, labels_df
def selectKBest(previous_result, data):
	# remove 'restricted_stock_deferred' and 'director_fees'
	previous_result.pop(4)
	previous_result.pop(4)

	result = []
	_k = 10
	for k in range(0,_k):
		feature_list = ['poi']
		for n in range(0,k+1):
			feature_list.append(previous_result[n][0])

		data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB()
		clf.fit(features_train, labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
		result.append((k+1,score[0],score[1],score[2]))
	return result
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)

        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            else:
                true_positives += 1
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)

        print clf
        #print "Best Params: ", clf.best_params_
        #print "Best Estimator: ", clf.best_estimator_
        #current_classifier = clf.best_estimator_
        importance = None


        if importance is not None:
            print "Importance: ", importance
            imp = sorted(zip(feature_list, importance), key=lambda tup: tup[1], reverse=True)
            print "Most Important Variables: " + str(imp)


        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives,
                                           true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out: ", clf
def cluster2Features():
    ### the input features we want to use
    ### can be any key in the person-level dictionary (salary, director_fees, etc.)
    feature_1 = "salary"
    feature_2 = "exercised_stock_options"
    poi  = "poi"
    features_list = [poi, feature_1, feature_2]
    data = featureFormat(data_dict, features_list )
    poi, finance_features = targetFeatureSplit( data )

    ### in the "clustering with 3 features" part of the mini-project,
    ### you'll want to change this line to
    ### for f1, f2, _ in finance_features:
    ### (as it's currently written, the line below assumes 2 features)
    #print finance_features
    for f1, f2 in finance_features:
        plt.scatter( f1, f2)
    plt.show()

    ### cluster here; create predictions of the cluster labels
    ### for the data and store them to a list called pred
    from sklearn.cluster import KMeans
    estimators = {'k_means_2': KMeans(n_clusters=2)}
    estimators['k_means_2'].fit(data)
    pred = estimators['k_means_2'].predict(data)

    ### rename the "name" parameter when you change the number of features
    ### so that the figure gets saved to a different file
    try:
        Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
    except NameError:
        print "no predictions object named pred found, no clusters to plot"
def algorithm(data_dict, features_list):

    from feature_format import featureFormat
    from feature_format import targetFeatureSplit
   
    ### store to my_dataset for easy export below
    my_dataset = data_dict
    data = featureFormat(my_dataset, features_list)

    # scale features
    #data = scaleFeatures(data)
    
    ### split into labels and features (this line assumes that the first
    ### feature in the array is the label, which is why "poi" must always
    ### be first in features_list
    labels, features = targetFeatureSplit(data)

    
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators = 1000, random_state = 202, \
    		learning_rate = 1.0, algorithm = "SAMME.R")
    
    ### dump your classifier, dataset and features_list so 
    ### anyone can run/check your results
    pickle.dump(clf, open("my_classifier.pkl", "w") )
    pickle.dump(data_dict, open("my_dataset.pkl", "w") )
    pickle.dump(features_list, open("my_feature_list.pkl", "w") )
def find_best_parameters(pipeline, parameters, score_func, dataset, 
                         feature_list, test_size=0.2, n_iter=10):
    """
    find best parameter by using GridSearchCV with given scoring function.

    returns GridSearchCV object that has best parameters.
    """

    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, 1, test_size=test_size, random_state = 42)
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

    sss = StratifiedShuffleSplit(labels_train, n_iter=n_iter , test_size=test_size, random_state=42)

    clf = GridSearchCV(pipeline, parameters, scoring=score_func, cv=sss, n_jobs=-1)
    clf.fit(features_train, labels_train)

    return clf
def test_classifier(clf, dataset, feature_list, scaling = False, folds = 1000):
    score_all = []
    precision_all = []
    recall_all = []
    
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    
    if scaling == True:
        min_max_scaler = preprocessing.MinMaxScaler()
        features = min_max_scaler.fit_transform(features)
		
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    for train_indices, test_indices in cv: 
        features_train= [features[ii] for ii in train_indices]
        features_test= [features[ii] for ii in test_indices]
        labels_train=[labels[ii] for ii in train_indices]
        labels_test=[labels[ii] for ii in test_indices]
        
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)
        score_all.append(clf.score(features_test,labels_test))
        precision_all.append(precision_score(labels_test,pred))
        recall_all.append(recall_score(labels_test,pred))

    precision = numpy.average(precision_all)
    recall = numpy.average(recall_all)
    score = numpy.average(score_all)
        
    print "Score: " + str(score)
    print "Recall: " + str(precision)
    print "Precision: " + str(recall)	
def get_k_best(df, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection
        returns dict where keys=features, values=scores
    """
    # feature, label = feature_format_scale(data_dict, features_list)
    from poi_dataprocess import *
    from feature_format import featureFormat, targetFeatureSplit

    data_dict_new = df[features_list].T.to_dict()

    data = featureFormat(data_dict_new, features_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    # df = df[features_list]
    # features = df.drop('poi', axis=1)#.astype(float)
    # labels = df['poi']

    from sklearn import preprocessing

    scaler = preprocessing.MinMaxScaler()
    features = scaler.fit_transform(features)

    from sklearn.feature_selection import SelectKBest

    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    k_best_features = dict(sorted_pairs[:k])

    return k_best_features
def validation(clf, dataset, feature_list, test_size=0.2, n_iter=1000):
    '''
    validate given classifier with using stratifie shuffle split cross validation. 
    returns average precision and recall
    '''
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)

    precision = []
    recall = []

    cv = StratifiedShuffleSplit(labels, n_iter, test_size=test_size, random_state = 42)
    for train_idx, test_idx in cv:
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)

        precision.append(precision_score(labels_test, predictions))
        recall.append(recall_score(labels_test, predictions))

    return np.mean(precision), np.mean(recall)
Example #18
0
def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    ### Run testing script
    test_classifier(clf, features, labels)
def get_k_best_features(data_dict, features_list, k):

  """
  runs scikit-learn's SelectKBest feature selection to get k best features
    
  Args:
    data_dict: data dictionary for enron
    feature_list: a list of features with first feature as target label
    k: Number of best features which need to be selected

  Returns:
    returns a list of k best features and list of lists where inner list's 
    first element is feature and the second element is feature score
  """

  data = featureFormat(data_dict, features_list)
  labels, features = targetFeatureSplit(data)

  k_best = SelectKBest(k=k)
  k_best.fit(features, labels)
  scores = k_best.scores_
  unsorted_pairs = zip(features_list[1:], scores)
  sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
  k_best_features = dict(sorted_pairs[:k])
  return k_best_features.keys(), map(list, sorted_pairs)
def regressionBonusAndLongTermInc():
    ### list the features you want to look at--first item in the
    ### list will be the "target" feature
    features_list = ["bonus", "long_term_incentive"]
    data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
    #, sort_keys = '../../tools/python2_lesson06_keys.pkl'
    target, features = targetFeatureSplit( data )
    #print target
    #print features

    ### training-testing split needed in regression, just like classification
    from sklearn.cross_validation import train_test_split
    feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
    train_color = "b"
    test_color = "r"

    ### Your regression goes here!
    ### Please name it reg, so that the plotting code below picks it up and
    ### plots it correctly. Don't forget to change the test_color above from "b" to
    ### "r" to differentiate training points from test points.
    from sklearn import linear_model
    ### name your regression reg
    reg = linear_model.LinearRegression()

    ### your code goes here!
    reg.fit(feature_train, target_train)
    #find the score on the test data
    print reg.score(feature_test, target_test)
Example #21
0
def tune_classifier(clf_name, clf, dataset, features_list, scores, folds = 1000):
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    scale = True if clf_name in {'kNN', 'SVM', 'kNN (hand-tuned)'} else False
    if scale:
        # Perform feature scaling 
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    if clf_name == 'kNN':
        parameter_grid = [{'p': [1, 2, 3], 
                           'n_neighbors': [1, 5, 7, 10, 15],
                           'leaf_size': [30, 50, 70, 100]}]
    elif clf_name == 'Decision Tree':
        parameter_grid = [{'min_samples_split': [2, 3, 4, 5], 
                           'min_samples_leaf':[2, 3, 4, 5], 
                           'splitter': ['random', 'best']}]
    best_params={}
    for score in scores:
        grid_clf = GridSearchCV(clf, parameter_grid, cv=cv, 
                                scoring="{0}_weighted".format(score))
        grid_clf.fit(features, labels)
        best_params = grid_clf.best_params_
        #print("Grid scores:")
        #for params, mean_score, scores in grid_clf.grid_scores_:
        #    print("{:0.3f} {:+0.03f} for {!r}".format(mean_score, scores.std() * 2, params))
    print("Classifier {0} has tuned parameters {1}".format(clf_name, best_params))
    return best_params
Example #22
0
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    ## Tester lacks feature scaling, lets put it here:

    # Scale features:
    mins = np.min(data, axis=0)
    maxs = np.max(data, axis=0)
    data = (data - mins) / (maxs - mins)

    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)
        print RESULTS_FORMAT_STRING.format(
            total_predictions, true_positives, false_positives, false_negatives, true_negatives
        )
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
def select_k_best_features(data, feature_list, k):
    """
    For E+F dataset, select k best features based on SelectKBest from 
    sklearn.feature_selection

    Input:
    data: data in dictionary format 
    feature_list: the full list of features to selection from 
    k: the number of features to keep

    Return:
    the list of length of k+1 with the first element as 'poi' and other 
    k best features 

    """
    data = featureFormat(data_dict, feature_list)
    labels, features = targetFeatureSplit(data)
    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    impt_unsorted = zip(feature_list[1:], k_best.scores_)
    impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True))
    k_best_features = [elem[0] for elem in impt_sorted][:k]
    print k, "best features:"
    print k_best_features
    return ['poi'] + k_best_features
Example #24
0
def tuner(clf, parameters, data):
    from sklearn.model_selection import GridSearchCV
    labels, features = targetFeatureSplit(data)
    scaler = MinMaxScaler()
    select = SelectKBest()

    steps = [("scale", scaler),
             ("select", select),
             ("classifier", clf)]
    
    pipeline = Pipeline(steps)

    shuffle = StratifiedShuffleSplit(n_splits=1000, test_size=0.3,
                                     random_state=42)
    
    my_scorer = make_scorer(my_score_func)
    scoring_metric = my_scorer
    
    grid_searcher = GridSearchCV(pipeline, param_grid=parameters,
                                 cv=shuffle, scoring=scoring_metric)

    features = select.fit_transform(features, labels)

    grid_searcher.fit(features, labels)

    print("Cross-validated {0} score: {1}".format(scoring_metric,
                                                  grid_searcher.best_score_))

    print("Params: ", grid_searcher.best_params_)
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    #uncomment to do features scaling
    """
    scaler=MinMaxScaler()
    features=scaler.fit_transform(features)

    """
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    gg=0
    for train_idx, test_idx in cv: 
        gg+=1
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        ### fit the classifier using training set, and test on test se

        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        #print predictions
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
Example #26
0
def ptest(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            else:
                true_positives += 1
    precision = 1.0*true_positives/(true_positives+false_positives)
    return precision
def make_feature_histograms(dataset, features_list):
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    plt.ioff()
    if not os.path.exists(os.path.join(os.path.dirname(__file__), 'hists')):
        os.makedirs(os.path.join(os.path.dirname(__file__), 'hists'))
    for feature, i in zip(features_list[1:], range(len(features[0]))):
        plt.figure()
        feature_values_non_poi = [f[i] for f, l in zip(features, labels) if l == 0.0]
        feature_values_poi = [f[i] for f, l in zip(features, labels) if l == 1.0]
        feature_values = feature_values_non_poi + feature_values_poi
        non_zero_values_non_poi = [x for x in feature_values_non_poi if x != 0.0]
        non_zero_values_poi = [x for x in feature_values_poi if x != 0.0]
        non_zero_values = non_zero_values_non_poi + non_zero_values_poi
        q1, q3 = np.percentile(non_zero_values, [25, 75])
        iqr = q3 - q1
        outliers_hi = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x > q3]
        outliers_lo = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x < q1]
        # get same binwidth for both POI and non-POI
        bins = np.histogram(non_zero_values, bins=50)[1]
        plt.hist(non_zero_values_poi, bins=bins, alpha=.5, lw=0, color='r', label='POIs')
        plt.hist(non_zero_values_non_poi, bins=bins, alpha=.5, lw=0, color='b', label='Non-POIs')
        msg = ('Maximum %s: %d\n' % (feature, max(non_zero_values)) +
               'Minimum %s: %d\n' % (feature, min(non_zero_values)) +
               'Mean %s: %.5f\n' % (feature, np.mean(non_zero_values)) +
               'Median %s: %d\n' % (feature, np.median(non_zero_values)) +
               '\nTotal Number of Values: %d\n' % len(feature_values) +
               'Total Number of Non-Zero Values: %d\n' % len(non_zero_values))
        
        # see which features have low number of non-zero values
        #if float(len(non_zero_values)) / len(feature_values) < 0.5:
        #    print feature
        
        # print out some outlier values if they exist
        for outliers, which_ols in zip([outliers_hi, outliers_lo], ['Top', 'Bottom']):
            if outliers:
                if len(outliers) >= 5:
                    top_n = 5
                else:
                    top_n = len(outliers)
                outliers = sorted(outliers)
                ol_line = q1 - 1.5*iqr
                if which_ols == 'Top':
                    outliers = list(reversed(outliers))
                    ol_line = q3 + 1.5*iqr
                msg += '\n%s %d Outliers: ' % (which_ols, top_n)
                for i in range(top_n):
                    if i != top_n - 1:
                        msg += '%d, ' % outliers[i]
                    else:
                        msg += '%d' % outliers[i]
                plt.axvline(ol_line, lw=.5, ls='--', c='r')
        
        plt.figtext(.3, .4, msg)
        #plt.grid(axis='y')
        plt.title("%s histogram (non-zero values)" % feature)
        plt.legend()
        figname = 'hists/%s_histogram.png' % feature
        plt.savefig(figname)
        plt.close()
def get_k_best(dictionary, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection returning:
    {feature:score}
    """
    data = featureFormat(dictionary, features_list)
    labels, features = targetFeatureSplit(data)

    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    pairs = zip(features_list[1:], scores)
    #combined scores and features into a pandas dataframe then sort 
    k_best_features = pd.DataFrame(pairs,columns = ['feature','score'])
    k_best_features = k_best_features.sort('score',ascending = False)
    
    
    #merge with null counts    
    df_nan_counts = get_nan_counts(dictionary)
    k_best_features = pd.merge(k_best_features,df_nan_counts,on= 'feature')  
    
    #eliminate infinite values
    k_best_features = k_best_features[np.isinf(k_best_features.score)==False]
    print 'Feature Selection by k_best_features\n'
    print "{0} best features in descending order: {1}\n".format(k, k_best_features.feature.values[:k])
    print '{0}\n'.format(k_best_features[:k])
    
    
    return k_best_features[:k]
def select_k_best(data_dict, features_list, k):
    # Create dataset from feature list
    data = featureFormat(data_dict, features_list)
    # Split dataset into labels and features
    labels, features = targetFeatureSplit(data)
    # Create Min/Max Scaler
    scaler = preprocessing.MinMaxScaler()
    # Scale Features
    features = scaler.fit_transform(features)
    # Create k_best feature selection
    k_best = SelectKBest(k=k)
    # Fit k_best
    k_best.fit(features, labels)
    # Get k_best scores
    scores = k_best.scores_
    # Create list with features and scores
    unsorted_pairs = zip(features_list[1:], scores)
    # Sort list
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    # Create dict
    if k == "all":
        k_best_features = dict(sorted_pairs)
    else:
        k_best_features = dict(sorted_pairs[:k])
    return k_best_features
Example #30
0
breitbart_data = []
with open('breitbart_articles.pkl', 'rb') as f:
    breitbart = pickle.load(f)
for article in breitbart:
    breitbart_data.append([1, article])

newsmax_data = []
with open('newsmax_articles.pkl', 'rb') as f:
    newsmax = pickle.load(f)
for article in newsmax:
    newsmax_data.append([1, article])

right_wing_data = fox_data + observer_data + breitbart_data + newsmax_data

data = left_wing_data + right_wing_data
labels, articles = targetFeatureSplit(data)
articles_train, articles_test, labels_train, labels_test = train_test_split(
    articles, labels, test_size=0.001, random_state=42)

flat_train = []
flat_test = []

for sublist in articles_train:
    for article in sublist:
        flat_train.append(article)
for sublist in articles_test:
    for article in sublist:
        flat_test.append(article)

vectorizer = TfidfVectorizer(strip_accents="unicode", lowercase=False)
vectors = vectorizer.fit_transform(flat_train)
Example #31
0

### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl",
                             "r"))
### there's an outlier--remove it!
data_dict.pop("TOTAL", 0)

### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
poi = "poi"
features_list = [poi, feature_1, feature_2]
data = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data)

### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
for f1, f2 in finance_features:
    plt.scatter(f1, f2)
plt.show()

### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred
model = cluster.KMeans(n_clusters=2)
model.fit(finance_features)
pred = model.predict(finance_features)
Example #32
0
    if data_dict[keys]["from_poi_to_this_person"] == "NaN" or data_dict[keys][
            "from_this_person_to_poi"] == "NaN":
        data_dict[keys]["contact_poi"] = "NaN"
    else:
        data_dict[keys]["contact_poi"] = int(
            data_dict[keys]["from_poi_to_this_person"]) + int(
                data_dict[keys]["from_this_person_to_poi"])

features_list.append("income")
features_list.append("contact_poi")
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Use SelectKBest to choose which feature to use for machine learning

k = 5
selKBest = SelectKBest(f_regression, k)

selKBest.fit(features, labels)

selKBest.transform(features).shape

mask = selKBest.get_support()

scores = selKBest.scores_
feature_score = zip(features_list[1:], scores)
feature_score = list(reversed(sorted(feature_score, key=lambda x: x[1])))
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    # The inital script raised an error : StratifiedShuffleSplit not iterable
    # I rewrote the cv StratifiedShuffleSplit object with the same parameters according to sklearn doc:
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html
    #cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    cv = StratifiedShuffleSplit(n_splits=folds, random_state=42)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0

    #for train_idx, test_idx in cv:
    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy,
                                        precision,
                                        recall,
                                        f1,
                                        f2,
                                        display_precision=5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                           false_positives, false_negatives,
                                           true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
Example #34
0
def main():
    ### Task 1: Select what features you'll use.
    ### features_list is a list of strings, each of which is a feature name.
    ### The first feature must be "poi".
    financial_features = ['salary', 'deferral_payments', 'total_payments', \
                         'loan_advances', 'bonus', 'restricted_stock_deferred',\
                         'deferred_income', 'total_stock_value', 'expenses', \
                         'exercised_stock_options', 'other', 'long_term_incentive', \
                         'restricted_stock', 'director_fees'] #(all units are in US dollars)

    email_features = ['to_messages', 'from_poi_to_this_person',
                     'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
    #(units are generally number of emails messages; notable exception is ‘email_address’, 
    # which is a text string)
    #email_address feature was removed from list

    poi_label = ['poi'] ###(boolean, represented as integer)

    features_list = poi_label + email_features + financial_features

    ### Load the dictionary containing the dataset
    with open("final_project_dataset_unix.pkl", "rb") as data_file:
        data_dict = pickle.load(data_file)
      
    #convert to a pandas dataframe for exploratory analysis
    df = pd.DataFrame.from_dict(data_dict, orient='index')

    #iterate df and convert string 'NaN' to actual np.nan
    for label, content in df.items():
        if label == 'email_address':
            for i in content:
                if i == 'NaN':
                    df[label][i] = np.nan
        else:
            df[label] = pd.to_numeric(df[label], errors='coerce')


    ### Investigate contents of dataset:
            
    # Total Number of data points
    total_people = df.shape[0]
    print('The total number of data points (people) in our data set is {}.\n'\
        .format(total_people))

    # Total Number of Features Used
    all_features = df.shape[1]
    print('There are {} features for each person in our dataset.\n'\
        .format(all_features))

    # Total Number of Persons Of Interest (POIs)
    poi_count = df['poi'][(df['poi'] == True)].count()
    print('Our dataset has {} persons of interest.\n'.format(poi_count))

    # Total Number of Non-POIs
    non_poi_count = total_people - poi_count
    print('Our dataset has {} Non persons of interest.\n'.format(non_poi_count))

    # Features with missing values?
    print('The following categories have missing values (NaN values)\n')
    print (df.isna().sum())


    ### Task 2: Remove outliers

    #visualize_features('salary', 'bonus', data_dict)
    #visualize_features('from_poi_to_this_person', 'from_this_person_to_poi', data_dict)
    #visualize_features('loan_advances', 'total_stock_value', data_dict)


    print()
    print('Searching for Outliers...')
    find_outlier('salary', df)
    print ()
    find_outlier('bonus', df)
    print()
    find_outlier('from_poi_to_this_person', df)
    print ()
    find_outlier('from_this_person_to_poi', df)
    print ()
    find_outlier('loan_advances', df)
    print ()
    find_outlier('total_stock_value', df)


    #get a count of number of NaN columns for each person
    nan_count = df.isna().sum(axis=1)


    print('\nThe top 5 people by number of NaN columns are:\n')
    print (nan_count.sort_values(ascending=False).head(5))

    print('\nLooking closer at Eugene Lockhart...\n')
    print( df.loc['LOCKHART EUGENE E'])

    print ('\nLooking closer at THE TRAVEL AGENCY IN THE PARK...\n')
    print (df.loc['THE TRAVEL AGENCY IN THE PARK'])


    ### Remove outliers
    df = df.drop(['TOTAL'], axis=0)
    df = df.drop(["LOCKHART EUGENE E"], axis=0)
    df = df.drop(["THE TRAVEL AGENCY IN THE PARK"], axis=0)

    #replace NaN with 0
    df = df.fillna(0)


    ### Task 3: Create new feature(s)
    ### Store to my_dataset for easy export below.
    my_dataset = df.to_dict('index')

    for person in my_dataset:
        to_poi_count = my_dataset[person]['from_this_person_to_poi']
        from_poi_count = my_dataset[person]['from_poi_to_this_person']
        total_received_emails = my_dataset[person]['from_messages']
        total_sent_emails = my_dataset[person]['to_messages']
        
        try:
            my_dataset[person]['to_poi_ratio'] = float(to_poi_count) /\
                float(total_sent_emails)
        except:
            my_dataset[person]['to_poi_ratio'] = 0
        try:
            my_dataset[person]['from_poi_ratio'] = float(from_poi_count) /\
            float(total_received_emails)
        except:
            my_dataset[person]['from_poi_ratio'] = 0

    features_list = features_list + ['to_poi_ratio', 'from_poi_ratio']

    ### Preprocessing

    ### Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    #Scaling features (normalizing all features)
    min_max_scaler = MinMaxScaler()
    features = min_max_scaler.fit_transform(features)

    ### Select the best features:
    # Removes all but the k highest scoring features
    n = 6 # adjust for optimization
    skb = SelectKBest(f_classif, k=n)
    skb.fit_transform(features, labels)
    #pprint(sorted(skb.scores_, reverse=True))

    #skip poi feature and combine with returned scores (key:value --> feature:score)
    scores = zip(features_list[1:], skb.scores_)

    #sort by highest scoring feature from scores
    sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True)
    #print '\nOur {} highest feature scores are:'.format(n)
    #pprint(sorted_scores[:n])
                                          
    #add k highest scoring features to create new features_list
    new_features_list = poi_label + list(map(lambda x: x[0], sorted_scores))[:n]
    #print '\nOur new features list includes: '
    #pprint(new_features_list)

    ### Extract features and labels from dataset using optimized features_list
    data = featureFormat(my_dataset, new_features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)


    ### Task 4: Try a variety of classifiers
    ### Please name your classifier clf for easy export below.
    ### Note that if you want to do PCA or other multi-stage operations,
    ### you'll need to use Pipelines. For more info:
    ### http://scikit-learn.org/stable/modules/pipeline.html


    print ('\nRunning GaussianNB classifier...')
    run_classifier(GaussianNB(), features, labels)

    print ('\nRunning SVM classifier...')
    run_classifier(SVC(), features, labels)

    print ('\nRunning AdaBoost classifier...')
    run_classifier(AdaBoostClassifier(), features, labels)

    print ('\nRunning DecisionTree classifier...')
    run_classifier(DecisionTreeClassifier(), features, labels)



    ### Task 5: Tune your classifier to achieve better than .3 precision and recall 
    ### using our testing script. Check the tester.py script in the final project
    ### folder for details on the evaluation method, especially the test_classifier
    ### function. Because of the small size of the dataset, the script uses
    ### stratified shuffle split cross validation. For more info: 
    ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

    ### Re-Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)


    # Adjust SVM parameters to refine accuracy
    # variables will be passed to fine_tune_algorithm to use in a Pipeline
    print ('\nThe best fit SVM has the following scores:\n')
    svm_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                 ('SVM', SVC())]
    svm_parameters = {'SVM__kernel': ('linear', 'rbf'), 
                  'SVM__C':[0.001, 0.01, .1, 1, 10, 100, 1000], 
                  'SVM__gamma':[0.01, .1, 1, 10, 100, 1000],
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    svm_clf = fine_tune_algorithm(svm_steps, svm_parameters, features, labels)


    # Adjust DecisionTreeClassifier parameters to refine accuracy
    print ('\nThe best fit DecisionTreeClassifer has the following scores:\n')
    dt_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), 
                ('DT', DecisionTreeClassifier())]
    dt_parameters = {'DT__criterion': ('gini', 'entropy'), 
                  'DT__min_samples_split':[2,3,4,5,6,7,8,9,10],
                     'DT__random_state':[13],
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    dt_clf = fine_tune_algorithm(dt_steps, dt_parameters, features, labels)


    # Adjust AdaBoostClassifier parameters to refine accuracy
    # variables will be passed to fine_tune_algorithm to use in a Pipeline
    print ('\nThe best fit AdaBoostClassifier has the following scores:\n')
    ab_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                ('AB', AdaBoostClassifier())]
    ab_parameters = {'AB__algorithm': ('SAMME', 'SAMME.R'), 
                  'AB__learning_rate':[.5, .6, .7, .8, .9,1],
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    ada_clf = fine_tune_algorithm(ab_steps, ab_parameters, features, labels)

    # Adjust GaussianNB parameters to refine accuracy
    print ('\nThe best fit GaussianNB Classifier has the following scores:\n')
    nb_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                ('NB', GaussianNB())]
    nb_parameters = {'SKB__k': [2,3,4,5,6,7,8,9,10]}
    nb_clf = fine_tune_algorithm(nb_steps, nb_parameters, features, labels)

    #final best fitting classifier
    clf = nb_clf

    ### Task 6: Dump your classifier, dataset, and features_list so anyone can
    ### check your results. You do not need to change anything below, but make sure
    ### that the version of poi_id.py that you submit can be run on its own and
    ### generates the necessary .pkl files for validating your results.

    dump_classifier_and_data(clf, my_dataset, features_list)
Example #35
0
# In[37]:

#With all features
from time import time
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# In[38]:

# without engineered features
data_old = featureFormat(my_dataset, old_features, sort_keys=True)
labels_old, features_old = targetFeatureSplit(data_old)
features_train_old, features_test_old, labels_train_old, labels_test_old = train_test_split(
    features_old, labels_old, test_size=0.3, random_state=42)

# In[39]:

data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

# In[40]:

from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
Example #36
0
        my_dataset[names]['messages_from_poi'] = 0
    if float(to_messages) != 0 and deferral_payments != 0:
        my_dataset[names][
            'messages_to_poi/deferral_payments'] = from_this_person_to_poi / float(
                to_messages * deferral_payments)
    else:
        my_dataset[names]['messages_to_poi/deferral_payments'] = 0

features_list_new = POI_label + financial_features + email_features_number + [
    'messages_from_poi'
] + ['messages_to_poi/deferral_payments']
#print "The List with all features with 2 new ones is:", features_list_new

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list_new, sort_keys=True)
labels, features = targetFeatureSplit(data)

from sklearn.feature_selection import SelectKBest, f_classif
featureSelecting = SelectKBest(f_classif, k=10)
featureSelecting.fit(features, labels)
featureSelected = featureSelecting.get_support()
scores = zip(featureSelecting.scores_, features_list_new[1:], featureSelected)
scoresSorted = sorted(scores, reverse=True)
#print "Scroes are:", scoresSorted
''' 
    scoresSorted =
    [(25.09754152873549, 'exercised_stock_options', True),
    (24.4676540475264, 'total_stock_value', True),
    (21.06000170753657, 'bonus', True),
    (18.575703268041785, 'salary', True),
    (11.5955476597306, 'deferred_income', True),
Example #37
0
new_feature_2_inputs_add('total_poi_emails', 'to_and_from_poi_emails',
                         'shared_receipt_with_poi')
new_feature_2_inputs_divide('percent_of_poi_to_emails',
                            'from_this_person_to_poi', 'to_messages')
new_feature_2_inputs_divide('percent_of_poi_from_emails',
                            'from_poi_to_this_person', 'from_messages')
new_feature_4_inputs_divide('percent_poi_emails', 'from_poi_to_this_person',
                            'from_this_person_to_poi', 'to_messages',
                            'from_messages')

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)


#Draw a plot comparing two features: f1_name and f2_name, along with their prediction line: pred.
def Draw(pred,
         features,
         poi,
         mark_poi=False,
         name="image.png",
         f1_name="feature 1",
         f2_name="feature 2"):

    #plot each cluster with a different color--add more colors for
    #drawing more than five clusters
    colors = ["b", "c", "k", "m", "g"]
    for ii, pp in enumerate(pred):
Example #38
0
'''

# Now we are preparing to make our finalList, but as the project requirement its first element should be 'poi'
theFinalList = ['poi']
theFinalList.extend(myList)

sep("*", "the final feature list")
print theFinalList

#Now features_list is finalized and will be utilized by the feature_format module
features_list = theFinalList

### Extract features and labels from dataset for local testing
data = feature_format.featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = feature_format.targetFeatureSplit(data)

### Task 4: Try a varity of classifiers


def use_decision_tree_clf():
    '''
    This function uses Decision tree classifier in addition to Grid search cross validation
    '''
    print "This is the use_decision_tree_clf() method"
    from sklearn import tree
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import precision_recall_fscore_support

    param = {
        'max_depth': [1, 2, 3, 9],
def estimator_evaluator1(clf, dataset, feature_list, folds):
    from feature_format import featureFormat, targetFeatureSplit
    from sklearn.cross_validation import StratifiedKFold
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedKFold(labels, n_folds=folds, random_state=30)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print(clf)
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Example #40
0
### Task 2: Remove outliers
del data_dict['TOTAL']
del data_dict['THE TRAVEL AGENCY IN THE PARK']

### Task 3: Create new feature(s)
data_dict = hf.add_poi_mail_features(data_dict)
# features_list.append('from_poi_pct')
# features_list.append('to_poi_pct')

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract all features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Dimensions of the data are computed below
# print np.array(data).shape
# print np.sum(labels)

### Plot all original variables
# hf.plot_features(features_list, data_all)

### Print top 5 extreme observations for "loan_advances" and "total_payments"
# pprint.pprint(hf.return_sorted_values(data_dict, "loan_advances", 5))
# pprint.pprint(hf.return_sorted_values(data_dict, "total_payments", 5))

### Plot Lasso selection
# hf.lasso_selection(features, labels, features_list)
Example #41
0
new_data = pd.DataFrame(my_dataset.values())[features_list]
new_data.index = my_dataset.keys()
new_data['new_total_stock'] = new_data['exercised_stock_options'] + new_data[
    'restricted_stock']
new_dataset = {}
key = list(new_data.index)
for j in range(len(key)):
    v = {}
    key_v = list(new_data.columns.values)
    for i in range(len(key_v)):
        value_v = list(new_data.loc[key[j]])
        v[key_v[i]] = value_v[i]
    new_dataset[key[j]] = v
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
features_list_new = [
    'poi', 'bonus', 'exercised_stock_options', 'expenses', 'from_messages',
    'from_poi_to_this_person', 'from_this_person_to_poi', 'other',
    'restricted_stock', 'salary', 'shared_receipt_with_poi', 'to_messages',
    'new_total_stock'
]
data_new = featureFormat(new_dataset, features_list_new, sort_keys=True)
labels_new, features_new = targetFeatureSplit(data_new)
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)
features_new = scaler.fit_transform(features_new)
from numpy import mean
from sklearn import cross_validation
from sklearn.metrics import accuracy_score, precision_score, recall_score
Example #42
0
    for k, v in data_dict[key].items():
        if v == 'NaN':
            data_dict[key][k] = 0

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)

from sklearn import preprocessing

scaler = preprocessing.RobustScaler()
data_scaled = scaler.fit_transform(data)
labels, features = targetFeatureSplit(data_scaled)
# for point in data:
#     salary = point[4]
#     bonus = point[2]
#     matplotlib.pyplot.scatter( salary, bonus )
#
# matplotlib.pyplot.xlabel("salary")
# matplotlib.pyplot.ylabel("bonus")
# matplotlib.pyplot.show()

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
def test_classifier(clf, dataset, feature_list, folds=1000):
    # extract the features specified in features_list
    data = featureFormat(dataset, feature_list, sort_keys=True)
    # split into labels and features (this line assumes that the first
    # feature in the array is the label, which is why "poi" must always
    # be first in the features list
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        # print clf
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")
        return clf
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Example #44
0
            'from_this_person_to_poi'] / data_dict[i]['from_messages']
    else:
        data_dict[i]['from_this_person_to_poi_ratio'] = 'NaN'

features_list = [
    'poi', 'salary', 'deferral_payments', 'loan_advances', 'bonus',
    'restricted_stock_deferred', 'deferred_income', 'expenses',
    'exercised_stock_options', 'other', 'long_term_incentive',
    'restricted_stock', 'director_fees', 'to_messages',
    'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
    'shared_receipt_with_poi', 'from_poi_to_this_person_ratio',
    'from_this_person_to_poi_ratio'
]

data_array = featureFormat(data_dict, features_list)
poi, features = targetFeatureSplit(data_array)

# Data Split for train and test

features_train, features_test, labels_train, labels_test = train_test_split(
    features, poi, test_size=0.3, random_state=42)

# Feature scaling
scaler = MinMaxScaler()
rescaled_features_train = scaler.fit_transform(features_train)
rescaled_features_test = scaler.fit_transform(features_test)

# Feature selection with SelectKBest

from sklearn.feature_selection import SelectKBest
Example #45
0
def my_test_classifier(clf, dataset, feature_list, folds=1000):
    from sklearn.cross_validation import StratifiedShuffleSplit
    PERF_FORMAT_STRING = "\
    \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
    Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"

    RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
    \tFalse negatives: {:4d}\tTrue negatives: {:4d}"

    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break

    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        results = (clf,
                   PERF_FORMAT_STRING.format(accuracy,
                                             precision,
                                             recall,
                                             f1,
                                             f2,
                                             display_precision=5),
                   RESULTS_FORMAT_STRING.format(total_predictions,
                                                true_positives,
                                                false_positives,
                                                false_negatives,
                                                true_negatives), precision,
                   recall, accuracy, f1, f2)
    except:
        results = (
            clf, "Got a divide by zero when trying out:",
            "Precision or recall may be undefined due to a lack of true positive predicitons.",
            0, 0)

    return results
df['to_poi_rate'] = df['from_this_person_to_poi'] / df['from_messages']
df['from_poi_rate'] = df['from_poi_to_this_person'] / df['to_messages']
new_feat_list = ['from_messages_median_pubIndex', 'to_poi_median_pubIndex']
df = pd.concat([df, df_new], axis=1)
df[new_feat_list] = df[new_feat_list].fillna(
    df.groupby("poi")[new_feat_list].transform("median"))

features_list = (poi_label + financial_feat_list + email_feat_list +
                 ['to_poi_rate', 'from_poi_rate'] + new_feat_list)
print("Total number of features: ", len(features_list) - 1)

### Store to my_dataset for easy export below.
my_dataset = df.to_dict(orient='index')

data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
clf = AdaBoostClassifier(random_state=45)
clf.fit(features_train, labels_train)

feat_importance = []
for i in range(len(clf.feature_importances_)):
#         mail = mail + 1
# print salary
# print mail

# peeps = enron_data.keys()
# print len(peeps)
# payments = 0
# for peep in peeps:
#     if enron_data[peep]['total_payments'] == 'NaN':
#         payments = payments + 1
# print payments
# print 100 * float(payments) / float(len(peeps))

feature_list = ["poi", "total_payments"]
data_array = featureFormat(enron_data, feature_list)
label, features = targetFeatureSplit(data_array)

print label
print len(label) + 10
# i = 0
# for lab in label:
#     # print lab
#     if lab == 1.0:
#         print features[i]
#         i = i + 1

#         nada = nada + 1
# totes = len(label)
# percent_losers = 100 * float(nada) / float(totes)
#
# print percent_losers
            'from_this_person_to_poi'] / data_dict[e]['from_messages']
    else:
        data_dict[e]['from_this_person_to_poi_ratio'] = 'NaN'

features_list = [
    'poi', 'salary', 'deferral_payments', 'loan_advances', 'bonus',
    'restricted_stock_deferred', 'deferred_income', 'expenses',
    'exercised_stock_options', 'other', 'long_term_incentive',
    'restricted_stock', 'director_fees', 'to_messages',
    'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
    'shared_receipt_with_poi', 'from_poi_to_this_person_ratio',
    'from_this_person_to_poi_ratio'
]

data_array = featureFormat(data_dict, features_list)
poi, features = targetFeatureSplit(data_array)

### split the data into train and test

features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    features, poi, test_size=0.3, random_state=42)

###Feature scaling before selection, necessary for SVM.
scaler = MinMaxScaler()
rescaled_features_train = scaler.fit_transform(features_train)
rescaled_features_test = scaler.fit_transform(features_test)

### Univariate selection
### Using SelectKBest, I can select features according to the k highest scores.

from sklearn.feature_selection import SelectKBest
Example #49
0
"""

import sys
import pickle

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

dictionary = pickle.load(
    open("../final_project/final_project_dataset_modified.pkl", "r"))

### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat(dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split

feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.
from sklearn import linear_model
Example #50
0
def selectKBest_f1_scores(clf, dataset, n_kbest_features, folds = 1000):
    """ Verifica os scores do numero de features selecionadas.
    
    Responsavel por selecionar o score F1 de 2 ate n_kbest_features.
    
    Args: 
        clf: classificador utilizado para a analise
        dataset: dados utilizados
        n_kbest_features: numero de maximo de features permitido.
        
    Returns:
        retorno1: Lista de valores K
        retorno2: Lista de Scores F1
    """
    graficoX = []
    graficoY = []
    for k in range(2, n_kbest_features):
        features_selected = select_best_features(k)
        features_selected.insert(0, "poi")
        data = featureFormat(dataset, features_selected, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
        true_negatives = 0
        false_negatives = 0
        true_positives = 0
        false_positives = 0
        for train_idx, test_idx in cv: 
            features_train = []
            features_test  = []
            labels_train   = []
            labels_test    = []
            for ii in train_idx:
                features_train.append( features[ii] )
                labels_train.append( labels[ii] )
            for jj in test_idx:
                features_test.append( features[jj] )
                labels_test.append( labels[jj] )

            clf.fit(features_train, labels_train)
            predictions = clf.predict(features_test)
            for prediction, truth in zip(predictions, labels_test):
                if prediction == 0 and truth == 0:
                    true_negatives += 1
                elif prediction == 0 and truth == 1:
                    false_negatives += 1
                elif prediction == 1 and truth == 0:
                    false_positives += 1
                elif prediction == 1 and truth == 1:
                    true_positives += 1
                else:
                    print "Warning: Found a predicted label not == 0 or 1."
                    print "All predictions should take value 0 or 1."
                    print "Evaluating performance for processed predictions:"
                    break
        try:
            f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
            graficoY.append(f1)
            graficoX.append(k)
        except:
            print "Got a divide by zero when trying out:", clf
            print "Precision or recall may be undefined due to a lack of true positive predicitons."
    return  graficoX, graficoY
Example #51
0
features_list += [
    'salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus',
    'restricted_stock_deferred', 'deferred_income', 'total_stock_value',
    'expenses', 'exercised_stock_options', 'other', 'long_term_incentive',
    'restricted_stock', 'director_fees'
]
# email feature
features_list += [
    'to_messages', 'from_poi_to_this_person', 'from_messages',
    'from_this_person_to_poi', 'shared_receipt_with_poi'
]

print len(features_list)

### Load the dictionary containing the dataset
y, X = targetFeatureSplit(featureFormat(data_dict, features_list))
X = np.array(X)

from sklearn import tree

clf = tree.DecisionTreeClassifier(random_state=12)
clf = clf.fit(X, y)
print "clf.feature_importances_ : ", clf.feature_importances_
idx_feature_importances = np.argsort(clf.feature_importances_)[::-1]
for i in range(10):
    idx = idx_feature_importances[i]
    print "importance ", i, " - ", features_list[
        idx + 1], " - ", clf.feature_importances_[idx]

new_features_list = ['poi']
for i in range(10):
from feature_format import featureFormat, targetFeatureSplit
## this include financial information and email address, how many email sent or recieved from POI
dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") )


## create another dictionary to put only POI from the original dictionary
poi_dictionary = {}
for k, v in dictionary.iteritems():
    if v['poi'] == True:
        poi_dictionary[k] = v

## change this list, to see result with other values
features_list = ["bonus", "exercised_stock_options"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
poi_data = featureFormat( poi_dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit( data )
poi_target, poi_features = targetFeatureSplit( poi_data )



### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
poi_color = "b"
all_color = "r"


### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, targe in zip(features, target):
    plt.scatter( feature, targe, color=all_color ) 
data = featureFormat(data_dict, features_list)

# Fit a scaler on the original data (So we don't get deceived by the "NaN" points)
feature_1_scaler = fit_scaler_on_original_data(data_dict, feature_1)
feature_2_scaler = fit_scaler_on_original_data(data_dict, feature_2)

# Rescale the data using the appropriate scaler
rescaled_data = data
rescaled_data[:,
              1] = feature_1_scaler.transform(rescaled_data[:,
                                                            1].reshape(1, -1))
rescaled_data[:,
              2] = feature_2_scaler.transform(rescaled_data[:,
                                                            2].reshape(1, -1))

poi, finance_features = targetFeatureSplit(rescaled_data)

kmeans_model = KMeans(n_clusters=2)
pred = kmeans_model.fit_predict(rescaled_data)

try:
    Draw(pred,
         finance_features,
         poi,
         mark_poi=False,
         name="clusters_with_feature_scaling.pdf",
         f1_name=feature_1,
         f2_name=feature_2)
except NameError:
    print "no predictions object named pred found, no clusters to plot"
Example #54
0
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    cv = StratifiedShuffleSplit(n_splits=folds, random_state=42)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0

    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []

        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)

        ##modifiquei a função para ela retornar os resultados ao inves de imprimi-los
        print(clf)
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")

        return accuracy, precision, recall
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Example #55
0
    if msg_from_poi != "NaN" and to_msg != "NaN":
        my_dataset[person]['msg_from_poi_ratio'] = msg_from_poi/float(to_msg)
    else:
        my_dataset[person]['msg_from_poi_ratio'] = 0

    msg_to_poi = my_dataset[person]['from_this_person_to_poi']
    from_msg = my_dataset[person]['from_messages']
    if msg_to_poi != "NaN" and from_msg != "NaN":
        my_dataset[person]['msg_to_poi_ratio'] = msg_to_poi/float(from_msg)
    else:
        my_dataset[person]['msg_to_poi_ratio'] = 0
new_features_list = features_list + ['msg_to_poi_ratio', 'msg_from_poi_ratio']

## Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, new_features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

#Select the best features: 
#Removes all features whose variance is below 80% 
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
features = sel.fit_transform(features)

#Removes all but the k highest scoring features
from sklearn.feature_selection import f_classif
k = 7
selector = SelectKBest(f_classif, k=7)
selector.fit_transform(features, labels)
print("Best features:")
scores = zip(new_features_list[1:],selector.scores_)
sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True)
Example #56
0
def FeatureSelection(data_dict, features_list):
    # Convert dictionary to numpy array, converts NaN to 0.0
    data = featureFormat(data_dict, features_list, \
                         sort_keys = True, remove_all_zeroes = False)
    # Separate into labels = 'poi' and features = rest of features_list
    labels, features = targetFeatureSplit(data)

    from sklearn.feature_selection import RFECV
    # Recursive Feature Elimination with Cross Validation
    from sklearn.svm import SVC
    # Support Vector Classifier to estimate fit coefficients for each feature
    from sklearn.cross_validation import StratifiedShuffleSplit
    # cross validation maintain roughly equal number of POIs in each split

    ### Create Estimator
    # which will update the coefficients with each iteration
    # class weight is set to auto because of unbalanced data classes
    # weight will be inversely proportional to class size
    svc = SVC(kernel='linear', class_weight='auto', random_state=42)
    ############## Scale features ######################
    # SVC algorithm requires use scaled features
    # missing values are coded 0.0, so MinMax will preserve those zero values
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)

    ### Select cross-validation method
    # StratifiedShuffleSplit keeps roughly the same number of POIs in each split
    sss = StratifiedShuffleSplit(labels, 100, test_size=0.3, random_state=42)
    ### Select evaluation metric
    # Evaluate model using f1 = 2 * (precision * recall) / (precision + recall)
    # Model should be able to predict POIs, which are a small percentage of cases
    metric = 'f1'
    # run the feature eliminater
    rfecv = RFECV(estimator=svc, cv=sss, scoring=metric, step=1)
    rfecv = rfecv.fit(features, labels)

    # view results
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score using F1 (precision&recall)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    #    plt.savefig('featureSelection.png', transparent=True)
    plt.show()
    print("Optimal number of features is %d" % rfecv.n_features_)
    print(
        'Features selected by recursive feature elimination with cross validation:'
    )
    F1_score = round(rfecv.grid_scores_[rfecv.n_features_], 3)
    print('F1 score from optimal features: %r' % F1_score)
    selection = rfecv.get_support()
    selected_features = ['poi']
    rejected_features = []
    for i in range(len(selection)):
        if selection[i]:
            selected_features.append(
                features_list[i + 1])  # first feature is 'poi'=the label
        else:
            rejected_features.append(features_list[i + 1])
    print(selected_features[1:])
    print('Features eliminated:')
    print(rejected_features)
    return selected_features, F1_score
Example #57
0
    You fill in the regression code where indicated:
"""

import sys
import pickle
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load(
    open("../final_project/final_project_dataset_modified.pkl", "r"))

### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat(dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.

from sklearn.linear_model import LinearRegression
reg = LinearRegression()
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    all_importance = []  #for holding feature importance from each fold
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        importance = clf.feature_importances_
        all_importance.append(importance)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        avg_importance = np.mean(all_importance, axis=0)
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        cm = [[true_negatives, false_positives],
              [false_negatives, true_positives]]
        print clf
        print "Feature importances", avg_importance
        print PERF_FORMAT_STRING.format(accuracy,
                                        precision,
                                        recall,
                                        f1,
                                        f2,
                                        display_precision=5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                           false_positives, false_negatives,
                                           true_negatives)
        print ""
        return cm
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predictions."
Example #59
0
         (this should be the quantity you want to predict) return targets and features as separate lists
         (sklearn can generally handle both lists and numpy arrays as input formats when training/predicting)
     """
     target = []
     features = []
     for item in data:
         target.append( item[0] )
         features.append( item[1:] )
         
     return target, features



data_dict = featureFormat(my_dataset, features_list, sort_keys = True)
#print "\n data_dict:", data_dict
labels, features = targetFeatureSplit(data_dict)
#print "\n The Labels are     :      ", labels
#print "\n The Labels are     :      ", features


# scale features via min-max
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)






#==============================================================================
Example #60
-1
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
 
    # Scale features
    if(scale_features):
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print 'Total predictions: '+str(total_predictions)
        print 'Accuracy: '+str(accuracy)
        print 'Precision: '+str(precision)
        print 'Recall: '+str(recall)
        print 'F1: '+str(f1)
        print 'F2: '+str(f2)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."