def precompute_recall_precision(features_list, sum = False):
    features_list_all = ['poi'] + features_list
    data = featureFormat(my_dataset, features_list_all, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    standardized = MinMaxScaler().fit_transform(features)
    # Score the features using f_classif
    sel = SelectKBest(k='all', score_func=f_classif)
    sel.fit_transform(features, labels)
    kbest = [(features_list[i], score, i) for i, score in enumerate(sel.scores_)]
    sorted_kbest = sorted(kbest, key=operator.itemgetter(1), reverse=True)
    print "Feature Set(", len(kbest), ") List and K-best scores:"
    for tup in sorted_kbest:
        print tup[2], "\t", tup[0], tup[1]
    if not sum:
        plot_feature_correlation(features, len(kbest))
    for i, method in enumerate(methods):
        pipe, params = method()
        grid_searcher = GridSearchCV(pipe, param_grid=params, cv=sk_fold, scoring='recall')
        grid_searcher.fit(features, labels)
        clf = grid_searcher.best_estimator_

        ### Extract features and labels from dataset for local testing
        data = featureFormat(my_dataset, features_list_all, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        my_test_classifier(clf, my_dataset, features_list_all, i)
Example #2
0
def check_enron_outliers(data_dict):
    fname="enron_salary_outlier.png"
    features_list = ["poi", "salary", "exercised_stock_options"]
    #features_list = ["poi", "from_this_person_to_poi", "shared_receipt_with_poi"]

    data = featureFormat(data_dict, features_list)
    midx = data[:, 2].argmax()
    feature_2_max = max(data[:, 2])
    print "idx of max ", features_list[2], " = ", midx
    print "max " , features_list[2], " = ", feature_2_max, ", ", data[:, 2][midx]

    plt.subplot(1,2,1)
    colors=map(lambda x: 'red' if x else 'grey', data[:, 0])
    plt.scatter(data[:, 1], data[:, 2], s=40+data[:,0], c=colors, alpha=0.5, lw=0.)
    plt.xlabel(features_list[1])
    plt.ylabel(features_list[2])

    # Now remove one outlier
    data_dict.pop("TOTAL", 0)
    data = featureFormat(data_dict, features_list)
    plt.subplot(1,2,2)
    colors=map(lambda x: 'red' if x else 'grey', data[:, 0])
    plt.scatter(data[:, 1], data[:, 2], s=40+data[:,0], c=colors, alpha=0.5, lw=0.)
    #plt.ticklabel_format(axis([-0.2e7, 1.2e7, -0.5, 4.0])
    plt.ticklabel_format(useOffset=True)
    plt.xlabel(features_list[1])
    plt.ylabel(features_list[2])

    plt.title("{0} vs {1} Plots before and after Outlier Removal.".format(features_list[1], features_list[2]), x=-0.1, y=1.05)
    plt.show()
Example #3
0
def dataset_explore():
    enron_data = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
    features = ["salary", "bonus"]

    # === Complete dataset ===
    data = featureFormat(enron_data, features)
    features_plot(data, 0, 1, 'salary', 'bonus', 'Complete dataset')
    
    # === Dataset without outliers ===
    dataset_outlier_cleaner(enron_data)
    data = featureFormat(enron_data, features)
    features_plot(data, 0, 1, 'salary', 'bonus', 'Dataset without outliers')

    return
def make_feature_histograms(dataset, features_list):
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    plt.ioff()
    if not os.path.exists(os.path.join(os.path.dirname(__file__), 'hists')):
        os.makedirs(os.path.join(os.path.dirname(__file__), 'hists'))
    for feature, i in zip(features_list[1:], range(len(features[0]))):
        plt.figure()
        feature_values_non_poi = [f[i] for f, l in zip(features, labels) if l == 0.0]
        feature_values_poi = [f[i] for f, l in zip(features, labels) if l == 1.0]
        feature_values = feature_values_non_poi + feature_values_poi
        non_zero_values_non_poi = [x for x in feature_values_non_poi if x != 0.0]
        non_zero_values_poi = [x for x in feature_values_poi if x != 0.0]
        non_zero_values = non_zero_values_non_poi + non_zero_values_poi
        q1, q3 = np.percentile(non_zero_values, [25, 75])
        iqr = q3 - q1
        outliers_hi = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x > q3]
        outliers_lo = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x < q1]
        # get same binwidth for both POI and non-POI
        bins = np.histogram(non_zero_values, bins=50)[1]
        plt.hist(non_zero_values_poi, bins=bins, alpha=.5, lw=0, color='r', label='POIs')
        plt.hist(non_zero_values_non_poi, bins=bins, alpha=.5, lw=0, color='b', label='Non-POIs')
        msg = ('Maximum %s: %d\n' % (feature, max(non_zero_values)) +
               'Minimum %s: %d\n' % (feature, min(non_zero_values)) +
               'Mean %s: %.5f\n' % (feature, np.mean(non_zero_values)) +
               'Median %s: %d\n' % (feature, np.median(non_zero_values)) +
               '\nTotal Number of Values: %d\n' % len(feature_values) +
               'Total Number of Non-Zero Values: %d\n' % len(non_zero_values))
        
        # see which features have low number of non-zero values
        #if float(len(non_zero_values)) / len(feature_values) < 0.5:
        #    print feature
        
        # print out some outlier values if they exist
        for outliers, which_ols in zip([outliers_hi, outliers_lo], ['Top', 'Bottom']):
            if outliers:
                if len(outliers) >= 5:
                    top_n = 5
                else:
                    top_n = len(outliers)
                outliers = sorted(outliers)
                ol_line = q1 - 1.5*iqr
                if which_ols == 'Top':
                    outliers = list(reversed(outliers))
                    ol_line = q3 + 1.5*iqr
                msg += '\n%s %d Outliers: ' % (which_ols, top_n)
                for i in range(top_n):
                    if i != top_n - 1:
                        msg += '%d, ' % outliers[i]
                    else:
                        msg += '%d' % outliers[i]
                plt.axvline(ol_line, lw=.5, ls='--', c='r')
        
        plt.figtext(.3, .4, msg)
        #plt.grid(axis='y')
        plt.title("%s histogram (non-zero values)" % feature)
        plt.legend()
        figname = 'hists/%s_histogram.png' % feature
        plt.savefig(figname)
        plt.close()
Example #5
0
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    # configure split of test_size and train_size
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42, 
                                test_size = .2, train_size = .8)
#    print cv

    for train_idx, test_idx in cv: 
        features_train      = []
        features_test       = []
        features_validation = []
        labels_train        = []
        labels_test         = []
        labels_validation   = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            if jj % 2 == 0: 
                features_validation.append( features[jj] )
                labels_validation.append( labels[jj] )
            else: 
                features_test.append( features[jj] )
                labels_test.append( labels[jj] )
                
# Determine size of training & test sets      
    fit_and_test_classifier(clf, features_train, labels_train, features_test, labels_test)
    fit_and_test_classifier(clf, features_train, labels_train, features_validation, labels_validation)
    print "features_train:", len(features_train), "labels_train:", len(labels_train)      
    print "features_test:", len(features_test), "labels_test:", len(labels_test)
    print "features_validation:", len(features_validation), "labels_validation:", len(labels_validation)
def cluster2Features():
    ### the input features we want to use
    ### can be any key in the person-level dictionary (salary, director_fees, etc.)
    feature_1 = "salary"
    feature_2 = "exercised_stock_options"
    poi  = "poi"
    features_list = [poi, feature_1, feature_2]
    data = featureFormat(data_dict, features_list )
    poi, finance_features = targetFeatureSplit( data )

    ### in the "clustering with 3 features" part of the mini-project,
    ### you'll want to change this line to
    ### for f1, f2, _ in finance_features:
    ### (as it's currently written, the line below assumes 2 features)
    #print finance_features
    for f1, f2 in finance_features:
        plt.scatter( f1, f2)
    plt.show()

    ### cluster here; create predictions of the cluster labels
    ### for the data and store them to a list called pred
    from sklearn.cluster import KMeans
    estimators = {'k_means_2': KMeans(n_clusters=2)}
    estimators['k_means_2'].fit(data)
    pred = estimators['k_means_2'].predict(data)

    ### rename the "name" parameter when you change the number of features
    ### so that the figure gets saved to a different file
    try:
        Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
    except NameError:
        print "no predictions object named pred found, no clusters to plot"
def makeData(dataset, feature_list, folds = 1000):
    """Make and return dataset prepared for training.

    Keyword arguments:
    dataset --- dict of dict
    feature_list --- list of strings
    folds --- int
    
    """
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    
    for train_idx, test_idx in cv: 
        
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
    return features_train, features_test, labels_train, labels_test
def test_training_stratified_split(dataset, features_list, testsize=0.2):
    """
    For E+F dataset, split dataset into the training and test 
    set using stratified method.

    Input:
    dataset: data in dictionary format 
    features_list: the full list of features to selection from 
    test: the proportion of the dataset to include in the test split

    Return:
    labels_train, labels_test, features_train, features_test

    """
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    labels = np.array([int(label) for label in labels])
    features = np.array(features)
    ### Split data into test set and training set
    sss = StratifiedShuffleSplit(labels, 1, test_size=testsize, random_state=0)

    for train_index, test_index in sss:
        labels_train, labels_test = labels[train_index].tolist(), labels[test_index].tolist()
        features_train, features_test = features[train_index].tolist(), features[test_index].tolist()
    return labels_train, labels_test, features_train, features_test
def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, 1000, random_state=42)

    # Build an empty feature importance totals array for calculating average importance
    totals = []
    for each_feature in feature_list:
        totals.append(0)

    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])
        clf = clf.fit(features_train, labels_train)
        for i in range(len(clf.feature_importances_)):
            totals[i] += clf.feature_importances_[i]
        # print clf.feature_importances_

    for i in range(len(totals)):
        totals[i] /= 1000

    # Display results
    print "Feature list: ", feature_list[1:]
    print "Importances: ", totals
def univariateFeatureSelection(f_list, my_dataset):
	result = []
	for feature in f_list:
		# Replace 'NaN' with 0
		for name in my_dataset:
			data_point = my_dataset[name]
			if not data_point[feature]:
				data_point[feature] = 0
			elif data_point[feature] == 'NaN':
				data_point[feature] =0

		data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB()
		clf.fit(features_train, labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
		result.append((feature,score[0],score[1],score[2]))
	result = sorted(result, reverse=True, key=lambda x: x[3])
	return result
def get_most_important_features(dataset, features_list):
    """Calculates the feature importances.
    Takes as input a dataset and a list of features.
    Creates an overfit Decision Tree and calculates the feature importances.
    Returns a list with the feature importances.
    """
    # creating an overfitted decision tree
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import accuracy_score

    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    # new features filtered, NaN values removed
    features_train, features_test, labels_train, labels_test = train_test_split(features,
                                                                                    labels,
                                                                                    test_size=0.3,
                                                                                    random_state=42)

    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    acc = accuracy_score(labels_test, pred)
    # uncomment to print the accuracy score
    #print "overfitted accuracy", acc

    # calculating feature importances
    feat_imp = clf.feature_importances_
    # uncomment to print the most important (common) ones
    #print feat_imp
    #for index, feature in enumerate(feat_imp):
    #    if feature > 0.2:
    #        print "spot:", index, ":", features_list[index+1], " | value:", feature
    return feat_imp
def prep_features(df, features_list, feature_scaled):

    """
    Arguments:
        load dataframe (or dictionary), and features_list
    return
        scaled features, labels in numpy.ndarray, and
        scaled features, labels in pandas dataframe
    """
    from feature_format import featureFormat, targetFeatureSplit
    import pandas as pd
    # for pandas dataframe
    df1 = df[features_list]
    features_df = df1.drop('poi', axis=1)#.astype(float)  # new features (pandas dataframe)
    labels_df = df1['poi']  # new labels (pandas dataframe)
    if feature_scaled ==  True:
        features_df_scaled = scale_features(features_df) # scale features
    else:
        features_df_scaled = features_df
    # for dictionary
    df2 = df[features_list]
    data_dict_new = df2.T.to_dict()  # data_dict (final)
    features_dic = features_df.copy()
    X_features = list(features_dic.columns)
    features_list_new = ['poi'] + X_features  # selected features list (final)
    data = featureFormat(data_dict_new, features_list_new, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    if feature_scaled == True:
        features = scale_features(features)

    return features, labels, features_df_scaled, labels_df
def selectKBest(previous_result, data):
	# remove 'restricted_stock_deferred' and 'director_fees'
	previous_result.pop(4)
	previous_result.pop(4)

	result = []
	_k = 10
	for k in range(0,_k):
		feature_list = ['poi']
		for n in range(0,k+1):
			feature_list.append(previous_result[n][0])

		data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False)
		labels, features = targetFeatureSplit(data)
		features = [abs(x) for x in features]
		from sklearn.cross_validation import StratifiedShuffleSplit
		cv = StratifiedShuffleSplit(labels, 1000, random_state = 42)
		features_train = []
		features_test  = []
		labels_train   = []
		labels_test    = []
		for train_idx, test_idx in cv:
			for ii in train_idx:
				features_train.append( features[ii] )
				labels_train.append( labels[ii] )
			for jj in test_idx:
				features_test.append( features[jj] )
				labels_test.append( labels[jj] )
		from sklearn.naive_bayes import GaussianNB
		clf = GaussianNB()
		clf.fit(features_train, labels_train)
		predictions = clf.predict(features_test)
		score = score_func(labels_test,predictions)
		result.append((k+1,score[0],score[1],score[2]))
	return result
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)

        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            else:
                true_positives += 1
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)

        print clf
        #print "Best Params: ", clf.best_params_
        #print "Best Estimator: ", clf.best_estimator_
        #current_classifier = clf.best_estimator_
        importance = None


        if importance is not None:
            print "Importance: ", importance
            imp = sorted(zip(feature_list, importance), key=lambda tup: tup[1], reverse=True)
            print "Most Important Variables: " + str(imp)


        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives,
                                           true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out: ", clf
def algorithm(data_dict, features_list):

    from feature_format import featureFormat
    from feature_format import targetFeatureSplit
   
    ### store to my_dataset for easy export below
    my_dataset = data_dict
    data = featureFormat(my_dataset, features_list)

    # scale features
    #data = scaleFeatures(data)
    
    ### split into labels and features (this line assumes that the first
    ### feature in the array is the label, which is why "poi" must always
    ### be first in features_list
    labels, features = targetFeatureSplit(data)

    
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators = 1000, random_state = 202, \
    		learning_rate = 1.0, algorithm = "SAMME.R")
    
    ### dump your classifier, dataset and features_list so 
    ### anyone can run/check your results
    pickle.dump(clf, open("my_classifier.pkl", "w") )
    pickle.dump(data_dict, open("my_dataset.pkl", "w") )
    pickle.dump(features_list, open("my_feature_list.pkl", "w") )
Example #16
0
 def __saveSelectedDataToCsv(self,features_list):
     print "Save selected data to csv"
     data = featureFormat(self.data_dict, features_list, sort_keys = True)
     df = pd.DataFrame(data, columns=features_list)
     df.to_csv('selecteddata.csv')
     print df.describe()
     return
def test_classifier(clf, dataset, feature_list, scaling = False, folds = 1000):
    score_all = []
    precision_all = []
    recall_all = []
    
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    
    if scaling == True:
        min_max_scaler = preprocessing.MinMaxScaler()
        features = min_max_scaler.fit_transform(features)
		
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    for train_indices, test_indices in cv: 
        features_train= [features[ii] for ii in train_indices]
        features_test= [features[ii] for ii in test_indices]
        labels_train=[labels[ii] for ii in train_indices]
        labels_test=[labels[ii] for ii in test_indices]
        
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)
        score_all.append(clf.score(features_test,labels_test))
        precision_all.append(precision_score(labels_test,pred))
        recall_all.append(recall_score(labels_test,pred))

    precision = numpy.average(precision_all)
    recall = numpy.average(recall_all)
    score = numpy.average(score_all)
        
    print "Score: " + str(score)
    print "Recall: " + str(precision)
    print "Precision: " + str(recall)	
def validation(clf, dataset, feature_list, test_size=0.2, n_iter=1000):
    '''
    validate given classifier with using stratifie shuffle split cross validation. 
    returns average precision and recall
    '''
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)

    precision = []
    recall = []

    cv = StratifiedShuffleSplit(labels, n_iter, test_size=test_size, random_state = 42)
    for train_idx, test_idx in cv:
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)

        precision.append(precision_score(labels_test, predictions))
        recall.append(recall_score(labels_test, predictions))

    return np.mean(precision), np.mean(recall)
def find_best_parameters(pipeline, parameters, score_func, dataset, 
                         feature_list, test_size=0.2, n_iter=10):
    """
    find best parameter by using GridSearchCV with given scoring function.

    returns GridSearchCV object that has best parameters.
    """

    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, 1, test_size=test_size, random_state = 42)
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

    sss = StratifiedShuffleSplit(labels_train, n_iter=n_iter , test_size=test_size, random_state=42)

    clf = GridSearchCV(pipeline, parameters, scoring=score_func, cv=sss, n_jobs=-1)
    clf.fit(features_train, labels_train)

    return clf
def get_k_best_features(data_dict, features_list, k):

  """
  runs scikit-learn's SelectKBest feature selection to get k best features
    
  Args:
    data_dict: data dictionary for enron
    feature_list: a list of features with first feature as target label
    k: Number of best features which need to be selected

  Returns:
    returns a list of k best features and list of lists where inner list's 
    first element is feature and the second element is feature score
  """

  data = featureFormat(data_dict, features_list)
  labels, features = targetFeatureSplit(data)

  k_best = SelectKBest(k=k)
  k_best.fit(features, labels)
  scores = k_best.scores_
  unsorted_pairs = zip(features_list[1:], scores)
  sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
  k_best_features = dict(sorted_pairs[:k])
  return k_best_features.keys(), map(list, sorted_pairs)
def get_k_best(df, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection
        returns dict where keys=features, values=scores
    """
    # feature, label = feature_format_scale(data_dict, features_list)
    from poi_dataprocess import *
    from feature_format import featureFormat, targetFeatureSplit

    data_dict_new = df[features_list].T.to_dict()

    data = featureFormat(data_dict_new, features_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    # df = df[features_list]
    # features = df.drop('poi', axis=1)#.astype(float)
    # labels = df['poi']

    from sklearn import preprocessing

    scaler = preprocessing.MinMaxScaler()
    features = scaler.fit_transform(features)

    from sklearn.feature_selection import SelectKBest

    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    k_best_features = dict(sorted_pairs[:k])

    return k_best_features
def select_k_best(data_dict, features_list, k):
    # Create dataset from feature list
    data = featureFormat(data_dict, features_list)
    # Split dataset into labels and features
    labels, features = targetFeatureSplit(data)
    # Create Min/Max Scaler
    scaler = preprocessing.MinMaxScaler()
    # Scale Features
    features = scaler.fit_transform(features)
    # Create k_best feature selection
    k_best = SelectKBest(k=k)
    # Fit k_best
    k_best.fit(features, labels)
    # Get k_best scores
    scores = k_best.scores_
    # Create list with features and scores
    unsorted_pairs = zip(features_list[1:], scores)
    # Sort list
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    # Create dict
    if k == "all":
        k_best_features = dict(sorted_pairs)
    else:
        k_best_features = dict(sorted_pairs[:k])
    return k_best_features
Example #23
0
def main():
    ### load up student's classifier, dataset, and feature_list
    clf, dataset, feature_list = load_classifier_and_data()
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    ### Run testing script
    test_classifier(clf, features, labels)
def regressionBonusAndLongTermInc():
    ### list the features you want to look at--first item in the
    ### list will be the "target" feature
    features_list = ["bonus", "long_term_incentive"]
    data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
    #, sort_keys = '../../tools/python2_lesson06_keys.pkl'
    target, features = targetFeatureSplit( data )
    #print target
    #print features

    ### training-testing split needed in regression, just like classification
    from sklearn.cross_validation import train_test_split
    feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
    train_color = "b"
    test_color = "r"

    ### Your regression goes here!
    ### Please name it reg, so that the plotting code below picks it up and
    ### plots it correctly. Don't forget to change the test_color above from "b" to
    ### "r" to differentiate training points from test points.
    from sklearn import linear_model
    ### name your regression reg
    reg = linear_model.LinearRegression()

    ### your code goes here!
    reg.fit(feature_train, target_train)
    #find the score on the test data
    print reg.score(feature_test, target_test)
Example #25
0
def ptest(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            else:
                true_positives += 1
    precision = 1.0*true_positives/(true_positives+false_positives)
    return precision
def get_k_best(dictionary, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection returning:
    {feature:score}
    """
    data = featureFormat(dictionary, features_list)
    labels, features = targetFeatureSplit(data)

    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    pairs = zip(features_list[1:], scores)
    #combined scores and features into a pandas dataframe then sort 
    k_best_features = pd.DataFrame(pairs,columns = ['feature','score'])
    k_best_features = k_best_features.sort('score',ascending = False)
    
    
    #merge with null counts    
    df_nan_counts = get_nan_counts(dictionary)
    k_best_features = pd.merge(k_best_features,df_nan_counts,on= 'feature')  
    
    #eliminate infinite values
    k_best_features = k_best_features[np.isinf(k_best_features.score)==False]
    print 'Feature Selection by k_best_features\n'
    print "{0} best features in descending order: {1}\n".format(k, k_best_features.feature.values[:k])
    print '{0}\n'.format(k_best_features[:k])
    
    
    return k_best_features[:k]
Example #27
0
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    ## Tester lacks feature scaling, lets put it here:

    # Scale features:
    mins = np.min(data, axis=0)
    maxs = np.max(data, axis=0)
    data = (data - mins) / (maxs - mins)

    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)
        print RESULTS_FORMAT_STRING.format(
            total_predictions, true_positives, false_positives, false_negatives, true_negatives
        )
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
Example #28
0
def tune_classifier(clf_name, clf, dataset, features_list, scores, folds = 1000):
    data = featureFormat(dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    scale = True if clf_name in {'kNN', 'SVM', 'kNN (hand-tuned)'} else False
    if scale:
        # Perform feature scaling 
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    if clf_name == 'kNN':
        parameter_grid = [{'p': [1, 2, 3], 
                           'n_neighbors': [1, 5, 7, 10, 15],
                           'leaf_size': [30, 50, 70, 100]}]
    elif clf_name == 'Decision Tree':
        parameter_grid = [{'min_samples_split': [2, 3, 4, 5], 
                           'min_samples_leaf':[2, 3, 4, 5], 
                           'splitter': ['random', 'best']}]
    best_params={}
    for score in scores:
        grid_clf = GridSearchCV(clf, parameter_grid, cv=cv, 
                                scoring="{0}_weighted".format(score))
        grid_clf.fit(features, labels)
        best_params = grid_clf.best_params_
        #print("Grid scores:")
        #for params, mean_score, scores in grid_clf.grid_scores_:
        #    print("{:0.3f} {:+0.03f} for {!r}".format(mean_score, scores.std() * 2, params))
    print("Classifier {0} has tuned parameters {1}".format(clf_name, best_params))
    return best_params
def select_k_best_features(data, feature_list, k):
    """
    For E+F dataset, select k best features based on SelectKBest from 
    sklearn.feature_selection

    Input:
    data: data in dictionary format 
    feature_list: the full list of features to selection from 
    k: the number of features to keep

    Return:
    the list of length of k+1 with the first element as 'poi' and other 
    k best features 

    """
    data = featureFormat(data_dict, feature_list)
    labels, features = targetFeatureSplit(data)
    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    impt_unsorted = zip(feature_list[1:], k_best.scores_)
    impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True))
    k_best_features = [elem[0] for elem in impt_sorted][:k]
    print k, "best features:"
    print k_best_features
    return ['poi'] + k_best_features
Example #30
0
# Supplied in the zip file from Udacity was a list of Persons of Interests, containing 35 individuals, sourced from
# USA Today article (http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm).
# We identify a discrepency between the what was provided in the source file and the final_project_dataset.pkl file.
# Is the dataset a better indicator of POI?
with open("poi_names.txt") as f:
    poi_list_usat = len(f.readlines()[2:])
print 'Number of POIs from USA Today:', (poi_list_usat)

### Task 2: Remove outliers

# We can visualize some of the features we think may be indicators of fraud to get a
# good idea of what the data looks like, potentially identifying some outliers.
# Using the enron61702insiderpay.pdf, we see high dollar values for feature "bonus" and
# "total_stock_value".
features_outlier_viz = ['bonus', 'total_stock_value']
features = featureFormat(data_dict, features_outlier_viz, sort_keys=True)

for i in features:
    bonus = i[0]
    total_stock_value = i[1]
    plt.scatter(bonus, total_stock_value)

plt.xlabel("Bonus")
plt.ylabel("Total Stock Value")
plt.show()

# In the Outlier Mini-Project we identified "TOTAL" as an important outlier for removal.
# We will include the removal of this "individual" as well as "The Travel Agency in the
# Park" because they are not really individuals working at Enron. We will also remove
# individuals with no data (NaN) for all features, which seemed out of place. These
# outliers are fairly easy to identify and remove.
Example #31
0
                         'from_this_person_to_poi')
new_feature_2_inputs_add('total_poi_emails', 'to_and_from_poi_emails',
                         'shared_receipt_with_poi')
new_feature_2_inputs_divide('percent_of_poi_to_emails',
                            'from_this_person_to_poi', 'to_messages')
new_feature_2_inputs_divide('percent_of_poi_from_emails',
                            'from_poi_to_this_person', 'from_messages')
new_feature_4_inputs_divide('percent_poi_emails', 'from_poi_to_this_person',
                            'from_this_person_to_poi', 'to_messages',
                            'from_messages')

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)


#Draw a plot comparing two features: f1_name and f2_name, along with their prediction line: pred.
def Draw(pred,
         features,
         poi,
         mark_poi=False,
         name="image.png",
         f1_name="feature 1",
         f2_name="feature 2"):

    #plot each cluster with a different color--add more colors for
    #drawing more than five clusters
    colors = ["b", "c", "k", "m", "g"]
Example #32
0
]  # You will need to use more features
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Task 2: Remove outliers

data_dict.pop('TOTAL', 0)
data_dict.pop('THE TRAVEL AGENCY IN THE PARK', 0)

### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    all_importance = []  #for holding feature importance from each fold
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        importance = clf.feature_importances_
        all_importance.append(importance)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        avg_importance = np.mean(all_importance, axis=0)
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        cm = [[true_negatives, false_positives],
              [false_negatives, true_positives]]
        print clf
        print "Feature importances", avg_importance
        print PERF_FORMAT_STRING.format(accuracy,
                                        precision,
                                        recall,
                                        f1,
                                        f2,
                                        display_precision=5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                           false_positives, false_negatives,
                                           true_negatives)
        print ""
        return cm
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predictions."
Example #34
0
"""

import sys
from sklearn import datasets, linear_model

import pickle
sys.path.append("tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load(open("tools/final_project_dataset_modified.pkl", "r"))
### list the features you want to look at--first item in the
### list will be the "target" feature
#features_list = ["bonus", "long_term_incentive"]
features_list = ["bonus", "salary"]

data = featureFormat(dictionary,
                     features_list,
                     remove_any_zeroes=True,
                     sort_keys="tools/python2_lesson06_keys.pkl")
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

reg = linear_model.LinearRegression()

# Train the model using the training sets
reg.fit(feature_train, target_train)
prediction = reg.predict(feature_train)
print ''

for user in data_dict:
    data_dict[user]['ratio_to_poi'] = str(
        float(data_dict[user]['from_this_person_to_poi']) /
        float(data_dict[user]['from_messages']))

### Store to my_dataset for easy export below.
my_dataset = data_dict

print 'Extract features and labels from dataset'
print ''

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset,
                     features_list,
                     sort_keys=True,
                     remove_NaN=False)
labels, features = targetFeatureSplit(data)
#%%

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
Example #36
0
#==============================================================================
# # ### Task3 : Plotting of features
#==============================================================================
#==============================================================================
from pandas.plotting import scatter_matrix
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

##Scaling the features
scaler = preprocessing.MinMaxScaler()


data1,data2,list_base,list_base_2,df,df_2,FinalDf,FinalDf_2,df2_mean,X,Y,Y_dict,Y_list = {},{},{},{},{},{},{},{},{},{},{},{},{}
for i in Retailers:
    data1[i] = featureFormat(my_dataset[i], valid_[i])
    data2[i] = featureFormat_nan(my_dataset[i], valid_[i])
    data1[i] = scaler.fit_transform(data1[i])
    #list_base = {}
    #for i in Retailers:
    list_ = []
    for k in range(len(valid_[i])):
        j = []
        for point in data1[i]:
            j.append(point[k])
        list_.append(j)
    list_base.update({i: list_})

    df[i] = pd.DataFrame(list_base[i], index=valid_[i])
    FinalDf[i] = df[i].transpose()
    Draws a little scatterplot of the training/testing data

    You fill in the regression code where indicated:
"""

import sys
import pickle
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load(
    open("../final_project/final_project_dataset_modified.pkl", "r"))

### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]
data = featureFormat(dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.

from sklearn import linear_model
Example #38
0
#!/usr/bin/python
"""
    Starter code for the validation mini-project.
    The first step toward building your POI identifier!

    Start by loading/formatting the data

    After that, it's not our code anymore--it's yours!
"""

import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open("../final_project/final_project_dataset.pkl",
                             "r"))

### first element is our labels, any added elements are predictor
### features. Keep this the same for the mini-project, but you'll
### have a different feature list when you do the final project.
features_list = ["poi", "salary"]

data = featureFormat(data_dict,
                     features_list,
                     sort_keys='../tools/python2_lesson13_keys.pkl')
labels, features = targetFeatureSplit(data)

### it's all yours from here forward!
word_features = vectorizer.get_feature_names()

sys.stdout.write("Done\n")

sys.stdout.write("Preprocessing data...   ")
sys.stdout.flush()

# Here I'm not adding all the word features into data_dict or my_dataset,
# that will waste too much time and they'll be extemely large. Instead,
# I preprocess the data_dict with featureFormat, then concatenate with
# the matrix generated by TfidfVectorizer.
# After that, do feature scaling and selection, then transform the
# final numpy array into original dict format
data = featureFormat(data_dict,
                     all_features,
                     remove_all_zeroes=False,
                     sort_keys=True)

# Concatenate two arrays vertically with np.hstack
data = np.hstack((data, tf.toarray()))

labels, features = targetFeatureSplit(data)

# Feature scaling
# Note that this applies to the final dataset used by tester.py,
# see L110~L115 and L178~L185
features = MinMaxScaler().fit_transform(features)

# Add an underscore before every word feature name to avoid ambiguity with
# original features
for feature in word_features:
Example #40
0
#	else: # to avoid this person from being removed from the master list
#		my_dataset[name][fname] = 0.0
#features_list.append(fname)
### add a feature for Total_stock/total_payments to see who had most to gain from the stock
fname = 'total_stock_to_payments'
for name in keys:
	if (my_dataset[name]['total_stock_value'] != 'NaN' and my_dataset[name]['total_payments'] != 'NaN'):
		my_dataset[name][fname] = float(my_dataset[name]['total_stock_value'])/my_dataset[name]['total_payments']
	else: # to avoid this person from being removed from the master list
		my_dataset[name][fname] = 0.0
features_list.append(fname)

nfeat = len(features_list)
print "number of features:", nfeat
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, remove_all_zeroes=False, sort_keys = True)
labels, features = targetFeatureSplit(data)
# scale features to normalize them
#features = preprocessing.scale(np.array(features))
features = np.array(features)	
labels = np.array(labels)

nsample = len(labels)
print "number of keys:", len(keys)
print " number of samples:", nsample
print "emp name                 ",
for feature in features_list:
    print '{:>10}'.format(feature),
print ''
i = 0
select_feature = 1
Example #41
0

print ('new score = {0}'.format(reg2.score(ages_test, net_worths_test)))


# ## Enron Outliers

# In[20]:


from feature_format import featureFormat, targetFeatureSplit

### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "rb") )
features = ["salary", "bonus"]
data = featureFormat(data_dict, features)

plt.scatter(data[:,0], data[:,1])
plt.xlabel("salary")
plt.ylabel("bonus")

import pandas as pd
df = pd.DataFrame(data_dict)
df.loc['salary',:] = pd.to_numeric(df.loc['salary',:], errors='coerce')
df.loc['bonus',:] = pd.to_numeric(df.loc['bonus',:], errors='coerce')
x = df.loc['salary',:].astype('float64')
print(x.idxmax(axis=1))


# ## Any More Outliers?
    'shared_receipt_with_poi'
]
enron_pd = pd.DataFrame.from_dict(data_dict, orient='index')
enron_pd[all_features] = enron_pd[all_features].astype(float)
print enron_pd.describe()

#how many POIs
poi_count = 0
for p in range(len(enron_pd)):
    if enron_pd['poi'][p] == True:
        poi_count += 1
print "There are", poi_count, "POI (persons of interest) and", 146 - poi_count, "non-POI"

# REMOVE "TOTAL", "THE TRAVEL AGENCY IN THE PARK" rows
# code from Lesson: Enron Outliers to plot
data_out = featureFormat(data_dict, features_list)
for point in data_out:
    salary = point[1]
    bonus = point[2]
    #matplotlib.pyplot.scatter( salary, bonus )
    plt.scatter(salary, bonus)

plt.xlabel("salary")
plt.ylabel("bonus")
plt.show()


# 2.2 Function to remove outliers
def remove_outlier(dict_object, keys):
    ### removes list of outliers keys from dict object
    for key in keys:
Example #43
0
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    cv = StratifiedShuffleSplit(n_splits=folds, random_state=42)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0

    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []

        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)

        ##modifiquei a função para ela retornar os resultados ao inves de imprimi-los
        print(clf)
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")

        return accuracy, precision, recall
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Example #44
0
]

# In[183]:

from sklearn.preprocessing import scale
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier
import tester

# In[184]:

from sklearn import preprocessing

data = featureFormat(my_dataset, feature_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)

# In[185]:

# dt_clf number of features
n_features = np.arange(1, len(feature_list))

dt_pipe = Pipeline([('select_features', SelectKBest()),
                    ('classify', DecisionTreeClassifier())])

param_grid = [{'select_features__k': n_features}]

dt_clf = GridSearchCV(dt_pipe, param_grid=param_grid, scoring='f1', cv=10)
Example #45
0
from time import time

import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open("../final_project/final_project_dataset.pkl",
                             "r"))

### first element is our labels, any added elements are predictor
### features. Keep this the same for the mini-project, but you'll
### have a different feature list when you do the final project.
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)  #original
#data = featureFormat(data_dict, features_list, sort_keys = '../tools/python2_lesson13_keys.pkl')
#

labels, features = targetFeatureSplit(data)
print "type(labels)=", type(labels), "len(labels)=", len(labels)
print "type(features)=", type(features), "len(features)=", len(features)

### it's all yours from here forward!
from sklearn import tree

start_time = time()
clf = tree.DecisionTreeClassifier()
print("--- time to initialise tree.DecisionTreeClassifier %s seconds ---" %
      (time() - start_time))
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    # The inital script raised an error : StratifiedShuffleSplit not iterable
    # I rewrote the cv StratifiedShuffleSplit object with the same parameters according to sklearn doc:
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html
    #cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    cv = StratifiedShuffleSplit(n_splits=folds, random_state=42)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0

    #for train_idx, test_idx in cv:
    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy,
                                        precision,
                                        recall,
                                        f1,
                                        f2,
                                        display_precision=5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                           false_positives, false_negatives,
                                           true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
Example #47
0
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import matplotlib.pyplot

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
outlier_check = ['salary', 'bonus']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)
my_data = data_dict
data = featureFormat(my_data, outlier_check, sort_keys=True)

### Task 2: Remove outliers

for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter(salary, bonus)

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

my_data.pop('TOTAL', 0)
my_data.pop('THE TRAVEL AGENCY IN THE PARK', 0)
Example #48
0
# In[10]:

#For convinience of data cleaning (Removing NaN) using pandas (Ref: Data visulization)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# In[11]:

unwanted_features = ["poi", "email_address"]

features_temp = [ele for ele in features_temp if ele not in unwanted_features]
# to make the first element in the list "poi"
features_temp = ["poi"] + features_temp
feature_data = featureFormat(data_dict, features_temp, remove_NaN=False)

# In[12]:

temp_df = pd.DataFrame(data=feature_data,
                       columns=features_temp,
                       index=name_data_point)
print "With NaN"
print temp_df.info()

# In[13]:

poi = temp_df['poi'] == 1
temp_df[poi].count()

# In[14]:
Example #49
0
def main():
    ### Task 1: Select what features you'll use.
    ### features_list is a list of strings, each of which is a feature name.
    ### The first feature must be "poi".
    financial_features = ['salary', 'deferral_payments', 'total_payments', \
                         'loan_advances', 'bonus', 'restricted_stock_deferred',\
                         'deferred_income', 'total_stock_value', 'expenses', \
                         'exercised_stock_options', 'other', 'long_term_incentive', \
                         'restricted_stock', 'director_fees'] #(all units are in US dollars)

    email_features = ['to_messages', 'from_poi_to_this_person',
                     'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
    #(units are generally number of emails messages; notable exception is ‘email_address’, 
    # which is a text string)
    #email_address feature was removed from list

    poi_label = ['poi'] ###(boolean, represented as integer)

    features_list = poi_label + email_features + financial_features

    ### Load the dictionary containing the dataset
    with open("final_project_dataset_unix.pkl", "rb") as data_file:
        data_dict = pickle.load(data_file)
      
    #convert to a pandas dataframe for exploratory analysis
    df = pd.DataFrame.from_dict(data_dict, orient='index')

    #iterate df and convert string 'NaN' to actual np.nan
    for label, content in df.items():
        if label == 'email_address':
            for i in content:
                if i == 'NaN':
                    df[label][i] = np.nan
        else:
            df[label] = pd.to_numeric(df[label], errors='coerce')


    ### Investigate contents of dataset:
            
    # Total Number of data points
    total_people = df.shape[0]
    print('The total number of data points (people) in our data set is {}.\n'\
        .format(total_people))

    # Total Number of Features Used
    all_features = df.shape[1]
    print('There are {} features for each person in our dataset.\n'\
        .format(all_features))

    # Total Number of Persons Of Interest (POIs)
    poi_count = df['poi'][(df['poi'] == True)].count()
    print('Our dataset has {} persons of interest.\n'.format(poi_count))

    # Total Number of Non-POIs
    non_poi_count = total_people - poi_count
    print('Our dataset has {} Non persons of interest.\n'.format(non_poi_count))

    # Features with missing values?
    print('The following categories have missing values (NaN values)\n')
    print (df.isna().sum())


    ### Task 2: Remove outliers

    #visualize_features('salary', 'bonus', data_dict)
    #visualize_features('from_poi_to_this_person', 'from_this_person_to_poi', data_dict)
    #visualize_features('loan_advances', 'total_stock_value', data_dict)


    print()
    print('Searching for Outliers...')
    find_outlier('salary', df)
    print ()
    find_outlier('bonus', df)
    print()
    find_outlier('from_poi_to_this_person', df)
    print ()
    find_outlier('from_this_person_to_poi', df)
    print ()
    find_outlier('loan_advances', df)
    print ()
    find_outlier('total_stock_value', df)


    #get a count of number of NaN columns for each person
    nan_count = df.isna().sum(axis=1)


    print('\nThe top 5 people by number of NaN columns are:\n')
    print (nan_count.sort_values(ascending=False).head(5))

    print('\nLooking closer at Eugene Lockhart...\n')
    print( df.loc['LOCKHART EUGENE E'])

    print ('\nLooking closer at THE TRAVEL AGENCY IN THE PARK...\n')
    print (df.loc['THE TRAVEL AGENCY IN THE PARK'])


    ### Remove outliers
    df = df.drop(['TOTAL'], axis=0)
    df = df.drop(["LOCKHART EUGENE E"], axis=0)
    df = df.drop(["THE TRAVEL AGENCY IN THE PARK"], axis=0)

    #replace NaN with 0
    df = df.fillna(0)


    ### Task 3: Create new feature(s)
    ### Store to my_dataset for easy export below.
    my_dataset = df.to_dict('index')

    for person in my_dataset:
        to_poi_count = my_dataset[person]['from_this_person_to_poi']
        from_poi_count = my_dataset[person]['from_poi_to_this_person']
        total_received_emails = my_dataset[person]['from_messages']
        total_sent_emails = my_dataset[person]['to_messages']
        
        try:
            my_dataset[person]['to_poi_ratio'] = float(to_poi_count) /\
                float(total_sent_emails)
        except:
            my_dataset[person]['to_poi_ratio'] = 0
        try:
            my_dataset[person]['from_poi_ratio'] = float(from_poi_count) /\
            float(total_received_emails)
        except:
            my_dataset[person]['from_poi_ratio'] = 0

    features_list = features_list + ['to_poi_ratio', 'from_poi_ratio']

    ### Preprocessing

    ### Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)

    #Scaling features (normalizing all features)
    min_max_scaler = MinMaxScaler()
    features = min_max_scaler.fit_transform(features)

    ### Select the best features:
    # Removes all but the k highest scoring features
    n = 6 # adjust for optimization
    skb = SelectKBest(f_classif, k=n)
    skb.fit_transform(features, labels)
    #pprint(sorted(skb.scores_, reverse=True))

    #skip poi feature and combine with returned scores (key:value --> feature:score)
    scores = zip(features_list[1:], skb.scores_)

    #sort by highest scoring feature from scores
    sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True)
    #print '\nOur {} highest feature scores are:'.format(n)
    #pprint(sorted_scores[:n])
                                          
    #add k highest scoring features to create new features_list
    new_features_list = poi_label + list(map(lambda x: x[0], sorted_scores))[:n]
    #print '\nOur new features list includes: '
    #pprint(new_features_list)

    ### Extract features and labels from dataset using optimized features_list
    data = featureFormat(my_dataset, new_features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)


    ### Task 4: Try a variety of classifiers
    ### Please name your classifier clf for easy export below.
    ### Note that if you want to do PCA or other multi-stage operations,
    ### you'll need to use Pipelines. For more info:
    ### http://scikit-learn.org/stable/modules/pipeline.html


    print ('\nRunning GaussianNB classifier...')
    run_classifier(GaussianNB(), features, labels)

    print ('\nRunning SVM classifier...')
    run_classifier(SVC(), features, labels)

    print ('\nRunning AdaBoost classifier...')
    run_classifier(AdaBoostClassifier(), features, labels)

    print ('\nRunning DecisionTree classifier...')
    run_classifier(DecisionTreeClassifier(), features, labels)



    ### Task 5: Tune your classifier to achieve better than .3 precision and recall 
    ### using our testing script. Check the tester.py script in the final project
    ### folder for details on the evaluation method, especially the test_classifier
    ### function. Because of the small size of the dataset, the script uses
    ### stratified shuffle split cross validation. For more info: 
    ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

    ### Re-Extract features and labels from dataset for local testing
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)


    # Adjust SVM parameters to refine accuracy
    # variables will be passed to fine_tune_algorithm to use in a Pipeline
    print ('\nThe best fit SVM has the following scores:\n')
    svm_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                 ('SVM', SVC())]
    svm_parameters = {'SVM__kernel': ('linear', 'rbf'), 
                  'SVM__C':[0.001, 0.01, .1, 1, 10, 100, 1000], 
                  'SVM__gamma':[0.01, .1, 1, 10, 100, 1000],
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    svm_clf = fine_tune_algorithm(svm_steps, svm_parameters, features, labels)


    # Adjust DecisionTreeClassifier parameters to refine accuracy
    print ('\nThe best fit DecisionTreeClassifer has the following scores:\n')
    dt_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), 
                ('DT', DecisionTreeClassifier())]
    dt_parameters = {'DT__criterion': ('gini', 'entropy'), 
                  'DT__min_samples_split':[2,3,4,5,6,7,8,9,10],
                     'DT__random_state':[13],
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    dt_clf = fine_tune_algorithm(dt_steps, dt_parameters, features, labels)


    # Adjust AdaBoostClassifier parameters to refine accuracy
    # variables will be passed to fine_tune_algorithm to use in a Pipeline
    print ('\nThe best fit AdaBoostClassifier has the following scores:\n')
    ab_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                ('AB', AdaBoostClassifier())]
    ab_parameters = {'AB__algorithm': ('SAMME', 'SAMME.R'), 
                  'AB__learning_rate':[.5, .6, .7, .8, .9,1],
                     'SKB__k': [2,3,4,5,6,7,8,9,10]}
    ada_clf = fine_tune_algorithm(ab_steps, ab_parameters, features, labels)

    # Adjust GaussianNB parameters to refine accuracy
    print ('\nThe best fit GaussianNB Classifier has the following scores:\n')
    nb_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()),
                ('NB', GaussianNB())]
    nb_parameters = {'SKB__k': [2,3,4,5,6,7,8,9,10]}
    nb_clf = fine_tune_algorithm(nb_steps, nb_parameters, features, labels)

    #final best fitting classifier
    clf = nb_clf

    ### Task 6: Dump your classifier, dataset, and features_list so anyone can
    ### check your results. You do not need to change anything below, but make sure
    ### that the version of poi_id.py that you submit can be run on its own and
    ### generates the necessary .pkl files for validating your results.

    dump_classifier_and_data(clf, my_dataset, features_list)
Example #50
0
### "NaN" values for this feature, then this feature will not be considered.
### So email_address, loan advances, deferral_payments, director_fees

features_list = [
    'poi', 'to_messages', 'expenses', 'deferred_income', 'long_term_incentive',
    'fraction_from_poi', 'shared_receipt_with_poi', 'from_messages', 'bonus',
    'total_stock_value', 'from_poi_to_this_person', 'from_this_person_to_poi',
    'restricted_stock', 'salary', 'total_payments', 'fraction_to_poi',
    'exercised_stock_options'
]

###############################################################################
### Task 2: Remove outliers
# Look for outliers points by salary and bonus values
features = ["salary", "bonus", "poi"]
data = featureFormat(data_dict, features)
max_salary = 0
max_bonus = 0
for point in data:
    salary = point[0]
    bonus = point[1]
    poi = point[2]
    if poi:
        plt.scatter(salary, bonus, c="r")
    else:
        plt.scatter(salary, bonus)
    if point[0] > max_salary:
        max_salary = point[0]
    if point[1] > max_bonus:
        max_bonus = point[1]
plt.xlabel("salary")
Example #51
0
    plt.show()


### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl",
                             "r"))
### there's an outlier--remove it!
data_dict.pop("TOTAL", 0)

### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
poi = "poi"
features_list = [poi, feature_1, feature_2]
data = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data)

### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
for f1, f2 in finance_features:
    plt.scatter(f1, f2)
plt.show()

### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred
from sklearn.cluster import KMeans
data2 = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data2)
Example #52
0
"""

import sys
import pickle

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

dictionary = pickle.load(
    open("../final_project/final_project_dataset_modified.pkl", "r"))

### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "salary"]  # salary
data = featureFormat(dictionary, features_list, remove_any_zeroes=True
                     )  #, "long_term_incentive"], remove_any_zeroes=True )
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split

feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### your regression goes here!
### please name it reg, so that the plotting code below picks it up and
### plots it correctly

##
#==============================================================================
#==============================================================================
# # ### Task3 : Plotting of features
#==============================================================================
#==============================================================================
from pandas.plotting import scatter_matrix
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

##Scaling the features
scaler = preprocessing.MinMaxScaler()

#data1,data2,list_base,list_base_2,df,df_2,FinalDf,FinalDf_2,df2_mean,X,Y,Y_dict,Y_list = {},{},{},{},{},{},{},{},{},{},{},{},{}

data1 = featureFormat(my_dataset, valid_)
data2 = featureFormat_nan(my_dataset, valid_)
data1 = scaler.fit_transform(data1)
#list_base = {}
#for i in Retailers:
list_base = []
for k in range(len(valid_)):
    j = []
    for point in data1:
        j.append(point[k])
    list_base.append(j)
#      list_base.update({i:list_})
#df = {}
#FinalDf = {}
#for i in Retailers:
df = pd.DataFrame(list_base, index=valid_)
Example #54
0
new_data = pd.DataFrame(my_dataset.values())[features_list]
new_data.index = my_dataset.keys()
new_data['new_total_stock'] = new_data['exercised_stock_options'] + new_data[
    'restricted_stock']
new_dataset = {}
key = list(new_data.index)
for j in range(len(key)):
    v = {}
    key_v = list(new_data.columns.values)
    for i in range(len(key_v)):
        value_v = list(new_data.loc[key[j]])
        v[key_v[i]] = value_v[i]
    new_dataset[key[j]] = v
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)
features_list_new = [
    'poi', 'bonus', 'exercised_stock_options', 'expenses', 'from_messages',
    'from_poi_to_this_person', 'from_this_person_to_poi', 'other',
    'restricted_stock', 'salary', 'shared_receipt_with_poi', 'to_messages',
    'new_total_stock'
]
data_new = featureFormat(new_dataset, features_list_new, sort_keys=True)
labels_new, features_new = targetFeatureSplit(data_new)
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)
features_new = scaler.fit_transform(features_new)
from numpy import mean
from sklearn import cross_validation
Example #55
0
print "The Number of Users: ", len(
    data_dict.keys())  # There are 146 users in dataset.
# The number of poi
count_poi = 0
for POIs in data_dict:
    if data_dict[POIs]['poi'] == True:
        count_poi += 1
    # Replace 'NaN' values to 0s
    for NA_keys in data_dict[POIs]:
        if data_dict[POIs][NA_keys] == 'NaN':
            data_dict[POIs][NA_keys] = 0
print "The Number of POIs: ", count_poi  # There are 18 POI in dataset.

### Task 2: Remove outliers
outlier_tester = ["salary", "bonus", "poi"]
outlier_data = featureFormat(data_dict, outlier_tester)

from operator import itemgetter
for point in outlier_data:
    if point[2] == False:
        salary = point[0]
        bonus = point[1]
        matplotlib.pyplot.scatter(salary, bonus)
matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

# Remove the outlier(s)
data_dict.pop("TOTAL")

#print len(data_dict.keys())
Example #56
0

### load in the dict of dicts containing all the data on each person in the dataset
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl",
                             "r"))
### there's an outlier--remove it!
data_dict.pop("TOTAL", 0)

### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
#feature_3 = "total_payments"
#poi  = "poi"
features_list = [feature_1, feature_2]
data = featureFormat(data_dict, features_list)
poi, finance_features = targetFeatureSplit(data)

filterdata = {k: v for k, v in data_dict.iteritems() if v['salary'] != 'NaN'}

from operator import attrgetter
min_num = min(filterdata.values(), key=lambda x: x['salary'])
max_num = max(filterdata.values(), key=lambda x: x['salary'])

print "min_value =", min_num['salary']
print "max_num =", max_num['salary']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data)
scaler.transform(data)
Example #57
0
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

with open("../submission/my_dataset.pkl", "r") as data_file:
    my_dataset = pickle.load(data_file)

with open("../submission/my_feature_list.pkl", "r") as data_file:
    features_list = pickle.load(data_file)

data = featureFormat(my_dataset, features_list)
labels, features = targetFeatureSplit(data)
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

lass_clf = Lasso(alpha=1, tol=1)
lass_clf.fit(features_train, labels_train)
feature_weights = {}
feature_weight_normalizer = 0

for i in range(len(features_list[1:])):
    feature_weights.update({features_list[i + 1]: lass_clf.coef_[i]})
    feature_weight_normalizer += lass_clf.coef_[i]

for feat in feature_weights:
    feature_weights.update(
Example #58
0
### Now add these above features + some more additional features to the feature_list
features1 = features_list + [
    'fraction_from_poi', 'fraction_to_poi', 'shared_receipt_with_poi',
    'expenses', 'loan_advances', 'long_term_incentive', 'restricted_stock',
    'salary', 'total_stock_value', 'exercised_stock_options', 'total_payments',
    'bonus', 'wealth'
]

print ""
print "Two new features succesfully added to the feature list - 'fraction_from_poi', 'fraction_to_poi' and 'wealth'"
print ""
print "Selected Feature list - before Feature_Selection", features1

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features1, sort_keys=True)
labels, features = targetFeatureSplit(data)

### We do not know yet if feature scaling and feature filering using kbest will benefit our model yet.
### But lets try it anyway

# Scale features
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# K-best features - choosing 6 features for a trial
k_best = SelectKBest(k=6)
k_best.fit(features, labels)

result_list = zip(k_best.get_support(), features1[1:], k_best.scores_)
result_list = sorted(result_list, key=lambda x: x[2], reverse=True)
Example #59
0
    for name in tfidf_dict:
        if tfidf_dict[name]['poi'] == True and tfidf_dict[name][feature] != 0.0:
            poi_count += 1
        if tfidf_dict[name][
                'poi'] == False and tfidf_dict[name][feature] != 0.0:
            npoi_count += 1
    if poi_count == 12 and npoi_count < 50:
        poi_relevant_features.append(feature)
list_of_features = (list(set(list_of_features) - set(poi_relevant_features)))
list_of_features.insert(0, 'poi')

for name in tfidf_dict:
    if tfidf_dict[name]['poi'] != 0.0 and tfidf_dict[name]['poi'] != 1:
        tfidf_dict[name]['poi'] = 0

data = featureFormat(tfidf_dict, list_of_features)
labels, features = targetFeatureSplit(data)
selector = SelectKBest(k=50)
# When selector was select k best - k = 50: Accuracy = 0.9082, Precision = 0.83676, Recall = 0.387
# When selector was select percentile - percentile = 10: Acc =0.83733 , Pre = 0.24419 , Rec = 0.105
# This is a significant decrease
# When Selector was select precentile - percentile = 5: Acc =0.83773 , Pre = 0.21144 , Rec = 0.07950
# This was another significant decrease

selector.fit(features, labels)

selected = selector.get_support()
list_of_features.pop(0)
list_of_features = np.array(list_of_features)
selected_features = list_of_features[selected]
for feature in selected_features:
Example #60
-1
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
 
    # Scale features
    if(scale_features):
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print 'Total predictions: '+str(total_predictions)
        print 'Accuracy: '+str(accuracy)
        print 'Precision: '+str(precision)
        print 'Recall: '+str(recall)
        print 'F1: '+str(f1)
        print 'F2: '+str(f2)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."