def precompute_recall_precision(features_list, sum = False): features_list_all = ['poi'] + features_list data = featureFormat(my_dataset, features_list_all, sort_keys = True) labels, features = targetFeatureSplit(data) standardized = MinMaxScaler().fit_transform(features) # Score the features using f_classif sel = SelectKBest(k='all', score_func=f_classif) sel.fit_transform(features, labels) kbest = [(features_list[i], score, i) for i, score in enumerate(sel.scores_)] sorted_kbest = sorted(kbest, key=operator.itemgetter(1), reverse=True) print "Feature Set(", len(kbest), ") List and K-best scores:" for tup in sorted_kbest: print tup[2], "\t", tup[0], tup[1] if not sum: plot_feature_correlation(features, len(kbest)) for i, method in enumerate(methods): pipe, params = method() grid_searcher = GridSearchCV(pipe, param_grid=params, cv=sk_fold, scoring='recall') grid_searcher.fit(features, labels) clf = grid_searcher.best_estimator_ ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list_all, sort_keys = True) labels, features = targetFeatureSplit(data) my_test_classifier(clf, my_dataset, features_list_all, i)
def test_classifier(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) # configure split of test_size and train_size cv = StratifiedShuffleSplit(labels, folds, random_state = 42, test_size = .2, train_size = .8) # print cv for train_idx, test_idx in cv: features_train = [] features_test = [] features_validation = [] labels_train = [] labels_test = [] labels_validation = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: if jj % 2 == 0: features_validation.append( features[jj] ) labels_validation.append( labels[jj] ) else: features_test.append( features[jj] ) labels_test.append( labels[jj] ) # Determine size of training & test sets fit_and_test_classifier(clf, features_train, labels_train, features_test, labels_test) fit_and_test_classifier(clf, features_train, labels_train, features_validation, labels_validation) print "features_train:", len(features_train), "labels_train:", len(labels_train) print "features_test:", len(features_test), "labels_test:", len(labels_test) print "features_validation:", len(features_validation), "labels_validation:", len(labels_validation)
def test_classifier(clf, dataset, feature_list, folds = 1000,scale_features = True, std_features = False): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) #scale features if necessary if scale_features == True: scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) #standardize features for pca if necessary if std_features == True: std = preprocessing.StandardScaler() features = preprocessing.StandardScaler().fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 else: true_positives += 1 try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" return f1 except: print "Got a divide by zero when trying out:", clf
def makeData(dataset, feature_list, folds = 1000): """Make and return dataset prepared for training. Keyword arguments: dataset --- dict of dict feature_list --- list of strings folds --- int """ data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) return features_train, features_test, labels_train, labels_test
def test_training_stratified_split(dataset, features_list, testsize=0.2): """ For E+F dataset, split dataset into the training and test set using stratified method. Input: dataset: data in dictionary format features_list: the full list of features to selection from test: the proportion of the dataset to include in the test split Return: labels_train, labels_test, features_train, features_test """ data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) labels = np.array([int(label) for label in labels]) features = np.array(features) ### Split data into test set and training set sss = StratifiedShuffleSplit(labels, 1, test_size=testsize, random_state=0) for train_index, test_index in sss: labels_train, labels_test = labels[train_index].tolist(), labels[test_index].tolist() features_train, features_test = features[train_index].tolist(), features[test_index].tolist() return labels_train, labels_test, features_train, features_test
def main(): ### load up student's classifier, dataset, and feature_list clf, dataset, feature_list = load_classifier_and_data() data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, 1000, random_state=42) # Build an empty feature importance totals array for calculating average importance totals = [] for each_feature in feature_list: totals.append(0) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) clf = clf.fit(features_train, labels_train) for i in range(len(clf.feature_importances_)): totals[i] += clf.feature_importances_[i] # print clf.feature_importances_ for i in range(len(totals)): totals[i] /= 1000 # Display results print "Feature list: ", feature_list[1:] print "Importances: ", totals
def univariateFeatureSelection(f_list, my_dataset): result = [] for feature in f_list: # Replace 'NaN' with 0 for name in my_dataset: data_point = my_dataset[name] if not data_point[feature]: data_point[feature] = 0 elif data_point[feature] == 'NaN': data_point[feature] =0 data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False) labels, features = targetFeatureSplit(data) features = [abs(x) for x in features] from sklearn.cross_validation import StratifiedShuffleSplit cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = score_func(labels_test,predictions) result.append((feature,score[0],score[1],score[2])) result = sorted(result, reverse=True, key=lambda x: x[3]) return result
def get_most_important_features(dataset, features_list): """Calculates the feature importances. Takes as input a dataset and a list of features. Creates an overfit Decision Tree and calculates the feature importances. Returns a list with the feature importances. """ # creating an overfitted decision tree from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) # new features filtered, NaN values removed features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42) clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = accuracy_score(labels_test, pred) # uncomment to print the accuracy score #print "overfitted accuracy", acc # calculating feature importances feat_imp = clf.feature_importances_ # uncomment to print the most important (common) ones #print feat_imp #for index, feature in enumerate(feat_imp): # if feature > 0.2: # print "spot:", index, ":", features_list[index+1], " | value:", feature return feat_imp
def prep_features(df, features_list, feature_scaled): """ Arguments: load dataframe (or dictionary), and features_list return scaled features, labels in numpy.ndarray, and scaled features, labels in pandas dataframe """ from feature_format import featureFormat, targetFeatureSplit import pandas as pd # for pandas dataframe df1 = df[features_list] features_df = df1.drop('poi', axis=1)#.astype(float) # new features (pandas dataframe) labels_df = df1['poi'] # new labels (pandas dataframe) if feature_scaled == True: features_df_scaled = scale_features(features_df) # scale features else: features_df_scaled = features_df # for dictionary df2 = df[features_list] data_dict_new = df2.T.to_dict() # data_dict (final) features_dic = features_df.copy() X_features = list(features_dic.columns) features_list_new = ['poi'] + X_features # selected features list (final) data = featureFormat(data_dict_new, features_list_new, sort_keys = True) labels, features = targetFeatureSplit(data) if feature_scaled == True: features = scale_features(features) return features, labels, features_df_scaled, labels_df
def selectKBest(previous_result, data): # remove 'restricted_stock_deferred' and 'director_fees' previous_result.pop(4) previous_result.pop(4) result = [] _k = 10 for k in range(0,_k): feature_list = ['poi'] for n in range(0,k+1): feature_list.append(previous_result[n][0]) data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False) labels, features = targetFeatureSplit(data) features = [abs(x) for x in features] from sklearn.cross_validation import StratifiedShuffleSplit cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = score_func(labels_test,predictions) result.append((k+1,score[0],score[1],score[2])) return result
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 else: true_positives += 1 try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print clf #print "Best Params: ", clf.best_params_ #print "Best Estimator: ", clf.best_estimator_ #current_classifier = clf.best_estimator_ importance = None if importance is not None: print "Importance: ", importance imp = sorted(zip(feature_list, importance), key=lambda tup: tup[1], reverse=True) print "Most Important Variables: " + str(imp) print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out: ", clf
def cluster2Features(): ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" poi = "poi" features_list = [poi, feature_1, feature_2] data = featureFormat(data_dict, features_list ) poi, finance_features = targetFeatureSplit( data ) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) #print finance_features for f1, f2 in finance_features: plt.scatter( f1, f2) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred from sklearn.cluster import KMeans estimators = {'k_means_2': KMeans(n_clusters=2)} estimators['k_means_2'].fit(data) pred = estimators['k_means_2'].predict(data) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print "no predictions object named pred found, no clusters to plot"
def algorithm(data_dict, features_list): from feature_format import featureFormat from feature_format import targetFeatureSplit ### store to my_dataset for easy export below my_dataset = data_dict data = featureFormat(my_dataset, features_list) # scale features #data = scaleFeatures(data) ### split into labels and features (this line assumes that the first ### feature in the array is the label, which is why "poi" must always ### be first in features_list labels, features = targetFeatureSplit(data) from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators = 1000, random_state = 202, \ learning_rate = 1.0, algorithm = "SAMME.R") ### dump your classifier, dataset and features_list so ### anyone can run/check your results pickle.dump(clf, open("my_classifier.pkl", "w") ) pickle.dump(data_dict, open("my_dataset.pkl", "w") ) pickle.dump(features_list, open("my_feature_list.pkl", "w") )
def find_best_parameters(pipeline, parameters, score_func, dataset, feature_list, test_size=0.2, n_iter=10): """ find best parameter by using GridSearchCV with given scoring function. returns GridSearchCV object that has best parameters. """ data = featureFormat(dataset, feature_list) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, 1, test_size=test_size, random_state = 42) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) sss = StratifiedShuffleSplit(labels_train, n_iter=n_iter , test_size=test_size, random_state=42) clf = GridSearchCV(pipeline, parameters, scoring=score_func, cv=sss, n_jobs=-1) clf.fit(features_train, labels_train) return clf
def test_classifier(clf, dataset, feature_list, scaling = False, folds = 1000): score_all = [] precision_all = [] recall_all = [] data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) if scaling == True: min_max_scaler = preprocessing.MinMaxScaler() features = min_max_scaler.fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) for train_indices, test_indices in cv: features_train= [features[ii] for ii in train_indices] features_test= [features[ii] for ii in test_indices] labels_train=[labels[ii] for ii in train_indices] labels_test=[labels[ii] for ii in test_indices] clf.fit(features_train, labels_train) pred = clf.predict(features_test) score_all.append(clf.score(features_test,labels_test)) precision_all.append(precision_score(labels_test,pred)) recall_all.append(recall_score(labels_test,pred)) precision = numpy.average(precision_all) recall = numpy.average(recall_all) score = numpy.average(score_all) print "Score: " + str(score) print "Recall: " + str(precision) print "Precision: " + str(recall)
def get_k_best(df, features_list, k): """ runs scikit-learn's SelectKBest feature selection returns dict where keys=features, values=scores """ # feature, label = feature_format_scale(data_dict, features_list) from poi_dataprocess import * from feature_format import featureFormat, targetFeatureSplit data_dict_new = df[features_list].T.to_dict() data = featureFormat(data_dict_new, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) # df = df[features_list] # features = df.drop('poi', axis=1)#.astype(float) # labels = df['poi'] from sklearn import preprocessing scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) from sklearn.feature_selection import SelectKBest k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ unsorted_pairs = zip(features_list[1:], scores) sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) k_best_features = dict(sorted_pairs[:k]) return k_best_features
def validation(clf, dataset, feature_list, test_size=0.2, n_iter=1000): ''' validate given classifier with using stratifie shuffle split cross validation. returns average precision and recall ''' data = featureFormat(dataset, feature_list) labels, features = targetFeatureSplit(data) precision = [] recall = [] cv = StratifiedShuffleSplit(labels, n_iter, test_size=test_size, random_state = 42) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) clf.fit(features_train, labels_train) predictions = clf.predict(features_test) precision.append(precision_score(labels_test, predictions)) recall.append(recall_score(labels_test, predictions)) return np.mean(precision), np.mean(recall)
def main(): ### load up student's classifier, dataset, and feature_list clf, dataset, feature_list = load_classifier_and_data() data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Run testing script test_classifier(clf, features, labels)
def get_k_best_features(data_dict, features_list, k): """ runs scikit-learn's SelectKBest feature selection to get k best features Args: data_dict: data dictionary for enron feature_list: a list of features with first feature as target label k: Number of best features which need to be selected Returns: returns a list of k best features and list of lists where inner list's first element is feature and the second element is feature score """ data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ unsorted_pairs = zip(features_list[1:], scores) sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) k_best_features = dict(sorted_pairs[:k]) return k_best_features.keys(), map(list, sorted_pairs)
def regressionBonusAndLongTermInc(): ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "long_term_incentive"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True) #, sort_keys = '../../tools/python2_lesson06_keys.pkl' target, features = targetFeatureSplit( data ) #print target #print features ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. from sklearn import linear_model ### name your regression reg reg = linear_model.LinearRegression() ### your code goes here! reg.fit(feature_train, target_train) #find the score on the test data print reg.score(feature_test, target_test)
def tune_classifier(clf_name, clf, dataset, features_list, scores, folds = 1000): data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) scale = True if clf_name in {'kNN', 'SVM', 'kNN (hand-tuned)'} else False if scale: # Perform feature scaling from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() features = scaler.fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) if clf_name == 'kNN': parameter_grid = [{'p': [1, 2, 3], 'n_neighbors': [1, 5, 7, 10, 15], 'leaf_size': [30, 50, 70, 100]}] elif clf_name == 'Decision Tree': parameter_grid = [{'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf':[2, 3, 4, 5], 'splitter': ['random', 'best']}] best_params={} for score in scores: grid_clf = GridSearchCV(clf, parameter_grid, cv=cv, scoring="{0}_weighted".format(score)) grid_clf.fit(features, labels) best_params = grid_clf.best_params_ #print("Grid scores:") #for params, mean_score, scores in grid_clf.grid_scores_: # print("{:0.3f} {:+0.03f} for {!r}".format(mean_score, scores.std() * 2, params)) print("Classifier {0} has tuned parameters {1}".format(clf_name, best_params)) return best_params
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) ## Tester lacks feature scaling, lets put it here: # Scale features: mins = np.min(data, axis=0) maxs = np.max(data, axis=0) data = (data - mins) / (maxs - mins) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format( total_predictions, true_positives, false_positives, false_negatives, true_negatives ) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def select_k_best_features(data, feature_list, k): """ For E+F dataset, select k best features based on SelectKBest from sklearn.feature_selection Input: data: data in dictionary format feature_list: the full list of features to selection from k: the number of features to keep Return: the list of length of k+1 with the first element as 'poi' and other k best features """ data = featureFormat(data_dict, feature_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) impt_unsorted = zip(feature_list[1:], k_best.scores_) impt_sorted = list(sorted(impt_unsorted, key=lambda x: x[1], reverse=True)) k_best_features = [elem[0] for elem in impt_sorted][:k] print k, "best features:" print k_best_features return ['poi'] + k_best_features
def tuner(clf, parameters, data): from sklearn.model_selection import GridSearchCV labels, features = targetFeatureSplit(data) scaler = MinMaxScaler() select = SelectKBest() steps = [("scale", scaler), ("select", select), ("classifier", clf)] pipeline = Pipeline(steps) shuffle = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=42) my_scorer = make_scorer(my_score_func) scoring_metric = my_scorer grid_searcher = GridSearchCV(pipeline, param_grid=parameters, cv=shuffle, scoring=scoring_metric) features = select.fit_transform(features, labels) grid_searcher.fit(features, labels) print("Cross-validated {0} score: {1}".format(scoring_metric, grid_searcher.best_score_)) print("Params: ", grid_searcher.best_params_)
def test_classifier(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) #uncomment to do features scaling """ scaler=MinMaxScaler() features=scaler.fit_transform(features) """ cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 gg=0 for train_idx, test_idx in cv: gg+=1 features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test se clf.fit(features_train, labels_train) predictions = clf.predict(features_test) #print predictions for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out:", clf
def ptest(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) pred = clf.predict(features_test) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 else: true_positives += 1 precision = 1.0*true_positives/(true_positives+false_positives) return precision
def make_feature_histograms(dataset, features_list): data = featureFormat(dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) plt.ioff() if not os.path.exists(os.path.join(os.path.dirname(__file__), 'hists')): os.makedirs(os.path.join(os.path.dirname(__file__), 'hists')) for feature, i in zip(features_list[1:], range(len(features[0]))): plt.figure() feature_values_non_poi = [f[i] for f, l in zip(features, labels) if l == 0.0] feature_values_poi = [f[i] for f, l in zip(features, labels) if l == 1.0] feature_values = feature_values_non_poi + feature_values_poi non_zero_values_non_poi = [x for x in feature_values_non_poi if x != 0.0] non_zero_values_poi = [x for x in feature_values_poi if x != 0.0] non_zero_values = non_zero_values_non_poi + non_zero_values_poi q1, q3 = np.percentile(non_zero_values, [25, 75]) iqr = q3 - q1 outliers_hi = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x > q3] outliers_lo = [x for x in non_zero_values if is_outlier(x, q1, q3, iqr) and x < q1] # get same binwidth for both POI and non-POI bins = np.histogram(non_zero_values, bins=50)[1] plt.hist(non_zero_values_poi, bins=bins, alpha=.5, lw=0, color='r', label='POIs') plt.hist(non_zero_values_non_poi, bins=bins, alpha=.5, lw=0, color='b', label='Non-POIs') msg = ('Maximum %s: %d\n' % (feature, max(non_zero_values)) + 'Minimum %s: %d\n' % (feature, min(non_zero_values)) + 'Mean %s: %.5f\n' % (feature, np.mean(non_zero_values)) + 'Median %s: %d\n' % (feature, np.median(non_zero_values)) + '\nTotal Number of Values: %d\n' % len(feature_values) + 'Total Number of Non-Zero Values: %d\n' % len(non_zero_values)) # see which features have low number of non-zero values #if float(len(non_zero_values)) / len(feature_values) < 0.5: # print feature # print out some outlier values if they exist for outliers, which_ols in zip([outliers_hi, outliers_lo], ['Top', 'Bottom']): if outliers: if len(outliers) >= 5: top_n = 5 else: top_n = len(outliers) outliers = sorted(outliers) ol_line = q1 - 1.5*iqr if which_ols == 'Top': outliers = list(reversed(outliers)) ol_line = q3 + 1.5*iqr msg += '\n%s %d Outliers: ' % (which_ols, top_n) for i in range(top_n): if i != top_n - 1: msg += '%d, ' % outliers[i] else: msg += '%d' % outliers[i] plt.axvline(ol_line, lw=.5, ls='--', c='r') plt.figtext(.3, .4, msg) #plt.grid(axis='y') plt.title("%s histogram (non-zero values)" % feature) plt.legend() figname = 'hists/%s_histogram.png' % feature plt.savefig(figname) plt.close()
def get_k_best(dictionary, features_list, k): """ runs scikit-learn's SelectKBest feature selection returning: {feature:score} """ data = featureFormat(dictionary, features_list) labels, features = targetFeatureSplit(data) k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ pairs = zip(features_list[1:], scores) #combined scores and features into a pandas dataframe then sort k_best_features = pd.DataFrame(pairs,columns = ['feature','score']) k_best_features = k_best_features.sort('score',ascending = False) #merge with null counts df_nan_counts = get_nan_counts(dictionary) k_best_features = pd.merge(k_best_features,df_nan_counts,on= 'feature') #eliminate infinite values k_best_features = k_best_features[np.isinf(k_best_features.score)==False] print 'Feature Selection by k_best_features\n' print "{0} best features in descending order: {1}\n".format(k, k_best_features.feature.values[:k]) print '{0}\n'.format(k_best_features[:k]) return k_best_features[:k]
def select_k_best(data_dict, features_list, k): # Create dataset from feature list data = featureFormat(data_dict, features_list) # Split dataset into labels and features labels, features = targetFeatureSplit(data) # Create Min/Max Scaler scaler = preprocessing.MinMaxScaler() # Scale Features features = scaler.fit_transform(features) # Create k_best feature selection k_best = SelectKBest(k=k) # Fit k_best k_best.fit(features, labels) # Get k_best scores scores = k_best.scores_ # Create list with features and scores unsorted_pairs = zip(features_list[1:], scores) # Sort list sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1]))) # Create dict if k == "all": k_best_features = dict(sorted_pairs) else: k_best_features = dict(sorted_pairs[:k]) return k_best_features
breitbart_data = [] with open('breitbart_articles.pkl', 'rb') as f: breitbart = pickle.load(f) for article in breitbart: breitbart_data.append([1, article]) newsmax_data = [] with open('newsmax_articles.pkl', 'rb') as f: newsmax = pickle.load(f) for article in newsmax: newsmax_data.append([1, article]) right_wing_data = fox_data + observer_data + breitbart_data + newsmax_data data = left_wing_data + right_wing_data labels, articles = targetFeatureSplit(data) articles_train, articles_test, labels_train, labels_test = train_test_split( articles, labels, test_size=0.001, random_state=42) flat_train = [] flat_test = [] for sublist in articles_train: for article in sublist: flat_train.append(article) for sublist in articles_test: for article in sublist: flat_test.append(article) vectorizer = TfidfVectorizer(strip_accents="unicode", lowercase=False) vectors = vectorizer.fit_transform(flat_train)
### load in the dict of dicts containing all the data on each person in the dataset data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) ### there's an outlier--remove it! data_dict.pop("TOTAL", 0) ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" poi = "poi" features_list = [poi, feature_1, feature_2] data = featureFormat(data_dict, features_list) poi, finance_features = targetFeatureSplit(data) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) for f1, f2 in finance_features: plt.scatter(f1, f2) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred model = cluster.KMeans(n_clusters=2) model.fit(finance_features) pred = model.predict(finance_features)
if data_dict[keys]["from_poi_to_this_person"] == "NaN" or data_dict[keys][ "from_this_person_to_poi"] == "NaN": data_dict[keys]["contact_poi"] = "NaN" else: data_dict[keys]["contact_poi"] = int( data_dict[keys]["from_poi_to_this_person"]) + int( data_dict[keys]["from_this_person_to_poi"]) features_list.append("income") features_list.append("contact_poi") ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### Use SelectKBest to choose which feature to use for machine learning k = 5 selKBest = SelectKBest(f_regression, k) selKBest.fit(features, labels) selKBest.transform(features).shape mask = selKBest.get_support() scores = selKBest.scores_ feature_score = zip(features_list[1:], scores) feature_score = list(reversed(sorted(feature_score, key=lambda x: x[1])))
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) # The inital script raised an error : StratifiedShuffleSplit not iterable # I rewrote the cv StratifiedShuffleSplit object with the same parameters according to sklearn doc: # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html #cv = StratifiedShuffleSplit(labels, folds, random_state = 42) cv = StratifiedShuffleSplit(n_splits=folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 #for train_idx, test_idx in cv: for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def main(): ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". financial_features = ['salary', 'deferral_payments', 'total_payments', \ 'loan_advances', 'bonus', 'restricted_stock_deferred',\ 'deferred_income', 'total_stock_value', 'expenses', \ 'exercised_stock_options', 'other', 'long_term_incentive', \ 'restricted_stock', 'director_fees'] #(all units are in US dollars) email_features = ['to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi'] #(units are generally number of emails messages; notable exception is ‘email_address’, # which is a text string) #email_address feature was removed from list poi_label = ['poi'] ###(boolean, represented as integer) features_list = poi_label + email_features + financial_features ### Load the dictionary containing the dataset with open("final_project_dataset_unix.pkl", "rb") as data_file: data_dict = pickle.load(data_file) #convert to a pandas dataframe for exploratory analysis df = pd.DataFrame.from_dict(data_dict, orient='index') #iterate df and convert string 'NaN' to actual np.nan for label, content in df.items(): if label == 'email_address': for i in content: if i == 'NaN': df[label][i] = np.nan else: df[label] = pd.to_numeric(df[label], errors='coerce') ### Investigate contents of dataset: # Total Number of data points total_people = df.shape[0] print('The total number of data points (people) in our data set is {}.\n'\ .format(total_people)) # Total Number of Features Used all_features = df.shape[1] print('There are {} features for each person in our dataset.\n'\ .format(all_features)) # Total Number of Persons Of Interest (POIs) poi_count = df['poi'][(df['poi'] == True)].count() print('Our dataset has {} persons of interest.\n'.format(poi_count)) # Total Number of Non-POIs non_poi_count = total_people - poi_count print('Our dataset has {} Non persons of interest.\n'.format(non_poi_count)) # Features with missing values? print('The following categories have missing values (NaN values)\n') print (df.isna().sum()) ### Task 2: Remove outliers #visualize_features('salary', 'bonus', data_dict) #visualize_features('from_poi_to_this_person', 'from_this_person_to_poi', data_dict) #visualize_features('loan_advances', 'total_stock_value', data_dict) print() print('Searching for Outliers...') find_outlier('salary', df) print () find_outlier('bonus', df) print() find_outlier('from_poi_to_this_person', df) print () find_outlier('from_this_person_to_poi', df) print () find_outlier('loan_advances', df) print () find_outlier('total_stock_value', df) #get a count of number of NaN columns for each person nan_count = df.isna().sum(axis=1) print('\nThe top 5 people by number of NaN columns are:\n') print (nan_count.sort_values(ascending=False).head(5)) print('\nLooking closer at Eugene Lockhart...\n') print( df.loc['LOCKHART EUGENE E']) print ('\nLooking closer at THE TRAVEL AGENCY IN THE PARK...\n') print (df.loc['THE TRAVEL AGENCY IN THE PARK']) ### Remove outliers df = df.drop(['TOTAL'], axis=0) df = df.drop(["LOCKHART EUGENE E"], axis=0) df = df.drop(["THE TRAVEL AGENCY IN THE PARK"], axis=0) #replace NaN with 0 df = df.fillna(0) ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = df.to_dict('index') for person in my_dataset: to_poi_count = my_dataset[person]['from_this_person_to_poi'] from_poi_count = my_dataset[person]['from_poi_to_this_person'] total_received_emails = my_dataset[person]['from_messages'] total_sent_emails = my_dataset[person]['to_messages'] try: my_dataset[person]['to_poi_ratio'] = float(to_poi_count) /\ float(total_sent_emails) except: my_dataset[person]['to_poi_ratio'] = 0 try: my_dataset[person]['from_poi_ratio'] = float(from_poi_count) /\ float(total_received_emails) except: my_dataset[person]['from_poi_ratio'] = 0 features_list = features_list + ['to_poi_ratio', 'from_poi_ratio'] ### Preprocessing ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) #Scaling features (normalizing all features) min_max_scaler = MinMaxScaler() features = min_max_scaler.fit_transform(features) ### Select the best features: # Removes all but the k highest scoring features n = 6 # adjust for optimization skb = SelectKBest(f_classif, k=n) skb.fit_transform(features, labels) #pprint(sorted(skb.scores_, reverse=True)) #skip poi feature and combine with returned scores (key:value --> feature:score) scores = zip(features_list[1:], skb.scores_) #sort by highest scoring feature from scores sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True) #print '\nOur {} highest feature scores are:'.format(n) #pprint(sorted_scores[:n]) #add k highest scoring features to create new features_list new_features_list = poi_label + list(map(lambda x: x[0], sorted_scores))[:n] #print '\nOur new features list includes: ' #pprint(new_features_list) ### Extract features and labels from dataset using optimized features_list data = featureFormat(my_dataset, new_features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Task 4: Try a variety of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html print ('\nRunning GaussianNB classifier...') run_classifier(GaussianNB(), features, labels) print ('\nRunning SVM classifier...') run_classifier(SVC(), features, labels) print ('\nRunning AdaBoost classifier...') run_classifier(AdaBoostClassifier(), features, labels) print ('\nRunning DecisionTree classifier...') run_classifier(DecisionTreeClassifier(), features, labels) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html ### Re-Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) # Adjust SVM parameters to refine accuracy # variables will be passed to fine_tune_algorithm to use in a Pipeline print ('\nThe best fit SVM has the following scores:\n') svm_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), ('SVM', SVC())] svm_parameters = {'SVM__kernel': ('linear', 'rbf'), 'SVM__C':[0.001, 0.01, .1, 1, 10, 100, 1000], 'SVM__gamma':[0.01, .1, 1, 10, 100, 1000], 'SKB__k': [2,3,4,5,6,7,8,9,10]} svm_clf = fine_tune_algorithm(svm_steps, svm_parameters, features, labels) # Adjust DecisionTreeClassifier parameters to refine accuracy print ('\nThe best fit DecisionTreeClassifer has the following scores:\n') dt_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), ('DT', DecisionTreeClassifier())] dt_parameters = {'DT__criterion': ('gini', 'entropy'), 'DT__min_samples_split':[2,3,4,5,6,7,8,9,10], 'DT__random_state':[13], 'SKB__k': [2,3,4,5,6,7,8,9,10]} dt_clf = fine_tune_algorithm(dt_steps, dt_parameters, features, labels) # Adjust AdaBoostClassifier parameters to refine accuracy # variables will be passed to fine_tune_algorithm to use in a Pipeline print ('\nThe best fit AdaBoostClassifier has the following scores:\n') ab_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), ('AB', AdaBoostClassifier())] ab_parameters = {'AB__algorithm': ('SAMME', 'SAMME.R'), 'AB__learning_rate':[.5, .6, .7, .8, .9,1], 'SKB__k': [2,3,4,5,6,7,8,9,10]} ada_clf = fine_tune_algorithm(ab_steps, ab_parameters, features, labels) # Adjust GaussianNB parameters to refine accuracy print ('\nThe best fit GaussianNB Classifier has the following scores:\n') nb_steps = [('scaler', MinMaxScaler()), ('SKB', SelectKBest()), ('NB', GaussianNB())] nb_parameters = {'SKB__k': [2,3,4,5,6,7,8,9,10]} nb_clf = fine_tune_algorithm(nb_steps, nb_parameters, features, labels) #final best fitting classifier clf = nb_clf ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)
# In[37]: #With all features from time import time from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix # In[38]: # without engineered features data_old = featureFormat(my_dataset, old_features, sort_keys=True) labels_old, features_old = targetFeatureSplit(data_old) features_train_old, features_test_old, labels_train_old, labels_test_old = train_test_split( features_old, labels_old, test_size=0.3, random_state=42) # In[39]: data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.3, random_state=42) # In[40]: from sklearn.svm import SVC from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import GridSearchCV
my_dataset[names]['messages_from_poi'] = 0 if float(to_messages) != 0 and deferral_payments != 0: my_dataset[names][ 'messages_to_poi/deferral_payments'] = from_this_person_to_poi / float( to_messages * deferral_payments) else: my_dataset[names]['messages_to_poi/deferral_payments'] = 0 features_list_new = POI_label + financial_features + email_features_number + [ 'messages_from_poi' ] + ['messages_to_poi/deferral_payments'] #print "The List with all features with 2 new ones is:", features_list_new ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list_new, sort_keys=True) labels, features = targetFeatureSplit(data) from sklearn.feature_selection import SelectKBest, f_classif featureSelecting = SelectKBest(f_classif, k=10) featureSelecting.fit(features, labels) featureSelected = featureSelecting.get_support() scores = zip(featureSelecting.scores_, features_list_new[1:], featureSelected) scoresSorted = sorted(scores, reverse=True) #print "Scroes are:", scoresSorted ''' scoresSorted = [(25.09754152873549, 'exercised_stock_options', True), (24.4676540475264, 'total_stock_value', True), (21.06000170753657, 'bonus', True), (18.575703268041785, 'salary', True), (11.5955476597306, 'deferred_income', True),
new_feature_2_inputs_add('total_poi_emails', 'to_and_from_poi_emails', 'shared_receipt_with_poi') new_feature_2_inputs_divide('percent_of_poi_to_emails', 'from_this_person_to_poi', 'to_messages') new_feature_2_inputs_divide('percent_of_poi_from_emails', 'from_poi_to_this_person', 'from_messages') new_feature_4_inputs_divide('percent_poi_emails', 'from_poi_to_this_person', 'from_this_person_to_poi', 'to_messages', 'from_messages') ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) #Draw a plot comparing two features: f1_name and f2_name, along with their prediction line: pred. def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature 1", f2_name="feature 2"): #plot each cluster with a different color--add more colors for #drawing more than five clusters colors = ["b", "c", "k", "m", "g"] for ii, pp in enumerate(pred):
''' # Now we are preparing to make our finalList, but as the project requirement its first element should be 'poi' theFinalList = ['poi'] theFinalList.extend(myList) sep("*", "the final feature list") print theFinalList #Now features_list is finalized and will be utilized by the feature_format module features_list = theFinalList ### Extract features and labels from dataset for local testing data = feature_format.featureFormat(my_dataset, features_list, sort_keys=True) labels, features = feature_format.targetFeatureSplit(data) ### Task 4: Try a varity of classifiers def use_decision_tree_clf(): ''' This function uses Decision tree classifier in addition to Grid search cross validation ''' print "This is the use_decision_tree_clf() method" from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import precision_recall_fscore_support param = { 'max_depth': [1, 2, 3, 9],
def estimator_evaluator1(clf, dataset, feature_list, folds): from feature_format import featureFormat, targetFeatureSplit from sklearn.cross_validation import StratifiedKFold data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedKFold(labels, n_folds=folds, random_state=30) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print(clf) print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") except: print("Got a divide by zero when trying out:", clf) print( "Precision or recall may be undefined due to a lack of true positive predicitons." )
### Task 2: Remove outliers del data_dict['TOTAL'] del data_dict['THE TRAVEL AGENCY IN THE PARK'] ### Task 3: Create new feature(s) data_dict = hf.add_poi_mail_features(data_dict) # features_list.append('from_poi_pct') # features_list.append('to_poi_pct') ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract all features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### Dimensions of the data are computed below # print np.array(data).shape # print np.sum(labels) ### Plot all original variables # hf.plot_features(features_list, data_all) ### Print top 5 extreme observations for "loan_advances" and "total_payments" # pprint.pprint(hf.return_sorted_values(data_dict, "loan_advances", 5)) # pprint.pprint(hf.return_sorted_values(data_dict, "total_payments", 5)) ### Plot Lasso selection # hf.lasso_selection(features, labels, features_list)
new_data = pd.DataFrame(my_dataset.values())[features_list] new_data.index = my_dataset.keys() new_data['new_total_stock'] = new_data['exercised_stock_options'] + new_data[ 'restricted_stock'] new_dataset = {} key = list(new_data.index) for j in range(len(key)): v = {} key_v = list(new_data.columns.values) for i in range(len(key_v)): value_v = list(new_data.loc[key[j]]) v[key_v[i]] = value_v[i] new_dataset[key[j]] = v ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) features_list_new = [ 'poi', 'bonus', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'other', 'restricted_stock', 'salary', 'shared_receipt_with_poi', 'to_messages', 'new_total_stock' ] data_new = featureFormat(new_dataset, features_list_new, sort_keys=True) labels_new, features_new = targetFeatureSplit(data_new) from sklearn import preprocessing scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) features_new = scaler.fit_transform(features_new) from numpy import mean from sklearn import cross_validation from sklearn.metrics import accuracy_score, precision_score, recall_score
for k, v in data_dict[key].items(): if v == 'NaN': data_dict[key][k] = 0 ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) from sklearn import preprocessing scaler = preprocessing.RobustScaler() data_scaled = scaler.fit_transform(data) labels, features = targetFeatureSplit(data_scaled) # for point in data: # salary = point[4] # bonus = point[2] # matplotlib.pyplot.scatter( salary, bonus ) # # matplotlib.pyplot.xlabel("salary") # matplotlib.pyplot.ylabel("bonus") # matplotlib.pyplot.show() ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html
def test_classifier(clf, dataset, feature_list, folds=1000): # extract the features specified in features_list data = featureFormat(dataset, feature_list, sort_keys=True) # split into labels and features (this line assumes that the first # feature in the array is the label, which is why "poi" must always # be first in the features list labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) # print clf print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") return clf except: print("Got a divide by zero when trying out:", clf) print( "Precision or recall may be undefined due to a lack of true positive predicitons." )
'from_this_person_to_poi'] / data_dict[i]['from_messages'] else: data_dict[i]['from_this_person_to_poi_ratio'] = 'NaN' features_list = [ 'poi', 'salary', 'deferral_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi', 'from_poi_to_this_person_ratio', 'from_this_person_to_poi_ratio' ] data_array = featureFormat(data_dict, features_list) poi, features = targetFeatureSplit(data_array) # Data Split for train and test features_train, features_test, labels_train, labels_test = train_test_split( features, poi, test_size=0.3, random_state=42) # Feature scaling scaler = MinMaxScaler() rescaled_features_train = scaler.fit_transform(features_train) rescaled_features_test = scaler.fit_transform(features_test) # Feature selection with SelectKBest from sklearn.feature_selection import SelectKBest
def my_test_classifier(clf, dataset, feature_list, folds=1000): from sklearn.cross_validation import StratifiedShuffleSplit PERF_FORMAT_STRING = "\ \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\ Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}" RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\ \tFalse negatives: {:4d}\tTrue negatives: {:4d}" data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) results = (clf, PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5), RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives), precision, recall, accuracy, f1, f2) except: results = ( clf, "Got a divide by zero when trying out:", "Precision or recall may be undefined due to a lack of true positive predicitons.", 0, 0) return results
df['to_poi_rate'] = df['from_this_person_to_poi'] / df['from_messages'] df['from_poi_rate'] = df['from_poi_to_this_person'] / df['to_messages'] new_feat_list = ['from_messages_median_pubIndex', 'to_poi_median_pubIndex'] df = pd.concat([df, df_new], axis=1) df[new_feat_list] = df[new_feat_list].fillna( df.groupby("poi")[new_feat_list].transform("median")) features_list = (poi_label + financial_feat_list + email_feat_list + ['to_poi_rate', 'from_poi_rate'] + new_feat_list) print("Total number of features: ", len(features_list) - 1) ### Store to my_dataset for easy export below. my_dataset = df.to_dict(orient='index') data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.3, random_state=42) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. clf = AdaBoostClassifier(random_state=45) clf.fit(features_train, labels_train) feat_importance = [] for i in range(len(clf.feature_importances_)):
# mail = mail + 1 # print salary # print mail # peeps = enron_data.keys() # print len(peeps) # payments = 0 # for peep in peeps: # if enron_data[peep]['total_payments'] == 'NaN': # payments = payments + 1 # print payments # print 100 * float(payments) / float(len(peeps)) feature_list = ["poi", "total_payments"] data_array = featureFormat(enron_data, feature_list) label, features = targetFeatureSplit(data_array) print label print len(label) + 10 # i = 0 # for lab in label: # # print lab # if lab == 1.0: # print features[i] # i = i + 1 # nada = nada + 1 # totes = len(label) # percent_losers = 100 * float(nada) / float(totes) # # print percent_losers
'from_this_person_to_poi'] / data_dict[e]['from_messages'] else: data_dict[e]['from_this_person_to_poi_ratio'] = 'NaN' features_list = [ 'poi', 'salary', 'deferral_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi', 'from_poi_to_this_person_ratio', 'from_this_person_to_poi_ratio' ] data_array = featureFormat(data_dict, features_list) poi, features = targetFeatureSplit(data_array) ### split the data into train and test features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( features, poi, test_size=0.3, random_state=42) ###Feature scaling before selection, necessary for SVM. scaler = MinMaxScaler() rescaled_features_train = scaler.fit_transform(features_train) rescaled_features_test = scaler.fit_transform(features_test) ### Univariate selection ### Using SelectKBest, I can select features according to the k highest scores. from sklearn.feature_selection import SelectKBest
""" import sys import pickle sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r")) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] data = featureFormat(dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. from sklearn import linear_model
def selectKBest_f1_scores(clf, dataset, n_kbest_features, folds = 1000): """ Verifica os scores do numero de features selecionadas. Responsavel por selecionar o score F1 de 2 ate n_kbest_features. Args: clf: classificador utilizado para a analise dataset: dados utilizados n_kbest_features: numero de maximo de features permitido. Returns: retorno1: Lista de valores K retorno2: Lista de Scores F1 """ graficoX = [] graficoY = [] for k in range(2, n_kbest_features): features_selected = select_best_features(k) features_selected.insert(0, "poi") data = featureFormat(dataset, features_selected, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) graficoY.append(f1) graficoX.append(k) except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons." return graficoX, graficoY
features_list += [ 'salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees' ] # email feature features_list += [ 'to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi' ] print len(features_list) ### Load the dictionary containing the dataset y, X = targetFeatureSplit(featureFormat(data_dict, features_list)) X = np.array(X) from sklearn import tree clf = tree.DecisionTreeClassifier(random_state=12) clf = clf.fit(X, y) print "clf.feature_importances_ : ", clf.feature_importances_ idx_feature_importances = np.argsort(clf.feature_importances_)[::-1] for i in range(10): idx = idx_feature_importances[i] print "importance ", i, " - ", features_list[ idx + 1], " - ", clf.feature_importances_[idx] new_features_list = ['poi'] for i in range(10):
from feature_format import featureFormat, targetFeatureSplit ## this include financial information and email address, how many email sent or recieved from POI dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") ) ## create another dictionary to put only POI from the original dictionary poi_dictionary = {} for k, v in dictionary.iteritems(): if v['poi'] == True: poi_dictionary[k] = v ## change this list, to see result with other values features_list = ["bonus", "exercised_stock_options"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True) poi_data = featureFormat( poi_dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit( data ) poi_target, poi_features = targetFeatureSplit( poi_data ) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) poi_color = "b" all_color = "r" ### draw the scatterplot, with color-coded training and testing points import matplotlib.pyplot as plt for feature, targe in zip(features, target): plt.scatter( feature, targe, color=all_color )
data = featureFormat(data_dict, features_list) # Fit a scaler on the original data (So we don't get deceived by the "NaN" points) feature_1_scaler = fit_scaler_on_original_data(data_dict, feature_1) feature_2_scaler = fit_scaler_on_original_data(data_dict, feature_2) # Rescale the data using the appropriate scaler rescaled_data = data rescaled_data[:, 1] = feature_1_scaler.transform(rescaled_data[:, 1].reshape(1, -1)) rescaled_data[:, 2] = feature_2_scaler.transform(rescaled_data[:, 2].reshape(1, -1)) poi, finance_features = targetFeatureSplit(rescaled_data) kmeans_model = KMeans(n_clusters=2) pred = kmeans_model.fit_predict(rescaled_data) try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters_with_feature_scaling.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print "no predictions object named pred found, no clusters to plot"
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(n_splits=folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) ##modifiquei a função para ela retornar os resultados ao inves de imprimi-los print(clf) print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") return accuracy, precision, recall except: print("Got a divide by zero when trying out:", clf) print( "Precision or recall may be undefined due to a lack of true positive predicitons." )
if msg_from_poi != "NaN" and to_msg != "NaN": my_dataset[person]['msg_from_poi_ratio'] = msg_from_poi/float(to_msg) else: my_dataset[person]['msg_from_poi_ratio'] = 0 msg_to_poi = my_dataset[person]['from_this_person_to_poi'] from_msg = my_dataset[person]['from_messages'] if msg_to_poi != "NaN" and from_msg != "NaN": my_dataset[person]['msg_to_poi_ratio'] = msg_to_poi/float(from_msg) else: my_dataset[person]['msg_to_poi_ratio'] = 0 new_features_list = features_list + ['msg_to_poi_ratio', 'msg_from_poi_ratio'] ## Extract features and labels from dataset for local testing data = featureFormat(my_dataset, new_features_list, sort_keys = True) labels, features = targetFeatureSplit(data) #Select the best features: #Removes all features whose variance is below 80% from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(.8 * (1 - .8))) features = sel.fit_transform(features) #Removes all but the k highest scoring features from sklearn.feature_selection import f_classif k = 7 selector = SelectKBest(f_classif, k=7) selector.fit_transform(features, labels) print("Best features:") scores = zip(new_features_list[1:],selector.scores_) sorted_scores = sorted(scores, key = lambda x: x[1], reverse=True)
def FeatureSelection(data_dict, features_list): # Convert dictionary to numpy array, converts NaN to 0.0 data = featureFormat(data_dict, features_list, \ sort_keys = True, remove_all_zeroes = False) # Separate into labels = 'poi' and features = rest of features_list labels, features = targetFeatureSplit(data) from sklearn.feature_selection import RFECV # Recursive Feature Elimination with Cross Validation from sklearn.svm import SVC # Support Vector Classifier to estimate fit coefficients for each feature from sklearn.cross_validation import StratifiedShuffleSplit # cross validation maintain roughly equal number of POIs in each split ### Create Estimator # which will update the coefficients with each iteration # class weight is set to auto because of unbalanced data classes # weight will be inversely proportional to class size svc = SVC(kernel='linear', class_weight='auto', random_state=42) ############## Scale features ###################### # SVC algorithm requires use scaled features # missing values are coded 0.0, so MinMax will preserve those zero values from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() features = scaler.fit_transform(features) ### Select cross-validation method # StratifiedShuffleSplit keeps roughly the same number of POIs in each split sss = StratifiedShuffleSplit(labels, 100, test_size=0.3, random_state=42) ### Select evaluation metric # Evaluate model using f1 = 2 * (precision * recall) / (precision + recall) # Model should be able to predict POIs, which are a small percentage of cases metric = 'f1' # run the feature eliminater rfecv = RFECV(estimator=svc, cv=sss, scoring=metric, step=1) rfecv = rfecv.fit(features, labels) # view results import matplotlib.pyplot as plt plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score using F1 (precision&recall)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) # plt.savefig('featureSelection.png', transparent=True) plt.show() print("Optimal number of features is %d" % rfecv.n_features_) print( 'Features selected by recursive feature elimination with cross validation:' ) F1_score = round(rfecv.grid_scores_[rfecv.n_features_], 3) print('F1 score from optimal features: %r' % F1_score) selection = rfecv.get_support() selected_features = ['poi'] rejected_features = [] for i in range(len(selection)): if selection[i]: selected_features.append( features_list[i + 1]) # first feature is 'poi'=the label else: rejected_features.append(features_list[i + 1]) print(selected_features[1:]) print('Features eliminated:') print(rejected_features) return selected_features, F1_score
You fill in the regression code where indicated: """ import sys import pickle sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r")) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] data = featureFormat(dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. from sklearn.linear_model import LinearRegression reg = LinearRegression()
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 all_importance = [] #for holding feature importance from each fold for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) importance = clf.feature_importances_ all_importance.append(importance) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives avg_importance = np.mean(all_importance, axis=0) accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) cm = [[true_negatives, false_positives], [false_negatives, true_positives]] print clf print "Feature importances", avg_importance print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" return cm except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predictions."
(this should be the quantity you want to predict) return targets and features as separate lists (sklearn can generally handle both lists and numpy arrays as input formats when training/predicting) """ target = [] features = [] for item in data: target.append( item[0] ) features.append( item[1:] ) return target, features data_dict = featureFormat(my_dataset, features_list, sort_keys = True) #print "\n data_dict:", data_dict labels, features = targetFeatureSplit(data_dict) #print "\n The Labels are : ", labels #print "\n The Labels are : ", features # scale features via min-max from sklearn import preprocessing scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) #==============================================================================
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) # Scale features if(scale_features): scaler = MinMaxScaler() features = scaler.fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print 'Total predictions: '+str(total_predictions) print 'Accuracy: '+str(accuracy) print 'Precision: '+str(precision) print 'Recall: '+str(recall) print 'F1: '+str(f1) print 'F2: '+str(f2) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."