def test_classifier(clf, dataset, feature_list, folds = 1000): #print dataset data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) print len(labels) folds= 2 cv = StratifiedShuffleSplit(labels, folds, random_state = 42) for train_index, test_index in cv: print("TRAIN:", train_index, "TEST:", test_index) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 else: true_positives += 1 try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print 'Feature List:', feature_list print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5) return [feature_list, accuracy, precision, recall, f1, f2] print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) except: print "Got a divide by zero when trying out:", clf, feature_list return [feature_list, -1, -1, -1, -1, -1]
def one_feature_predict(features_list, my_dataset): all = [] for i in features_list: if i != 'poi': l = [] l.append('poi') l.append(i) all.append(l) #print all mycolumns = ['feature_list', 'accuracy', 'precision', 'recall', 'f1', 'f2'] resultdf = pd.DataFrame(columns=mycolumns) for item in all: data = featureFormat(my_dataset, item, sort_keys = True) labels, features = targetFeatureSplit(data) clf = tree.DecisionTreeClassifier(min_samples_split = 4) clf.fit(features, labels) resultdf.loc[len(resultdf)] = (test_classifier(clf, my_dataset, item)) return resultdf
import sys import pickle sys.path.append("../tools/") from MiniProjects.tools.feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../../data/final_project_dataset_modified.pkl", "r") ) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit( data ) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color from "b" to "r" ### to differentiate training points from test points.
def topk_feature_predict(k_best_features, my_dataset, normalize_data = False): new_list = k_best_features.keys() all = [] for i in range(len(new_list)): if i !=0: #already looked at individual features on step 1. Start by 2X2 all.extend([sorted(l) for l in itertools.combinations(new_list, i+1)]) #use this to select only combinations of 4 #all.extend([sorted(l) for l in itertools.combinations(new_list, 4)]) for item in all: #add 'poi' in the beginning to all combinations poi ='poi' item.insert(0, poi) mycolumns = ['feature_list', 'accuracy', 'precision', 'recall', 'f1', 'f2'] resultdf2 = pd.DataFrame(columns=mycolumns) for item in all: data = featureFormat(my_dataset, item, sort_keys = True) if normalize_data: df = pd.DataFrame(data, columns=item) for column in df.columns[1:]: df[column] = (df[column] - df[column].mean()) / (df[column].std()) labels = df['poi'] features = df[item[1:]] # all expect for poi else: labels, features = targetFeatureSplit(data) # Tree with 5 best: #clf = tree.DecisionTreeClassifier(min_samples_split = 4) # KNeighborsClassifier with 5 best: #clf = KNeighborsClassifier(algorithm='auto', metric='minkowski', metric_params=None, n_neighbors=5, p=2, weights='distance') #clf = KNeighborsClassifier(algorithm='auto', metric='minkowski', metric_params=None, n_neighbors=6, p=2, weights='distance') #clf = KNeighborsClassifier(algorithm='auto', metric='manhattan', metric_params=None, n_neighbors=6, p=2, weights='distance' , leaf_size=30) # best 1 clf = KNeighborsClassifier(algorithm='auto', metric='minkowski', metric_params=None, n_neighbors=6, p=2, weights='distance', leaf_size=30) #best 2 #Logistic regression: #clf = LogisticRegression(C=1000,penalty='l1',random_state=42,tol=-1000,class_weight='auto') #clf = LogisticRegression( C=1,penalty='l1',random_state=42,tol=10**-10,class_weight='auto') #Random Forest: #clf = RandomForestClassifier(n_estimators=10, min_samples_split = 4, n_jobs = -1, max_features = 0.5) #AdaBoost: #clf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) clf.fit(features, labels) #only have to transform back to dictionary if I had to make it a data frame to normalize if normalize_data: new_dataset = df_to_dict(features_df=features, labels_df=labels) else: new_dataset = my_dataset resultdf2.loc[len(resultdf2)]= (test_classifier(clf, new_dataset, item)) return resultdf2
for key in data_dict: if math.isnan(float(my_dataset[key]['bonus'])) or (math.isnan(float(my_dataset[key]['salary']))): my_dataset[key]['bonus_salary_ratio'] = 0 else: my_dataset[key]['bonus_salary_ratio'] = round(float(my_dataset[key]['bonus']) /float(my_dataset[key]['salary']),2) features_list.append('bonus_salary_ratio') ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, all_features, sort_keys = True) #pprint.pprint(my_dataset) labels, features = targetFeatureSplit(data) k_best_features = SelectKBestFeatures(features, labels, 5, False) #uncomment for Step 1: #resultdf1 = one_feature_predict(features_list, my_dataset) #print tabulate(resultdf1.sort(['recall','accuracy', 'precision'], ascending = [0,0,0]) , headers='keys', tablefmt='psql', floatfmt=".4f") #uncomment for Step 2 #resultdf2 = topk_feature_predict(k_best_features, my_dataset, False) #print tabulate(resultdf2.sort(['f1','recall','accuracy', 'precision'], ascending = [0, 0,0,0]) , headers='keys', tablefmt='psql', floatfmt=".4f") #sys.exit(0)
min_max.append(value["exercised_stock_options"]) print min(min_max), max(min_max) ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" # feature_3 = 'total_payments' poi = "poi" features_list = [poi, feature_1, feature_2] data = featureFormat(data_dict, features_list) poi, finance_features = targetFeatureSplit(data) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, line below assumes 2 features) for f1, f2, f3 in finance_features: plt.scatter(f1, f2) plt.show() from sklearn.cluster import KMeans features_list = ["poi", feature_1, feature_2] data2 = featureFormat(data_dict, features_list)