def test_classifier(clf, dataset, feature_list, folds=1000): data = feature_format(dataset, feature_list, sort_keys=True) labels, features = target_feature_split(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def perform_k_fold_and_grid_search(data): from sklearn.svm import SVC from sklearn.metrics import accuracy_score from sklearn.model_selection import StratifiedKFold from sklearn.decomposition import PCA from sklearn.model_selection import GridSearchCV # split feature and target data labels_data, features_data = target_feature_split(data) features_train, labels_train, features_test, labels_test = [], [], [], [] # split features and labels into train and test skf = StratifiedKFold(n_splits=3) for train_index, test_index in skf.split(features_data, labels_data): features_train = [features_data[index] for index in train_index] labels_train = [labels_data[index] for index in train_index] features_test = [features_data[index] for index in test_index] labels_test = [labels_data[index] for index in test_index] # perform principal components analysis and transform features into components pca = PCA(n_components=2) pca.fit(features_train) pca_train, pca_test = pca.transform(features_train), pca.transform( features_test) # dictionary of params for svm parameters = { 'kernel': ('linear', 'rbf'), 'C': [1, 10, 1000], 'gamma': [10, 1000] } _svm_ = SVC() # grid search will find the best params svm_classifier = GridSearchCV(_svm_, parameters) # svm classifier for classification # principal components are used in place of features svm_classifier.fit(features_train, labels_train) print("best params:", svm_classifier.best_params_) labels_prediction = svm_classifier.predict(features_test) print("accuracy score: ", accuracy_score(labels_test, labels_prediction) * 100, "%")
def __main__(): import numpy as np raw_data = feature_format(dictionary, features_list, remove_any_zeroes=True) target, features = target_feature_split(raw_data) feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.3, random_state=42) ml = MachineLearningAlgorithms() ml.perform_linear_regression() # SVM with features ml.classify_svm(feature_train, target_train, feature_test, target_test) pca_train_, pca_test_ = ml.principal_component_analysis( feature_train, feature_test) # SVM with principle components ml.classify_svm(pca_train_, target_train, pca_test_, target_test) ml.kmeans_cluster(feature_train) # k fold train/test splitting ml.perform_k_fold_and_grid_search(raw_data) # feature scaling print("rescaled: {}".format( ml.feature_rescale(np.array([50.0, 99.0, 22.3, 88.0])))) ml.text_classification()
dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "rb")) # list the features you want to look at -- first item in the list will be the "target" feature features_list = [ "bonus", # target "long_term_incentive" # feature -- use salary, long_term_incentive and other features to compare score ] """ long term incentives have better relation with bonus than the salaries we find it by comparing r square scores while using both features as input and bonus as target """ data = feature_format(dictionary, features_list, remove_any_zeroes=True) target, features = target_feature_split(data) feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" # we are trying to predict bonus using salary # feature --> salary, long_term_incentive or any other feature --> input # target --> bonus --> output reg = LinearRegression() reg.fit(feature_train, target_train) target_prediction = reg.predict(feature_test) intercept_prediction = reg.intercept_ slope_prediction = reg.coef_
# can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" feature_3 = "total_payments" poi = "poi" # add 3rd feature in features_list and compare the results # after adding 3rd feature, total 4 data points exchanged their positions # in the plot features_list = [poi, feature_1, feature_2] # splitting dictionary to list # list containing poi(target), feature1, feature2 data = feature_format(data_dict, features_list) # splitting list to further lists # separate poi(target) from feature1, feature2 poi, finance_features = target_feature_split(data) # feature scaling # after scaling --> 0...1 # run k-means with and without scaling and comparing the results # some of the data points will be clustered in different cluster after re-scaling. # in this case, we may not need scaling # but when we are using salary and from_messages as features then scaling # is critical # finance_features = MinMaxScaler().fit_transform(finance_features) exercised_stock_options_values = [] salary_values = [] # change to f1, f2, f3 --> for 3 features for f1, f2 in finance_features: if f1 != 0:
#!/usr/bin/python """ Starter code for the evaluation mini-project. Start by copying your trained/tested POI identifier from that which you built in the validation mini-project. This is the second step toward building your POI identifier! Start by loading/formatting the data... """ import pickle import sys sys.path.append("../tools/") from feature_format import feature_format, target_feature_split with open("../final_project/final_project_dataset.pkl", "rb") as f: data_dict = pickle.load(f) # add more features to features_list! features_list = ["poi", "salary"] data = feature_format(data_dict, features_list, sort_keys='../tools/python2_lesson13_keys.pkl') labels, features = target_feature_split(data) # your code goes here