def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print(clf) print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") except: print("Got a divide by zero when trying out:", clf)
def evaluation(clf, features_list, folds=1000): """calculate the precision, recall and f1 of a classifier, using k-fold""" data = featureFormat(my_dataset, features_list, sort_keys=True) target, features = targetFeatureSplit(data) precision = [] recall = [] f1 = [] cv = StratifiedShuffleSplit(target, folds, random_state=42) for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(target[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(target[jj]) clf.fit(features_train, labels_train) predictions = clf.predict(features_test) precision.append(precision_score(labels_test, predictions)) recall.append(recall_score(labels_test, predictions)) f1.append(f1_score(labels_test, predictions)) print "Precision: ", round(np.mean(precision), 4) print "Recall: ", round(np.mean(recall), 4) print "f1: ", round(np.mean(f1), 4)
def main(): keys_path = os.path.join(os.path.dirname(tools.__file__), 'python2_lesson14_keys.pkl') data_dict = load_pickle( os.path.join(os.path.dirname(final_project.__file__), 'final_project_dataset.pkl')) # first element is our labels, any added elements are predictor # features. Keep this the same for the mini-project, but you'll # have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list, sort_keys=keys_path) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) # it's all yours from here forward! clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) print('Accuracy', clf.score(features_test, labels_test)) predictions = clf.predict(features_test) conf_matrix = confusion_matrix(labels_test, predictions, labels=[True, False]) print('Confusion matrix') print(conf_matrix) precision = precision_score(labels_test, predictions) print('Precision', precision) recall = recall_score(labels_test, predictions) print('Recall', recall)
def main(): # load in the dict of dicts containing all the data # on each person in the dataset file_path = os.path.join(os.path.dirname(final_project.__file__), 'final_project_dataset.pkl') data_dict = load_pickle(file_path) # there's an outlier--remove it! del data_dict["TOTAL"] # the input features we want to use # can be any key in the person-level dictionary # (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" # feature_3 = "total_payments" poi = "poi" features_list = [poi, feature_1, feature_2] # , feature_3] data = featureFormat(data_dict, features_list) poi, finance_features = targetFeatureSplit(data) scaler = MinMaxScaler() finance_features = scaler.fit_transform(finance_features) print(scaler.transform([[2e5, 1e6]])) # in the "clustering with 3 features" part of the mini-project, # you'll want to change this line to # for f1, f2, _ in finance_features: # (as it's currently written, the line below assumes 2 features) for f1, f2 in finance_features: plt.scatter(f1, f2) plt.xlabel(feature_1) plt.ylabel(feature_2) plt.show() # cluster here; create predictions of the cluster labels # for the data and store them to a list called pred clustering = KMeans(n_clusters=2) clustering.fit(X=finance_features) pred = clustering.predict(finance_features) # rename the "name" parameter when you change the number of features # so that the figure gets saved to a different file try: draw(pred, finance_features, poi, mark_poi=False, name="clusters3.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print("no predictions object named pred found, no clusters to plot")
def test_classifier(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print clf print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5) print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def prepare_data(input_data, features_list): """ 准备分类器需要的features,target数据 """ data_format = featureFormat(input_data, features_list) targets, features = targetFeatureSplit(data_format) features_train, features_test, target_train, target_test = train_test_split(features, targets, test_size = 0.3, random_state=42) from sklearn.cross_validation import KFold kf=KFold(len(targets),3) for train_indices, test_indices in kf: #make training and testing sets features_train= [features[ii] for ii in train_indices] features_test= [features[ii] for ii in test_indices] target_train=[targets[ii] for ii in train_indices] target_test=[targets[ii] for ii in test_indices] return features_train, features_test, target_train, target_test
def get_k_best_features(data, features_list, k=10): # Setup the label and features data = featureFormat(data, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) # Apply SelectKBest k_best = SelectKBest(k=k) k_best.fit(features, labels) scores = k_best.scores_ # pair up with feature name, ignore the first one, since # that is the 'poi' label unsorted_pairs = zip(features_list[1:], scores) # Sort based on score sorted_pairs = list(sorted(unsorted_pairs, key=lambda x: x[1], reverse=True)) return sorted_pairs
def main(): keys_path = os.path.join(os.path.dirname(tools.__file__), 'python2_lesson13_keys.pkl') data_dict = load_pickle( os.path.join(os.path.dirname(final_project.__file__), 'final_project_dataset.pkl')) # first element is our labels, any added elements are predictor # features. Keep this the same for the mini-project, but you'll # have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list, sort_keys=keys_path) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) # it's all yours from here forward! clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) print(clf.score(features_test, labels_test))
You fill in the regression code where indicated: """ import sys import pickle from tools.feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r")) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] data = featureFormat(dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "b" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. ### draw the scatterplot, with color-coded training and testing points
import sys sys.path.append("../tools/") from tools.feature_format import featureFormat, targetFeatureSplit from sklearn import tree from sklearn.metrics import accuracy_score from sklearn.cross_validation import train_test_split data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") ) ### first element is our labels, any added elements are predictor ### features. Keep this the same for the mini-project, but you'll ### have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42) clf = tree.DecisionTreeClassifier() clf.fit(features_train, labels_train) pred = clf.predict(features_test) print accuracy_score(pred, labels_test) ### it's all yours from here forward!
salary = [min(salary), 200000.0, max(salary)] salary = numpy.array([[e] for e in salary]) salary_scaler = MinMaxScaler() rescaled_salary = salary_scaler.fit_transform(salary) print "Rescaled salary:", rescaled_salary # the input features we want to use # can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" feature_3 = "total_payments" poi = "poi" features_list = [poi, feature_1, feature_2, feature_3] data = featureFormat(data_dict, features_list) poi, finance_features = targetFeatureSplit(data) # in the "clustering with 3 features" part of the mini-project, # you'll want to change this line to # for f1, f2, _ in finance_features: # (as it's currently written, the line below assumes 2 features) for f1, f2, _ in finance_features: plt.scatter(f1, f2) plt.show() # cluster here; create predictions of the cluster labels # for the data and store them to a list called pred kmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300) pred = kmeans.fit_predict(finance_features)
# FEATURE REMOVAL for features in ['loan_advances', 'total_payments']: features_list.remove(features) # Task 3: Create new feature(s) new_features_list = [ 'poi', 'shared_receipt_with_poi', 'expenses', 'from_this_person_to_poi', 'from_poi_to_this_person', ] new_data = featureFormat(data_dict, new_features_list) new_labels, new_features = targetFeatureSplit(new_data) # Store to my_dataset for easy export below. my_dataset = data_dict # Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) # Task 4: Try a varity of classifiers # Please name your classifier clf for easy export below. # Note that if you want to do PCA or other multi-stage operations, # you'll need to use Pipelines. For more info: # http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. # 1. Decision Tree Classifier
You fill in the regression code where indicated: """ import sys import pickle sys.path.append("../tools/") from tools.feature_format import featureFormat, targetFeatureSplit dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") ) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] data = featureFormat( dictionary, features_list, remove_any_zeroes=True) target, features = targetFeatureSplit( data ) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. from sklearn.linear_model import LinearRegression reg = LinearRegression()
features_list = ['poi', 'salary', 'bonus', 'expenses', 'exercised_stock_options'] # You will need to use more features ### Load the dictionary containing the dataset with open("final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) ### Task 2: Remove outliers ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. data = featureFormat(data_dict, features_list) my_dataset = data_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB clf = GaussianNB() ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier
print( "data[0:3] -> " ) # features_list = ["bonus", "salary"] not features_list = ["bonus", "long_term_incentive"] print(data[0:3]) # [[ 600000. 365788.] # [ 1200000. 267102.] # [ 350000. 170941.] # print(data[0:2]) # first two rows. two columns # print(data[0:2,0]) # first two rows, column 1 of 2 only - zero based indexing # print(data[0:2,1]) # first two rows, column 2 of 2 only - zero based indexing # print(data[0:2,0:1]) # first two rows, column 1 of 1 only - zero based indexing # print(data[0:2,0:2]) # first two rows, two columns - zero based indexing - this is a good example # target, features = targetFeatureSplit( data ) target, features = feature_format.targetFeatureSplit(data) ### training-testing split needed in regression, just like classification # from sklearn.cross_validation import train_test_split from sklearn.model_selection import train_test_split feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. reg = linear_model.LinearRegression()
### load in the dict of dicts containing all the data on each person in the dataset data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) ### there's an outlier--remove it! data_dict.pop("TOTAL", 0) ### the input features we want to use ### can be any key in the person-level dictionary (salary, director_fees, etc.) feature_1 = "salary" feature_2 = "exercised_stock_options" feature_3 = "total_payments" poi = "poi" features_list = [poi, feature_1, feature_2, feature_3] data = featureFormat(data_dict, features_list ) poi, finance_features = targetFeatureSplit( data ) from numpy import ndarray options=[] salary=[] for k,v in data_dict.iteritems(): if v['exercised_stock_options'] != 'NaN': options.append(v['exercised_stock_options']) if v['salary'] != 'NaN': salary.append(v['salary']) print 'maximum options: {} minimum options: {}'.format(max(options), min(options)) print 'maximum salary: {} minimum salary: {}'.format(max(salary), min(salary))
def main(): dictionary_path = os.path.join(os.path.dirname(final_project.__file__), "final_project_dataset_modified.pkl") with io.open(dictionary_path, 'rb') as f: dictionary = pickle.load(f) ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] keys_path = os.path.join(os.path.dirname(tools.__file__), 'python2_lesson06_keys.pkl') data = featureFormat(dictionary, features_list, remove_any_zeroes=True, sort_keys=keys_path) target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. reg = LinearRegression() reg.fit(feature_train, target_train) print('test score', reg.score(feature_test, target_test)) print('train score', reg.score(feature_train, target_train)) print('coef', reg.coef_, 'intercept', reg.intercept_) ### draw the scatterplot, with color-coded training and testing points for feature, target in zip(feature_test, target_test): plt.scatter(feature, target, color=test_color) for feature, target in zip(feature_train, target_train): plt.scatter(feature, target, color=train_color) ### labels for the legend plt.scatter(feature_test[0], target_test[0], color=test_color, label="test") plt.scatter(feature_test[0], target_test[0], color=train_color, label="train") ### draw the regression line, once it's coded plt.plot(feature_test, reg.predict(feature_test)) reg.fit(feature_test, target_test) plt.plot(feature_train, reg.predict(feature_train), color="y") print('coef', reg.coef_, 'intercept', reg.intercept_) plt.xlabel(features_list[1]) plt.ylabel(features_list[0]) plt.legend() plt.show()