def preprocess(self, dataset): self.dataset, self.features_list = transform_data(dataset) # next 4 lines are for Udacity dataset only. remove later. all transformation should be done on transform_data data = featureFormat(self.dataset, self.features_list, sort_keys=True) y, X = targetFeatureSplit(data) self.X = np.array(X) if type(X) == list else X self.y = np.array(y) if type(y) == list else y if self.filter_outliers: self.remove_outliers()
from tester import dump_classifier_and_data from tester import test_classifier from outliers_clf import remove_outliers from eda import transform_data from agent import * with open("../final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) ### Task 1: Select what features you'll use. ### Task 3: Create new feature(s) # here just for evaluation. these steps have been moved to class methods or functions # exploratory data analysis and feature creation available at eda.py my_dataset, features_list = transform_data(data_dict) data = featureFormat(my_dataset, features_list, sort_keys = True) y, X = targetFeatureSplit(data) ### Task 2: Remove outliers # here just for evaluation. these steps have been moved to class methods or functions # outlier removal classifier and supporting methods available at outliers_clf.py X, y, outliers = remove_outliers(X, y) print "Outliers: {}".format(map(lambda x:data_dict.items()[x][0], outliers)) ### Task 4: Try a varity of classifiers ### Task 5: Tune your classifier to achieve better than .3 precision and recall # showing a few options for evaluation purpose. # more were tested at strategies.py, results available at all_results.txt and selected_results.txt # classes available at agent.py