Beispiel #1
0
 def preprocess(self, dataset):
     self.dataset, self.features_list = transform_data(dataset)
     # next 4 lines are for Udacity dataset only. remove later. all transformation should be done on transform_data
     data = featureFormat(self.dataset, self.features_list, sort_keys=True)
     y, X = targetFeatureSplit(data)
     self.X = np.array(X) if type(X) == list else X
     self.y = np.array(y) if type(y) == list else y
     if self.filter_outliers:
         self.remove_outliers()
Beispiel #2
0
from tester import dump_classifier_and_data
from tester import test_classifier

from outliers_clf import remove_outliers
from eda import transform_data
from agent import *

with open("../final_project_dataset.pkl", "r") as data_file:
   data_dict = pickle.load(data_file)

### Task 1: Select what features you'll use.
### Task 3: Create new feature(s)
# here just for evaluation. these steps have been moved to class methods or functions
# exploratory data analysis and feature creation available at eda.py

my_dataset, features_list = transform_data(data_dict)
data = featureFormat(my_dataset, features_list, sort_keys = True)
y, X = targetFeatureSplit(data)

### Task 2: Remove outliers
# here just for evaluation. these steps have been moved to class methods or functions
# outlier removal classifier and supporting methods available at outliers_clf.py

X, y, outliers = remove_outliers(X, y)
print "Outliers: {}".format(map(lambda x:data_dict.items()[x][0], outliers))

### Task 4: Try a varity of classifiers
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
# showing a few options for evaluation purpose. 
# more were tested at strategies.py, results available at all_results.txt and selected_results.txt
# classes available at agent.py