コード例 #1
0
# remove outliers
outlier_keys = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E']
enron_functions.remove_outliers(data_dict, outlier_keys)
print len(data_dict)
# instantiate copies of dataset and features for grading purposes
my_dataset = copy(data_dict)
my_feature_list = copy(features_list)

# get K-best features
num_features = 4 #2 for KN
best_features = enron_functions.get_k_best(my_dataset, my_feature_list, num_features)
my_feature_list = [target_label] + best_features.keys()

# add two new features
enron_functions.add_financial_sum(my_dataset, my_feature_list)
enron_functions.add_poi_interaction_fraction(my_dataset, my_feature_list)
#enron_functions.visualize(data_dict, 'total_stock_value', 'poi_interaction')
# print features
print "{0} selected features: {1}\n".format(len(my_feature_list) - 1, my_feature_list[1:])

# extract the features specified in features_list
data = featureFormat(my_dataset, my_feature_list)

# split into labels and features (this line assumes that the first
# feature in the array is the label, which is why "poi" must always
# be first in the features list
labels, features = targetFeatureSplit(data)

# scale features via min-max
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
best_features = enron_functions.get_k_best(my_dataset, my_feature_list, num_features)
my_feature_list = [target_label] + best_features.keys() 


'''# Convert NAN to 0 in selected features
for name, item in my_dataset.items():
    for key in my_feature_list:
        if item[key] == 'NaN':
            my_dataset[name][key] = 0
            #print my_dataset[name]
 '''           


# add two new features
#enron_functions.add_financial_sum(my_dataset, my_feature_list)
enron_functions.add_poi_interaction_fraction(my_dataset, my_feature_list) # Adding only this feature
#enron_functions.visualize(data_dict, 'total_stock_value', 'poi_interaction')
# print features
print "{0} selected features: {1}\n".format(len(my_feature_list) - 1, my_feature_list[1:])

# extract the features specified in features_list
data = featureFormat(my_dataset, my_feature_list)

# split into labels and features (this line assumes that the first
# feature in the array is the label, which is why "poi" must always
# be first in the features list
labels, features = targetFeatureSplit(data)

# scale features via min-max
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()