コード例 #1
0
def ensemble(data_set, features_list):
    from data_sorter import featuresProcess
    from data_sorter import split
    data = featuresProcess(data_set, features_list)
    labels, features = split(data)
    from sklearn.ensemble import BaggingClassifier
    clf = BaggingClassifier(n_estimators=int(sys.argv[1]),
                            random_state=202,
                            bootstrap=True)
    pickle.dump(clf, open("classifier_ada.pkl", "w"))
    pickle.dump(features_list, open("features_list.pkl", "w"))
    pickle.dump(data_set, open("dataset.pkl", "w"))
コード例 #2
0
def ensemble(data_set, features_list):
    print data_set
    from data_sorter import featuresProcess
    from data_sorter import split
    data = featuresProcess(data_set, features_list)
    labels, features = split(data)
    from sklearn.ensemble import AdaBoostClassifier
    clf = AdaBoostClassifier(n_estimators=int(sys.argv[1]),
                             random_state=202,
                             learning_rate=float(sys.argv[2]),
                             algorithm="SAMME.R")
    pickle.dump(clf, open("classifier_ada.pkl", "w"))
    pickle.dump(features_list, open("features_list.pkl", "w"))
    pickle.dump(data_set, open("dataset.pkl", "w"))
コード例 #3
0
email_to_poi = get_information("from_this_person_to_poi", "from_messages", data)

count = 0
for ii in data:
  data[ii]["email_from_poi"] = email_from_poi[count]
  data[ii]["email_to_poi"] = email_to_poi[count]
  count += 1




features_list = ["poi", "salary", "bonus", "email_from_poi", "email_to_poi",'deferral_payments', 'total_payments']

dataset = featuresProcess(data, features_list)

labels, features = split(dataset)
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42)

time_start = time.time()

dtree = DecisionTreeClassifier()
dtree.fit(features_train, labels_train)
score = dtree.score(features_test, labels_test)
print "Accuracy ", score

print "Decesion tree took time : ", time.time() - time_start

feat_ranks = dtree.feature_importances_
indices = np.argsort(feat_ranks)[::-1]
for ii in range(5):
  print "{} feature {} ({})".format(ii+1, features_list[ii + 1], feat_ranks[indices[ii]])