-
Notifications
You must be signed in to change notification settings - Fork 0
/
poi_id.py
52 lines (44 loc) · 1.86 KB
/
poi_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/python
import pickle
from tester import dump_classifier_and_data
from tools.prepareData import prepare_data
from tools.customTransformers import ImputeToValue, LogTransform, MinMaxNA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# load data
with open("./data/final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
# preprocess
df = prepare_data(data_dict)
steps_logit = [('impute nans', ImputeToValue()),
('log transforming', LogTransform(True)),
('minmax scaling', MinMaxNA()),
('kbest', SelectKBest(k=15)),
('pca', PCA(n_components=8)),
('classify', LogisticRegression(C=1, class_weight='balanced', penalty='l1'))]
clf = Pipeline(steps_logit)
'''
'''
# Tested pipelines that were npt chosen as the final classifier
'''
from sklearn.linear_model import SGDClassifier
steps_sgd = [('impute nans', ImputeToValue()),
('log transforming', LogTransform(True)),
('minmax scaling', MinMaxNA()),
('kbest', SelectKBest(k=16)),
('pca', PCA(n_components=15)),
('classify', SGDClassifier(alpha=0.01, class_weight='balanced', loss='log', penalty='l1', n_iter=6))]
clf = Pipeline(steps_sgd)
from sklearn.ensemble import AdaBoostClassifier
steps_adaboost = [('impute nans', ImputeToValue()),
('log transforming', LogTransform(True)),
('minmax scaling', MinMaxNA()),
('kbest', SelectKBest(k=3)),
('pca', PCA(n_components=None)),
('classify', AdaBoostClassifier(n_estimators=160))]
clf = Pipeline(steps_adaboost)
'''
# save the classifier pipeline and preprocessed data
dump_classifier_and_data(clf, df)