#!/usr/bin/python import pickle from tester import dump_classifier_and_data from tools.prepareData import prepare_data from tools.customTransformers import ImputeToValue, LogTransform, MinMaxNA from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectKBest from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression # load data with open("./data/final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) # preprocess df = prepare_data(data_dict) steps_logit = [('impute nans', ImputeToValue()), ('log transforming', LogTransform(True)), ('minmax scaling', MinMaxNA()), ('kbest', SelectKBest(k=15)), ('pca', PCA(n_components=8)), ('classify', LogisticRegression(C=1, class_weight='balanced', penalty='l1'))] clf = Pipeline(steps_logit) ''' ''' # Tested pipelines that were npt chosen as the final classifier ''' from sklearn.linear_model import SGDClassifier
def main(): # load data with open("./data/final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) # preprocess df = prepare_data(data_dict) # split into training and test set cv = StratifiedShuffleSplit(df['poi'], 100) ''' For each classifier two pipelines were created. That was necessary because without knowing the number of features after applying SelectKBest the maximum of n_components for pca cannot be set. Thus in a first grid search the n_components parameter of the PCA is set to None to include all components, while the other parameters are optimized. In the second following grid search the parameters found in the first run were set and the n_components parameter optimized to this set of parameters. ''' # create classifiers and parameter grids sgd = {'classifier': SGDClassifier(n_jobs=4), 'parameters': {'kbest__k': np.arange(1, 20, 1).tolist(), 'pca__n_components': [None], 'classify__class_weight': ['balanced', None], 'classify__loss': ['log', 'hinge'], 'classify__penalty': ['l2', 'l1', 'elasticnet', 'none'], 'classify__alpha': [0.0001, 0.001, 0.01, 0.1]}} sgd_pca = {'classifier': SGDClassifier(n_jobs=4), 'parameters': {'kbest__k': [16], 'pca__n_components': [None] + np.arange(1, 16, 1).tolist(), 'classify__class_weight': ['balanced'], 'classify__loss': ['log'], 'classify__penalty': ['l1'], 'classify__alpha': [0.001, 0.01, 0.1]}} ada = {'classifier': AdaBoostClassifier(), 'parameters': {'kbest__k': np.arange(1, 20, 1).tolist(), 'pca__n_components': [None], 'classify__n_estimators': np.arange(20, 200, 20).tolist(), 'classify__base_estimator': [DecisionTreeClassifier(), RandomForestClassifier()]}} ada_pca = {'classifier': AdaBoostClassifier(), 'parameters': {'kbest__k': [3], 'pca__n_components': [None] + np.arange(1, 3, 1).tolist(), 'classify__n_estimators': [160, 180, 200], 'classify__base_estimator': [DecisionTreeClassifier()]}} logit = {'classifier': LogisticRegression(), 'parameters': {'kbest__k': np.arange(1, 20, 1).tolist(), 'pca__n_components': [None], 'classify__class_weight': ['balanced', None], 'classify__penalty': ['l2', 'l1'], 'classify__C': [0.001, 0.01, 0.1, 1, 10]}} logit_pca = {'classifier': LogisticRegression(), 'parameters': {'kbest__k': [15], 'pca__n_components': [None] + np.arange(1, 15, 1).tolist(), 'classify__class_weight': ['balanced'], 'classify__penalty': ['l1'], 'classify__C': [0.1, 1, 10]}} # list of classifiers to optimize classifiers = {'sgd_finalpca': sgd_pca, 'ada_finalpca': ada_pca, 'logit_finalpca': logit_pca} # split features and labels features = df.drop(['poi'], 1) labels = df['poi'] # apply a gridsearch for each classifier for c in classifiers: print(c) steps = [('impute nans', ImputeToValue()), ('log transforming', LogTransform(True)), ('minmax scaling', MinMaxNA()), ('kbest', SelectKBest()), ('pca', PCA()), ('classify', classifiers[c]['classifier'])] pipeline = Pipeline(steps) clf = GridSearchCV(pipeline, classifiers[c]['parameters'], scoring='precision', cv=cv, n_jobs=4) clf.fit(features, labels) print('best') print(clf.best_estimator_) print(clf.best_score_) clf_outfile = './data/clf_' + c + '.pkl' with open(clf_outfile, 'wb') as f: pickle.dump(clf, f)
#!/usr/bin/python import pickle from tester import dump_classifier_and_data from tools.prepareData import prepare_data from tools.customTransformers import ImputeToValue, LogTransform, MinMaxNA from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectKBest from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression # load data with open("./data/final_project_dataset.pkl", "r") as data_file: data_dict = pickle.load(data_file) # preprocess df = prepare_data(data_dict) steps_logit = [('impute nans', ImputeToValue()), ('log transforming', LogTransform(True)), ('minmax scaling', MinMaxNA()), ('kbest', SelectKBest(k=15)), ('pca', PCA(n_components=8)), ('classify', LogisticRegression(C=1, class_weight='balanced', penalty='l1'))] clf = Pipeline(steps_logit) ''' ''' # Tested pipelines that were npt chosen as the final classifier ''' from sklearn.linear_model import SGDClassifier steps_sgd = [('impute nans', ImputeToValue()),