""" Implementation and general notes: • The parameters min_samples_split and min_impurity_decrease were difficult to analyse. Seems like smaller values are preferred • The model often chose different hyperparemeters between runs. It seems inconsistent """ from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from utils import split_feats_targs, capture_features, capture_targets, export_results (train_features, train_targets) = split_feats_targs('train_2.csv') # pass training set with targets (val_features, val_targets) = split_feats_targs('val_2.csv') # pass validation set test_features = capture_features('test_no_label_2.csv', False) # pass test set without targets actual_targets = capture_targets('test_with_label_2.csv') # pass test set with targets """ Parameter options to tune: • splitting criterion: gini and entropy • maximum depth of the tree: 10 and no maximum • minimum number of samples to split an internal node: experiment with values of your choice • minimum impurity decrease: experiment with values of your choice • class weight: None and balanced """ print("Finding best hyperparameters for DT....") best_dt = GridSearchCV(DecisionTreeClassifier(), { 'criterion': ['gini', 'entropy'], 'max_depth': [10, None], 'min_samples_split': [2,3,5], 'min_impurity_decrease': [0.0, 1e-250, 1e-900], 'class_weight': [None, 'balanced']
from sklearn.neural_network import MLPClassifier from utils import split_feats_targs, capture_features, capture_targets, export_results (train_features, train_targets) = split_feats_targs( 'train_1.csv') # pass training set with targets test_features = capture_features('test_no_label_1.csv', False) # pass test set without targets actual_targets = capture_targets( 'test_with_label_1.csv') # pass test set with targets fitted_mlp = MLPClassifier(activation='logistic', solver='sgd').fit( train_features, train_targets) # fits model with training set values predicted_targets = list(fitted_mlp.predict( test_features)) # gets predictions from model and record them export_results(actual_targets, predicted_targets, 'Base-MLP-DS1.csv')
INTENDED FOR DEMO Running this file will run every single estimator for the given training and test set File inputs are required wherever <file_type> is seen """ from utils import split_feats_targs, capture_features, capture_targets, export_results from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV """ Store the necessary training/test set values into variables """ (train_features, train_targets) = split_feats_targs('<demo_training_set_file_name>') test_features = capture_features( '<demo_test_set_file_name>', False) # pass False if test set has no targets, otherwise pass True actual_targets = capture_targets('<demo_test_set_w_targets_file_name>') """ Run GNB model """ fitted_gnb = GaussianNB().fit( train_features, train_targets) # fit model with training set values predicted_targets = list(fitted_gnb.predict( test_features)) # get predictions from model and record them export_results(actual_targets, predicted_targets, '<demo_output_file_name>') """ Run PER model """