def plotDifferentTrainingSetSingleTestSetNZ(results_analyzer, dirData, dirModelsBase, dirResultsBase): print('plotDifferentTrainingSetSingleTestSetNZ') data_prefix = 'nz' dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '2017', 'options_filtering': None } options_testing = DatasetOptions(dict_options_dataset_testing); years_training = [2012, 2013, 2014, 2015, 2016]; names = []; analyzers = [] for year in years_training: print(year) dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': str(year), 'options_filtering': None } options_training_year = DatasetOptions(dict_options_dataset_training); options_rf_year = OptionsRF(dirModelsBase, options_training_year.getFilenameOptions(filteroptions=True)); results_test_year = Results(dirResultsBase, options_training_year, options_rf_year, 'test', options_testing); names.append(str(year)) analyzers.append(ResultsSingleConfigAnalyzer(results_test_year, 10)); title_plot = 'classifier (rf): trained on subsets of nz 2012-2016, tested on subset of nz 2017' filename_plot = dirPlotsBase + 'rf_training_nz_years_20122016_testing_nz_year_2017.png' print('plot ROC curve...') results_analyzer.plotROCcurveMulitpleConfigs(analyzers, names, f_plot=filename_plot, titlePlot=title_plot, )
def main(dict_dataset_options): options = DatasetOptions(dict_dataset_options) preproc = PreprocessorNZ(options) print('grouping: ' + str(options.getGroupingName())) # preproc.processDischargeFile(); # preproc.processDiagnosisFile(); # preproc.createFeatureSet(); preproc.encodeFeatures() preproc.fuse()
def run_deep(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dict_data_train = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '20012011', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_train = DatasetOptions(dict_data_train) dict_data_eval = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '2013', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_eval = DatasetOptions(dict_data_eval) # dataset_options_eval = None; if dict_data_train['data_prefix'] == 'nz': feature_columns_nz_fusion = FeatureColumnsNZFusion( dataset_options=dataset_options_train) feature_columns = feature_columns_nz_fusion elif dict_data_train['data_prefix'] == 'patrec': feature_columns_patrec_fusion = FeatureColumnsPatrecFusion( dataset_options=dataset_options_train) feature_columns = feature_columns_patrec_fusion else: print('unknown data prefix..exit') sys.exit() dict_dataset_options = { 'train': dataset_options_train, 'eval': dataset_options_eval, 'test': None } nn = NeuralNetModel('train', dict_dataset_options, feature_columns, flags_obj) nn.train()
def run_deep(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dict_data_train = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '20122016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_train = DatasetOptions(dict_data_train) dataset_options_eval = None if dict_data_train['data_prefix'] == 'nz': feature_columns = FeatureColumnsAutoEncoderNZ( dataset_options=dataset_options_train) else: print('unknown data prefix..exit') sys.exit() dict_dataset_options = { 'train': dataset_options_train, 'eval': dataset_options_eval, 'test': None } nn = AutoEncoderModel('train', dict_dataset_options, feature_columns, flags_obj) nn.train()
def plotOneTrainingSetDifferentTestSets(results_analyzer, dirData, dirModelsBase, dirResultsBase): data_prefix = 'patrec' dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20122015', 'options_filtering': None } options_training = DatasetOptions(dict_options_dataset_training); # compare different subsets of data: EntlassBereich (only with RandomForest) options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True)); dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20162017', 'options_filtering': None } options_testing_all = DatasetOptions(dict_options_dataset_testing); results_test_all = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_all); dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20162017', 'options_filtering': 'EntlassBereich_Med' } options_testing_med = DatasetOptions(dict_options_dataset_testing); results_test_med = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_med); dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20162017', 'options_filtering': 'EntlassBereich_SaO' } options_testing_sao = DatasetOptions(dict_options_dataset_testing); results_test_sao = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_sao); dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': data_prefix, 'dataset': '20162017', 'options_filtering': 'EntlassBereich_Gyn' } options_testing_gyn = DatasetOptions(dict_options_dataset_testing); results_test_gyn = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_gyn); analyzer_all = ResultsSingleConfigAnalyzer(results_test_all, 10); analyzer_med = ResultsSingleConfigAnalyzer(results_test_med, 10); analyzer_sao = ResultsSingleConfigAnalyzer(results_test_sao, 10); analyzer_gyn = ResultsSingleConfigAnalyzer(results_test_gyn, 10); analyzer = [analyzer_all, analyzer_med, analyzer_sao, analyzer_gyn]; names = ['All', 'Med', 'SaO', 'Gyn'] title_plot = 'classifier (rf): trained on patrec 2012-2015, tested on subsets of patrec 2016-2017' filename_plot = dirPlotsBase + 'rf_training_all_testing_EntlassBereich.png' results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot, )
def plotSingleConfiguration(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015' } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017' } options_training = DatasetOptions(dict_options_dataset_training); options_testing = DatasetOptions(dict_options_dataset_testing); options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True)); results_all_runs_test = Results(dirResultsBase, options_training, options_rf, 'test', options_testing); analyzer_single_config = ResultsSingleConfigAnalyzer(results_all_runs_test, 10); results_analyzer.plotROCcurveSingleConfig(analyzer_single_config, 'rf')
def plotSGDClassifierPerformance(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': '2016', 'encoding': 'categorical', 'newfeatures': {'names': constantsNZ.NEW_FEATURES}, 'featurereduction': None, 'grouping': 'grouping' } options_dataset_testing = DatasetOptions(dict_options_dataset_testing); analyzer = []; years = [2012, 2013, 2014, 2015]; for year in years: dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': str(year), 'encoding': 'categorical', 'newfeatures': {'names': constantsNZ.NEW_FEATURES}, 'featurereduction': None, 'grouping': 'grouping' } options_dataset_training = DatasetOptions(dict_options_dataset_training); dict_opt_sgd = {'loss': 'log', 'penalty': 'l1'}; options_sgd = OptionsSGD(dirModelsBase, options_dataset_training.getFilenameOptions(filteroptions=True),options_clf=dict_opt_sgd); results_year = Results(dirResultsBase, options_dataset_training, options_sgd, 'test', options_dataset_testing); analyzer_sgd_year = ResultsSingleConfigAnalyzer(results_year, 1); analyzer.append(analyzer_sgd_year); names = ['2012', '2013', '2014', '2015']; title_plot = 'performance of batch-based logistic regression' filename_plot = dirPlotsBase + 'sgd_nz_performance_years_training20122015_test2016.png' results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot)
def run_deep(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dirProject = '/home/thomas/fusessh/scicore/projects/patrec' # dirProject = "Z:\\projects\\PATREC" dirData = os.path.join(dirProject, 'data') dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'oncology', 'balanced': False, 'resample': True } dataset_options_train = DatasetOptions(dict_options_dataset_training) dataset_options_eval = None if dict_options_dataset_training['data_prefix'] == 'nz': feature_columns_nz = FeatureColumnsNZ( dataset_options=dataset_options_train) feature_columns = feature_columns_nz elif dict_options_dataset_training['data_prefix'] == 'patrec': feature_columns_patrec = FeatureColumnsPatrec( dataset_options=dataset_options_train) feature_columns = feature_columns_patrec else: print('unknown data prefix..exit') sys.exit() dict_dataset_options = { 'train': dataset_options_train, 'eval': dataset_options_eval, 'test': None } nn = NeuralNetModel('train', dict_dataset_options, feature_columns, flags_obj) print(flags_obj.log_dir) nn.train()
def encode(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dict_data_training = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '20012016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_training = DatasetOptions(dict_data_training) dict_data_encoding = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '2017', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_encoding = DatasetOptions(dict_data_encoding) feature_columns = FeatureColumnsAutoEncoderNZ( dataset_options=dataset_options_encoding) dict_dataset_options = { 'train': dataset_options_training, 'eval': None, 'test': dataset_options_encoding } nn = AutoEncoderModel('test', dict_dataset_options, feature_columns, flags_obj) diag_encodings = nn.encode() print('diag_encodings --> main diag: ' + str(diag_encodings[0].shape)) print('diag_encodings --> secondary diags: ' + str(diag_encodings[1].shape)) main_diag_encodings = diag_encodings[0] sec_diag_encodings = diag_encodings[1] dataset_encoding = Dataset(dataset_options_encoding) df_encoding = dataset_encoding.getDf() print('df_encoding: ' + str(df_encoding.shape)) num_encoded_dim = main_diag_encodings.shape[1] dir_data = dataset_options_encoding.getDirData() dataset = dataset_options_encoding.getDatasetName() data_prefix = dataset_options_encoding.getDataPrefix() demographic_featurename = dataset_options_encoding.getFilenameOptionDemographicFeatures( ) featureset_str = dataset_options_encoding.getFeatureSetStr() encoding = dataset_options_encoding.getEncodingScheme() name_event_column = dataset_options_encoding.getEventColumnName() name_main_diag = dataset_options_encoding.getNameMainDiag() name_sec_diag = dataset_options_encoding.getNameSecDiag() df_encoding_sec_diag = df_encoding[name_event_column].to_frame() df_encoding_main_diag = df_encoding[name_event_column].to_frame() num_encoded_dim = sec_diag_encodings.shape[1] for k in range(0, num_encoded_dim): new_col_secdiag = name_sec_diag + '_dim_' + str(k) df_encoding_sec_diag[new_col_secdiag] = sec_diag_encodings[:, k] new_col_maindiag = name_main_diag + '_dim_' + str(k) df_encoding_main_diag[new_col_maindiag] = main_diag_encodings[:, k] print('df_encoding_main_diag: ' + str(df_encoding_main_diag.shape)) print('df_encoding_sec_diag: ' + str(df_encoding_sec_diag.shape)) filename_sec_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_sec_diag + '_' + str( num_encoded_dim) + 'dim.csv' filename_main_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_main_diag + '_' + str( num_encoded_dim) + 'dim.csv' list_df = [ df_encoding_sec_diag[i:i + 10000] for i in range(0, df_encoding_sec_diag.shape[0], 10000) ] list_df[0].to_csv(filename_sec_diag_encoding, index=False, line_terminator='\n') for l in list_df[1:]: l.to_csv(filename_sec_diag_encoding, index=False, line_terminator='\n', header=False, mode='a') list_df = [ df_encoding_main_diag[i:i + 10000] for i in range(0, df_encoding_main_diag.shape[0], 10000) ] list_df[0].to_csv(filename_main_diag_encoding, index=False, line_terminator='\n') for l in list_df[1:]: l.to_csv(filename_main_diag_encoding, index=False, line_terminator='\n', header=False, mode='a')
from preprocessing.Preprocessor import Preprocessor import helpers.constants as constantsPATREC # dirProject = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/'; dirProject = '/home/thomas/fusessh/scicore/projects/patrec' dirData = os.path.join(dirProject, 'data') dict_dataset_options = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', # 'subgroups': ['DK'], 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': { 'names': constantsPATREC.NEW_FEATURES }, 'featurereduction': None, 'filtering': None } options = DatasetOptions(dict_dataset_options) preproc = Preprocessor(options) preproc.splitColumns() preproc.clean() preproc.group() preproc.createFeatureSet() preproc.encodeFeatures() preproc.fuse()
def plotDiseasePerformances(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_opt_lr = {'penalty': 'l1', 'C': 0.5}; dict_opt_rf = {'n_estimators': 500, 'max_depth': 50}; dict_options_nn = { 'hidden_units': [60, 40, 40, 20], 'learningrate': 0.001, 'dropout': 0.5, 'batchnorm': True, 'batch_size': 64, 'training_epochs': 1000, 'pretrained': None, } DIRPROJECT = '/home/thomas/projects/patrec'; model_dir = os.path.join(DIRPROJECT, "patients_model") dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': {'names': constantsPATREC.NEW_FEATURES}, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'encoding': 'categorical', 'newfeatures': {'names': constantsPATREC.NEW_FEATURES}, 'grouping': 'verylightgrouping', 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_all_training = dict_options_dataset_training.copy(); dict_options_all_testing = dict_options_dataset_testing.copy(); options_all_training = DatasetOptions(dict_options_all_training); options_all_testing = DatasetOptions(dict_options_all_testing); options_all_lr = OptionsLogisticRegression(dirModelsBase, options_all_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_all_rf = OptionsRF(dirModelsBase, options_all_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); options_all_nn = OptionsNN(model_dir, options_all_training, options_clf=dict_options_nn); dict_options_lung_training = dict_options_dataset_training.copy(); dict_options_lung_testing = dict_options_dataset_testing.copy(); dict_options_lung_training['filtering'] = 'chronic_lung'; dict_options_lung_testing['filtering'] = 'chronic_lung'; options_lung_training = DatasetOptions(dict_options_lung_training); options_lung_testing = DatasetOptions(dict_options_lung_testing); options_lung_lr = OptionsLogisticRegression(dirModelsBase, options_lung_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_lung_rf = OptionsRF(dirModelsBase, options_lung_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); options_lung_nn = OptionsNN(model_dir, options_lung_training, options_clf=dict_options_nn); dict_options_oncology_training = dict_options_dataset_training.copy(); dict_options_oncology_testing = dict_options_dataset_testing.copy(); dict_options_oncology_training['filtering'] = 'oncology'; dict_options_oncology_testing['filtering'] = 'oncology'; options_oncology_training = DatasetOptions(dict_options_oncology_training); options_oncology_testing = DatasetOptions(dict_options_oncology_testing); options_oncology_lr = OptionsLogisticRegression(dirModelsBase, options_oncology_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_oncology_rf = OptionsRF(dirModelsBase, options_oncology_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); options_oncology_nn = OptionsNN(model_dir, options_oncology_training, options_clf=dict_options_nn); dict_options_cardio_training = dict_options_dataset_training.copy(); dict_options_cardio_testing = dict_options_dataset_testing.copy(); dict_options_cardio_training['filtering'] = 'cardiovascular'; dict_options_cardio_testing['filtering'] = 'cardiovascular'; options_cardio_training = DatasetOptions(dict_options_cardio_training); options_cardio_testing = DatasetOptions(dict_options_cardio_testing); options_cardio_lr = OptionsLogisticRegression(dirModelsBase, options_cardio_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_cardio_rf = OptionsRF(dirModelsBase, options_cardio_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); options_cardio_nn = OptionsNN(model_dir, options_cardio_training, options_clf=dict_options_nn); results_all_rf = Results(dirResultsBase, options_all_training, options_all_rf, 'test', options_all_testing); results_lung_rf = Results(dirResultsBase, options_lung_training, options_lung_rf, 'test', options_lung_testing); results_oncology_rf = Results(dirResultsBase, options_oncology_training, options_oncology_rf, 'test', options_oncology_testing); results_cardio_rf = Results(dirResultsBase, options_cardio_training, options_cardio_rf, 'test', options_cardio_testing); results_all_lr = Results(dirResultsBase, options_all_training, options_all_lr, 'test', options_all_testing); results_lung_lr = Results(dirResultsBase, options_lung_training, options_lung_lr, 'test', options_lung_testing); results_oncology_lr = Results(dirResultsBase, options_oncology_training, options_oncology_lr, 'test', options_oncology_testing); results_cardio_lr = Results(dirResultsBase, options_cardio_training, options_cardio_lr, 'test', options_cardio_testing); results_all_nn = Results(dirResultsBase, options_all_training, options_all_nn, 'test', options_all_testing); results_lung_nn = Results(dirResultsBase, options_lung_training, options_lung_nn, 'test', options_lung_testing); results_oncology_nn = Results(dirResultsBase, options_oncology_training, options_oncology_nn, 'test', options_oncology_testing); results_cardio_nn = Results(dirResultsBase, options_cardio_training, options_cardio_nn, 'test', options_cardio_testing); analyzer_all_rf = ResultsSingleConfigAnalyzer(results_all_rf, 10); analyzer_lung_rf = ResultsSingleConfigAnalyzer(results_lung_rf, 10); analyzer_oncology_rf = ResultsSingleConfigAnalyzer(results_oncology_rf, 10); analyzer_cardio_rf = ResultsSingleConfigAnalyzer(results_cardio_rf, 10); analyzer_all_lr = ResultsSingleConfigAnalyzer(results_all_lr, 10); analyzer_lung_lr = ResultsSingleConfigAnalyzer(results_lung_lr, 10); analyzer_oncology_lr = ResultsSingleConfigAnalyzer(results_oncology_lr, 10); analyzer_cardio_lr = ResultsSingleConfigAnalyzer(results_cardio_lr, 10); analyzer_all_nn = ResultsSingleConfigAnalyzer(results_all_nn, 10); analyzer_lung_nn = ResultsSingleConfigAnalyzer(results_lung_nn, 10); analyzer_oncology_nn = ResultsSingleConfigAnalyzer(results_oncology_nn, 10); analyzer_cardio_nn = ResultsSingleConfigAnalyzer(results_cardio_nn, 10); analyzer_rf = [analyzer_all_rf, analyzer_lung_rf, analyzer_oncology_rf, analyzer_cardio_rf]; analyzer_lr = [analyzer_all_lr, analyzer_lung_lr, analyzer_oncology_lr, analyzer_cardio_lr]; analyzer_nn = [analyzer_all_nn, analyzer_lung_nn, analyzer_oncology_nn, analyzer_cardio_nn] analyzer = analyzer_lr; names_rf = ['RF - all', 'RF - chronic lung', 'RF - oncology', 'RF - cardiovascular']; names_lr = ['LR - all', 'LR - chronic lung', 'LR - oncology', 'LR - cardiovascular']; names_nn = ['NN - all', 'NN - chronic lung', 'NN - oncology', 'NN - cardiovascular']; names = names_lr; title_plot = '' filename_plot_rf = os.path.join(dirPlotsBase, 'diseases_rf_classification_performance.png'); filename_plot_lr = os.path.join(dirPlotsBase, 'diseases_lr_classification_performance.png'); filename_plot_nn = os.path.join(dirPlotsBase, 'diseases_nn_classification_performance.png'); filename_plot = filename_plot_lr; results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, titlePlot=title_plot, f_plot=filename_plot)
def plotOEPerformances(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_opt_lr = {'penalty': 'l1', 'C': 0.5}; dict_opt_rf = {'n_estimators': 500, 'max_depth': 50}; dict_options_nn = { 'hidden_units': [60, 40, 40, 20], 'learningrate': 0.001, 'dropout': 0.5, 'batchnorm': True, 'batch_size': 64, 'training_epochs': 1000, 'pretrained': None, } DIRPROJECT = '/home/thomas/projects/patrec'; model_dir = os.path.join(DIRPROJECT, "patients_model") dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dataset_options_training_all = DatasetOptions(dict_options_dataset_training); dataset_options_testing_all = DatasetOptions(dict_options_dataset_testing); options_all_nn = OptionsNN(model_dir, dataset_options_training_all, options_clf=dict_options_nn); options_all_lr = OptionsLogisticRegression(dirModelsBase, dataset_options_training_all.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr) options_all_rf = OptionsRF(dirModelsBase, dataset_options_training_all.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); classifier_nn_all = ClassifierNN(options_all_nn) classifier_lr_all = ClassifierLogisticRegression(options_all_lr) classifier_rf_all = ClassifierRF(options_all_rf) dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_SaO', 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_SaO', 'balanced': False, 'resample': False } dataset_options_training_SaO = DatasetOptions(dict_options_dataset_training); dataset_options_testing_SaO = DatasetOptions(dict_options_dataset_testing); options_SaO_nn = OptionsNN(model_dir, dataset_options_training_SaO, options_clf=dict_options_nn); options_SaO_lr = OptionsLogisticRegression(dirModelsBase, dataset_options_training_SaO.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_SaO_rf = OptionsRF(dirModelsBase, dataset_options_training_SaO.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf) classifier_nn_SaO = ClassifierNN(options_SaO_nn); classifier_lr_SaO = ClassifierLogisticRegression(options_SaO_lr); classifier_rf_SaO = ClassifierRF(options_SaO_rf); dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Med', 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Med', 'balanced': False, 'resample': False } dataset_options_training_Med = DatasetOptions(dict_options_dataset_training); dataset_options_testing_Med = DatasetOptions(dict_options_dataset_testing); options_Med_nn = OptionsNN(model_dir, dataset_options_training_Med, options_clf=dict_options_nn); options_Med_lr = OptionsLogisticRegression(dirModelsBase, dataset_options_training_Med.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_Med_rf = OptionsRF(dirModelsBase, dataset_options_training_Med.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf) classifier_nn_Med = ClassifierNN(options_Med_nn) classifier_lr_Med = ClassifierLogisticRegression(options_Med_lr); classifier_rf_Med = ClassifierRF(options_Med_rf); dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Gyn', 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Gyn', 'balanced': False, 'resample': False } dataset_options_training_Gyn = DatasetOptions(dict_options_dataset_training); dataset_options_testing_Gyn = DatasetOptions(dict_options_dataset_testing); options_Gyn_nn = OptionsNN(model_dir, dataset_options_training_Gyn, options_clf=dict_options_nn); options_Gyn_lr = OptionsLogisticRegression(dirModelsBase, dataset_options_training_Gyn.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); options_Gyn_rf = OptionsRF(dirModelsBase, dataset_options_training_Gyn.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); classifier_nn_Gyn = ClassifierNN(options_Gyn_nn) classifier_lr_Gyn = ClassifierLogisticRegression(options_Gyn_lr) classifier_rf_Gyn = ClassifierRF(options_Gyn_rf) results_all_nn = Results(dirResultsBase, dataset_options_training_all, options_all_nn, 'test', dataset_options_testing_all); results_SaO_nn = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_nn, 'test', dataset_options_testing_SaO); results_Med_nn = Results(dirResultsBase, dataset_options_training_Med, options_Med_nn, 'test', dataset_options_testing_Med); results_Gyn_nn = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_nn, 'test', dataset_options_testing_Gyn); results_all_lr = Results(dirResultsBase, dataset_options_training_all, options_all_lr, 'test', dataset_options_testing_all); results_SaO_lr = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_lr, 'test', dataset_options_testing_SaO); results_Med_lr = Results(dirResultsBase, dataset_options_training_Med, options_Med_lr, 'test', dataset_options_testing_Med); results_Gyn_lr = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_lr, 'test', dataset_options_testing_Gyn); results_all_rf = Results(dirResultsBase, dataset_options_training_all, options_all_rf, 'test', dataset_options_testing_all); results_SaO_rf = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_rf, 'test', dataset_options_testing_SaO); results_Med_rf = Results(dirResultsBase, dataset_options_training_Med, options_Med_rf, 'test', dataset_options_testing_Med); results_Gyn_rf = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_rf, 'test', dataset_options_testing_Gyn); analyzer_all_nn = ResultsSingleConfigAnalyzer(results_all_nn, 10); analyzer_SaO_nn = ResultsSingleConfigAnalyzer(results_SaO_nn, 10); analyzer_Med_nn = ResultsSingleConfigAnalyzer(results_Med_nn, 10); analyzer_Gyn_nn = ResultsSingleConfigAnalyzer(results_Gyn_nn, 10); analyzer_all_lr = ResultsSingleConfigAnalyzer(results_all_lr, 10); analyzer_SaO_lr = ResultsSingleConfigAnalyzer(results_SaO_lr, 10); analyzer_Med_lr = ResultsSingleConfigAnalyzer(results_Med_lr, 10); analyzer_Gyn_lr = ResultsSingleConfigAnalyzer(results_Gyn_lr, 10); analyzer_all_rf = ResultsSingleConfigAnalyzer(results_all_rf, 10); analyzer_SaO_rf = ResultsSingleConfigAnalyzer(results_SaO_rf, 10); analyzer_Med_rf = ResultsSingleConfigAnalyzer(results_Med_rf, 10); analyzer_Gyn_rf = ResultsSingleConfigAnalyzer(results_Gyn_rf, 10); analyzer_nn = [analyzer_all_nn, analyzer_Med_nn, analyzer_SaO_nn, analyzer_Gyn_nn]; analyzer_lr = [analyzer_all_lr, analyzer_Med_lr, analyzer_SaO_lr, analyzer_Gyn_lr]; analyzer_rf = [analyzer_all_rf, analyzer_Med_rf, analyzer_SaO_rf, analyzer_Gyn_rf]; analyzer = analyzer_nn; names_nn = ['NN - all', 'NN - Med', 'NN - SaO', 'NN - Gyn'] names_lr = ['LR - all', 'LR - Med', 'LR - SaO', 'LR - Gyn'] names_rf = ['RF - all', 'RF - Med', 'RF - SaO', 'RF - Gyn'] names = names_nn; title_plot = '' filename_plot_nn = os.path.join(dirPlotsBase, 'oes_nn_classification_performance.png') filename_plot_lr = os.path.join(dirPlotsBase, 'oes_lr_classification_performance.png') filename_plot_rf = os.path.join(dirPlotsBase, 'oes_rf_classification_performance.png') filename_plot = filename_plot_nn; results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, titlePlot=title_plot, f_plot=filename_plot)
dir_model = sys.argv[1] threshold_epoch = 0 if len(sys.argv) > 2: threshold_epoch = int(sys.argv[2]) dict_data_train = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '20122016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_train = DatasetOptions(dict_data_train) diag_group_names = dataset_options_train.getDiagGroupNames() indices_diag_codes = getDiagCodesIndices(diag_group_names) main_groups = icd10_chapters.getMainGroups() num_colors = len(main_groups) colors = plt.cm.rainbow(np.linspace(0, 1, num_colors)) num_diags = len(indices_diag_codes) filenames_encodings = glob.glob(dir_model + 'basic_encodings_*') var_encodings = [] for l, f in enumerate(sorted(filenames_encodings)): print(f) epoch = int(f.split('/')[-1].split('.')[0].split('_')[-1])
def analyze(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dict_data_train = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '20072016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_train = DatasetOptions(dict_data_train) dataset_options_eval = None if dict_data_train['data_prefix'] == 'nz': feature_columns = FeatureColumnsAutoEncoderNZ( dataset_options=dataset_options_train) else: print('unknown data prefix..exit') sys.exit() dict_dataset_options = { 'train': dataset_options_train, 'eval': dataset_options_eval, 'test': None } nn = AutoEncoderModel('analysis', dict_dataset_options, feature_columns, flags_obj) basic_encodings = nn.analyze() num_colors = 26 colors = plt.cm.rainbow(np.linspace(0, 1, num_colors)) pca = PCA(n_components=2) weights_2d_pca = pca.fit_transform(basic_encodings) tsne = TSNE(n_components=2) weights_2d_tsne = tsne.fit_transform(basic_encodings) diag_group_names = dataset_options_train.getDiagGroupNames() num_diags = len(diag_group_names) if dataset_options_train.getGroupingName() == 'verylightgrouping': num_subcategories = 100 elif dataset_options_train.getGroupingName() == 'lightgrouping': num_subcategories = 10 elif dataset_options_train.getGroupingName() == 'grouping': num_subcategories = 1 else: print('grouping scheme is unknown...exit') sys.exit() plt.figure() for k in range(0, num_colors): c = colors[k] plt.scatter( weights_2d_pca[k * num_subcategories:(k * num_subcategories + num_subcategories), 0], weights_2d_pca[k * num_subcategories:(k * num_subcategories + num_subcategories), 1], label=string.ascii_uppercase[k], alpha=0.5, s=100, c=c) plt.legend() plt.title('pca') plt.draw() plt.figure() for k in range(0, num_colors): c = colors[k] plt.scatter( weights_2d_tsne[k * num_subcategories:(k * num_subcategories + num_subcategories), 0], weights_2d_tsne[k * num_subcategories:(k * num_subcategories + num_subcategories), 1], label=string.ascii_uppercase[k], alpha=0.5, s=100, c=c) plt.legend() plt.title('t-sne') plt.draw() plt.show()
dirProject = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + '/' dirData = dirProject + 'data/' dirResultsBase = dirProject + 'results/' dirModelsBase = dirProject + 'classifiers/' dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': str(2012), 'newfeatures': { 'names': constantsNZ.NEW_FEATURES }, 'featurereduction': None } options_training = DatasetOptions(dict_options_dataset_training) dict_opt_sgd = { 'loss': 'log', 'penalty': 'l1' } options_sgd = OptionsSGD( dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_sgd) clf_sgd = ClassifierSGD(options_sgd) dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': '2016',
from utils.DatasetFilter import DatasetFilter from utils.Dataset import Dataset from utils.DatasetOptions import DatasetOptions import helpers.constants as constants import helpers.constantsNZ as constantsNZ dirProject = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/' dirData = dirProject + 'data/' dirPlotsBase = dirProject + 'plots/feature_comparison_wiederkehrer_normal/' dict_options_analyzing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': { 'names': constants.NEW_FEATURES }, 'featurereduction': None, 'filter_options': 'chronic_lung' } options = DatasetOptions(dict_options_analyzing) dataset = Dataset(options) datafilter = DatasetFilter(options) datafilter.filterDataDisease()
def predict(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dirProject = '/home/thomas/fusessh/scicore/projects/patrec' # dirProject = "Z:\\projects\\PATREC" dirResultsBase = os.path.join(dirProject, 'results/') dirData = os.path.join(dirProject, 'data') dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Gyn', 'balanced': False, 'resample': False } dataset_options_training = DatasetOptions(dict_options_dataset_training) dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': 'EntlassBereich_Gyn', 'balanced': False, 'resample': False } dataset_options_testing = DatasetOptions(dict_options_dataset_testing) if dict_options_dataset_testing['data_prefix'] == 'nz': feature_columns = FeatureColumnsNZ( dataset_options=dataset_options_testing) # feature_columns_nz_fusion = FeatureColumnsNZFusion(dataset_options=dataset_options_testing); # feature_columns = feature_columns_nz_fusion; elif dict_options_dataset_testing['data_prefix'] == 'patrec': feature_columns = FeatureColumnsPatrec( dataset_options=dataset_options_testing) # feature_columns_patrec_fusion = FeatureColumnsPatrecFusion(dataset_options=dataset_options_testing); # feature_columns = feature_columns_patrec_fusion; else: print('unknown data prefix..exit') sys.exit() dict_dataset_options = { 'train': dataset_options_training, 'eval': None, 'test': dataset_options_testing } nn = NeuralNetModel('test', dict_dataset_options, feature_columns, flags_obj) model_flags = nn.getFlags() if model_flags.model_dir.endswith('/'): trained_model = model_flags.model_dir.split('/')[-2] else: trained_model = model_flags.model_dir.split('/')[-1] if trained_model.startswith('warmstart'): pretrained = 'pretrained' else: pretrained = None print('warmstart: ' + str(trained_model.startswith('warmstart'))) print('hidden units: ' + str(model_flags.hidden_units)) dict_options_nn = { 'hidden_units': model_flags.hidden_units, 'learningrate': model_flags.learningrate, 'dropout': model_flags.dropout, 'batchnorm': model_flags.batchnorm, 'batch_size': model_flags.batch_size, 'training_epochs': model_flags.train_epochs, 'pretrained': pretrained, } options_nn = OptionsNN(model_flags.model_dir, dataset_options_training, options_clf=dict_options_nn) classifier_nn = ClassifierNN(options_nn) results_all_runs_test = Results(dirResultsBase, dataset_options_training, options_nn, 'test', dataset_options_testing) num_runs = 10 test_auc = [] test_avgprecision = [] for k in range(0, num_runs): results = nn.predict() filename_data_testing = nn.getFilenameDatasetBalanced() df_testing_balanced = pd.read_csv(filename_data_testing) predictions = [p['probabilities'] for p in results] predictions = np.array(predictions) print('get labels...: ' + str(filename_data_testing)) labels = df_testing_balanced[ dataset_options_testing.getEarlyReadmissionFlagname()].values res = classifier_nn.setResults(predictions, labels) results_all_runs_test.addResultsSingleRun(res) auc = res.getAUC() avgprecision = res.getAvgPrecision() print('') print('AUC: ' + str(auc)) print('avg precision: ' + str(avgprecision)) print('') test_auc.append(auc) test_avgprecision.append(avgprecision) print('') print('mean test auc: ' + str(np.mean(np.array(test_auc)))) print('mean test avg precision: ' + str(np.mean(np.array(test_avgprecision)))) print('') results_all_runs_test.writeResultsToFileDataset()
df_all_years = pd.DataFrame() for year in years: print('year: ' + str(year)) dict_options_dataset = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': str(year), 'encoding': 'embedding', 'grouping': 'verylightgrouping', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' } } options_dataset_year = DatasetOptions(dict_options_dataset) dataset_year = Dataset(options_dataset_year) if balanced: df_year = dataset_year.getBalancedSubSet() else: df_year = dataset_year.getDf() #df_year['main_diag'] = df_year['main_diag'].apply(convertDiagToInd) print(df_year.shape) df_all_years = df_all_years.append(df_year) print('df balanced all years: ' + str(df_all_years.shape)) encoding = options_dataset_year.getEncodingScheme() grouping = options_dataset_year.getGroupingName() featureset = options_dataset_year.getFeatureSetStr()
def plotDifferentClassifiers(results_analyzer, dirData, dirModelsBase, dirResultsBase): dict_opt_lr = {'penalty': 'l1', 'C': 0.5}; dict_opt_rf = {'n_estimators': 500, 'max_depth': 50}; dict_options_nn = { 'hidden_units': [60, 40, 40, 20], 'learningrate': 0.001, 'dropout': 0.5, 'batchnorm': True, 'batch_size': 64, 'training_epochs': 1000, 'pretrained': None, } DIRPROJECT = '/home/thomas/projects/patrec'; model_dir = os.path.join(DIRPROJECT, "patients_model") dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': {'names': constantsPATREC.NEW_FEATURES}, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': {'names': constantsPATREC.NEW_FEATURES}, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } options_training = DatasetOptions(dict_options_dataset_training); options_testing = DatasetOptions(dict_options_dataset_testing); dict_opt_rf = {'n_estimators': 500, 'max_depth': 50}; options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf); results_test_rf = Results(dirResultsBase, options_training, options_rf, 'test', options_testing); dict_opt_lr_l2 = {'penalty': 'l2', 'C': 0.01}; options_lr_l2 = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr_l2); results_test_lr_l2 = Results(dirResultsBase, options_training, options_lr_l2, 'test', options_testing); dict_opt_lr_l1 = {'penalty': 'l1', 'C': 0.5}; options_lr_l1 = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr_l1); results_test_lr_l1 = Results(dirResultsBase, options_training, options_lr_l1, 'test', options_testing); dict_options_dataset_training_nn = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } dict_options_dataset_testing_nn = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'grouping': 'verylightgrouping', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': None, 'filtering': None, 'balanced': False, 'resample': False } options_training = DatasetOptions(dict_options_dataset_training_nn); options_testing = DatasetOptions(dict_options_dataset_testing_nn); options_nn = OptionsNN(model_dir, options_training, options_clf=dict_options_nn); results_test_nn = Results(dirResultsBase, options_training, options_nn, 'test', options_testing); analyzer_rf = ResultsSingleConfigAnalyzer(results_test_rf, 10); analyzer_lr_l1 = ResultsSingleConfigAnalyzer(results_test_lr_l1, 10); analyzer_nn = ResultsSingleConfigAnalyzer(results_test_nn, 10) analyzer = [analyzer_rf, analyzer_lr_l1, analyzer_nn]; names = ['RF', 'Logistic Regression (l1)', 'Neural Network'] title_plot = '' filename_plot = os.path.join(dirPlotsBase, 'different_classifiers_train_patrec_20122015_test_patrec_20162017.png') results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot, )
dirModelsBase = os.path.join(dirProject, 'classifiers/') dirPlotsBase = os.path.join(dirProject, 'plots', 'learned_features') dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'encoding': 'categorical', 'newfeatures': { 'names': constantsPATREC.NEW_FEATURES }, 'featurereduction': None, 'grouping': 'verylightgrouping', 'filtering': 'EntlassBereich_Gyn' } options_training = DatasetOptions(dict_options_dataset_training) dict_opt_rf = { 'n_estimators': 500, 'max_depth': 50 } options_rf = OptionsRF( dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf) dict_opt_lr = { 'penalty': 'l1', 'C': 0.5 } options_lr = OptionsLogisticRegression(
def plotNNPerformance(results_analyzer, dirData, dirModelsBase, dirResultsBase): # compare different trainings of NNs dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': '20122016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': '2017', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } options_training_nn = DatasetOptions(dict_options_dataset_training); options_testing_nn = DatasetOptions(dict_options_dataset_testing); dict_options_nn = { 'hidden_units': [60, 40, 20, 10, 10], 'learningrate': 0.05, 'dropout': 0.25, 'batch_size': 640, 'training_epochs': 250, 'pretrained': 'pretrained' } options_nn_nz = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn) results_nn_nz = Results(dirResultsBase, options_training_nn, options_nn_nz, 'test', options_testing_nn); dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } options_training_nn = DatasetOptions(dict_options_dataset_training); options_testing_nn = DatasetOptions(dict_options_dataset_testing); dict_options_nn = { 'hidden_units': [20, 10, 10], 'learningrate': 0.01, 'dropout': 0.15, 'batch_size': 80, 'training_epochs': 500, } options_nn_patrec = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn) results_nn_patrec = Results(dirResultsBase, options_training_nn, options_nn_patrec, 'test', options_testing_nn); dict_options_nn = { 'hidden_units': [20, 10, 10], 'learningrate': 0.01, 'dropout': 0.25, 'batch_size': 80, 'training_epochs': 500, 'pretrained': 'pretrained' } options_nn_patrec_pretrained = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn) results_nn_patrec_pretrained = Results(dirResultsBase, options_training_nn, options_nn_patrec_pretrained, 'test', options_testing_nn); dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'subgroups': ['DK'], 'encoding': 'categorical', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'subgroups': ['DK'], 'encoding': 'categorical', 'newfeatures': None, 'featurereduction': {'method': 'FUSION'}, 'grouping': 'verylightgrouping' } dict_opt_lr = {'penalty': 'l1', 'C': 0.075}; options_training = DatasetOptions(dict_options_dataset_training); options_testing = DatasetOptions(dict_options_dataset_testing); options_lr = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr); results_lr = Results(dirResultsBase, options_training, options_lr, 'test', options_testing); analyzer_nn_nz = ResultsSingleConfigAnalyzer(results_nn_nz, 1); analyzer_nn_patrec = ResultsSingleConfigAnalyzer(results_nn_patrec, 1); analyzer_nn_patrec_pretrained = ResultsSingleConfigAnalyzer(results_nn_patrec_pretrained, 1); analyzer_lr = ResultsSingleConfigAnalyzer(results_lr, 10); analyzer = [analyzer_nn_nz, analyzer_nn_patrec, analyzer_nn_patrec_pretrained, analyzer_lr]; names = ['NZ', 'Basel', 'Basel (pretrained NZ)', 'LASSO'] title_plot = 'neural network performance: with and without pre-training' filename_plot = dirPlotsBase + 'nn_pretraining_nz_plus_lasso.png' results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot)
dirData = dirProject + 'data/'; dirPlotsBase = dirProject + 'plots/feature_comparison_wiederkehrer_normal/' dict_options_analyzing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'encoding': 'categorical', 'newfeatures': {'names': constants.NEW_FEATURES}, 'featurereduction': None, 'grouping': 'verylightgrouping', 'filtering': 'cardiovascular' } options = DatasetOptions(dict_options_analyzing); dataset = Dataset(options); if options.getOptionsFiltering() is not None: dirPlots = dirPlotsBase + options.getOptionsFiltering() + '/'; else: dirPlots = dirPlotsBase; if not os.path.exists(dirPlots): os.makedirs(dirPlots); analyzer = DataAnalyzer(options, dirPlots) analyzer.doFeatureComparison() # analyzer.checkWiederkehrer(); # avg_num_subgrp = analyzer.getAvgNumberSubgroup('DK')