Esempio n. 1
0
def plotDifferentTrainingSetSingleTestSetNZ(results_analyzer, dirData, dirModelsBase, dirResultsBase):
    print('plotDifferentTrainingSetSingleTestSetNZ')
    data_prefix = 'nz'

    dict_options_dataset_testing = {
        'dir_data':                 dirData,
        'data_prefix':              data_prefix,
        'dataset':                  '2017',
        'options_filtering':        None
    }
    options_testing = DatasetOptions(dict_options_dataset_testing);

    years_training = [2012, 2013, 2014, 2015, 2016];
    names = [];
    analyzers = []
    for year in years_training:
        print(year)
        dict_options_dataset_training = {
            'dir_data':             dirData,
            'data_prefix':          data_prefix,
            'dataset':              str(year),
            'options_filtering':    None
        }
        options_training_year = DatasetOptions(dict_options_dataset_training);
        options_rf_year = OptionsRF(dirModelsBase, options_training_year.getFilenameOptions(filteroptions=True));
        results_test_year = Results(dirResultsBase, options_training_year, options_rf_year, 'test', options_testing);

        names.append(str(year))
        analyzers.append(ResultsSingleConfigAnalyzer(results_test_year, 10));

    title_plot = 'classifier (rf): trained on subsets of nz 2012-2016, tested on subset of nz 2017'
    filename_plot = dirPlotsBase + 'rf_training_nz_years_20122016_testing_nz_year_2017.png'
    print('plot ROC curve...')
    results_analyzer.plotROCcurveMulitpleConfigs(analyzers, names, f_plot=filename_plot, titlePlot=title_plot, )
def main(dict_dataset_options):

    options = DatasetOptions(dict_dataset_options)
    preproc = PreprocessorNZ(options)

    print('grouping: ' + str(options.getGroupingName()))
    # preproc.processDischargeFile();
    # preproc.processDiagnosisFile();

    # preproc.createFeatureSet();
    preproc.encodeFeatures()
    preproc.fuse()
def run_deep(flags_obj):
    """Run Wide-Deep training and eval loop.
    Args:
    flags_obj: An object containing parsed flag values.
    """
    dict_data_train = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '20012011',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_train = DatasetOptions(dict_data_train)

    dict_data_eval = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '2013',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_eval = DatasetOptions(dict_data_eval)
    # dataset_options_eval = None;

    if dict_data_train['data_prefix'] == 'nz':
        feature_columns_nz_fusion = FeatureColumnsNZFusion(
            dataset_options=dataset_options_train)
        feature_columns = feature_columns_nz_fusion
    elif dict_data_train['data_prefix'] == 'patrec':
        feature_columns_patrec_fusion = FeatureColumnsPatrecFusion(
            dataset_options=dataset_options_train)
        feature_columns = feature_columns_patrec_fusion
    else:
        print('unknown data prefix..exit')
        sys.exit()

    dict_dataset_options = {
        'train': dataset_options_train,
        'eval': dataset_options_eval,
        'test': None
    }

    nn = NeuralNetModel('train', dict_dataset_options, feature_columns,
                        flags_obj)
    nn.train()
def run_deep(flags_obj):
    """Run Wide-Deep training and eval loop.
    Args:
    flags_obj: An object containing parsed flag values.
    """
    dict_data_train = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '20122016',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_train = DatasetOptions(dict_data_train)
    dataset_options_eval = None

    if dict_data_train['data_prefix'] == 'nz':
        feature_columns = FeatureColumnsAutoEncoderNZ(
            dataset_options=dataset_options_train)
    else:
        print('unknown data prefix..exit')
        sys.exit()

    dict_dataset_options = {
        'train': dataset_options_train,
        'eval': dataset_options_eval,
        'test': None
    }

    nn = AutoEncoderModel('train', dict_dataset_options, feature_columns,
                          flags_obj)
    nn.train()
Esempio n. 5
0
def plotOneTrainingSetDifferentTestSets(results_analyzer, dirData, dirModelsBase, dirResultsBase):
    data_prefix = 'patrec'
    dict_options_dataset_training = {
        'dir_data':             dirData,
        'data_prefix':          data_prefix,
        'dataset':              '20122015',
        'options_filtering':    None
    }
    options_training = DatasetOptions(dict_options_dataset_training);
    # compare different subsets of data: EntlassBereich (only with RandomForest)
    options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True));


    dict_options_dataset_testing = {
        'dir_data':             dirData,
        'data_prefix':          data_prefix,
        'dataset':              '20162017',
        'options_filtering':    None
    }
    options_testing_all = DatasetOptions(dict_options_dataset_testing);
    results_test_all = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_all);


    dict_options_dataset_testing = {
        'dir_data':             dirData,
        'data_prefix':          data_prefix,
        'dataset':              '20162017',
        'options_filtering':    'EntlassBereich_Med'
    }
    options_testing_med = DatasetOptions(dict_options_dataset_testing);
    results_test_med = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_med);

    dict_options_dataset_testing = {
        'dir_data':             dirData,
        'data_prefix':          data_prefix,
        'dataset':              '20162017',
        'options_filtering':    'EntlassBereich_SaO'
    }
    options_testing_sao = DatasetOptions(dict_options_dataset_testing);
    results_test_sao = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_sao);

    dict_options_dataset_testing = {
        'dir_data':             dirData,
        'data_prefix':          data_prefix,
        'dataset':              '20162017',
        'options_filtering':    'EntlassBereich_Gyn'
    }
    options_testing_gyn = DatasetOptions(dict_options_dataset_testing);
    results_test_gyn = Results(dirResultsBase, options_training, options_rf, 'test', options_testing_gyn);

    analyzer_all = ResultsSingleConfigAnalyzer(results_test_all, 10);
    analyzer_med = ResultsSingleConfigAnalyzer(results_test_med, 10);
    analyzer_sao = ResultsSingleConfigAnalyzer(results_test_sao, 10);
    analyzer_gyn = ResultsSingleConfigAnalyzer(results_test_gyn, 10);
    analyzer = [analyzer_all, analyzer_med, analyzer_sao, analyzer_gyn];
    names = ['All', 'Med', 'SaO', 'Gyn']
    title_plot = 'classifier (rf): trained on patrec 2012-2015, tested on subsets of patrec 2016-2017'
    filename_plot = dirPlotsBase + 'rf_training_all_testing_EntlassBereich.png'
    results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot, )
Esempio n. 6
0
def plotSingleConfiguration(results_analyzer, dirData, dirModelsBase, dirResultsBase):
    dict_options_dataset_training = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20122015'
    }
    dict_options_dataset_testing = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20162017'
    }

    options_training = DatasetOptions(dict_options_dataset_training);
    options_testing = DatasetOptions(dict_options_dataset_testing);
    options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True));
    results_all_runs_test = Results(dirResultsBase, options_training, options_rf, 'test', options_testing);

    analyzer_single_config = ResultsSingleConfigAnalyzer(results_all_runs_test, 10);
    results_analyzer.plotROCcurveSingleConfig(analyzer_single_config, 'rf')
Esempio n. 7
0
def plotSGDClassifierPerformance(results_analyzer, dirData, dirModelsBase, dirResultsBase):

    dict_options_dataset_testing = {
        'dir_data':         dirData,
        'data_prefix':      'nz',
        'dataset':          '2016',
        'encoding':         'categorical',
        'newfeatures':      {'names': constantsNZ.NEW_FEATURES},
        'featurereduction': None,
        'grouping':         'grouping'
    }
    options_dataset_testing = DatasetOptions(dict_options_dataset_testing);

    analyzer = [];
    years = [2012, 2013, 2014, 2015];
    for year in years:
        dict_options_dataset_training = {
            'dir_data':         dirData,
            'data_prefix':      'nz',
            'dataset':          str(year),
            'encoding':         'categorical',
            'newfeatures':      {'names': constantsNZ.NEW_FEATURES},
            'featurereduction': None,
            'grouping':         'grouping'
        }
        options_dataset_training = DatasetOptions(dict_options_dataset_training);

        dict_opt_sgd = {'loss': 'log', 'penalty': 'l1'};
        options_sgd = OptionsSGD(dirModelsBase, options_dataset_training.getFilenameOptions(filteroptions=True),options_clf=dict_opt_sgd);
        results_year = Results(dirResultsBase, options_dataset_training, options_sgd, 'test', options_dataset_testing);
        analyzer_sgd_year = ResultsSingleConfigAnalyzer(results_year, 1);
        analyzer.append(analyzer_sgd_year);

    names = ['2012', '2013', '2014', '2015'];
    title_plot = 'performance of batch-based logistic regression'
    filename_plot = dirPlotsBase + 'sgd_nz_performance_years_training20122015_test2016.png'
    results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot)
def run_deep(flags_obj):
    """Run Wide-Deep training and eval loop.
    Args:
    flags_obj: An object containing parsed flag values.
    """

    dirProject = '/home/thomas/fusessh/scicore/projects/patrec'
    # dirProject = "Z:\\projects\\PATREC"
    dirData = os.path.join(dirProject, 'data')
    dict_options_dataset_training = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20122015',
        'grouping': 'verylightgrouping',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': None,
        'filtering': 'oncology',
        'balanced': False,
        'resample': True
    }
    dataset_options_train = DatasetOptions(dict_options_dataset_training)

    dataset_options_eval = None

    if dict_options_dataset_training['data_prefix'] == 'nz':
        feature_columns_nz = FeatureColumnsNZ(
            dataset_options=dataset_options_train)
        feature_columns = feature_columns_nz
    elif dict_options_dataset_training['data_prefix'] == 'patrec':
        feature_columns_patrec = FeatureColumnsPatrec(
            dataset_options=dataset_options_train)
        feature_columns = feature_columns_patrec
    else:
        print('unknown data prefix..exit')
        sys.exit()

    dict_dataset_options = {
        'train': dataset_options_train,
        'eval': dataset_options_eval,
        'test': None
    }

    nn = NeuralNetModel('train', dict_dataset_options, feature_columns,
                        flags_obj)
    print(flags_obj.log_dir)
    nn.train()
def encode(flags_obj):
    """Run Wide-Deep training and eval loop.
    Args:
    flags_obj: An object containing parsed flag values.
    """
    dict_data_training = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '20012016',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_training = DatasetOptions(dict_data_training)

    dict_data_encoding = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '2017',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_encoding = DatasetOptions(dict_data_encoding)

    feature_columns = FeatureColumnsAutoEncoderNZ(
        dataset_options=dataset_options_encoding)

    dict_dataset_options = {
        'train': dataset_options_training,
        'eval': None,
        'test': dataset_options_encoding
    }

    nn = AutoEncoderModel('test', dict_dataset_options, feature_columns,
                          flags_obj)
    diag_encodings = nn.encode()
    print('diag_encodings --> main diag: ' + str(diag_encodings[0].shape))
    print('diag_encodings --> secondary diags: ' +
          str(diag_encodings[1].shape))

    main_diag_encodings = diag_encodings[0]
    sec_diag_encodings = diag_encodings[1]

    dataset_encoding = Dataset(dataset_options_encoding)
    df_encoding = dataset_encoding.getDf()
    print('df_encoding: ' + str(df_encoding.shape))
    num_encoded_dim = main_diag_encodings.shape[1]

    dir_data = dataset_options_encoding.getDirData()
    dataset = dataset_options_encoding.getDatasetName()
    data_prefix = dataset_options_encoding.getDataPrefix()
    demographic_featurename = dataset_options_encoding.getFilenameOptionDemographicFeatures(
    )
    featureset_str = dataset_options_encoding.getFeatureSetStr()
    encoding = dataset_options_encoding.getEncodingScheme()
    name_event_column = dataset_options_encoding.getEventColumnName()

    name_main_diag = dataset_options_encoding.getNameMainDiag()
    name_sec_diag = dataset_options_encoding.getNameSecDiag()
    df_encoding_sec_diag = df_encoding[name_event_column].to_frame()
    df_encoding_main_diag = df_encoding[name_event_column].to_frame()

    num_encoded_dim = sec_diag_encodings.shape[1]
    for k in range(0, num_encoded_dim):
        new_col_secdiag = name_sec_diag + '_dim_' + str(k)
        df_encoding_sec_diag[new_col_secdiag] = sec_diag_encodings[:, k]

        new_col_maindiag = name_main_diag + '_dim_' + str(k)
        df_encoding_main_diag[new_col_maindiag] = main_diag_encodings[:, k]

    print('df_encoding_main_diag: ' + str(df_encoding_main_diag.shape))
    print('df_encoding_sec_diag: ' + str(df_encoding_sec_diag.shape))

    filename_sec_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_sec_diag + '_' + str(
        num_encoded_dim) + 'dim.csv'
    filename_main_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_main_diag + '_' + str(
        num_encoded_dim) + 'dim.csv'

    list_df = [
        df_encoding_sec_diag[i:i + 10000]
        for i in range(0, df_encoding_sec_diag.shape[0], 10000)
    ]
    list_df[0].to_csv(filename_sec_diag_encoding,
                      index=False,
                      line_terminator='\n')
    for l in list_df[1:]:
        l.to_csv(filename_sec_diag_encoding,
                 index=False,
                 line_terminator='\n',
                 header=False,
                 mode='a')

    list_df = [
        df_encoding_main_diag[i:i + 10000]
        for i in range(0, df_encoding_main_diag.shape[0], 10000)
    ]
    list_df[0].to_csv(filename_main_diag_encoding,
                      index=False,
                      line_terminator='\n')
    for l in list_df[1:]:
        l.to_csv(filename_main_diag_encoding,
                 index=False,
                 line_terminator='\n',
                 header=False,
                 mode='a')
Esempio n. 10
0
from preprocessing.Preprocessor import Preprocessor

import helpers.constants as constantsPATREC

# dirProject = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/';
dirProject = '/home/thomas/fusessh/scicore/projects/patrec'
dirData = os.path.join(dirProject, 'data')

dict_dataset_options = {
    'dir_data': dirData,
    'data_prefix': 'patrec',
    'dataset': '20122015',
    # 'subgroups':                ['DK'],
    'grouping': 'verylightgrouping',
    'encoding': 'categorical',
    'newfeatures': {
        'names': constantsPATREC.NEW_FEATURES
    },
    'featurereduction': None,
    'filtering': None
}

options = DatasetOptions(dict_dataset_options)
preproc = Preprocessor(options)
preproc.splitColumns()
preproc.clean()
preproc.group()
preproc.createFeatureSet()
preproc.encodeFeatures()
preproc.fuse()
Esempio n. 11
0
def plotDiseasePerformances(results_analyzer, dirData, dirModelsBase, dirResultsBase):
    dict_opt_lr = {'penalty': 'l1', 'C': 0.5};
    dict_opt_rf = {'n_estimators': 500, 'max_depth': 50};
    dict_options_nn = {
        'hidden_units': [60, 40, 40, 20],
        'learningrate': 0.001,
        'dropout': 0.5,
        'batchnorm': True,
        'batch_size': 64,
        'training_epochs': 1000,
        'pretrained': None,
    }
    DIRPROJECT = '/home/thomas/projects/patrec';
    model_dir = os.path.join(DIRPROJECT, "patients_model")

    dict_options_dataset_training = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20122015',
        'grouping':         'verylightgrouping',
        'encoding':         'categorical',
        'newfeatures':      {'names': constantsPATREC.NEW_FEATURES},
        'featurereduction': None,
        'filtering':        None,
        'balanced':         False,
        'resample':         False
    }
    dict_options_dataset_testing = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20162017',
        'encoding':         'categorical',
        'newfeatures':      {'names': constantsPATREC.NEW_FEATURES},
        'grouping':         'verylightgrouping',
        'featurereduction': None,
        'filtering':        None,
        'balanced':         False,
        'resample':         False
    }

    dict_options_all_training = dict_options_dataset_training.copy();
    dict_options_all_testing = dict_options_dataset_testing.copy();
    options_all_training = DatasetOptions(dict_options_all_training);
    options_all_testing = DatasetOptions(dict_options_all_testing);
    options_all_lr = OptionsLogisticRegression(dirModelsBase,
                                               options_all_training.getFilenameOptions(filteroptions=True),
                                               options_clf=dict_opt_lr);
    options_all_rf = OptionsRF(dirModelsBase,
                               options_all_training.getFilenameOptions(filteroptions=True),
                               options_clf=dict_opt_rf);
    options_all_nn = OptionsNN(model_dir, options_all_training, options_clf=dict_options_nn);


    dict_options_lung_training = dict_options_dataset_training.copy();
    dict_options_lung_testing = dict_options_dataset_testing.copy();
    dict_options_lung_training['filtering'] = 'chronic_lung';
    dict_options_lung_testing['filtering'] = 'chronic_lung';
    options_lung_training = DatasetOptions(dict_options_lung_training);
    options_lung_testing = DatasetOptions(dict_options_lung_testing);
    options_lung_lr = OptionsLogisticRegression(dirModelsBase,
                                                options_lung_training.getFilenameOptions(filteroptions=True),
                                                options_clf=dict_opt_lr);
    options_lung_rf = OptionsRF(dirModelsBase,
                                options_lung_training.getFilenameOptions(filteroptions=True),
                                options_clf=dict_opt_rf);
    options_lung_nn = OptionsNN(model_dir, options_lung_training, options_clf=dict_options_nn);

    dict_options_oncology_training = dict_options_dataset_training.copy();
    dict_options_oncology_testing = dict_options_dataset_testing.copy();
    dict_options_oncology_training['filtering'] = 'oncology';
    dict_options_oncology_testing['filtering'] = 'oncology';
    options_oncology_training = DatasetOptions(dict_options_oncology_training);
    options_oncology_testing = DatasetOptions(dict_options_oncology_testing);
    options_oncology_lr = OptionsLogisticRegression(dirModelsBase,
                                                    options_oncology_training.getFilenameOptions(filteroptions=True),
                                                    options_clf=dict_opt_lr);
    options_oncology_rf = OptionsRF(dirModelsBase,
                                    options_oncology_training.getFilenameOptions(filteroptions=True),
                                    options_clf=dict_opt_rf);
    options_oncology_nn = OptionsNN(model_dir, options_oncology_training, options_clf=dict_options_nn);

    dict_options_cardio_training = dict_options_dataset_training.copy();
    dict_options_cardio_testing = dict_options_dataset_testing.copy();
    dict_options_cardio_training['filtering'] = 'cardiovascular';
    dict_options_cardio_testing['filtering'] = 'cardiovascular';
    options_cardio_training = DatasetOptions(dict_options_cardio_training);
    options_cardio_testing = DatasetOptions(dict_options_cardio_testing);
    options_cardio_lr = OptionsLogisticRegression(dirModelsBase,
                                                  options_cardio_training.getFilenameOptions(filteroptions=True),
                                                  options_clf=dict_opt_lr);
    options_cardio_rf = OptionsRF(dirModelsBase,
                                  options_cardio_training.getFilenameOptions(filteroptions=True),
                                  options_clf=dict_opt_rf);
    options_cardio_nn = OptionsNN(model_dir, options_cardio_training, options_clf=dict_options_nn);

    results_all_rf = Results(dirResultsBase, options_all_training, options_all_rf, 'test', options_all_testing);
    results_lung_rf = Results(dirResultsBase, options_lung_training, options_lung_rf, 'test', options_lung_testing);
    results_oncology_rf = Results(dirResultsBase, options_oncology_training, options_oncology_rf, 'test', options_oncology_testing);
    results_cardio_rf = Results(dirResultsBase, options_cardio_training, options_cardio_rf, 'test', options_cardio_testing);
    results_all_lr = Results(dirResultsBase, options_all_training, options_all_lr, 'test', options_all_testing);
    results_lung_lr = Results(dirResultsBase, options_lung_training, options_lung_lr, 'test', options_lung_testing);
    results_oncology_lr = Results(dirResultsBase, options_oncology_training, options_oncology_lr, 'test', options_oncology_testing);
    results_cardio_lr = Results(dirResultsBase, options_cardio_training, options_cardio_lr, 'test', options_cardio_testing);
    results_all_nn = Results(dirResultsBase, options_all_training, options_all_nn, 'test', options_all_testing);
    results_lung_nn = Results(dirResultsBase, options_lung_training, options_lung_nn, 'test', options_lung_testing);
    results_oncology_nn = Results(dirResultsBase, options_oncology_training, options_oncology_nn, 'test', options_oncology_testing);
    results_cardio_nn = Results(dirResultsBase, options_cardio_training, options_cardio_nn, 'test', options_cardio_testing);

    analyzer_all_rf = ResultsSingleConfigAnalyzer(results_all_rf, 10);
    analyzer_lung_rf = ResultsSingleConfigAnalyzer(results_lung_rf, 10);
    analyzer_oncology_rf = ResultsSingleConfigAnalyzer(results_oncology_rf, 10);
    analyzer_cardio_rf = ResultsSingleConfigAnalyzer(results_cardio_rf, 10);
    analyzer_all_lr = ResultsSingleConfigAnalyzer(results_all_lr, 10);
    analyzer_lung_lr = ResultsSingleConfigAnalyzer(results_lung_lr, 10);
    analyzer_oncology_lr = ResultsSingleConfigAnalyzer(results_oncology_lr, 10);
    analyzer_cardio_lr = ResultsSingleConfigAnalyzer(results_cardio_lr, 10);
    analyzer_all_nn = ResultsSingleConfigAnalyzer(results_all_nn, 10);
    analyzer_lung_nn = ResultsSingleConfigAnalyzer(results_lung_nn, 10);
    analyzer_oncology_nn = ResultsSingleConfigAnalyzer(results_oncology_nn,  10);
    analyzer_cardio_nn = ResultsSingleConfigAnalyzer(results_cardio_nn, 10);
    analyzer_rf = [analyzer_all_rf, analyzer_lung_rf, analyzer_oncology_rf, analyzer_cardio_rf];
    analyzer_lr = [analyzer_all_lr, analyzer_lung_lr, analyzer_oncology_lr, analyzer_cardio_lr];
    analyzer_nn = [analyzer_all_nn, analyzer_lung_nn, analyzer_oncology_nn, analyzer_cardio_nn]
    analyzer = analyzer_lr;

    names_rf = ['RF - all', 'RF - chronic lung', 'RF - oncology', 'RF - cardiovascular'];
    names_lr = ['LR - all', 'LR - chronic lung', 'LR - oncology', 'LR - cardiovascular'];
    names_nn = ['NN - all', 'NN - chronic lung', 'NN - oncology', 'NN - cardiovascular'];
    names = names_lr;

    title_plot = ''
    filename_plot_rf = os.path.join(dirPlotsBase, 'diseases_rf_classification_performance.png');
    filename_plot_lr = os.path.join(dirPlotsBase, 'diseases_lr_classification_performance.png');
    filename_plot_nn = os.path.join(dirPlotsBase, 'diseases_nn_classification_performance.png');
    filename_plot = filename_plot_lr;
    results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, titlePlot=title_plot, f_plot=filename_plot)
Esempio n. 12
0
def plotOEPerformances(results_analyzer, dirData, dirModelsBase, dirResultsBase):
    dict_opt_lr = {'penalty': 'l1', 'C': 0.5};
    dict_opt_rf = {'n_estimators': 500, 'max_depth': 50};
    dict_options_nn = {
        'hidden_units':     [60, 40, 40, 20],
        'learningrate':     0.001,
        'dropout':          0.5,
        'batchnorm':        True,
        'batch_size':       64,
        'training_epochs':  1000,
        'pretrained':       None,
    }
    DIRPROJECT = '/home/thomas/projects/patrec';
    model_dir = os.path.join(DIRPROJECT, "patients_model")

    dict_options_dataset_training = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20122015',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        None,
        'balanced':         False,
        'resample':         False
    }
    dict_options_dataset_testing = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20162017',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        None,
        'balanced':         False,
        'resample':         False
    }
    dataset_options_training_all = DatasetOptions(dict_options_dataset_training);
    dataset_options_testing_all = DatasetOptions(dict_options_dataset_testing);
    options_all_nn = OptionsNN(model_dir, dataset_options_training_all, options_clf=dict_options_nn);
    options_all_lr = OptionsLogisticRegression(dirModelsBase,
                                                dataset_options_training_all.getFilenameOptions(filteroptions=True),
                                                options_clf=dict_opt_lr)
    options_all_rf = OptionsRF(dirModelsBase, dataset_options_training_all.getFilenameOptions(filteroptions=True),
                               options_clf=dict_opt_rf);
    classifier_nn_all = ClassifierNN(options_all_nn)
    classifier_lr_all = ClassifierLogisticRegression(options_all_lr)
    classifier_rf_all = ClassifierRF(options_all_rf)

    dict_options_dataset_training = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20122015',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        'EntlassBereich_SaO',
        'balanced':         False,
        'resample':         False
    }
    dict_options_dataset_testing = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20162017',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        'EntlassBereich_SaO',
        'balanced':         False,
        'resample':         False
    }
    dataset_options_training_SaO = DatasetOptions(dict_options_dataset_training);
    dataset_options_testing_SaO = DatasetOptions(dict_options_dataset_testing);
    options_SaO_nn = OptionsNN(model_dir, dataset_options_training_SaO, options_clf=dict_options_nn);
    options_SaO_lr = OptionsLogisticRegression(dirModelsBase,
                                               dataset_options_training_SaO.getFilenameOptions(filteroptions=True),
                                               options_clf=dict_opt_lr);
    options_SaO_rf = OptionsRF(dirModelsBase,
                               dataset_options_training_SaO.getFilenameOptions(filteroptions=True),
                               options_clf=dict_opt_rf)
    classifier_nn_SaO = ClassifierNN(options_SaO_nn);
    classifier_lr_SaO = ClassifierLogisticRegression(options_SaO_lr);
    classifier_rf_SaO = ClassifierRF(options_SaO_rf);

    dict_options_dataset_training = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20122015',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        'EntlassBereich_Med',
        'balanced':         False,
        'resample':         False
    }
    dict_options_dataset_testing = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20162017',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        'EntlassBereich_Med',
        'balanced':         False,
        'resample':         False
    }
    dataset_options_training_Med = DatasetOptions(dict_options_dataset_training);
    dataset_options_testing_Med = DatasetOptions(dict_options_dataset_testing);
    options_Med_nn = OptionsNN(model_dir, dataset_options_training_Med, options_clf=dict_options_nn);
    options_Med_lr = OptionsLogisticRegression(dirModelsBase,
                                               dataset_options_training_Med.getFilenameOptions(filteroptions=True),
                                               options_clf=dict_opt_lr);
    options_Med_rf = OptionsRF(dirModelsBase, dataset_options_training_Med.getFilenameOptions(filteroptions=True),
                               options_clf=dict_opt_rf)
    classifier_nn_Med = ClassifierNN(options_Med_nn)
    classifier_lr_Med = ClassifierLogisticRegression(options_Med_lr);
    classifier_rf_Med = ClassifierRF(options_Med_rf);

    dict_options_dataset_training = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20122015',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        'EntlassBereich_Gyn',
        'balanced':         False,
        'resample':         False
    }
    dict_options_dataset_testing = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20162017',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        'EntlassBereich_Gyn',
        'balanced':         False,
        'resample':         False
    }
    dataset_options_training_Gyn = DatasetOptions(dict_options_dataset_training);
    dataset_options_testing_Gyn = DatasetOptions(dict_options_dataset_testing);
    options_Gyn_nn = OptionsNN(model_dir, dataset_options_training_Gyn, options_clf=dict_options_nn);
    options_Gyn_lr = OptionsLogisticRegression(dirModelsBase,
                                               dataset_options_training_Gyn.getFilenameOptions(filteroptions=True),
                                               options_clf=dict_opt_lr);
    options_Gyn_rf = OptionsRF(dirModelsBase, dataset_options_training_Gyn.getFilenameOptions(filteroptions=True),
                               options_clf=dict_opt_rf);
    classifier_nn_Gyn = ClassifierNN(options_Gyn_nn)
    classifier_lr_Gyn = ClassifierLogisticRegression(options_Gyn_lr)
    classifier_rf_Gyn = ClassifierRF(options_Gyn_rf)

    results_all_nn = Results(dirResultsBase, dataset_options_training_all, options_all_nn, 'test', dataset_options_testing_all);
    results_SaO_nn = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_nn, 'test', dataset_options_testing_SaO);
    results_Med_nn = Results(dirResultsBase, dataset_options_training_Med, options_Med_nn, 'test', dataset_options_testing_Med);
    results_Gyn_nn = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_nn, 'test', dataset_options_testing_Gyn);
    results_all_lr = Results(dirResultsBase, dataset_options_training_all, options_all_lr, 'test', dataset_options_testing_all);
    results_SaO_lr = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_lr, 'test', dataset_options_testing_SaO);
    results_Med_lr = Results(dirResultsBase, dataset_options_training_Med, options_Med_lr, 'test', dataset_options_testing_Med);
    results_Gyn_lr = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_lr, 'test', dataset_options_testing_Gyn);
    results_all_rf = Results(dirResultsBase, dataset_options_training_all, options_all_rf, 'test', dataset_options_testing_all);
    results_SaO_rf = Results(dirResultsBase, dataset_options_training_SaO, options_SaO_rf, 'test', dataset_options_testing_SaO);
    results_Med_rf = Results(dirResultsBase, dataset_options_training_Med, options_Med_rf, 'test', dataset_options_testing_Med);
    results_Gyn_rf = Results(dirResultsBase, dataset_options_training_Gyn, options_Gyn_rf, 'test', dataset_options_testing_Gyn);

    analyzer_all_nn = ResultsSingleConfigAnalyzer(results_all_nn, 10);
    analyzer_SaO_nn = ResultsSingleConfigAnalyzer(results_SaO_nn, 10);
    analyzer_Med_nn = ResultsSingleConfigAnalyzer(results_Med_nn, 10);
    analyzer_Gyn_nn = ResultsSingleConfigAnalyzer(results_Gyn_nn, 10);
    analyzer_all_lr = ResultsSingleConfigAnalyzer(results_all_lr, 10);
    analyzer_SaO_lr = ResultsSingleConfigAnalyzer(results_SaO_lr, 10);
    analyzer_Med_lr = ResultsSingleConfigAnalyzer(results_Med_lr, 10);
    analyzer_Gyn_lr = ResultsSingleConfigAnalyzer(results_Gyn_lr, 10);
    analyzer_all_rf = ResultsSingleConfigAnalyzer(results_all_rf, 10);
    analyzer_SaO_rf = ResultsSingleConfigAnalyzer(results_SaO_rf, 10);
    analyzer_Med_rf = ResultsSingleConfigAnalyzer(results_Med_rf, 10);
    analyzer_Gyn_rf = ResultsSingleConfigAnalyzer(results_Gyn_rf, 10);

    analyzer_nn = [analyzer_all_nn, analyzer_Med_nn, analyzer_SaO_nn, analyzer_Gyn_nn];
    analyzer_lr = [analyzer_all_lr, analyzer_Med_lr, analyzer_SaO_lr, analyzer_Gyn_lr];
    analyzer_rf = [analyzer_all_rf, analyzer_Med_rf, analyzer_SaO_rf, analyzer_Gyn_rf];
    analyzer = analyzer_nn;
    names_nn = ['NN - all', 'NN - Med', 'NN - SaO', 'NN - Gyn']
    names_lr = ['LR - all', 'LR - Med', 'LR - SaO', 'LR - Gyn']
    names_rf = ['RF - all', 'RF - Med', 'RF - SaO', 'RF - Gyn']
    names = names_nn;
    title_plot = ''
    filename_plot_nn = os.path.join(dirPlotsBase, 'oes_nn_classification_performance.png')
    filename_plot_lr = os.path.join(dirPlotsBase, 'oes_lr_classification_performance.png')
    filename_plot_rf = os.path.join(dirPlotsBase, 'oes_rf_classification_performance.png')
    filename_plot = filename_plot_nn;
    results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, titlePlot=title_plot, f_plot=filename_plot)
    dir_model = sys.argv[1]
    threshold_epoch = 0
    if len(sys.argv) > 2:
        threshold_epoch = int(sys.argv[2])
    dict_data_train = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '20122016',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_train = DatasetOptions(dict_data_train)

    diag_group_names = dataset_options_train.getDiagGroupNames()
    indices_diag_codes = getDiagCodesIndices(diag_group_names)
    main_groups = icd10_chapters.getMainGroups()

    num_colors = len(main_groups)
    colors = plt.cm.rainbow(np.linspace(0, 1, num_colors))

    num_diags = len(indices_diag_codes)

    filenames_encodings = glob.glob(dir_model + 'basic_encodings_*')
    var_encodings = []
    for l, f in enumerate(sorted(filenames_encodings)):
        print(f)
        epoch = int(f.split('/')[-1].split('.')[0].split('_')[-1])
Esempio n. 14
0
def analyze(flags_obj):
    """Run Wide-Deep training and eval loop.
    Args:
    flags_obj: An object containing parsed flag values.
    """
    dict_data_train = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '20072016',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_train = DatasetOptions(dict_data_train)
    dataset_options_eval = None

    if dict_data_train['data_prefix'] == 'nz':
        feature_columns = FeatureColumnsAutoEncoderNZ(
            dataset_options=dataset_options_train)
    else:
        print('unknown data prefix..exit')
        sys.exit()

    dict_dataset_options = {
        'train': dataset_options_train,
        'eval': dataset_options_eval,
        'test': None
    }

    nn = AutoEncoderModel('analysis', dict_dataset_options, feature_columns,
                          flags_obj)
    basic_encodings = nn.analyze()

    num_colors = 26
    colors = plt.cm.rainbow(np.linspace(0, 1, num_colors))

    pca = PCA(n_components=2)
    weights_2d_pca = pca.fit_transform(basic_encodings)

    tsne = TSNE(n_components=2)
    weights_2d_tsne = tsne.fit_transform(basic_encodings)

    diag_group_names = dataset_options_train.getDiagGroupNames()
    num_diags = len(diag_group_names)

    if dataset_options_train.getGroupingName() == 'verylightgrouping':
        num_subcategories = 100
    elif dataset_options_train.getGroupingName() == 'lightgrouping':
        num_subcategories = 10
    elif dataset_options_train.getGroupingName() == 'grouping':
        num_subcategories = 1
    else:
        print('grouping scheme is unknown...exit')
        sys.exit()

    plt.figure()
    for k in range(0, num_colors):
        c = colors[k]
        plt.scatter(
            weights_2d_pca[k * num_subcategories:(k * num_subcategories +
                                                  num_subcategories), 0],
            weights_2d_pca[k * num_subcategories:(k * num_subcategories +
                                                  num_subcategories), 1],
            label=string.ascii_uppercase[k],
            alpha=0.5,
            s=100,
            c=c)
    plt.legend()
    plt.title('pca')
    plt.draw()

    plt.figure()
    for k in range(0, num_colors):
        c = colors[k]
        plt.scatter(
            weights_2d_tsne[k * num_subcategories:(k * num_subcategories +
                                                   num_subcategories), 0],
            weights_2d_tsne[k * num_subcategories:(k * num_subcategories +
                                                   num_subcategories), 1],
            label=string.ascii_uppercase[k],
            alpha=0.5,
            s=100,
            c=c)
    plt.legend()
    plt.title('t-sne')
    plt.draw()

    plt.show()
    dirProject = os.path.dirname(os.path.dirname(
        os.path.abspath(__file__))) + '/'
    dirData = dirProject + 'data/'
    dirResultsBase = dirProject + 'results/'
    dirModelsBase = dirProject + 'classifiers/'

    dict_options_dataset_training = {
        'dir_data': dirData,
        'data_prefix': 'nz',
        'dataset': str(2012),
        'newfeatures': {
            'names': constantsNZ.NEW_FEATURES
        },
        'featurereduction': None
    }
    options_training = DatasetOptions(dict_options_dataset_training)

    dict_opt_sgd = {
        'loss': 'log',
        'penalty': 'l1'
    }
    options_sgd = OptionsSGD(
        dirModelsBase,
        options_training.getFilenameOptions(filteroptions=True),
        options_clf=dict_opt_sgd)
    clf_sgd = ClassifierSGD(options_sgd)

    dict_options_dataset_training = {
        'dir_data': dirData,
        'data_prefix': 'nz',
        'dataset': '2016',
Esempio n. 16
0
from utils.DatasetFilter import DatasetFilter
from utils.Dataset import Dataset
from utils.DatasetOptions import DatasetOptions

import helpers.constants as constants
import helpers.constantsNZ as constantsNZ

dirProject = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/'
dirData = dirProject + 'data/'
dirPlotsBase = dirProject + 'plots/feature_comparison_wiederkehrer_normal/'

dict_options_analyzing = {
    'dir_data': dirData,
    'data_prefix': 'patrec',
    'dataset': '20122015',
    'grouping': 'verylightgrouping',
    'encoding': 'categorical',
    'newfeatures': {
        'names': constants.NEW_FEATURES
    },
    'featurereduction': None,
    'filter_options': 'chronic_lung'
}

options = DatasetOptions(dict_options_analyzing)
dataset = Dataset(options)

datafilter = DatasetFilter(options)
datafilter.filterDataDisease()
def predict(flags_obj):
    """Run Wide-Deep training and eval loop.
    Args:
    flags_obj: An object containing parsed flag values.
    """
    dirProject = '/home/thomas/fusessh/scicore/projects/patrec'
    # dirProject = "Z:\\projects\\PATREC"
    dirResultsBase = os.path.join(dirProject, 'results/')
    dirData = os.path.join(dirProject, 'data')

    dict_options_dataset_training = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20122015',
        'grouping': 'verylightgrouping',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': None,
        'filtering': 'EntlassBereich_Gyn',
        'balanced': False,
        'resample': False
    }
    dataset_options_training = DatasetOptions(dict_options_dataset_training)

    dict_options_dataset_testing = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20162017',
        'grouping': 'verylightgrouping',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': None,
        'filtering': 'EntlassBereich_Gyn',
        'balanced': False,
        'resample': False
    }
    dataset_options_testing = DatasetOptions(dict_options_dataset_testing)

    if dict_options_dataset_testing['data_prefix'] == 'nz':
        feature_columns = FeatureColumnsNZ(
            dataset_options=dataset_options_testing)
        # feature_columns_nz_fusion = FeatureColumnsNZFusion(dataset_options=dataset_options_testing);
        # feature_columns = feature_columns_nz_fusion;
    elif dict_options_dataset_testing['data_prefix'] == 'patrec':
        feature_columns = FeatureColumnsPatrec(
            dataset_options=dataset_options_testing)
        # feature_columns_patrec_fusion = FeatureColumnsPatrecFusion(dataset_options=dataset_options_testing);
        # feature_columns = feature_columns_patrec_fusion;
    else:
        print('unknown data prefix..exit')
        sys.exit()

    dict_dataset_options = {
        'train': dataset_options_training,
        'eval': None,
        'test': dataset_options_testing
    }

    nn = NeuralNetModel('test', dict_dataset_options, feature_columns,
                        flags_obj)
    model_flags = nn.getFlags()

    if model_flags.model_dir.endswith('/'):
        trained_model = model_flags.model_dir.split('/')[-2]
    else:
        trained_model = model_flags.model_dir.split('/')[-1]

    if trained_model.startswith('warmstart'):
        pretrained = 'pretrained'
    else:
        pretrained = None

    print('warmstart: ' + str(trained_model.startswith('warmstart')))
    print('hidden units: ' + str(model_flags.hidden_units))
    dict_options_nn = {
        'hidden_units': model_flags.hidden_units,
        'learningrate': model_flags.learningrate,
        'dropout': model_flags.dropout,
        'batchnorm': model_flags.batchnorm,
        'batch_size': model_flags.batch_size,
        'training_epochs': model_flags.train_epochs,
        'pretrained': pretrained,
    }

    options_nn = OptionsNN(model_flags.model_dir,
                           dataset_options_training,
                           options_clf=dict_options_nn)
    classifier_nn = ClassifierNN(options_nn)
    results_all_runs_test = Results(dirResultsBase, dataset_options_training,
                                    options_nn, 'test',
                                    dataset_options_testing)

    num_runs = 10
    test_auc = []
    test_avgprecision = []
    for k in range(0, num_runs):

        results = nn.predict()
        filename_data_testing = nn.getFilenameDatasetBalanced()
        df_testing_balanced = pd.read_csv(filename_data_testing)

        predictions = [p['probabilities'] for p in results]
        predictions = np.array(predictions)

        print('get labels...: ' + str(filename_data_testing))
        labels = df_testing_balanced[
            dataset_options_testing.getEarlyReadmissionFlagname()].values
        res = classifier_nn.setResults(predictions, labels)
        results_all_runs_test.addResultsSingleRun(res)

        auc = res.getAUC()
        avgprecision = res.getAvgPrecision()
        print('')
        print('AUC: ' + str(auc))
        print('avg precision: ' + str(avgprecision))
        print('')
        test_auc.append(auc)
        test_avgprecision.append(avgprecision)

    print('')
    print('mean test auc: ' + str(np.mean(np.array(test_auc))))
    print('mean test avg precision: ' +
          str(np.mean(np.array(test_avgprecision))))
    print('')
    results_all_runs_test.writeResultsToFileDataset()
    df_all_years = pd.DataFrame()
    for year in years:
        print('year: ' + str(year))
        dict_options_dataset = {
            'dir_data': dirData,
            'data_prefix': 'nz',
            'dataset': str(year),
            'encoding': 'embedding',
            'grouping': 'verylightgrouping',
            'newfeatures': None,
            'featurereduction': {
                'method': 'FUSION'
            }
        }

        options_dataset_year = DatasetOptions(dict_options_dataset)
        dataset_year = Dataset(options_dataset_year)
        if balanced:
            df_year = dataset_year.getBalancedSubSet()
        else:
            df_year = dataset_year.getDf()

        #df_year['main_diag'] = df_year['main_diag'].apply(convertDiagToInd)
        print(df_year.shape)
        df_all_years = df_all_years.append(df_year)

    print('df balanced all years: ' + str(df_all_years.shape))

    encoding = options_dataset_year.getEncodingScheme()
    grouping = options_dataset_year.getGroupingName()
    featureset = options_dataset_year.getFeatureSetStr()
Esempio n. 19
0
def plotDifferentClassifiers(results_analyzer, dirData, dirModelsBase, dirResultsBase):
    dict_opt_lr = {'penalty': 'l1', 'C': 0.5};
    dict_opt_rf = {'n_estimators': 500, 'max_depth': 50};
    dict_options_nn = {
        'hidden_units': [60, 40, 40, 20],
        'learningrate': 0.001,
        'dropout': 0.5,
        'batchnorm': True,
        'batch_size': 64,
        'training_epochs': 1000,
        'pretrained': None,
    }
    DIRPROJECT = '/home/thomas/projects/patrec';
    model_dir = os.path.join(DIRPROJECT, "patients_model")

    dict_options_dataset_training = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20122015',
        'grouping':         'verylightgrouping',
        'encoding':         'categorical',
        'newfeatures':      {'names': constantsPATREC.NEW_FEATURES},
        'featurereduction': None,
        'filtering':        None,
        'balanced':         False,
        'resample':         False
    }
    dict_options_dataset_testing = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20162017',
        'grouping':         'verylightgrouping',
        'encoding':         'categorical',
        'newfeatures':      {'names': constantsPATREC.NEW_FEATURES},
        'featurereduction': None,
        'filtering':        None,
        'balanced':         False,
        'resample':         False
    }

    options_training = DatasetOptions(dict_options_dataset_training);
    options_testing = DatasetOptions(dict_options_dataset_testing);

    dict_opt_rf = {'n_estimators': 500, 'max_depth': 50};
    options_rf = OptionsRF(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf);
    results_test_rf = Results(dirResultsBase, options_training, options_rf, 'test', options_testing);

    dict_opt_lr_l2 = {'penalty': 'l2', 'C': 0.01};
    options_lr_l2 = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr_l2);
    results_test_lr_l2 = Results(dirResultsBase, options_training, options_lr_l2, 'test', options_testing);

    dict_opt_lr_l1 = {'penalty': 'l1', 'C': 0.5};
    options_lr_l1 = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr_l1);
    results_test_lr_l1 = Results(dirResultsBase, options_training, options_lr_l1, 'test', options_testing);

    dict_options_dataset_training_nn = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20122015',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        None,
        'balanced':         False,
        'resample':         False
    }
    dict_options_dataset_testing_nn = {
        'dir_data':         dirData,
        'data_prefix':      'patrec',
        'dataset':          '20162017',
        'grouping':         'verylightgrouping',
        'encoding':         'embedding',
        'newfeatures':      None,
        'featurereduction': None,
        'filtering':        None,
        'balanced':         False,
        'resample':         False
    }
    options_training = DatasetOptions(dict_options_dataset_training_nn);
    options_testing = DatasetOptions(dict_options_dataset_testing_nn);
    options_nn = OptionsNN(model_dir, options_training, options_clf=dict_options_nn);
    results_test_nn = Results(dirResultsBase, options_training, options_nn, 'test', options_testing);

    analyzer_rf = ResultsSingleConfigAnalyzer(results_test_rf, 10);
    analyzer_lr_l1 = ResultsSingleConfigAnalyzer(results_test_lr_l1, 10);
    analyzer_nn = ResultsSingleConfigAnalyzer(results_test_nn, 10)
    analyzer = [analyzer_rf, analyzer_lr_l1, analyzer_nn];
    names = ['RF', 'Logistic Regression (l1)', 'Neural Network']
    title_plot = ''
    filename_plot = os.path.join(dirPlotsBase,
                                 'different_classifiers_train_patrec_20122015_test_patrec_20162017.png')
    results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot, titlePlot=title_plot, )
    dirModelsBase = os.path.join(dirProject, 'classifiers/')
    dirPlotsBase = os.path.join(dirProject, 'plots', 'learned_features')

    dict_options_dataset_training = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20122015',
        'encoding': 'categorical',
        'newfeatures': {
            'names': constantsPATREC.NEW_FEATURES
        },
        'featurereduction': None,
        'grouping': 'verylightgrouping',
        'filtering': 'EntlassBereich_Gyn'
    }
    options_training = DatasetOptions(dict_options_dataset_training)

    dict_opt_rf = {
        'n_estimators': 500,
        'max_depth': 50
    }
    options_rf = OptionsRF(
        dirModelsBase,
        options_training.getFilenameOptions(filteroptions=True),
        options_clf=dict_opt_rf)

    dict_opt_lr = {
        'penalty': 'l1',
        'C': 0.5
    }
    options_lr = OptionsLogisticRegression(
Esempio n. 21
0
def plotNNPerformance(results_analyzer, dirData, dirModelsBase, dirResultsBase):

    # compare different trainings of NNs
    dict_options_dataset_training = {
        'dir_data':             dirData,
        'data_prefix':          'nz',
        'dataset':              '20122016',
        'encoding':             'embedding',
        'newfeatures':          None,
        'featurereduction':     {'method': 'FUSION'},
        'grouping':             'verylightgrouping'
    }
    dict_options_dataset_testing = {
        'dir_data':             dirData,
        'data_prefix':          'nz',
        'dataset':              '2017',
        'encoding':             'embedding',
        'newfeatures':          None,
        'featurereduction':     {'method': 'FUSION'},
        'grouping':             'verylightgrouping'
    }
    options_training_nn = DatasetOptions(dict_options_dataset_training);
    options_testing_nn = DatasetOptions(dict_options_dataset_testing);

    dict_options_nn = {
        'hidden_units':     [60, 40, 20, 10, 10],
        'learningrate':     0.05,
        'dropout':          0.25,
        'batch_size':       640,
        'training_epochs':  250,
        'pretrained':       'pretrained'
    }
    options_nn_nz = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn)
    results_nn_nz = Results(dirResultsBase, options_training_nn, options_nn_nz, 'test', options_testing_nn);

    dict_options_dataset_training = {
        'dir_data':             dirData,
        'data_prefix':          'patrec',
        'dataset':              '20122015',
        'encoding':             'embedding',
        'newfeatures':          None,
        'featurereduction':     {'method': 'FUSION'},
        'grouping':             'verylightgrouping'
    }
    dict_options_dataset_testing = {
        'dir_data':             dirData,
        'data_prefix':          'patrec',
        'dataset':              '20162017',
        'encoding':             'embedding',
        'newfeatures':          None,
        'featurereduction':     {'method': 'FUSION'},
        'grouping':             'verylightgrouping'
    }
    options_training_nn = DatasetOptions(dict_options_dataset_training);
    options_testing_nn = DatasetOptions(dict_options_dataset_testing);

    dict_options_nn = {
        'hidden_units':     [20, 10, 10],
        'learningrate':     0.01,
        'dropout':          0.15,
        'batch_size':       80,
        'training_epochs':  500,
    }
    options_nn_patrec = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn)
    results_nn_patrec = Results(dirResultsBase, options_training_nn, options_nn_patrec, 'test', options_testing_nn);

    dict_options_nn = {
        'hidden_units':     [20, 10, 10],
        'learningrate':     0.01,
        'dropout':          0.25,
        'batch_size':       80,
        'training_epochs':  500,
        'pretrained':       'pretrained'
    }
    options_nn_patrec_pretrained = OptionsNN(dirModelsBase, options_training_nn.getFilenameOptions(filteroptions=True), options_clf=dict_options_nn)
    results_nn_patrec_pretrained = Results(dirResultsBase, options_training_nn, options_nn_patrec_pretrained, 'test', options_testing_nn);

    dict_options_dataset_training = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20122015',
        'subgroups': ['DK'],
        'encoding': 'categorical',
        'newfeatures': None,
        'featurereduction': {'method': 'FUSION'},
        'grouping': 'verylightgrouping'
    }
    dict_options_dataset_testing = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20162017',
        'subgroups': ['DK'],
        'encoding': 'categorical',
        'newfeatures': None,
        'featurereduction': {'method': 'FUSION'},
        'grouping': 'verylightgrouping'
    }
    dict_opt_lr = {'penalty': 'l1', 'C': 0.075};
    options_training = DatasetOptions(dict_options_dataset_training);
    options_testing = DatasetOptions(dict_options_dataset_testing);
    options_lr = OptionsLogisticRegression(dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_lr);
    results_lr = Results(dirResultsBase, options_training, options_lr, 'test', options_testing);

    analyzer_nn_nz = ResultsSingleConfigAnalyzer(results_nn_nz, 1);
    analyzer_nn_patrec = ResultsSingleConfigAnalyzer(results_nn_patrec, 1);
    analyzer_nn_patrec_pretrained = ResultsSingleConfigAnalyzer(results_nn_patrec_pretrained, 1);
    analyzer_lr = ResultsSingleConfigAnalyzer(results_lr, 10);
    analyzer = [analyzer_nn_nz, analyzer_nn_patrec, analyzer_nn_patrec_pretrained, analyzer_lr];
    names = ['NZ', 'Basel', 'Basel (pretrained NZ)', 'LASSO']
    title_plot = 'neural network performance: with and without pre-training'
    filename_plot = dirPlotsBase + 'nn_pretraining_nz_plus_lasso.png'
    results_analyzer.plotROCcurveMulitpleConfigs(analyzer, names, f_plot=filename_plot)
Esempio n. 22
0
dirData = dirProject + 'data/';
dirPlotsBase = dirProject + 'plots/feature_comparison_wiederkehrer_normal/'


dict_options_analyzing = {
    'dir_data':             dirData,
    'data_prefix':          'patrec',
    'dataset':              '20122015',
    'encoding':             'categorical',
    'newfeatures':          {'names': constants.NEW_FEATURES},
    'featurereduction':     None,
    'grouping':             'verylightgrouping',
    'filtering':            'cardiovascular'
}

options = DatasetOptions(dict_options_analyzing);
dataset = Dataset(options);

if options.getOptionsFiltering() is not None:
    dirPlots = dirPlotsBase + options.getOptionsFiltering() + '/';
else:
    dirPlots = dirPlotsBase;

if not os.path.exists(dirPlots):
    os.makedirs(dirPlots);

analyzer = DataAnalyzer(options, dirPlots)
analyzer.doFeatureComparison()
# analyzer.checkWiederkehrer();

# avg_num_subgrp = analyzer.getAvgNumberSubgroup('DK')