def main(dict_dataset_options):

    options = DatasetOptions(dict_dataset_options)
    preproc = PreprocessorNZ(options)

    print('grouping: ' + str(options.getGroupingName()))
    # preproc.processDischargeFile();
    # preproc.processDiagnosisFile();

    # preproc.createFeatureSet();
    preproc.encodeFeatures()
    preproc.fuse()
Beispiel #2
0
def analyze(flags_obj):
    """Run Wide-Deep training and eval loop.
    Args:
    flags_obj: An object containing parsed flag values.
    """
    dict_data_train = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '20072016',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_train = DatasetOptions(dict_data_train)
    dataset_options_eval = None

    if dict_data_train['data_prefix'] == 'nz':
        feature_columns = FeatureColumnsAutoEncoderNZ(
            dataset_options=dataset_options_train)
    else:
        print('unknown data prefix..exit')
        sys.exit()

    dict_dataset_options = {
        'train': dataset_options_train,
        'eval': dataset_options_eval,
        'test': None
    }

    nn = AutoEncoderModel('analysis', dict_dataset_options, feature_columns,
                          flags_obj)
    basic_encodings = nn.analyze()

    num_colors = 26
    colors = plt.cm.rainbow(np.linspace(0, 1, num_colors))

    pca = PCA(n_components=2)
    weights_2d_pca = pca.fit_transform(basic_encodings)

    tsne = TSNE(n_components=2)
    weights_2d_tsne = tsne.fit_transform(basic_encodings)

    diag_group_names = dataset_options_train.getDiagGroupNames()
    num_diags = len(diag_group_names)

    if dataset_options_train.getGroupingName() == 'verylightgrouping':
        num_subcategories = 100
    elif dataset_options_train.getGroupingName() == 'lightgrouping':
        num_subcategories = 10
    elif dataset_options_train.getGroupingName() == 'grouping':
        num_subcategories = 1
    else:
        print('grouping scheme is unknown...exit')
        sys.exit()

    plt.figure()
    for k in range(0, num_colors):
        c = colors[k]
        plt.scatter(
            weights_2d_pca[k * num_subcategories:(k * num_subcategories +
                                                  num_subcategories), 0],
            weights_2d_pca[k * num_subcategories:(k * num_subcategories +
                                                  num_subcategories), 1],
            label=string.ascii_uppercase[k],
            alpha=0.5,
            s=100,
            c=c)
    plt.legend()
    plt.title('pca')
    plt.draw()

    plt.figure()
    for k in range(0, num_colors):
        c = colors[k]
        plt.scatter(
            weights_2d_tsne[k * num_subcategories:(k * num_subcategories +
                                                   num_subcategories), 0],
            weights_2d_tsne[k * num_subcategories:(k * num_subcategories +
                                                   num_subcategories), 1],
            label=string.ascii_uppercase[k],
            alpha=0.5,
            s=100,
            c=c)
    plt.legend()
    plt.title('t-sne')
    plt.draw()

    plt.show()
            'data_prefix': 'nz',
            'dataset': str(year),
            'encoding': 'embedding',
            'grouping': 'verylightgrouping',
            'newfeatures': None,
            'featurereduction': {
                'method': 'FUSION'
            }
        }

        options_dataset_year = DatasetOptions(dict_options_dataset)
        dataset_year = Dataset(options_dataset_year)
        if balanced:
            df_year = dataset_year.getBalancedSubSet()
        else:
            df_year = dataset_year.getDf()

        #df_year['main_diag'] = df_year['main_diag'].apply(convertDiagToInd)
        print(df_year.shape)
        df_all_years = df_all_years.append(df_year)

    print('df balanced all years: ' + str(df_all_years.shape))

    encoding = options_dataset_year.getEncodingScheme()
    grouping = options_dataset_year.getGroupingName()
    featureset = options_dataset_year.getFeatureSetStr()
    filename_data_years = dirData + 'data_nz_' + str(min(years)) + str(
        max(years)
    ) + '_' + featureset + '_' + encoding + '_' + grouping + '.csv'
    df_all_years.to_csv(filename_data_years, line_terminator='\n', index=False)