def main(dict_dataset_options): options = DatasetOptions(dict_dataset_options) preproc = PreprocessorNZ(options) print('grouping: ' + str(options.getGroupingName())) # preproc.processDischargeFile(); # preproc.processDiagnosisFile(); # preproc.createFeatureSet(); preproc.encodeFeatures() preproc.fuse()
def analyze(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dict_data_train = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '20072016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_train = DatasetOptions(dict_data_train) dataset_options_eval = None if dict_data_train['data_prefix'] == 'nz': feature_columns = FeatureColumnsAutoEncoderNZ( dataset_options=dataset_options_train) else: print('unknown data prefix..exit') sys.exit() dict_dataset_options = { 'train': dataset_options_train, 'eval': dataset_options_eval, 'test': None } nn = AutoEncoderModel('analysis', dict_dataset_options, feature_columns, flags_obj) basic_encodings = nn.analyze() num_colors = 26 colors = plt.cm.rainbow(np.linspace(0, 1, num_colors)) pca = PCA(n_components=2) weights_2d_pca = pca.fit_transform(basic_encodings) tsne = TSNE(n_components=2) weights_2d_tsne = tsne.fit_transform(basic_encodings) diag_group_names = dataset_options_train.getDiagGroupNames() num_diags = len(diag_group_names) if dataset_options_train.getGroupingName() == 'verylightgrouping': num_subcategories = 100 elif dataset_options_train.getGroupingName() == 'lightgrouping': num_subcategories = 10 elif dataset_options_train.getGroupingName() == 'grouping': num_subcategories = 1 else: print('grouping scheme is unknown...exit') sys.exit() plt.figure() for k in range(0, num_colors): c = colors[k] plt.scatter( weights_2d_pca[k * num_subcategories:(k * num_subcategories + num_subcategories), 0], weights_2d_pca[k * num_subcategories:(k * num_subcategories + num_subcategories), 1], label=string.ascii_uppercase[k], alpha=0.5, s=100, c=c) plt.legend() plt.title('pca') plt.draw() plt.figure() for k in range(0, num_colors): c = colors[k] plt.scatter( weights_2d_tsne[k * num_subcategories:(k * num_subcategories + num_subcategories), 0], weights_2d_tsne[k * num_subcategories:(k * num_subcategories + num_subcategories), 1], label=string.ascii_uppercase[k], alpha=0.5, s=100, c=c) plt.legend() plt.title('t-sne') plt.draw() plt.show()
'data_prefix': 'nz', 'dataset': str(year), 'encoding': 'embedding', 'grouping': 'verylightgrouping', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' } } options_dataset_year = DatasetOptions(dict_options_dataset) dataset_year = Dataset(options_dataset_year) if balanced: df_year = dataset_year.getBalancedSubSet() else: df_year = dataset_year.getDf() #df_year['main_diag'] = df_year['main_diag'].apply(convertDiagToInd) print(df_year.shape) df_all_years = df_all_years.append(df_year) print('df balanced all years: ' + str(df_all_years.shape)) encoding = options_dataset_year.getEncodingScheme() grouping = options_dataset_year.getGroupingName() featureset = options_dataset_year.getFeatureSetStr() filename_data_years = dirData + 'data_nz_' + str(min(years)) + str( max(years) ) + '_' + featureset + '_' + encoding + '_' + grouping + '.csv' df_all_years.to_csv(filename_data_years, line_terminator='\n', index=False)