for identifier in identifiers: # reset tensorflow graph tf.reset_default_graph() print ("loading data...") samples, labels = data_utils.eICU_task() train_seqs = samples['train'].reshape(-1,16,4) vali_seqs = samples['vali'].reshape(-1,16,4) test_seqs = samples['test'].reshape(-1,16,4) train_targets = labels['train'] vali_targets = labels['vali'] test_targets = labels['test'] train_seqs, vali_seqs, test_seqs = data_utils.scale_data(train_seqs, vali_seqs, test_seqs) print ("data loaded.") #training config lr = 0.1 batch_size = 28 num_epochs = 1005 D_rounds = 1 # number of rounds of discriminator training G_rounds = 3 # number of rounds of generator training use_time = False # use one latent dimension as time print(identifier) seq_length = train_seqs.shape[1] num_generated_features = train_seqs.shape[2]
def model_memorisation(identifier, epoch, max_samples=2000, tstr=False): """ Compare samples from a model against training set and validation set in mmd """ if tstr: print('Loading data from TSTR experiment (not sampling from model)') # load pre-generated samples synth_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item() model_samples = synth_data['samples'] synth_labels = synth_data['labels'] # load real data used in that experiment real_data = np.load('./experiments/data/' + identifier + '.data.npy').item() real_samples = real_data['samples'] train = real_samples['train'] test = real_samples['test'] n_samples = test.shape[0] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation(model_samples)[:n_samples] print('Data loaded successfully!') else: if identifier == 'cristobal_eICU': model_samples = pickle.load(open('REDACTED', 'rb')) samples, labels = data_utils.eICU_task() train = samples['train'].reshape(-1, 16, 4) vali = samples['vali'].reshape(-1, 16, 4) test = samples['test'].reshape(-1, 16, 4) #train_targets = labels['train'] #vali_targets = labels['vali'] #test_targets = labels['test'] train, vali, test = data_utils.scale_data(train, vali, test) n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation( model_samples)[:n_samples] elif identifier == 'cristobal_MNIST': the_dir = 'REDACTED' # pick a random one which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7']) model_samples, model_labels = pickle.load( open( the_dir + 'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_' + which + '_190.pk', 'rb')) # get test and train... # (generated with fixed seed...) mnist_resized_dim = 14 samples, labels = data_utils.load_resized_mnist(mnist_resized_dim) proportions = [0.6, 0.2, 0.2] train, vali, test, labels_split = data_utils.split( samples, labels=labels, random_seed=1, proportions=proportions) np.random.seed() train = train.reshape(-1, 14, 14) test = test.reshape(-1, 14, 14) vali = vali.reshape(-1, 14, 14) n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation( model_samples)[:n_samples] else: settings = json.load( open('./experiments/settings/' + identifier + '.txt', 'r')) # get the test, train sets data = np.load('./experiments/data/' + identifier + '.data.npy').item() train = data['samples']['train'] test = data['samples']['test'] n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] model_samples = model.sample_trained_model(settings, epoch, n_samples) all_samples = np.vstack([train, test, model_samples]) heuristic_sigma = mmd.median_pairwise_distance(all_samples) print('heuristic sigma:', heuristic_sigma) pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test( model_samples, test, np.random.permutation(train)[:n_samples], sigma=heuristic_sigma, computeMMDs=False) #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False) # if pvalue < 0.05: # print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data') # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test) # else: # print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data') return pvalue, tstat, sigma
def view_marginals_cristobal(rep=0, epoch=300, zoom=False): """ View marginals of the synthetic data (compare to real data), from the data Cristobal generated. """ samples_path = paths.eICU_synthetic_dir + 'samples_eICU_cdgan_synthetic_dataset_r' + str( rep) + '_' + str(epoch) + '.pk' samples = np.load(samples_path) labels_path = paths.eICU_synthetic_dir + 'labels_eICU_cdgan_synthetic_dataset_r' + str( rep) + '_' + str(epoch) + '.pk' labels = np.load(labels_path) real_path = paths.eICU_task_data raw_real_train = np.load(real_path).item()['X_train'].reshape(-1, 16, 4) real_test = np.load(real_path).item()['X_test'].reshape(-1, 16, 4) real_vali = np.load(real_path).item()['X_vali'].reshape(-1, 16, 4) # discard vali, test real, scaled_vali, scaled_test = scale_data(raw_real_train, real_vali, real_test) real = raw_real_train view_marginals_raw(raw_real_train, label='raw_real_train') view_marginals_raw(real, label='real_train') view_marginals_raw(samples, label='synthetic') variables = ['sao2', 'heartrate', 'respiration', 'systemicmean'] # get the scaling factors scaling_factors = { 'a': np.zeros(shape=(16, 4)), 'b': np.zeros(shape=(16, 4)) } ranges = [] for var in range(4): var_min = 100 var_max = 0 for timestep in range(16): min_val = np.min([ np.min(raw_real_train[:, timestep, var]), np.min(real_vali[:, timestep, var]) ]) max_val = np.max([ np.max(raw_real_train[:, timestep, var]), np.max(real_vali[:, timestep, var]) ]) if min_val < var_min: var_min = min_val if max_val > var_max: var_max = max_val a = (max_val - min_val) / 2 b = (max_val + min_val) / 2 scaling_factors['a'][timestep, var] = a scaling_factors['b'][timestep, var] = b ranges.append([var_min, var_max]) # now, scale the synthetic data manually samples_scaled = np.zeros_like(samples) for var in range(4): for timestep in range(16): samples_scaled[:, timestep, var] = samples[:, timestep, var] * scaling_factors[ 'a'][timestep, var] + scaling_factors['b'][timestep, var] if zoom: # use modes, skip for now modes = False if modes: # get rough region of interest, then zoom in on it afterwards! num_gradations = 5 gradations = np.linspace(-1, 1, num_gradations) # for cutoff in the gradations, what fraction of samples (at a given time point) fall into that cutoff bracket? lower = 0 real_grid = np.zeros(shape=(16, num_gradations, 4)) for (i, cutoff) in enumerate(gradations): # take the mean over samples real_frac = ((real > lower) & (real <= cutoff)).mean(axis=0) lower = cutoff real_grid[:, i, :] = real_frac time_averaged_grid = np.mean(real_grid, axis=0) # get the most populated part of the grid for each variable grid_modes = np.argmax(time_averaged_grid, axis=0) lower = 0 ranges = [] for i in grid_modes: lower = gradations[i - 1] upper = gradations[i] ranges.append([lower, upper]) else: # hand-crafted ranges ranges = [[88, 100], [30, 130], [7, 60], [35, 135]] num_gradations = 25 # for cutoff in the gradations, what fraction of samples (at a given time point) fall into that cutoff bracket? grid = np.zeros(shape=(16, num_gradations, 4)) real_grid = np.zeros(shape=(16, num_gradations, 4)) assert samples.shape[-1] == 4 for var in range(4): # allow for a different range per variable (if zoom) low = ranges[var][0] high = ranges[var][1] gradations = np.linspace(low, high, num_gradations) for (i, cutoff) in enumerate(gradations): # take the mean over samples frac = ((samples_scaled[:, :, var] > low) & (samples_scaled[:, :, var] <= cutoff)).mean(axis=0) real_frac = ((real[:, :, var] > low) & (real[:, :, var] <= cutoff)).mean(axis=0) low = cutoff grid[:, i, var] = frac real_grid[:, i, var] = real_frac # now plot this as an image fig, axarr = plt.subplots(nrows=4, ncols=2, sharey='row', sharex=True) axarr[0, 0].imshow(grid[:, :, 0].T, origin='lower', aspect=0.5, cmap='magma_r') axarr[1, 0].imshow(grid[:, :, 1].T, origin='lower', aspect=0.5, cmap='magma_r') axarr[2, 0].imshow(grid[:, :, 2].T, origin='lower', aspect=0.5, cmap='magma_r') axarr[3, 0].imshow(grid[:, :, 3].T, origin='lower', aspect=0.5, cmap='magma_r') axarr[0, 1].imshow(real_grid[:, :, 0].T, origin='lower', aspect=0.5, cmap='magma_r') axarr[1, 1].imshow(real_grid[:, :, 1].T, origin='lower', aspect=0.5, cmap='magma_r') axarr[2, 1].imshow(real_grid[:, :, 2].T, origin='lower', aspect=0.5, cmap='magma_r') axarr[3, 1].imshow(real_grid[:, :, 3].T, origin='lower', aspect=0.5, cmap='magma_r') axarr[0, 0].set_title("synthetic") axarr[0, 1].set_title("real") for var in range(4): low, high = ranges[var] labels = np.linspace(low, high, num_gradations)[1::4] labels = np.round(labels, 0) axarr[var, 0].set_yticklabels(labels) axarr[var, 0].set_yticks(np.arange(num_gradations)[1::4]) axarr[var, 0].set_ylabel(variables[var]) for ax in axarr[var, :]: ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') ax.set_adjustable('box-forced') ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) ax.grid(b=True, color='black', alpha=0.2, linestyle='--') axarr[-1, 0].set_xticks(np.arange(16)[::2]) axarr[-1, 1].set_xticks(np.arange(16)[::2]) if zoom: plt.suptitle('(zoomed)') plt.tight_layout(pad=0.0, w_pad=-5.0, h_pad=0.1) plt.savefig("./experiments/eval/eICU_cristobal_marginals_r" + str(rep) + "_epoch" + str(epoch) + ".png") # now make the histograms fig, axarr = plt.subplots(nrows=1, ncols=4) axarr[0].set_ylabel("density") axarr[0].hist(real[:, :, 0].flatten(), normed=True, color='black', alpha=0.8, range=ranges[0], bins=min(50, (ranges[0][1] - ranges[0][0])), label='real') axarr[1].hist(real[:, :, 1].flatten(), normed=True, color='black', alpha=0.8, range=ranges[1], bins=50) axarr[2].hist(real[:, :, 2].flatten(), normed=True, color='black', alpha=0.8, range=ranges[2], bins=50) axarr[3].hist(real[:, :, 3].flatten(), normed=True, color='black', alpha=0.8, range=ranges[3], bins=50) axarr[0].hist(samples_scaled[:, :, 0].flatten(), normed=True, alpha=0.6, range=ranges[0], bins=min(50, (ranges[0][1] - ranges[0][0])), label='synthetic') axarr[0].legend() axarr[1].hist(samples_scaled[:, :, 1].flatten(), normed=True, alpha=0.6, range=ranges[1], bins=50) axarr[2].hist(samples_scaled[:, :, 2].flatten(), normed=True, alpha=0.6, range=ranges[2], bins=50) axarr[3].hist(samples_scaled[:, :, 3].flatten(), normed=True, alpha=0.6, range=ranges[3], bins=50) for (var, ax) in enumerate(axarr): ax.set_xlabel(variables[var]) ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) ax.grid(b=True, color='black', alpha=0.2, linestyle='--') plt.gcf().subplots_adjust(bottom=0.2) fig.set_size_inches(10, 3) plt.savefig("./experiments/eval/eICU_cristobal_hist_r" + str(rep) + "_epoch" + str(epoch) + ".png") return True