# directory where the data will be saved wd = './synthetic_eICU_datasets' if not os.path.isdir(wd): os.mkdir(wd) # runs the experiment 5 times identifiers = ['eICU_cdgan_synthetic_dataset_r' + str(i) for i in range(0, 1)] for identifier in identifiers: # reset tensorflow graph tf.reset_default_graph() print("loading data...") samples, labels = data_utils.eICU_task() train_seqs = samples['train'].reshape(-1, 16, 4) vali_seqs = samples['vali'].reshape(-1, 16, 4) test_seqs = samples['test'].reshape(-1, 16, 4) train_targets = labels['train'] vali_targets = labels['vali'] test_targets = labels['test'] train_seqs, vali_seqs, test_seqs = data_utils.normalise_data( train_seqs, vali_seqs, test_seqs) print("data loaded.") #training config lr = 0.1 batch_size = 28 num_epochs = 1005
def model_memorisation(identifier, epoch, max_samples=2000, tstr=False): """ Compare samples from a model against training set and validation set in mmd """ if tstr: print('Loading data from TSTR experiment (not sampling from model)') # load pre-generated samples synth_data = np.load('./experiments/tstr/' + identifier + '_' + str(epoch) + '.data.npy').item() model_samples = synth_data['samples'] synth_labels = synth_data['labels'] # load real data used in that experiment real_data = np.load('./experiments/data/' + identifier + '.data.npy').item() real_samples = real_data['samples'] train = real_samples['train'] test = real_samples['test'] n_samples = test.shape[0] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation(model_samples)[:n_samples] print('Data loaded successfully!') else: if identifier == 'cristobal_eICU': model_samples = pickle.load(open('REDACTED', 'rb')) samples, labels = data_utils.eICU_task() train = samples['train'].reshape(-1, 16, 4) vali = samples['vali'].reshape(-1, 16, 4) test = samples['test'].reshape(-1, 16, 4) #train_targets = labels['train'] #vali_targets = labels['vali'] #test_targets = labels['test'] train, vali, test = data_utils.scale_data(train, vali, test) n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation( model_samples)[:n_samples] elif identifier == 'cristobal_MNIST': the_dir = 'REDACTED' # pick a random one which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7']) model_samples, model_labels = pickle.load( open( the_dir + 'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_' + which + '_190.pk', 'rb')) # get test and train... # (generated with fixed seed...) mnist_resized_dim = 14 samples, labels = data_utils.load_resized_mnist(mnist_resized_dim) proportions = [0.6, 0.2, 0.2] train, vali, test, labels_split = data_utils.split( samples, labels=labels, random_seed=1, proportions=proportions) np.random.seed() train = train.reshape(-1, 14, 14) test = test.reshape(-1, 14, 14) vali = vali.reshape(-1, 14, 14) n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] if model_samples.shape[0] > n_samples: model_samples = np.random.permutation( model_samples)[:n_samples] else: settings = json.load( open('./experiments/settings/' + identifier + '.txt', 'r')) # get the test, train sets data = np.load('./experiments/data/' + identifier + '.data.npy').item() train = data['samples']['train'] test = data['samples']['test'] n_samples = test.shape[0] if n_samples > max_samples: n_samples = max_samples test = np.random.permutation(test)[:n_samples] model_samples = model.sample_trained_model(settings, epoch, n_samples) all_samples = np.vstack([train, test, model_samples]) heuristic_sigma = mmd.median_pairwise_distance(all_samples) print('heuristic sigma:', heuristic_sigma) pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test( model_samples, test, np.random.permutation(train)[:n_samples], sigma=heuristic_sigma, computeMMDs=False) #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False) # if pvalue < 0.05: # print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data') # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test) # else: # print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data') return pvalue, tstat, sigma