for identifier in identifiers:

    # reset tensorflow graph
    tf.reset_default_graph()

    print ("loading data...")

    samples, labels = data_utils.eICU_task()
    train_seqs = samples['train'].reshape(-1,16,4)
    vali_seqs = samples['vali'].reshape(-1,16,4)
    test_seqs = samples['test'].reshape(-1,16,4)
    train_targets = labels['train']
    vali_targets = labels['vali']
    test_targets = labels['test']
    train_seqs, vali_seqs, test_seqs = data_utils.scale_data(train_seqs, vali_seqs, test_seqs)

    print ("data loaded.")

    #training config
    lr = 0.1
    batch_size = 28
    num_epochs = 1005
    D_rounds = 1    # number of rounds of discriminator training
    G_rounds = 3    # number of rounds of generator training
    use_time = False    # use one latent dimension as time

    print(identifier)

    seq_length = train_seqs.shape[1]
    num_generated_features = train_seqs.shape[2]
Esempio n. 2
0
def model_memorisation(identifier, epoch, max_samples=2000, tstr=False):
    """
    Compare samples from a model against training set and validation set in mmd
    """
    if tstr:
        print('Loading data from TSTR experiment (not sampling from model)')
        # load pre-generated samples
        synth_data = np.load('./experiments/tstr/' + identifier + '_' +
                             str(epoch) + '.data.npy').item()
        model_samples = synth_data['samples']
        synth_labels = synth_data['labels']
        # load real data used in that experiment
        real_data = np.load('./experiments/data/' + identifier +
                            '.data.npy').item()
        real_samples = real_data['samples']
        train = real_samples['train']
        test = real_samples['test']
        n_samples = test.shape[0]
        if model_samples.shape[0] > n_samples:
            model_samples = np.random.permutation(model_samples)[:n_samples]
        print('Data loaded successfully!')
    else:
        if identifier == 'cristobal_eICU':
            model_samples = pickle.load(open('REDACTED', 'rb'))
            samples, labels = data_utils.eICU_task()
            train = samples['train'].reshape(-1, 16, 4)
            vali = samples['vali'].reshape(-1, 16, 4)
            test = samples['test'].reshape(-1, 16, 4)
            #train_targets = labels['train']
            #vali_targets = labels['vali']
            #test_targets = labels['test']
            train, vali, test = data_utils.scale_data(train, vali, test)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(
                    model_samples)[:n_samples]
        elif identifier == 'cristobal_MNIST':
            the_dir = 'REDACTED'
            # pick a random one
            which = np.random.choice(['NEW_OK_', '_r4', '_r5', '_r6', '_r7'])
            model_samples, model_labels = pickle.load(
                open(
                    the_dir +
                    'synth_mnist_minist_cdgan_1_2_100_multivar_14_nolr_rdim3_0_2_'
                    + which + '_190.pk', 'rb'))
            # get test and train...
            # (generated with fixed seed...)
            mnist_resized_dim = 14
            samples, labels = data_utils.load_resized_mnist(mnist_resized_dim)
            proportions = [0.6, 0.2, 0.2]
            train, vali, test, labels_split = data_utils.split(
                samples, labels=labels, random_seed=1, proportions=proportions)
            np.random.seed()
            train = train.reshape(-1, 14, 14)
            test = test.reshape(-1, 14, 14)
            vali = vali.reshape(-1, 14, 14)
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            if model_samples.shape[0] > n_samples:
                model_samples = np.random.permutation(
                    model_samples)[:n_samples]
        else:
            settings = json.load(
                open('./experiments/settings/' + identifier + '.txt', 'r'))
            # get the test, train sets
            data = np.load('./experiments/data/' + identifier +
                           '.data.npy').item()
            train = data['samples']['train']
            test = data['samples']['test']
            n_samples = test.shape[0]
            if n_samples > max_samples:
                n_samples = max_samples
                test = np.random.permutation(test)[:n_samples]
            model_samples = model.sample_trained_model(settings, epoch,
                                                       n_samples)
    all_samples = np.vstack([train, test, model_samples])
    heuristic_sigma = mmd.median_pairwise_distance(all_samples)
    print('heuristic sigma:', heuristic_sigma)
    pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(
        model_samples,
        test,
        np.random.permutation(train)[:n_samples],
        sigma=heuristic_sigma,
        computeMMDs=False)
    #pvalue, tstat, sigma, MMDXY, MMDXZ = MMD_3_Sample_Test(model_samples, np.random.permutation(train)[:n_samples], test, sigma=heuristic_sigma, computeMMDs=False)
    #    if pvalue < 0.05:
    #        print('At confidence level 0.05, we reject the null hypothesis that MMDXY <= MMDXZ, and conclude that the test data has a smaller MMD with the true data than the generated data')
    # the function takes (X, Y, Z) as its first arguments, it's testing if MMDXY (i.e. MMD between model and train) is less than MMDXZ (MMd between model and test)
    #    else:
    #        print('We have failed to reject the null hypothesis that MMDXY <= MMDXZ, and cannot conclu#de that the test data has a smaller MMD with the true data than the generated data')
    return pvalue, tstat, sigma
Esempio n. 3
0
def view_marginals_cristobal(rep=0, epoch=300, zoom=False):
    """
    View marginals of the synthetic data (compare to real data), from the data Cristobal generated.
    """
    samples_path = paths.eICU_synthetic_dir + 'samples_eICU_cdgan_synthetic_dataset_r' + str(
        rep) + '_' + str(epoch) + '.pk'
    samples = np.load(samples_path)
    labels_path = paths.eICU_synthetic_dir + 'labels_eICU_cdgan_synthetic_dataset_r' + str(
        rep) + '_' + str(epoch) + '.pk'
    labels = np.load(labels_path)
    real_path = paths.eICU_task_data
    raw_real_train = np.load(real_path).item()['X_train'].reshape(-1, 16, 4)
    real_test = np.load(real_path).item()['X_test'].reshape(-1, 16, 4)
    real_vali = np.load(real_path).item()['X_vali'].reshape(-1, 16, 4)
    # discard vali, test
    real, scaled_vali, scaled_test = scale_data(raw_real_train, real_vali,
                                                real_test)
    real = raw_real_train
    view_marginals_raw(raw_real_train, label='raw_real_train')
    view_marginals_raw(real, label='real_train')
    view_marginals_raw(samples, label='synthetic')

    variables = ['sao2', 'heartrate', 'respiration', 'systemicmean']

    # get the scaling factors
    scaling_factors = {
        'a': np.zeros(shape=(16, 4)),
        'b': np.zeros(shape=(16, 4))
    }
    ranges = []
    for var in range(4):
        var_min = 100
        var_max = 0
        for timestep in range(16):
            min_val = np.min([
                np.min(raw_real_train[:, timestep, var]),
                np.min(real_vali[:, timestep, var])
            ])
            max_val = np.max([
                np.max(raw_real_train[:, timestep, var]),
                np.max(real_vali[:, timestep, var])
            ])
            if min_val < var_min:
                var_min = min_val
            if max_val > var_max:
                var_max = max_val
            a = (max_val - min_val) / 2
            b = (max_val + min_val) / 2
            scaling_factors['a'][timestep, var] = a
            scaling_factors['b'][timestep, var] = b
        ranges.append([var_min, var_max])

    # now, scale the synthetic data manually
    samples_scaled = np.zeros_like(samples)
    for var in range(4):
        for timestep in range(16):
            samples_scaled[:, timestep,
                           var] = samples[:, timestep, var] * scaling_factors[
                               'a'][timestep,
                                    var] + scaling_factors['b'][timestep, var]

    if zoom:
        # use modes, skip for now
        modes = False
        if modes:
            # get rough region of interest, then zoom in on it afterwards!
            num_gradations = 5
            gradations = np.linspace(-1, 1, num_gradations)
            # for cutoff in the gradations, what fraction of samples (at a given time point) fall into that cutoff bracket?
            lower = 0
            real_grid = np.zeros(shape=(16, num_gradations, 4))
            for (i, cutoff) in enumerate(gradations):
                # take the mean over samples
                real_frac = ((real > lower) & (real <= cutoff)).mean(axis=0)
                lower = cutoff
                real_grid[:, i, :] = real_frac
            time_averaged_grid = np.mean(real_grid, axis=0)
            # get the most populated part of the grid for each variable
            grid_modes = np.argmax(time_averaged_grid, axis=0)
            lower = 0
            ranges = []
            for i in grid_modes:
                lower = gradations[i - 1]
                upper = gradations[i]
                ranges.append([lower, upper])
        else:
            # hand-crafted ranges
            ranges = [[88, 100], [30, 130], [7, 60], [35, 135]]

    num_gradations = 25
    # for cutoff in the gradations, what fraction of samples (at a given time point) fall into that cutoff bracket?
    grid = np.zeros(shape=(16, num_gradations, 4))
    real_grid = np.zeros(shape=(16, num_gradations, 4))
    assert samples.shape[-1] == 4
    for var in range(4):
        # allow for a different range per variable (if zoom)
        low = ranges[var][0]
        high = ranges[var][1]
        gradations = np.linspace(low, high, num_gradations)
        for (i, cutoff) in enumerate(gradations):
            # take the mean over samples
            frac = ((samples_scaled[:, :, var] > low) &
                    (samples_scaled[:, :, var] <= cutoff)).mean(axis=0)
            real_frac = ((real[:, :, var] > low) &
                         (real[:, :, var] <= cutoff)).mean(axis=0)
            low = cutoff
            grid[:, i, var] = frac
            real_grid[:, i, var] = real_frac

    # now plot this as an image
    fig, axarr = plt.subplots(nrows=4, ncols=2, sharey='row', sharex=True)
    axarr[0, 0].imshow(grid[:, :, 0].T,
                       origin='lower',
                       aspect=0.5,
                       cmap='magma_r')
    axarr[1, 0].imshow(grid[:, :, 1].T,
                       origin='lower',
                       aspect=0.5,
                       cmap='magma_r')
    axarr[2, 0].imshow(grid[:, :, 2].T,
                       origin='lower',
                       aspect=0.5,
                       cmap='magma_r')
    axarr[3, 0].imshow(grid[:, :, 3].T,
                       origin='lower',
                       aspect=0.5,
                       cmap='magma_r')
    axarr[0, 1].imshow(real_grid[:, :, 0].T,
                       origin='lower',
                       aspect=0.5,
                       cmap='magma_r')
    axarr[1, 1].imshow(real_grid[:, :, 1].T,
                       origin='lower',
                       aspect=0.5,
                       cmap='magma_r')
    axarr[2, 1].imshow(real_grid[:, :, 2].T,
                       origin='lower',
                       aspect=0.5,
                       cmap='magma_r')
    axarr[3, 1].imshow(real_grid[:, :, 3].T,
                       origin='lower',
                       aspect=0.5,
                       cmap='magma_r')

    axarr[0, 0].set_title("synthetic")
    axarr[0, 1].set_title("real")
    for var in range(4):
        low, high = ranges[var]
        labels = np.linspace(low, high, num_gradations)[1::4]
        labels = np.round(labels, 0)
        axarr[var, 0].set_yticklabels(labels)
        axarr[var, 0].set_yticks(np.arange(num_gradations)[1::4])
        axarr[var, 0].set_ylabel(variables[var])
        for ax in axarr[var, :]:
            ax.yaxis.set_ticks_position('none')
            ax.xaxis.set_ticks_position('none')
            ax.set_adjustable('box-forced')
            ax.spines['top'].set_visible(False)
            ax.spines['bottom'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.grid(b=True, color='black', alpha=0.2, linestyle='--')

    axarr[-1, 0].set_xticks(np.arange(16)[::2])
    axarr[-1, 1].set_xticks(np.arange(16)[::2])

    if zoom:
        plt.suptitle('(zoomed)')

    plt.tight_layout(pad=0.0, w_pad=-5.0, h_pad=0.1)
    plt.savefig("./experiments/eval/eICU_cristobal_marginals_r" + str(rep) +
                "_epoch" + str(epoch) + ".png")

    # now make the histograms
    fig, axarr = plt.subplots(nrows=1, ncols=4)
    axarr[0].set_ylabel("density")
    axarr[0].hist(real[:, :, 0].flatten(),
                  normed=True,
                  color='black',
                  alpha=0.8,
                  range=ranges[0],
                  bins=min(50, (ranges[0][1] - ranges[0][0])),
                  label='real')
    axarr[1].hist(real[:, :, 1].flatten(),
                  normed=True,
                  color='black',
                  alpha=0.8,
                  range=ranges[1],
                  bins=50)
    axarr[2].hist(real[:, :, 2].flatten(),
                  normed=True,
                  color='black',
                  alpha=0.8,
                  range=ranges[2],
                  bins=50)
    axarr[3].hist(real[:, :, 3].flatten(),
                  normed=True,
                  color='black',
                  alpha=0.8,
                  range=ranges[3],
                  bins=50)
    axarr[0].hist(samples_scaled[:, :, 0].flatten(),
                  normed=True,
                  alpha=0.6,
                  range=ranges[0],
                  bins=min(50, (ranges[0][1] - ranges[0][0])),
                  label='synthetic')
    axarr[0].legend()
    axarr[1].hist(samples_scaled[:, :, 1].flatten(),
                  normed=True,
                  alpha=0.6,
                  range=ranges[1],
                  bins=50)
    axarr[2].hist(samples_scaled[:, :, 2].flatten(),
                  normed=True,
                  alpha=0.6,
                  range=ranges[2],
                  bins=50)
    axarr[3].hist(samples_scaled[:, :, 3].flatten(),
                  normed=True,
                  alpha=0.6,
                  range=ranges[3],
                  bins=50)
    for (var, ax) in enumerate(axarr):
        ax.set_xlabel(variables[var])
        ax.yaxis.set_ticks_position('none')
        ax.xaxis.set_ticks_position('none')
        ax.spines['top'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.grid(b=True, color='black', alpha=0.2, linestyle='--')

    plt.gcf().subplots_adjust(bottom=0.2)
    fig.set_size_inches(10, 3)
    plt.savefig("./experiments/eval/eICU_cristobal_hist_r" + str(rep) +
                "_epoch" + str(epoch) + ".png")

    return True