コード例 #1
0
ファイル: wae_gan.py プロジェクト: dandanpeng/master_thesis
def pvae(nsamples, model_weights, test_csv, out_csv):
    vae.load_model(model_weights)

    df = pd.read_csv(test_csv)
    df_x = conversion.unpadded_tcrbs_to_onehot(df, params['max_cdr3_len'])
    cdr3 = np.array(df_x.iloc[:, 0].tolist())
    v_gene = np.array(df_x.iloc[:, 1].tolist())
    j_gene = np.array(df_x.iloc[:, 2].tolist())

    log_p_x = np.zeros((nsamples, len(df_x)))

    for j in range(len(log_p_x)):
        assert (len(df_x) == len(log_p_x[j]))

        z_mean, z_log_var = encoder.predict([cdr3, v_gene, j_gene])
        z_sd = np.sqrt(np.exp(z_log_var))
        z_sample = stats.norm.rvs(z_mean, z_sd)
        aa_probs, v_gene_probs, j_gene_probs = decoder.predict(z_sample)

        aa_obs, v_gene_obs, j_gene_obs = common.cols_of_df(df_x)

        for i in range(len(df_x)):
            log_p_x_given_z = \
                logprob_of_obs_vect(aa_probs[i], aa_obs[i]) + \
                np.log(np.sum(v_gene_probs[i] * v_gene_obs[i])) + \
                np.log(np.sum(j_gene_probs[i] * j_gene_obs[i]))

            log_p_z = np.sum(stats.norm.logpdf(z_sample[i], 0, 1))
            log_q_z_given_x = np.sum(
                stats.norm.logpdf(z_sample[i], z_mean[i], z_sd[i]))
            log_imp_weight = log_p_z - log_q_z_given_x
            log_p_x[j][i] = log_p_x_given_z + log_imp_weight
        print(j / len(log_p_x))
    avg = special.logsumexp(log_p_x, axis=0) - np.log(nsamples)
    pd.DataFrame({'log_p_x': avg}).to_csv(out_csv, index=False)
コード例 #2
0
def fit(train_file: str, validation_split: float, best_weights_fname: str, tensorboard_log_dir: str):
    """
    Fit the vae with warmup and early stopping.
    """
    train_csv = pd.read_csv(train_file)
    train_data = conversion.unpadded_tcrbs_to_onehot(train_csv, params['max_cdr3_len'], 'middle')
    cdr3 = np.array(train_data.iloc[:, 0].tolist())
    v_gene = np.array(train_data.iloc[:, 1].tolist())
    j_gene = np.array(train_data.iloc[:, 2].tolist())    

    best_val_loss = np.inf

    # We pretrain a given number of times and take the best run for the full train.
    for pretrain_idx in range(params['pretrains']):
        reinitialize_weights()
        # In our first fitting phase we don't apply EarlyStopping so that
        # we get the number of specifed warmup epochs.
        # Below we apply the fact that right now the only thing in self.callbacks is the BetaSchedule callback.
        # If other callbacks appear we'll need to change this.
        if tensorboard_log_dir:
            callbacks = [keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir + '_warmup_' + str(pretrain_idx))]
        else:
            callbacks = []
        callbacks += callbacks  # <- here re callbacks
        history = svae.fit(
            x=[cdr3],  # y=X for a VAE.
            y=[cdr3, v_gene, j_gene],
            epochs=1 + params['warmup_period'],
            batch_size=params['batch_size'],
            validation_split=validation_split,
            callbacks=callbacks,
            verbose=2)
        new_val_loss = history.history['val_loss'][-1]
        if new_val_loss < best_val_loss:
            best_val_loss = new_val_loss
            svae.save_weights(best_weights_fname, overwrite=True)
    
    svae.load_weights(best_weights_fname)

    checkpoint = ModelCheckpoint(best_weights_fname, save_best_only=True, mode='min')
    early_stopping = EarlyStopping(
        monitor=params['stopping_monitor'], patience=params['patience'], mode='min')
    callbacks = [checkpoint, early_stopping]
    if tensorboard_log_dir:
        callbacks += [keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir)]
    svae_history = svae.fit(
        x=cdr3,  # y=X tfor a VAE.
        y=[cdr3, v_gene, j_gene],
        epochs=params['epochs'],
        batch_size=params['batch_size'],
        validation_split=validation_split,
        callbacks=callbacks,
        verbose=2)
    return svae_history.history
コード例 #3
0
def test_contiguous_match_counts_df():
    test = conversion.unpadded_tcrbs_to_onehot(
        common.read_data_csv('adaptive-filter-test.correct.csv'), 30)
    v_germline_tensor, j_germline_tensor = conversion.adaptive_aa_encoding_tensors(
        30)
    result = conversion.contiguous_match_counts_df(test, v_germline_tensor,
                                                   j_germline_tensor)

    assert np.array_equal(result[0], np.array([4., 6.]))
    assert np.array_equal(result[1], np.array([5., 5.]))
    assert np.array_equal(result[5], np.array([5., 0.]))
コード例 #4
0
ファイル: tcr_vae.py プロジェクト: tdw1221/vampire
 def get_data(self, fname, data_chunk_size=0):
     """
     Get data in the correct format from fname. If data_chunk_size is
     nonzero, trim so the data length is a multiple of data_chunk_size.
     """
     df = pd.read_csv(fname, usecols=['amino_acid', 'v_gene', 'j_gene'])
     if data_chunk_size == 0:
         sub_df = df
     else:
         assert len(df) >= data_chunk_size
         n_to_take = len(df) - len(df) % data_chunk_size
         sub_df = df[:n_to_take]
     return conversion.unpadded_tcrbs_to_onehot(sub_df, self.params['max_cdr3_len'])
コード例 #5
0
def pvae(evaluation_file, out_csv, nsamples = 500):    
    evaluation_csv = pd.read_csv(evaluation_file)
    evaluation_data = conversion.unpadded_tcrbs_to_onehot(evaluation_csv, params['max_cdr3_len'], 'middle')
    #cdr3 = np.array(evaluation_data.iloc[:, 0].tolist())
    
    log_p_x = np.zeros((nsamples, len(evaluation_data)))
    
    with click.progressbar(range(nsamples)) as bar:
        for i in bar:
            log_pvae_importance_sample(evaluation_data, log_p_x[i])
    
    avg = special.logsumexp(log_p_x, axis = 0) - np.log(nsamples)
    pd.DataFrame({'log_p_x': avg}).to_csv(out_csv, index=False)
コード例 #6
0
ファイル: tcr_vae.py プロジェクト: tdw1221/vampire
def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json, model_weights, in_tcregex, out_csv):
    """
    Calculate Pvae for a TCR specified by a tcregex.

    A tcregex is specified as a string triple "v_gene,j_gene,cdr3_tcregex" where
    cdr3_tcregex uses regex symbols appropriate for amino acids.

    We keep on sampling sequences from the tcregex until the P_VAE converges.

    Note that the default number of importance samples is less than that for
    the usual pvae, because we're averaging out stochasticity anyhow.
    """
    v = TCRVAE.of_json_file(params_json)
    v.vae.load_weights(model_weights)

    # Accumulates the sequences and their P_VAEs across iters.
    generated_dfs = []
    # Accumulates the P_VAE means across iters.
    means = []

    for batch_i in range(max_iters):
        df_generated = tcregex.sample_tcregex(in_tcregex, batch_size)
        df_x = conversion.unpadded_tcrbs_to_onehot(df_generated, v.params['max_cdr3_len'])

        log_p_x = np.zeros((nsamples, len(df_x)))

        for i in range(nsamples):
            v.log_pvae_importance_sample(df_x, log_p_x[i])

        # Calculate log of mean of numbers given in log space.
        # This calculates the per-sequence log_p_x estimate.
        df_generated['log_p_x'] = special.logsumexp(log_p_x, axis=0) - np.log(nsamples)
        generated_dfs.append(df_generated)
        catted = pd.concat(generated_dfs)
        means.append(special.logsumexp(catted['log_p_x'], axis=0) - np.log(len(catted)))
        if len(means) > track_last:
            recent_sd = np.std(np.array(means[-track_last:]))
            click.echo("[Iter {}]\tmean: {:.6}\trecent SD: {:.5}\ttol: {}".format(batch_i, means[-1], recent_sd, tol))
            if recent_sd < tol:
                break
        else:
            click.echo("[Iter {}]\tmean: {:.6}".format(batch_i, means[-1]))

    click.echo("tcregex P_VAE estimate: {}".format(means[-1]))
    catted.to_csv(out_csv, index=False)
コード例 #7
0
ファイル: wae_gan.py プロジェクト: dandanpeng/master_thesis
def train(train_file, params):
    train_csv = pd.read_csv(train_file)
    train_data = conversion.unpadded_tcrbs_to_onehot(train_csv,
                                                     params['max_cdr3_len'],
                                                     'middle')
    cdr3 = np.array(train_data.iloc[:1000, 0].tolist())
    v_gene = np.array(train_data.iloc[:1000, 1].tolist())
    j_gene = np.array(train_data.iloc[:1000, 2].tolist())

    vae, discriminator, generator, encoder, decoder = create_model(params)

    past = datetime.now()
    for epoch in np.arange(1, params['epochs'] + 1):
        vae_loss = []
        discriminator_loss = []
        generator_loss = []
        pred_identity = []
        for batch in np.arange(len(train_data) / params['batch_size']):
            start = int(batch * params['batch_size'])
            end = int(start + params['batch_size'])
            cdr3_samples = cdr3[start:end]
            v_gene_samples = v_gene[start:end]
            j_gene_samples = j_gene[start:end]
            samples = [cdr3_samples, v_gene_samples, j_gene_samples]
            vae_history = vae.fit(samples,
                                  samples,
                                  epochs=1,
                                  batch_size=params['batch_size'],
                                  validation_split=0.0,
                                  verbose=0)
            vae_loss.append(vae_history.history['loss'])
            pred_identity.append(vae_history.history['cdr3_output_identity'])

            # Train Discriminator
            fake_latent = K.eval(sampling(encoder.predict(samples)))
            real_latent = np.random.normal(size=(params['batch_size'],
                                                 params['latent_dim']))

            d_real_history = discriminator.fit(real_latent,
                                               np.ones(
                                                   (params['batch_size'], 1)),
                                               epochs=1,
                                               batch_size=params['batch_size'],
                                               validation_split=0.0,
                                               verbose=0)
            d_fake_history = discriminator.fit(fake_latent,
                                               np.zeros(
                                                   (params['batch_size'], 1)),
                                               epochs=1,
                                               batch_size=params['batch_size'],
                                               validation_split=0.0,
                                               verbose=0)
            discriminator_loss.append(0.5 *
                                      np.add(d_real_history.history['loss'],
                                             d_fake_history.history['loss']))

            # Train Generator
            generator_history = generator.fit(samples,
                                              np.ones(
                                                  (params['batch_size'], 1)),
                                              epochs=1,
                                              batch_size=params['batch_size'],
                                              validation_split=0.0,
                                              verbose=0)

            generator_loss.append(generator_history.history['loss'])

        now = datetime.now()
        print("\nEpoch {}/{} - {:.1f}s".format(epoch, params['epochs'],
                                               (now - past).total_seconds()))
        print("VAE Loss: {}".format(np.mean(vae_loss)))
        print("Discriminator Loss: {}".format(np.mean(discriminator_loss)))
        print("Generator Loss: {}".format(np.mean(generator_loss)))
        print("Identity: {}".format(np.mean(pred_identity)))
コード例 #8
0
def test_cdr3_length_of_onehots():
    data = common.read_data_csv('adaptive-filter-test.correct.csv')
    lengths = data['amino_acid'].apply(len).apply(float)
    onehots = conversion.unpadded_tcrbs_to_onehot(data, 30)
    assert lengths.equals(
        conversion.cdr3_length_of_onehots(onehots['amino_acid']))