def pvae(nsamples, model_weights, test_csv, out_csv): vae.load_model(model_weights) df = pd.read_csv(test_csv) df_x = conversion.unpadded_tcrbs_to_onehot(df, params['max_cdr3_len']) cdr3 = np.array(df_x.iloc[:, 0].tolist()) v_gene = np.array(df_x.iloc[:, 1].tolist()) j_gene = np.array(df_x.iloc[:, 2].tolist()) log_p_x = np.zeros((nsamples, len(df_x))) for j in range(len(log_p_x)): assert (len(df_x) == len(log_p_x[j])) z_mean, z_log_var = encoder.predict([cdr3, v_gene, j_gene]) z_sd = np.sqrt(np.exp(z_log_var)) z_sample = stats.norm.rvs(z_mean, z_sd) aa_probs, v_gene_probs, j_gene_probs = decoder.predict(z_sample) aa_obs, v_gene_obs, j_gene_obs = common.cols_of_df(df_x) for i in range(len(df_x)): log_p_x_given_z = \ logprob_of_obs_vect(aa_probs[i], aa_obs[i]) + \ np.log(np.sum(v_gene_probs[i] * v_gene_obs[i])) + \ np.log(np.sum(j_gene_probs[i] * j_gene_obs[i])) log_p_z = np.sum(stats.norm.logpdf(z_sample[i], 0, 1)) log_q_z_given_x = np.sum( stats.norm.logpdf(z_sample[i], z_mean[i], z_sd[i])) log_imp_weight = log_p_z - log_q_z_given_x log_p_x[j][i] = log_p_x_given_z + log_imp_weight print(j / len(log_p_x)) avg = special.logsumexp(log_p_x, axis=0) - np.log(nsamples) pd.DataFrame({'log_p_x': avg}).to_csv(out_csv, index=False)
def fit(train_file: str, validation_split: float, best_weights_fname: str, tensorboard_log_dir: str): """ Fit the vae with warmup and early stopping. """ train_csv = pd.read_csv(train_file) train_data = conversion.unpadded_tcrbs_to_onehot(train_csv, params['max_cdr3_len'], 'middle') cdr3 = np.array(train_data.iloc[:, 0].tolist()) v_gene = np.array(train_data.iloc[:, 1].tolist()) j_gene = np.array(train_data.iloc[:, 2].tolist()) best_val_loss = np.inf # We pretrain a given number of times and take the best run for the full train. for pretrain_idx in range(params['pretrains']): reinitialize_weights() # In our first fitting phase we don't apply EarlyStopping so that # we get the number of specifed warmup epochs. # Below we apply the fact that right now the only thing in self.callbacks is the BetaSchedule callback. # If other callbacks appear we'll need to change this. if tensorboard_log_dir: callbacks = [keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir + '_warmup_' + str(pretrain_idx))] else: callbacks = [] callbacks += callbacks # <- here re callbacks history = svae.fit( x=[cdr3], # y=X for a VAE. y=[cdr3, v_gene, j_gene], epochs=1 + params['warmup_period'], batch_size=params['batch_size'], validation_split=validation_split, callbacks=callbacks, verbose=2) new_val_loss = history.history['val_loss'][-1] if new_val_loss < best_val_loss: best_val_loss = new_val_loss svae.save_weights(best_weights_fname, overwrite=True) svae.load_weights(best_weights_fname) checkpoint = ModelCheckpoint(best_weights_fname, save_best_only=True, mode='min') early_stopping = EarlyStopping( monitor=params['stopping_monitor'], patience=params['patience'], mode='min') callbacks = [checkpoint, early_stopping] if tensorboard_log_dir: callbacks += [keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir)] svae_history = svae.fit( x=cdr3, # y=X tfor a VAE. y=[cdr3, v_gene, j_gene], epochs=params['epochs'], batch_size=params['batch_size'], validation_split=validation_split, callbacks=callbacks, verbose=2) return svae_history.history
def test_contiguous_match_counts_df(): test = conversion.unpadded_tcrbs_to_onehot( common.read_data_csv('adaptive-filter-test.correct.csv'), 30) v_germline_tensor, j_germline_tensor = conversion.adaptive_aa_encoding_tensors( 30) result = conversion.contiguous_match_counts_df(test, v_germline_tensor, j_germline_tensor) assert np.array_equal(result[0], np.array([4., 6.])) assert np.array_equal(result[1], np.array([5., 5.])) assert np.array_equal(result[5], np.array([5., 0.]))
def get_data(self, fname, data_chunk_size=0): """ Get data in the correct format from fname. If data_chunk_size is nonzero, trim so the data length is a multiple of data_chunk_size. """ df = pd.read_csv(fname, usecols=['amino_acid', 'v_gene', 'j_gene']) if data_chunk_size == 0: sub_df = df else: assert len(df) >= data_chunk_size n_to_take = len(df) - len(df) % data_chunk_size sub_df = df[:n_to_take] return conversion.unpadded_tcrbs_to_onehot(sub_df, self.params['max_cdr3_len'])
def pvae(evaluation_file, out_csv, nsamples = 500): evaluation_csv = pd.read_csv(evaluation_file) evaluation_data = conversion.unpadded_tcrbs_to_onehot(evaluation_csv, params['max_cdr3_len'], 'middle') #cdr3 = np.array(evaluation_data.iloc[:, 0].tolist()) log_p_x = np.zeros((nsamples, len(evaluation_data))) with click.progressbar(range(nsamples)) as bar: for i in bar: log_pvae_importance_sample(evaluation_data, log_p_x[i]) avg = special.logsumexp(log_p_x, axis = 0) - np.log(nsamples) pd.DataFrame({'log_p_x': avg}).to_csv(out_csv, index=False)
def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json, model_weights, in_tcregex, out_csv): """ Calculate Pvae for a TCR specified by a tcregex. A tcregex is specified as a string triple "v_gene,j_gene,cdr3_tcregex" where cdr3_tcregex uses regex symbols appropriate for amino acids. We keep on sampling sequences from the tcregex until the P_VAE converges. Note that the default number of importance samples is less than that for the usual pvae, because we're averaging out stochasticity anyhow. """ v = TCRVAE.of_json_file(params_json) v.vae.load_weights(model_weights) # Accumulates the sequences and their P_VAEs across iters. generated_dfs = [] # Accumulates the P_VAE means across iters. means = [] for batch_i in range(max_iters): df_generated = tcregex.sample_tcregex(in_tcregex, batch_size) df_x = conversion.unpadded_tcrbs_to_onehot(df_generated, v.params['max_cdr3_len']) log_p_x = np.zeros((nsamples, len(df_x))) for i in range(nsamples): v.log_pvae_importance_sample(df_x, log_p_x[i]) # Calculate log of mean of numbers given in log space. # This calculates the per-sequence log_p_x estimate. df_generated['log_p_x'] = special.logsumexp(log_p_x, axis=0) - np.log(nsamples) generated_dfs.append(df_generated) catted = pd.concat(generated_dfs) means.append(special.logsumexp(catted['log_p_x'], axis=0) - np.log(len(catted))) if len(means) > track_last: recent_sd = np.std(np.array(means[-track_last:])) click.echo("[Iter {}]\tmean: {:.6}\trecent SD: {:.5}\ttol: {}".format(batch_i, means[-1], recent_sd, tol)) if recent_sd < tol: break else: click.echo("[Iter {}]\tmean: {:.6}".format(batch_i, means[-1])) click.echo("tcregex P_VAE estimate: {}".format(means[-1])) catted.to_csv(out_csv, index=False)
def train(train_file, params): train_csv = pd.read_csv(train_file) train_data = conversion.unpadded_tcrbs_to_onehot(train_csv, params['max_cdr3_len'], 'middle') cdr3 = np.array(train_data.iloc[:1000, 0].tolist()) v_gene = np.array(train_data.iloc[:1000, 1].tolist()) j_gene = np.array(train_data.iloc[:1000, 2].tolist()) vae, discriminator, generator, encoder, decoder = create_model(params) past = datetime.now() for epoch in np.arange(1, params['epochs'] + 1): vae_loss = [] discriminator_loss = [] generator_loss = [] pred_identity = [] for batch in np.arange(len(train_data) / params['batch_size']): start = int(batch * params['batch_size']) end = int(start + params['batch_size']) cdr3_samples = cdr3[start:end] v_gene_samples = v_gene[start:end] j_gene_samples = j_gene[start:end] samples = [cdr3_samples, v_gene_samples, j_gene_samples] vae_history = vae.fit(samples, samples, epochs=1, batch_size=params['batch_size'], validation_split=0.0, verbose=0) vae_loss.append(vae_history.history['loss']) pred_identity.append(vae_history.history['cdr3_output_identity']) # Train Discriminator fake_latent = K.eval(sampling(encoder.predict(samples))) real_latent = np.random.normal(size=(params['batch_size'], params['latent_dim'])) d_real_history = discriminator.fit(real_latent, np.ones( (params['batch_size'], 1)), epochs=1, batch_size=params['batch_size'], validation_split=0.0, verbose=0) d_fake_history = discriminator.fit(fake_latent, np.zeros( (params['batch_size'], 1)), epochs=1, batch_size=params['batch_size'], validation_split=0.0, verbose=0) discriminator_loss.append(0.5 * np.add(d_real_history.history['loss'], d_fake_history.history['loss'])) # Train Generator generator_history = generator.fit(samples, np.ones( (params['batch_size'], 1)), epochs=1, batch_size=params['batch_size'], validation_split=0.0, verbose=0) generator_loss.append(generator_history.history['loss']) now = datetime.now() print("\nEpoch {}/{} - {:.1f}s".format(epoch, params['epochs'], (now - past).total_seconds())) print("VAE Loss: {}".format(np.mean(vae_loss))) print("Discriminator Loss: {}".format(np.mean(discriminator_loss))) print("Generator Loss: {}".format(np.mean(generator_loss))) print("Identity: {}".format(np.mean(pred_identity)))
def test_cdr3_length_of_onehots(): data = common.read_data_csv('adaptive-filter-test.correct.csv') lengths = data['amino_acid'].apply(len).apply(float) onehots = conversion.unpadded_tcrbs_to_onehot(data, 30) assert lengths.equals( conversion.cdr3_length_of_onehots(onehots['amino_acid']))