def train_plda(iv_file, train_list, val_list, preproc_file, epochs, ml_md, md_epochs, output_path, **kwargs): if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None vcr_args = VCR.filter_args(**kwargs) vcr_train = VCR(iv_file, train_list, preproc, **vcr_args) x, class_ids = vcr_train.read() x_val = None class_ids_val = None if val_list is not None: vcr_val = VCR(iv_file, val_list, preproc, **vcr_args) x_val, class_ids_val = vcr_val.read() t1 = time.time() plda_args = F.filter_train_args(**kwargs) model = F.create_plda(**plda_args) elbos = model.fit(x, class_ids, x_val=x_val, class_ids_val=class_ids_val, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) model.save(output_path) elbo = np.vstack(elbos) num = np.arange(epochs) elbo = np.vstack((num, elbo)).T elbo_path=os.path.splitext(output_path)[0] + '.csv' np.savetxt(elbo_path, elbo, delimiter=',')
def train_lda(iv_file, train_list, preproc_file, lda_dim, name, save_tlist, append_tlist, output_path, **kwargs): if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None vcr_args = VCR.filter_args(**kwargs) vcr = VCR(iv_file, train_list, preproc, **vcr_args) x, class_ids = vcr.read() t1 = time.time() model = LDA(lda_dim=lda_dim, name=name) model.fit(x, class_ids) logging.info('Elapsed time: %.2f s.' % (time.time() - t1)) x = model.predict(x) s_mat = SbSw() s_mat.fit(x, class_ids) logging.debug(s_mat.Sb[:4, :4]) logging.debug(s_mat.Sw[:4, :4]) if save_tlist: if append_tlist and preproc is not None: preproc.append(model) model = preproc else: model = TransformList(model) model.save(output_path)
def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None vcr_args = VCR.filter_args(**kwargs) vcr_train = VCR(iv_file, train_list, preproc, **vcr_args) x, class_ids = vcr_train.read() t1 = time.time() model_args = GBE.filter_train_args(**kwargs) model = GBE(**model_args) model.fit(x, class_ids) logging.info('Elapsed time: %.2f s.' % (time.time() - t1)) model.save(output_path)
model.save(output_path) if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, fromfile_prefix_chars='@', description='Train LDA') parser.add_argument('--iv-file', dest='iv_file', required=True) parser.add_argument('--train-list', dest='train_list', required=True) parser.add_argument('--preproc-file', dest='preproc_file', default=None) VCR.add_argparse_args(parser) parser.add_argument('--output-path', dest='output_path', required=True) parser.add_argument('--lda-dim', dest='lda_dim', type=int, default=None) parser.add_argument('--no-save-tlist', dest='save_tlist', default=True, action='store_false') parser.add_argument('--no-append-tlist', dest='append_tlist', default=True, action='store_false') parser.add_argument('--name', dest='name', default='lda') args = parser.parse_args() config_logger(args.verbose) del args.verbose
def plot_vector_tsne(iv_file, v_list, preproc_file, output_path, save_embed, output_dim, perplexity, exag, lr, num_iter, init_method, rng_seed, verbose, pca_dim, max_classes, **kwargs): if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None vr_args = VCR.filter_args(**kwargs) vcr = VCR(iv_file, v_list, preproc, **vr_args) x, class_ids = vcr.read() t1 = time.time() if pca_dim > 0: pca = PCA(pca_dim=pca_dim) pca.fit(x) x = pca.predict(x) if not os.path.exists(output_path): os.makedirs(ouput_path) tsne_obj = lambda n: TSNE(n_components=n, perplexity=perplexity, early_exaggeration=exag, learning_rate=lr, n_iter=num_iter, init=init_method, random_state=rng_seed, verbose=verbose) if max_classes > 0: index = class_ids < max_classes x = x[index] class_ids = class_ids[index] if output_dim > 3: tsne = tsne_obj(output_dim) y = tsne.fit_transform(x) if save_embed: h5_file = '%s/embed_%dd.h5' % (output_path, ouput_dim) hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) tsne = tsne_obj(2) y = tsne.fit_transform(x) if save_embed: h5_file = '%s/embed_2d.h5' % output_path hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) fig_file = '%s/tsne_2d.pdf' % (output_path) # plt.scatter(y[:,0], y[:,1], c=class_ids, marker='x') color_marker = [(c, m) for m in markers for c in colors] for c in np.unique(class_ids): idx = class_ids == c plt.scatter(y[idx, 0], y[idx, 1], c=color_marker[c][0], marker=color_marker[c][1], label=vcr.class_names[c]) plt.legend() plt.grid(True) plt.show() plt.savefig(fig_file) plt.clf() # if max_classes > 0: # fig_file = '%s/tsne_2d_n%d.pdf' % (output_path, max_classes) # index = class_ids < max_classes # plt.scatter(y[index,0], y[index,1], c=class_ids[index], marker='x') # plt.grid(True) # plt.show() # plt.savefig(fig_file) # plt.clf() tsne = tsne_obj(3) y = tsne.fit_transform(x) if save_embed: h5_file = '%s/embed_3d.h5' % output_path hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) fig_file = '%s/tsne_3d.pdf' % (output_path) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') #ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') for c in np.unique(class_ids): idx = class_ids == c ax.scatter(y[idx, 0], y[idx, 1], y[idx, 2], c=color_marker[c][0], marker=color_marker[c][1], label=vcr.class_names[c]) plt.grid(True) plt.show() plt.savefig(fig_file) plt.clf() # if max_classes > 0: # fig_file = '%s/tsne_3d_n%d.pdf' % (output_path, max_classes) # index = class_ids < max_classes # ax = fig.add_subplot(111, projection='3d') # ax.scatter(y[index,0], y[index,1], y[index,2], c=class_ids[index], marker='x') # plt.grid(True) # plt.show() # plt.savefig(fig_file) # plt.clf() logging.info('Elapsed time: %.2f s.' % (time.time() - t1))
def train_be(iv_file, train_list, adapt_iv_file, adapt_list, lda_dim, plda_type, y_dim, z_dim, epochs, ml_md, md_epochs, w_mu, w_B, w_W, output_path, **kwargs): # Read data logging.info('loading data') vcr_args = VCR.filter_args(**kwargs) vcr_train = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr_train.read() # Train LDA logging.info('train LDA') t1 = time.time() lda = LDA(lda_dim=lda_dim, name='lda') lda.fit(x, class_ids) x_lda = lda.predict(x) logging.info('LDA elapsed time: %.2f s.' % (time.time() - t1)) # Train centering and whitening logging.info('train length norm') t1 = time.time() lnorm = LNorm(name='lnorm') lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) logging.info('length norm elapsed time: %.2f s.' % (time.time() - t1)) # Train PLDA logging.info('train PLDA') t1 = time.time() plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name='plda') elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) logging.info('PLDA elapsed time: %.2f s.' % (time.time() - t1)) # Save models logging.info('saving models') preproc = TransformList(lda) preproc.append(lnorm) if not os.path.exists(output_path): os.makedirs(ouput_path) preproc.save(output_path + '/lda_lnorm.h5') plda.save(output_path + '/plda.h5') num = np.arange(epochs) elbo = np.vstack((num, elbo)).T np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') #adaptation vcr = VCR(adapt_iv_file, adapt_list, None) x, class_ids = vcr.read() x_lda = lda.predict(x) lnorm.update_T = False lnorm.fit(x_lda) preproc = TransformList(lda) preproc.append(lnorm) preproc.save(output_path + '/lda_lnorm_adapt.h5') x_ln = lnorm.predict(x_lda) plda_adapt = plda.copy() elbo = plda.fit(x_ln, class_ids, epochs=epochs) plda_adapt.weighted_avg_model(plda, w_mu, w_B, w_W) plda_adapt.save(output_path + '/plda_adapt.h5') num = np.arange(epochs) elbo = np.vstack((num, elbo)).T np.savetxt(output_path + '/elbo_adapt.csv', elbo, delimiter=',')
def train_pdda(iv_file, train_list, val_list, decoder_file, qy_file, qz_file, epochs, batch_size, preproc_file, output_path, num_samples_y, num_samples_z, px_form, qy_form, qz_form, min_kl, **kwargs): set_float_cpu('float32') vcr_args = VCR.filter_args(**kwargs) opt_args = KOF.filter_args(**kwargs) cb_args = KCF.filter_args(**kwargs) if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None vcr_train = VCR(iv_file, train_list, preproc, **vcr_args) max_length = vcr_train.max_samples_per_class x_val = None sw_val = None if val_list is not None: vcr_val = VCR(iv_file, val_list, preproc, **vcr_args) max_length = max(max_length, vcr_val.max_samples_per_class) x_val, sw_val = vcr_val.read(return_3d=True, max_length=max_length) x, sw = vcr_train.read(return_3d=True, max_length=max_length) t1 = time.time() decoder = load_model_arch(decoder_file) qy = load_model_arch(qy_file) if qz_file is None: vae = TVAEY(qy, decoder, px_cond_form=px_form, qy_form=qy_form, min_kl=min_kl) vae.build(num_samples=num_samples_y, max_seq_length = x.shape[1]) else: qz = load_model_arch(qz_file) vae = TVAEYZ(qy, qz, decoder, px_cond_form=px_form, qy_form=qy_form, qz_form=qz_form, min_kl=min_kl) vae.build(num_samples_y=num_samples_y, num_samples_z=num_samples_z, max_seq_length = x.shape[1]) logging.info(time.time()-t1) # opt = create_optimizer(**opt_args) # cb = create_basic_callbacks(vae, output_path, **cb_args) # h = vae.fit(x, x_val=x_val, # sample_weight_train=sw, sample_weight_val=sw_val, # optimizer=opt, shuffle=True, epochs=100, # batch_size=batch_size, callbacks=cb) # opt = create_optimizer(**opt_args) # cb = create_basic_callbacks(vae, output_path, **cb_args) # h = vae.fit_mdy(x, x_val=x_val, # sample_weight_train=sw, sample_weight_val=sw_val, # optimizer=opt, shuffle=True, epochs=200, # batch_size=batch_size, callbacks=cb) # y_mean, y_logvar, z_mean, z_logvar = vae.compute_qyz_x( # x, batch_size=batch_size) # sw = np.expand_dims(sw, axis=-1) # m_y = np.mean(np.mean(y_mean, axis=0)) # s2_y = np.sum(np.sum(np.exp(y_logvar)+y_mean**2, axis=0)/ # y_logvar.shape[0]-m_y**2) # m_z = np.mean(np.sum(np.sum(z_mean*sw, axis=1), axis=0) # /np.sum(sw)) # s2_z = np.sum(np.sum(np.sum((np.exp(z_logvar)+z_mean**2)*sw, axis=1), axis=0) # /np.sum(sw)-m_z**2) # logging.info('m_y: %.2f, trace_y: %.2f, m_z: %.2f, trace_z: %.2f' % # (m_y, s2_y, m_z, s2_z)) cb = KCF.create_callbacks(vae, output_path, **cb_args) opt = KOF.create_optimizer(**opt_args) h = vae.fit(x, x_val=x_val, sample_weight_train=sw, sample_weight_val=sw_val, optimizer=opt, shuffle=True, epochs=epochs, batch_size=batch_size, callbacks=cb) if vae.x_chol is not None: x_chol = np.array(K.eval(vae.x_chol)) logging.info(x_chol[:4,:4]) logging.info('Train elapsed time: %.2f' % (time.time() - t1)) vae.save(output_path + '/model') t1 = time.time() elbo = np.mean(vae.elbo(x, num_samples=1, batch_size=batch_size)) logging.info('elbo: %.2f' % elbo) logging.info('Elbo elapsed time: %.2f' % (time.time() - t1)) t1 = time.time() vae.build(num_samples_y=1, num_samples_z=1, max_seq_length = x.shape[1]) vae.compile() qyz = vae.compute_qyz_x(x, batch_size=batch_size) if vae.qy_form == 'diag_normal': y_mean, y_logvar = qyz[:2] qz = qyz[2:] else: y_mean, y_logvar, y_chol = qyz[:3] qz = qyz[3:] if vae.qz_form == 'diag_normal': z_mean, z_logvar = qz[:2] else: z_mean, z_logvar, z_chol = qz[:3] sw = np.expand_dims(sw, axis=-1) m_y = np.mean(np.mean(y_mean, axis=0)) s2_y = np.sum(np.sum(np.exp(y_logvar)+y_mean**2, axis=0)/ y_logvar.shape[0]-m_y**2) m_z = np.mean(np.sum(np.sum(z_mean*sw, axis=1), axis=0) /np.sum(sw)) s2_z = np.sum(np.sum(np.sum((np.exp(z_logvar)+z_mean**2)*sw, axis=1), axis=0) /np.sum(sw)-m_z**2) logging.info('m_y: %.2f, trace_y: %.2f, m_z: %.2f, trace_z: %.2f' % (m_y, s2_y, m_z, s2_z)) logging.info('Trace elapsed time: %.2f' % (time.time() - t1)) t1 = time.time() vae.build(num_samples_y=1, num_samples_z=1, max_seq_length = 2) vae.compile() x1 = x[:,0,:] x2 = x[:,1,:] # scores = vae.eval_llr_1vs1_elbo(x1, x2, num_samples=10) # tar = scores[np.eye(scores.shape[0], dtype=bool)] # non = scores[np.logical_not(np.eye(scores.shape[0], dtype=bool))] # logging.info('m_tar: %.2f s_tar: %.2f' % (np.mean(tar), np.std(tar))) # logging.info('m_non: %.2f s_non: %.2f' % (np.mean(non), np.std(non))) # scores = vae.eval_llr_1vs1_cand(x1, x2) # tar = scores[np.eye(scores.shape[0], dtype=bool)] # non = scores[np.logical_not(np.eye(scores.shape[0], dtype=bool))] # logging.info('m_tar: %.2f s_tar: %.2f' % (np.mean(tar), np.std(tar))) # logging.info('m_non: %.2f s_non: %.2f' % (np.mean(non), np.std(non))) scores = vae.eval_llr_1vs1_qscr(x1, x2) tar = scores[np.eye(scores.shape[0], dtype=bool)] non = scores[np.logical_not(np.eye(scores.shape[0], dtype=bool))] logging.info('m_tar: %.2f s_tar: %.2f' % (np.mean(tar), np.std(tar))) logging.info('m_non: %.2f s_non: %.2f' % (np.mean(non), np.std(non))) logging.info('Eval elapsed time: %.2f' % (time.time() - t1))