if n_iter % 500 == 0: stats_str = [('DIS_COSTS', 'Discriminator loss')] stats_log = ['%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0] stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_words_proc = 0 for k, _ in stats_str: del stats[k][:] # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': n_epoch}) evaluator.all_eval(to_log) evaluator.eval_dis(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC) logger.info('End of epoch %i.\n\n' % n_epoch) # update the learning rate (stop if too small) trainer.update_lr(to_log, VALIDATION_METRIC) if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: logger.info('Learning rate < 1e-6. BREAK.') break """
def main(): VALIDATION_METRIC_SUP = 'precision_at_1-csls_knn_10' VALIDATION_METRIC_UNSUP = 'mean_cosine-csls_knn_10-S2T-10000' # main parser = argparse.ArgumentParser(description='Supervised training') parser.add_argument("--seed", type=int, default=-1, help="Initialization seed") parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)") parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models") parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name") parser.add_argument("--exp_id", type=str, default="", help="Experiment ID") parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU") parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)") # data parser.add_argument("--src_lang", type=str, default='en', help="Source language") parser.add_argument("--tgt_lang", type=str, default='es', help="Target language") parser.add_argument("--aux_lang", type=str, default='', help="Auxiliary language") parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension") parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)") # training refinement parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)") # dictionary creation parameters (for refinement) parser.add_argument("--dico_train", type=str, default="default", help="Path to training dictionary (default: use identical character strings)") parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary") parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)") parser.add_argument("--dico_build", type=str, default='S2T&T2S', help="S2T,T2S,S2T|T2S,S2T&T2S") parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation") parser.add_argument("--dico_max_rank", type=int, default=10000, help="Maximum dictionary words rank (0 to disable)") parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)") parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)") # reload pre-trained embeddings parser.add_argument("--src_emb", type=str, default='', help="Reload source embeddings") parser.add_argument("--tgt_emb", type=str, default='', help="Reload target embeddings") parser.add_argument("--aux_emb", type=str, default='', help="Reload auxiliary embeddings") parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training") parser.add_argument("--fitting_method", type=str, default="non_iterative", help="Method of fitting, one of [non_iterative, em, gauss_seidel, gradient_based]") # parse parameters params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train) assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size print(params.src_emb, params.tgt_emb, params.aux_emb) assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) assert params.export in ["", "txt", "pth"] # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, aux_emb, mapping, _ = build_model(params, False) trainer = Trainer(src_emb, tgt_emb, aux_emb, mapping, None, params) # load a training dictionary. if a dictionary path is not provided, use a default # one ("default") or create one based on identical character strings ("identical_char") trainer.load_training_dico(params.dico_train) # define the validation metric VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP logger.info("Validation metric: %s" % VALIDATION_METRIC) # apply the PCCA solution trainer.fit(fitting_method=params.fitting_method) # IMPORTANT: EVALUATOR SHOULD BE CREATED AFTER TRAINER HAS BEEN FITTED evaluator = Evaluator(trainer) # embeddings evaluation to_log = OrderedDict({}) evaluator.all_eval(to_log) logger.info("__log__:%s" % json.dumps(to_log))
""" Learning loop for Procrustes Iterative Learning """ for n_iter in range(params.n_refinement + 1): logger.info('Starting iteration %i...' % n_iter) # build a dictionary from aligned embeddings (unless # it is the first iteration and we use the init one) if n_iter > 0 or not hasattr(trainer, 'dico'): trainer.build_dictionary() # apply the Procrustes solution trainer.procrustes() # embeddings evaluation to_log = OrderedDict({'n_iter': n_iter}) evaluator.all_eval(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC) logger.info('End of iteration %i.\n\n' % n_iter) # export embeddings if params.export: trainer.reload_best() trainer.export()
""" for n_iter in range(params.n_refinement + 1): logger.info('Starting iteration %i...' % n_iter) # build a dictionary from aligned embeddings (unless # it is the first iteration and we use the init one) if n_iter > 0 or not hasattr(trainer, 'dico'): trainer.build_dictionary() # apply the Procrustes solution trainer.procrustes() # embeddings evaluation to_log = OrderedDict({'n_iter': n_iter}) evaluator.all_eval(to_log, exclude=code) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, params.val_metric, n_iter) logger.info('End of iteration %i.\n\n' % n_iter) #get rank of left-out code trainer.reload_best() desc_repr = trainer.tgt_emb.weight[trainer.tgt_dico.word2id[word]] code_sims = cos(trainer.mapping(trainer.src_emb.weight), desc_repr.unsqueeze(0)).data.cpu().numpy() print("getting similarity rank of code %s" % code) rank = len(code_sims) - np.where(np.argsort(code_sims) == trainer.src_dico.word2id[code])[0][0] code_sims_unaligned = cos(trainer.src_emb.weight, desc_repr.unsqueeze(0)).data.cpu().numpy() rank_u = len(code_sims) - np.where(np.argsort(code_sims_unaligned) == trainer.src_dico.word2id[code])[0][0]
if n_iter > params.n_refinement - params.fine_tuning: support = False logger.info('Starting iteration %i...' % n_iter) # build a dictionary from aligned embeddings (unless # it is the first iteration and we use the init one) if n_iter > 0 or not hasattr(trainer, 'dico'): trainer.build_dictionary(support) # apply the Procrustes solution if params.generalized: trainer.generalized_procrustes(support, n_iter == 0) else: trainer.simple_procrustes() # embeddings evaluation to_log = OrderedDict({'n_iter': n_iter}) biling_dict = True evaluator.all_eval(to_log, biling_dict) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC.format(params.tgt_lang[-1])) logger.info('End of iteration %i.\n\n' % n_iter) # export embeddings if params.export: trainer.reload_best() trainer.export()
'%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0 ] stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_words_proc = 0 for k, _ in stats_str: del stats[k][:] # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': n_epoch}) evaluator.all_eval(to_log, n_epoch) evaluator.eval_dis(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC) logger.info('End of epoch %i.\n\n' % n_epoch) # update the learning rate (stop if too small) trainer.update_lr(to_log, VALIDATION_METRIC) if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: logger.info('Learning rate < 1e-6. BREAK.') break """ Learning loop for Procrustes Iterative Refinement """
'%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0 ] stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_words_proc = 0 for k, _ in stats_str: del stats[k][:] # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': n_epoch}) evaluator.all_eval(to_log, 0) evaluator.eval_dis(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC) logger.info('End of epoch %i.\n\n' % n_epoch) # update the learning rate (stop if too small) trainer.update_lr(to_log, VALIDATION_METRIC) if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: logger.info('Learning rate < 1e-6. BREAK.') break """ Learning loop for Procrustes Iterative Refinement """
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) evaluator = Evaluator(trainer) np.random.seed(params.seed) # init generator parameters with artetxe's methods if params.map_init == "second_order": m_init = extract_initial_mapping(params, src_emb, tgt_emb) trainer.set_mapping_weights(torch.from_numpy(m_init)) # if we initialize the generator from a supervised mapping, evaluate before training for sanity check if not params.map_id_init: # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': -1}) evaluator.all_eval(to_log) evaluator.eval_dis(to_log) """ Learning loop for Adversarial Training """ if params.adversarial: logger.info('----> ADVERSARIAL TRAINING <----\n\n') # training loop for n_epoch in range(params.n_epochs): logger.info('Starting adversarial training epoch %i...' % n_epoch) tic = time.time() n_words_proc = 0 stats = {'DIS_COSTS': []}
'%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0 ] stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_words_proc = 0 for k, _ in stats_str: del stats[k][:] # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': n_epoch}) evaluator.all_eval(to_log, map='to_tgt') evaluator.eval_dis(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC, map='to_tgt') logger.info('End of epoch %i.\n\n' % n_epoch) # update the learning rate (stop if too small) trainer.update_lr(to_log, VALIDATION_METRIC) if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: logger.info('Learning rate < 1e-6. BREAK.') break """ Learning loop for Procrustes Iterative Refinement """
'%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0 ] stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_words_proc = 0 for k, _ in stats_str: del stats[k][:] # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': n_epoch}) evaluator.all_eval(to_log, True) evaluator.eval_dis(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC.format(params.tgt_lang[-1])) logger.info('End of epoch %i.\n\n' % n_epoch) # update the learning rate (stop if too small) trainer.update_lr(to_log, VALIDATION_METRIC.format(params.tgt_lang[-1])) if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: logger.info('Learning rate < 1e-6. BREAK.') break """
def learning(params, src_data, tgt_data, options): VALIDATION_METRIC = 'mean_cosine-csls_knn_10-S2T-10000' logger = logging.getLogger('{}Log'.format(src_data.dataname)) for i in range(10): # tic = time.time() if i == 0: options.initialize = True else: options.initialize = False logger.info("src_learning {}回目".format(i + 1)) src_data = RVSML_OT_Learning(src_data, options, params) logger.info("tgt_learning {}回目".format(i + 1)) tgt_data = RVSML_OT_Learning(tgt_data, options, params) # build model / trainer / evaluator src_emb, tgt_emb, mapping, discriminator = build_model( params, src_data, tgt_data, True) trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) evaluator = Evaluator(trainer) """ Learning loop for Adversarial Training """ logger.info('----> ADVERSARIAL TRAINING <----\n\n') # training loop for n_epoch in range(params.n_epochs): logger.info('Starting adversarial training epoch %i...' % n_epoch) tic = time.time() n_words_proc = 0 stats = {'DIS_COSTS': []} for n_iter in range(0, params.epoch_size, params.batch_size): # discriminator training for _ in range(params.dis_steps): trainer.dis_step(stats) # mapping training (discriminator fooling) n_words_proc += trainer.mapping_step(stats) # log stats if n_iter % 500 == 0: stats_str = [('DIS_COSTS', 'Discriminator loss')] stats_log = [ '%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0 ] stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_words_proc = 0 for k, _ in stats_str: del stats[k][:] # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': n_epoch}) evaluator.all_eval(to_log) evaluator.eval_dis(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC) logger.info('End of epoch %i.\n\n' % n_epoch) # update the learning rate (stop if too small) trainer.update_lr(to_log, VALIDATION_METRIC) if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: logger.info('Learning rate < 1e-6. BREAK.') break """ Learning loop for Procrustes Iterative Refinement """ # if params.n_refinement > 0: # Get the best mapping according to VALIDATION_METRIC logger.info('----> ITERATIVE PROCRUSTES REFINEMENT <----\n\n') trainer.reload_best() # training loop for n_iter in range(params.n_refinement): logger.info('Starting refinement iteration %i...' % n_iter) # build a dictionary from aligned embeddings trainer.build_dictionary() # apply the Procrustes solution trainer.procrustes() # embeddings evaluation to_log = OrderedDict({'n_iter': n_iter}) evaluator.all_eval(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC) logger.info('End of refinement iteration %i.\n\n' % n_iter) src_data.trans_mat = torch.mm(src_data.trans_mat, trainer.mapping.weight.data) return src_data, tgt_data
'%s: %.4f' % (v, np.mean(stats[k])) for k, v in stats_str if len(stats[k]) > 0 ] stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic))) logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log)) # reset tic = time.time() n_words_proc = 0 for k, _ in stats_str: del stats[k][:] # embeddings / discriminator evaluation to_log = OrderedDict({'n_epoch': n_epoch}) evaluator.all_eval(to_log) #AssertionError evaluator.eval_dis(to_log) # JSON log / save best model / end of epoch logger.info("__log__:%s" % json.dumps(to_log)) trainer.save_best(to_log, VALIDATION_METRIC) logger.info('End of epoch %i.\n\n' % n_epoch) # update the learning rate (stop if too small) trainer.update_lr(to_log, VALIDATION_METRIC) if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr: logger.info('Learning rate < 1e-6. BREAK.') break """ Learning loop for Procrustes Iterative Refinement """