def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt, params.bt_sample_temperature) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch if params.validation_metrics != '': trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training # initialize experiment / SLURM signal handler for time limit / pre-emption init_distributed_mode(params) logger = initialize_exp(params) init_signal_handler() # CPU / CUDA if params.cpu: assert not params.multi_gpu else: assert torch.cuda.is_available() src.utils.CUDA = not params.cpu # build environment / modules / trainer / evaluator env = build_env(params) modules = build_modules(env, params) trainer = Trainer(modules, env, params) evaluator = Evaluator(trainer) # evaluation if params.eval_only: scores = evaluator.run_all_evals() for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_equations = 0 while trainer.n_equations < trainer.epoch_size: # training steps for task_id in np.random.permutation(len(params.tasks)): task = params.tasks[task_id] if params.export_data: trainer.export_data(task) else: trainer.enc_dec_step(task) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals() # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model model = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator trainer = Trainer(model, data, params) evaluator = Evaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # MLM steps trainer.mlm_step(params.lambda_mlm) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") if params.encoder_only: model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) else: encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) with open(params.eval_output, 'w') as f: scores['temperature'] = params.softmax_temperature json.dump(dict(scores), f, indent=4) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # mass prediction steps for lang in shuf_order(params.mass_steps): trainer.mass_step(lang, params.lambda_mass) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) # back-parallel steps for lang1, lang2 in shuf_order(params.bmt_steps, params): trainer.bmt_step(lang1, lang2, params.lambda_bmt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity # scores = evaluator.run_all_evals(trainer) scores = {} # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build the big model if params.encoder_only: big_model = build_model(params, data['dico'], cut=False) else: # 修改处1 big_encoder, big_decoder = build_model(params, data['dico'], cut=False) # if we cut some layers, must build a small model if params.cut_layer: if params.encoder_only: small_model = build_model(params, data['dico'], cut=True) else: # 修改处1 small_encoder, small_decoder = build_model(params, data['dico'], cut=True) # build the big trainer, reload potential checkpoints # the big trainer is used to train, so need't a evaluator for it if params.encoder_only: big_trainer = SingleTrainer(big_model, data, params) else: big_trainer = EncDecTrainer(big_encoder, big_decoder, data, params) params.lambda_mlm = "1" params.lambda_clm = "1" params.lambda_pc = "1" params.lambda_ae = "1" params.lambda_mt = "1" params.lambda_bt = "1" # build the small model, and use it for evaluator if params.encoder_only: small_trainer = small_SingleTrainer(small_model, data, params) evaluator = SingleEvaluator(small_trainer, data, params) else: small_trainer = small_EncDecTrainer(small_encoder, small_decoder, data, params) evaluator = EncDecEvaluator(small_trainer, data, params) # evaluation only for the small trainer if params.eval_only: scores = evaluator.run_all_evals(small_trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for count in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % small_trainer.epoch) small_trainer.n_sentences = 0 while small_trainer.n_sentences < small_trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): small_trainer.clm_step(lang1, lang2, params.lambda_clm, big_trainer) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): small_trainer.mlm_step(lang1, lang2, params.lambda_mlm, big_trainer) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): small_trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): small_trainer.mt_step(lang, lang, params.lambda_ae, big_trainer) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): small_trainer.mt_step(lang1, lang2, params.lambda_mt, big_trainer) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): small_trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) small_trainer.iter() logger.info("============ End of epoch %i ============" % small_trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(small_trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch small_trainer.save_best_model(scores) small_trainer.save_periodic() small_trainer.end_epoch(scores)