def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)
    hidden_size = 1024
    encoder = EncoderRNN.EncoderRNN(params.n_words, hidden_size).cuda()
    decoder = Attention_decoder.Attention_decoder(hidden_size,
                                                  params.n_words,
                                                  dropout_p=0.1).cuda()
    trainer = LSTM_Trainer(encoder, decoder, data, params)
    evaluator = LSTM_Evaluator(trainer, data, params)
    # set sampling probabilities for training
    set_sampling_probs(data, params)
    # language model training
    for count in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.try_lstm(lang1, lang2, params.lambda_mt)

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)

    # save the output of softmax
    trainer.save_softmax_output(clm_temp, 'clm_temp')
    trainer.save_softmax_output(ml_temp, 'ml_temp')
    trainer.save_softmax_output(bt_temp, 'bt_temp')
Esempio n. 2
0
def seq2seq_main(params):
    '''
      Use different vocabulary/dictionary for src and tgt
    '''

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = seq2seq_load_data(params)

    # build model
    # 因為 language pair 會重新升冪排序 (zh-en) --> (en-zh)
    # 所以 en 變成 src , zh 變成 tgt
    encoder, decoder = build_seq2seq_model(
        params, data['tgt_dico'], data['src_dico'])

    # build trainer, reload potential checkpoints / build evaluator
    trainer = EncDecTrainer(encoder, decoder, data, params)
    evaluator = MyEncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 3
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # float16
    if params.fp16:
        assert torch.backends.cudnn.enabled
        if params.encoder_only:
            model = network_to_half(model)
        else:
            encoder = network_to_half(encoder)
            decoder = network_to_half(decoder)

    # distributed
    # if params.multi_gpu:
    #     logger.info("Using nn.parallel.DistributedDataParallel ...")
    #     if params.encoder_only:
    #         model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True)
    #     else:
    #         encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True)
    #         decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True)

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # mass prediction steps
            for lang in shuf_order(params.mass_steps):
                trainer.mass_step(lang, params.lambda_mass)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)
            
            # back-parallel steps
            for lang1, lang2 in shuf_order(params.bmt_steps, params):
                trainer.bmt_step(lang1, lang2, params.lambda_bmt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" % trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 4
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    params.lgs = lgs = params.lgs.split("-")
    if len(lgs) == 1:
        lgs.append(lgs[0])

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # Replace the original MLM steps
            for lang1, lang2 in shuf_order(params.mlm_steps, params):

                if params.do_meta_update:
                    trainer.meta_mlm_step(lang1)
                else:
                    trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 5
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    # reload-model options are in here
    if params.encoder_only:
        model = build_model(params, data['dico'])

        if params.use_adapters:
            logger.info("Using adapters")
            for param in model.named_parameters():

                if param[0][:8] != "adapters":
                    param[1].requires_grad = False

            for param_name, param in model.embeddings.named_parameters():
                param.requires_grad = True
            for param_name, param in model.position_embeddings.named_parameters(
            ):
                param.requires_grad = True
            for param_name, param in model.pred_layer.named_parameters():
                param.requires_grad = True
            for param in model.layer_norm_emb.parameters():
                param.requires_grad = True
            for param in model.named_parameters():
                logger.info(param[0] + ' required grad = ' +
                            str(param[1].requires_grad))

    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
        logger.info("Number of trainable parameters (encoder): %i" % sum(
            [p.numel()
             for p in trainer.model.parameters() if p.requires_grad]))

    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)
        logger.info(
            "Number of trainable parameters (encoder): %i" %
            sum([p.numel() for p in encoder.parameters() if p.requires_grad]))
        logger.info(
            "Number of trainable parameters (decoder): %i" %
            sum([p.numel() for p in decoder.parameters() if p.requires_grad]))

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for epoch in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 6
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)
    
    # initialize the experiment
    logger = initialize_exp(params)

#     # initialize SLURM signal handler for time limit / pre-emption
#     init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

#     # float16
#     if params.fp16:
#         assert torch.backends.cudnn.enabled
#         if params.encoder_only:
#             model = network_to_half(model)
#         else:
#             encoder = network_to_half(encoder)
#             decoder = network_to_half(decoder)

#     # distributed
#     if params.multi_gpu:
#         logger.info("Using nn.parallel.DistributedDataParallel ...")
#         if params.fp16:
#             if params.encoder_only:
#                 model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True)
#             else:
#                 encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True)
#                 decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True)
#         else:
#             if params.encoder_only:
#                 model = nn.parallel.DistributedDataParallel(model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)
#             else:
#                 encoder = nn.parallel.DistributedDataParallel(encoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)
#                 decoder = nn.parallel.DistributedDataParallel(decoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

#     # evaluation
#     if params.eval_only:
#         scores = evaluator.run_all_evals(trainer)
#         for k, v in scores.items():
#             logger.info("%s -> %.6f" % (k, v))
#         logger.info("__log__:%s" % json.dumps(scores))
#         exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_sentences = 0
        trainer.n_images = 0

        while trainer.n_sentences < trainer.epoch_size or trainer.n_images < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            # shuf_order's result could be: ['fr', 'fr'] or ['en', 'fr'] or ['fr', 'en'] or ['en', 'en']
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)
                
            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)
                
            # Image-language pretraining steps 
            trainer.ipm_step("coco36", params.lambda_ipm)

            # CMLM steps steps
            for m1, m2 in shuf_order(params.cmlm_steps, params):
                trainer.cmlm_step(m1, m2, params.lambda_cmlm)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()


        logger.info("============ End of epoch %i ============" % trainer.epoch)
Esempio n. 7
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # float16
    if params.fp16:
        assert torch.backends.cudnn.enabled
        if params.encoder_only:
            model = network_to_half(model)
        else:
            encoder = network_to_half(encoder)
            decoder = network_to_half(decoder)

    # distributed
    if params.multi_gpu:
        logger.info("Using nn.parallel.DistributedDataParallel ...")
        if params.encoder_only:
            model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True)
        else:
            encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True)
            decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True)

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        logger.info('Evaluating and saving new result file')
        scores = evaluator.run_all_evals_match(trainer)
        for k, v in scores.items():
            if 'likelihood' in k:
                logger.info("%s -> %.6f" % (k, np.mean(v)))
            elif 'scores' in k:
                logger.info("%s -> %s" % (k, v.shape))
            else:
                logger.info("%s -> %.6f" % (k, v))

        np.savetxt(os.path.join(params.dump_path, 'best-fwd-prediction.txt'),scores['%s_%s_fwd_scores' % ('test', params.mass_steps[0])],fmt='%f')
        for match in params.match_files.split(','):
            np.savetxt(os.path.join(params.dump_path, 'best-match-prediction{}.txt'.format(match.split('.')[-1])),
                   scores['%s_%s_sentence_likelihood' % (match, params.mass_steps[0])], fmt='%f')
        labels = np.loadtxt(os.path.join(params.data_path, 'labels'))
        targets = np.loadtxt(os.path.join(params.data_path, 'suffix'))
        preds = scores['%s_%s_sentence_likelihood' % ('match', params.mass_steps[0])]
        results = pd.DataFrame({'label': labels, 'target': targets, 'pred': preds})
        results.to_pickle(os.path.join(params.dump_path, 'best-matching-prediction.pkl'))
        #logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)
    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # mass prediction steps
            for lang in shuf_order(params.mass_steps):
                trainer.mass_step(lang, params.lambda_mass)
            trainer.iter()
        logger.info("============ End of epoch %i ============" % trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_epoch_evals_match(trainer)
        # print / JSON log
        for k, v in scores.items():
            if 'likelihood' in k:
                logger.info("%s -> %.6f" % (k, np.mean(v)))
            elif 'scores' in k:
                logger.info("%s -> %s" % (k, v.shape))
            else:
                logger.info("%s -> %.6f" % (k, v))
        #if params.is_master:
            #logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 8
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # load checkpoint
    if params.model_path != "":
        reloaded = torch.load(params.model_path)
        model_params = AttrDict(reloaded['params'])
        dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'],
                          reloaded['dico_counts'])
        encoder = TransformerModel(model_params,
                                   dico,
                                   is_encoder=True,
                                   with_output=True).cuda().eval()
        decoder = TransformerModel(model_params,
                                   dico,
                                   is_encoder=False,
                                   with_output=True).cuda().eval()
        encoder = TransformerModel(model_params,
                                   dico,
                                   is_encoder=True,
                                   with_output=True).cuda().eval()
        decoder = TransformerModel(model_params,
                                   dico,
                                   is_encoder=False,
                                   with_output=True).cuda().eval()
        encoder.load_state_dict(reloaded['encoder'])
        decoder.load_state_dict(reloaded['decoder'])
        logger.info("Supported languages: %s" %
                    ", ".join(model_params.lang2id.keys()))
    else:
        # build model
        if params.encoder_only:
            model = build_model(params, data['dico'])
        else:
            encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 9
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    if params.other_seed > -1:
        # deterministic
        torch.manual_seed(params.other_seed)
        torch.cuda.manual_seed(params.other_seed)
        np.random.seed(params.other_seed)
        random.seed(params.other_seed)

    if params.iter_seed == -1:
        # non-deterministic
        params.iter_seed = None

    # load data
    data = load_data(params)
    writer = SummaryWriter(params.dump_path + "/" + params.exp_name + "_log")

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        sys.exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)
    _iter = 0

    # dump initial weights
    if params.save_initial:
        trainer.save_checkpoint('initial', include_optimizers=False)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:
            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                if params.only_vlm:
                    # with visual features
                    trainer.vlm_step(lang1, lang2, params.lambda_mlm, _iter)
                else:
                    trainer.mlm_step(lang1, lang2, params.lambda_mlm, _iter)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            for lang1, lang2 in shuf_order(params.mmt_steps, params):
                trainer.mmt_step(lang1, lang2, params.lambda_mt)

            trainer.iter()
            _iter += 1

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            writer.add_scalar(k, v, _iter)
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 10
0
def main(params):
    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)
    print(data)

    # build model
    if params.encoder_only:
        model = build_model(params)
    else:
        encoder, decoder = build_model(params)

    # build trainer, reload potential checkpoints / build evaluator

    trainer = XTrainer(model, data, params)
    evaluator = XEvaluator(trainer, data, params)
    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:
            # MLM steps (also includes TLM if lang2 is not None)

            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                if params.is_understanding:
                    trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            for lang1, lang2 in shuf_order(params.text_steps, params):
                if params.is_ntg:
                    trainer.ntg_step(lang1, None, params.lambda_mlm)

            # cross-modal caption steps
            for lang1, lang2 in shuf_order(params.cross_modal_steps, params):
                if params.is_mt:
                    trainer.mt_ic_step(lang1, lang2, params.lambda_ic)
                else:
                    trainer.ic_step(lang1, lang2, params.lambda_ic)

                if params.is_freelb:
                    trainer.free_lb_ic_step(lang1, lang2, params.lambda_ic)

            for lang1, lang2 in shuf_order(params.mlm_steps, params, n=3):
                if params.is_generation:
                    trainer.bart_mlm_step(lang1, lang2, params.lambda_imlm)
                    trainer.bart_mass_step(lang1, lang2, params.lambda_imlm)

            for lang1, lang2 in shuf_order(params.cross_ae_steps, params):
                trainer.bart_img_step(lang1, lang2, params.lambda_ida)

            for lang1, lang2 in shuf_order(params.cross_rel_steps, params):
                if params.is_pretrain:
                    trainer.pretrain_rel_step(lang1, lang2)
                else:
                    if params.is_slide:
                        trainer.slide_step(lang1, lang2, params.lambda_t2i)
                    else:
                        # support multi languages
                        trainer.rel_step(lang1, lang2, params.lambda_t2i, params.lambda_i2t)

            # for lang1, lang2 in shuf_order(params.cross_mlm_steps, params):
            #     trainer.mlm_step(lang1, lang2, params.lambda_mlm)
            #
            # for lang1, lang2 in shuf_order(params.cross_mrm_steps, params):
            #     trainer.mrm_step(lang1, lang2, params.lambda_mrm)
            #
            # for lang1, lang2 in shuf_order(params.cross_mrfr_steps, params):
            #     trainer.mrfr_step(lang1, lang2, params.lambda_mrfr)

            trainer.iter()

        logger.info("============ End of epoch %i ============" % trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        evaluate_results = []
        import os
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))
            evaluate_results.append(json.dumps(scores))
            with open(os.path.join(params.dump_path, "epoch_{0}.eval_log".format(trainer.epoch)), 'w') as writer:
                for line in evaluate_results:
                    writer.write(line + '\n')

        # end of epoch
        trainer.save_best_model(scores)
        if trainer.epoch % params.save_every_epoch == 0 and params.is_master:
            trainer.save_model('model_pretrain_%i' % trainer.epoch)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 11
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    voc_path = "/home/zchen/XLM/data/processed/XLM_en_zh/50k/vocab"
    data = load_wikisum_data(voc_path, params.data_path, params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    elif params.WS:
        global_encoder, local_encoder, decoder = build_model(
            params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    trainer = WikisumTrainer(global_encoder, local_encoder, decoder, data,
                             params)
    evaluator = WikisumEvaluator(trainer, data, params, data_set="valid")

    # evaluation
    if params.eval_only:
        scores = evaluator.evaluate(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):
        trainer.epoch += 1

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        for i, batch in enumerate(trainer.dataloader):
            for lang1, lang2 in shuf_order(params.ws_steps, params):
                trainer.step(lang1, lang2, batch)

            trainer.iter()
            # if i > 1000:
            # break

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.evaluate(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 12
0
def main(params):
    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    if params.build_nmt_domain_feature:
        import torch
        from src.curriculum import build_nmt_domain_feature
        dataset = data['para'][('de', 'en')]['train']
        batches, indices = dataset.get_iterator(
            shuffle=False,
            group_by_size=params.group_by_size,
            n_sentences=-1,
        )

        features = build_nmt_domain_feature(data, params, batches, dataset)
        result = {'indices': indices, 'domain_feature': features}
        torch.save(result, params.build_output_path)
        return

    if params.build_nlm_domain_feature:
        import torch
        from src.curriculum import build_nlm_domain_feature
        dataset = data['para'][('de', 'en')]['train']
        batches, indices = dataset.get_iterator(
            shuffle=False,
            group_by_size=params.group_by_size,
            n_sentences=-1,
        )
        # dataset = data['mono_stream']['en']['train']
        # indices = dataset.get_iterator(
        #     shuffle=False
        # )

        features, domain_score, sents = build_nlm_domain_feature(
            data, params, batches, dataset)
        result = {
            'indices': indices,
            'domain_feature': features,
            'domain_score': domain_score,
            'sents': sents
        }
        if params.build_output_path.endswith('pth'):
            build_output_path = params.build_output_path
        else:
            build_output_path = f'{params.build_output_path}/final.pth'
        torch.save(result, build_output_path)
        return

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    elif params.dual_encoder:
        encoder1, encoder2 = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    elif params.domains:
        if params.curriculum_learning:
            trainer = CurriculumTrainer(encoder, decoder, data, params)
        elif params.dual_encoder:
            trainer = DualEncoderTrainer(encoder1, encoder2, data, params)
        else:
            trainer = MultiDomainTrainer(encoder, decoder, data, params)
        if params.local_adapt:
            evaluator = MetaMultiDomainEvaluator(trainer, data, params)
        else:
            if params.curriculum_learning:
                evaluator = EncDecEvaluator(trainer, data, params)
            elif params.dual_encoder:
                evaluator = DualEncoderEvaluator(trainer, data, params)
            else:
                evaluator = MultiDomainEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

            if not params.dual_encoder and params.domains and params.domain_ratio_update_freq > 0 and trainer.n_total_iter % params.domain_ratio_update_freq == 0 and not params.sampling_uniform:
                evaluator.update_language_sampler_multidomain()
                evaluator.update_dataset_ratio(trainer)

            if not params.dual_encoder and params.domain_reset_freq > 0 and trainer.n_total_iter % params.domain_reset_freq == 0:
                evaluator.reset_dataset_ratio(trainer)

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 13
0
def main(params):
    if params.adv:
        params.use_lang_emb = False
        print("Language embeddings are not used...\n \n \n \n")
    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    # reload-model options are in here
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])
    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for epoch in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" % trainer.epoch)

    # evaluate perplexity
    scores = evaluator.run_all_evals(trainer)

    # print / JSON log
    for k, v in scores.items():
        logger.info("%s -> %.6f" % (k, v))
    if params.is_master:
        logger.info("__log__:%s" % json.dumps(scores))
Esempio n. 14
0
def clts_elmo_main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # cross lingual encoder

    # cross lingual  text summarization encoder, text summarization decoder
    elmo, ts_encoder, ts_decoder = build_clts_elmo_model(params, data['dico'])

    trainer = XLMCLTSEncDecTrainer(elmo, ts_encoder, ts_decoder, data, params)
    evaluator = XLMCLTSEncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 15
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    meta_params = copy.deepcopy(params).meta_params
    params.meta_params = "..."  # to long to be log
    logger = initialize_exp(params)
    params.meta_params = meta_params

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # todo : good params.n_words (We take the one from the first task have this parameter for the moment.)
    """
    But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, 
    and therefore no problem if we choose one at random.
    """
    p = params.meta_params[data['key']]

    # build model
    if params.encoder_only:
        model = build_model(params=p, dico=data['dico'])
    else:
        encoder, decoder = build_model(params=p, dico=data['dico'])

    # todo : good pad_index and eos_index and ... (I'll take the one from the first task for the moment.)
    """
    But we think that if all the task data are based on the same vocabulary, all these parameters will be the same, 
    and therefore no problem if we choose one at random.
    """
    params.pad_index = p.pad_index
    params.eos_index = p.eos_index

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        if not params.meta_learning:
            for k, v in scores.items():
                logger.info("%s -> %.6f" % (k, v))
        else:
            for lgs in params.meta_params.keys():
                logger.info("============ task : %s " % lgs)
                for k, v in scores[lgs].items():
                    if k != "epoch":
                        logger.info("%s -> %.6f" % (k, v))
            logger.info("============ all")
            for k, v in scores.items():
                if not (k in (list(params.meta_params.keys()) + ['epoch'])):
                    logger.info("%s -> %.6f" % (k, v))

        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        if not params.meta_learning:
            trainer.n_sentences = 0
            while trainer.n_sentences < trainer.epoch_size:
                # CLM steps
                for lang1, lang2 in shuf_order(params.clm_steps, params):
                    trainer.clm_step(lang1, lang2, params.lambda_clm)

                # MLM steps (also includes TLM if lang2 is not None)
                for lang1, lang2 in shuf_order(params.mlm_steps, params):
                    trainer.mlm_step(lang1, lang2, params.lambda_mlm)

                # parallel classification steps
                for lang1, lang2 in shuf_order(params.pc_steps, params):
                    trainer.pc_step(lang1, lang2, params.lambda_pc)

                # denoising auto-encoder steps
                for lang in shuf_order(params.ae_steps):
                    trainer.mt_step(lang, lang, params.lambda_ae)

                # machine translation steps
                for lang1, lang2 in shuf_order(params.mt_steps, params):
                    trainer.mt_step(lang1, lang2, params.lambda_mt)

                # back-translation steps
                for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                    trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

                trainer.iter()
        else:
            # our
            trainer.n_sentences = {}
            """
            Here we build language lists for each of our meta-taks. Indeed, for two language lists l1 and l2, 
            the objective will be done with l1[i] and l2[i] respectively, this for each index i of the two lists. 
            """
            lang1_dic, lang2_dic, lang3_dic = {}, {}, {}
            """
            In the case of meta-learning, we have a (meta-)data dictionary for each (meta-)task, 
            so the keys are the languages conserved by the task. 
            """
            data_keys_dic = {}

            # equivalent to "for task in list of task" in the original algorithm,  except here we prepare all the tasks beforehand.
            for lgs in params.meta_params.keys():
                trainer.n_sentences[lgs] = 0

                # CLM
                try:
                    lang1_dic['clm_step']
                except KeyError:
                    lang1_dic['clm_step'], lang2_dic[
                        'clm_step'], data_keys_dic['clm_step'] = [], [], []
                for lang1, lang2 in shuf_order(
                        params.meta_params[lgs].clm_steps, params):
                    lang1_dic['clm_step'].append(lang1)
                    lang2_dic['clm_step'].append(lang2)
                    data_keys_dic['clm_step'].append(lgs)

                # MLM
                try:
                    lang1_dic['mlm_step']
                except KeyError:
                    lang1_dic['mlm_step'], lang2_dic[
                        'mlm_step'], data_keys_dic['mlm_step'] = [], [], []
                for lang1, lang2 in shuf_order(
                        params.meta_params[lgs].mlm_steps, params):
                    lang1_dic['mlm_step'].append(lang1)
                    lang2_dic['mlm_step'].append(lang2)
                    data_keys_dic['mlm_step'].append(lgs)

                # parallel classification
                try:
                    lang1_dic['pc_step']
                except KeyError:
                    lang1_dic['pc_step'], lang2_dic['pc_step'], data_keys_dic[
                        'pc_step'] = [], [], []
                for lang1, lang2 in shuf_order(
                        params.meta_params[lgs].pc_steps, params):
                    lang1_dic['pc_step'].append(lang1)
                    lang2_dic['pc_step'].append(lang2)
                    data_keys_dic['pc_step'].append(lgs)

                # denoising auto-encoder
                try:
                    lang1_dic['ae_step']
                except KeyError:
                    lang1_dic['ae_step'], data_keys_dic['ae_step'] = [], []
                for lang1 in shuf_order(params.meta_params[lgs].ae_steps):
                    lang1_dic['ae_step'].append(lang1)
                    data_keys_dic['ae_step'].append(lgs)

                # machine translation
                try:
                    lang1_dic['mt_step']
                except KeyError:
                    lang1_dic['mt_step'], lang2_dic['mt_step'], data_keys_dic[
                        'mt_step'] = [], [], []
                for lang1, lang2 in shuf_order(
                        params.meta_params[lgs].mt_steps, params):
                    lang1_dic['mt_step'].append(lang1)
                    lang2_dic['mt_step'].append(lang2)
                    data_keys_dic['mt_step'].append(lgs)

                # back-translation
                try:
                    lang1_dic['bt_step']
                except KeyError:
                    lang1_dic['bt_step'], lang2_dic['bt_step'], lang3_dic[
                        'bt_step'], data_keys_dic['bt_step'] = [], [], [], []
                for lang1, lang2, lang3 in shuf_order(
                        params.meta_params[lgs].bt_steps):
                    lang1_dic['bt_step'].append(lang1)
                    lang2_dic['bt_step'].append(lang2)
                    lang3_dic['bt_step'].append(lang3)
                    data_keys_dic['bt_step'].append(lgs)

            flag = True

            # equivalent to "while not done do" in the original algorithm
            while flag:

                # CLM steps
                #print("clm_step", flag)
                a = trainer.clm_step(lang1_dic['clm_step'],
                                     lang2_dic['clm_step'], params.lambda_clm,
                                     data_keys_dic['clm_step'])

                #print("mlm_step", flag)
                # MLM steps (also includes TLM if lang2 is not None)
                b = trainer.mlm_step(lang1_dic['mlm_step'],
                                     lang2_dic['mlm_step'], params.lambda_mlm,
                                     data_keys_dic['mlm_step'])

                # parallel classification steps
                c = trainer.pc_step(lang1_dic['pc_step'], lang2_dic['pc_step'],
                                    params.lambda_pc, data_keys_dic['pc_step'])

                if isinstance(trainer, EncDecTrainer):

                    # denoising auto-encoder steps
                    d = trainer.mt_step(lang1_dic['ae_step'],
                                        lang1_dic['ae_step'], params.lambda_ae,
                                        data_keys_dic['ae_step'])

                    # machine translation steps
                    e = trainer.mt_step(lang1_dic['mt_step'],
                                        lang2_dic['mt_step'], params.lambda_mt,
                                        data_keys_dic['mt_step'])

                    # back-translation steps
                    f = trainer.bt_step(lang1_dic['bt_step'],
                                        lang2_dic['bt_step'],
                                        lang3_dic['bt_step'], params.lambda_bt,
                                        data_keys_dic['bt_step'])

                    # do things better
                    if (not a) and (not b) and (not c) and (not d) and (
                            not e) and (not f):
                        flag = False  # End of epoch
                    else:
                        flag = True
                else:
                    # do things better
                    if (not a) and (not b) and (not c):
                        flag = False  # End of epoch
                    else:
                        flag = True

                trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        if not params.meta_learning:
            for k, v in scores.items():
                logger.info("%s -> %.6f" % (k, v))
        else:
            for lgs in params.meta_params.keys():
                logger.info("============ task : %s " % lgs)
                for k, v in scores[lgs].items():
                    if k != "epoch":
                        logger.info("%s -> %.6f" % (k, v))
            logger.info("============ all")
            for k, v in scores.items():
                if not (k in (list(params.meta_params.keys()) + ['epoch'])):
                    logger.info("%s -> %.6f" % (k, v))

        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)

        # our
        logger.info("============ garbage collector collecting %d ..." %
                    gc.collect())
Esempio n. 16
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    if params.encoder_only:
        model = build_model(params, data['dico'])
    else:
        encoder, decoder = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    if params.encoder_only:
        trainer = SingleTrainer(model, data, params)
        evaluator = SingleEvaluator(trainer, data, params)
    else:
        trainer = EncDecTrainer(encoder, decoder, data, params)
        evaluator = EncDecEvaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:

            # CLM steps (causal languge model)
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                trainer.clm_step(lang1, lang2, params.lambda_clm)

            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                trainer.mlm_step(lang1, lang2, params.lambda_mlm)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                trainer.mt_step(lang, lang, params.lambda_ae)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                trainer.mt_step(lang1, lang2, params.lambda_mt)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                trainer.bt_step(lang1, lang2, lang3,
                                params.lambda_bt, params.bt_sample_temperature)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        if params.validation_metrics != '':
            trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)
Esempio n. 17
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build the big model
    if params.encoder_only:
        big_model = build_model(params, data['dico'], cut=False)
    else:
        # 修改处1
        big_encoder, big_decoder = build_model(params, data['dico'], cut=False)

    # if we cut some layers, must build a small model
    if params.cut_layer:
        if params.encoder_only:
            small_model = build_model(params, data['dico'], cut=True)
        else:
            # 修改处1
            small_encoder, small_decoder = build_model(params,
                                                       data['dico'],
                                                       cut=True)

    # build the big trainer, reload potential checkpoints
    # the big trainer is used to train, so need't a evaluator for it
    if params.encoder_only:
        big_trainer = SingleTrainer(big_model, data, params)
    else:
        big_trainer = EncDecTrainer(big_encoder, big_decoder, data, params)

    params.lambda_mlm = "1"
    params.lambda_clm = "1"
    params.lambda_pc = "1"
    params.lambda_ae = "1"
    params.lambda_mt = "1"
    params.lambda_bt = "1"

    # build the small model, and use it for evaluator
    if params.encoder_only:
        small_trainer = small_SingleTrainer(small_model, data, params)
        evaluator = SingleEvaluator(small_trainer, data, params)
    else:
        small_trainer = small_EncDecTrainer(small_encoder, small_decoder, data,
                                            params)
        evaluator = EncDecEvaluator(small_trainer, data, params)

    # evaluation only for the small trainer
    if params.eval_only:
        scores = evaluator.run_all_evals(small_trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for count in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    small_trainer.epoch)

        small_trainer.n_sentences = 0

        while small_trainer.n_sentences < small_trainer.epoch_size:

            # CLM steps
            for lang1, lang2 in shuf_order(params.clm_steps, params):
                small_trainer.clm_step(lang1, lang2, params.lambda_clm,
                                       big_trainer)
            # MLM steps (also includes TLM if lang2 is not None)
            for lang1, lang2 in shuf_order(params.mlm_steps, params):
                small_trainer.mlm_step(lang1, lang2, params.lambda_mlm,
                                       big_trainer)

            # parallel classification steps
            for lang1, lang2 in shuf_order(params.pc_steps, params):
                small_trainer.pc_step(lang1, lang2, params.lambda_pc)

            # denoising auto-encoder steps
            for lang in shuf_order(params.ae_steps):
                small_trainer.mt_step(lang, lang, params.lambda_ae,
                                      big_trainer)

            # machine translation steps
            for lang1, lang2 in shuf_order(params.mt_steps, params):
                small_trainer.mt_step(lang1, lang2, params.lambda_mt,
                                      big_trainer)

            # back-translation steps
            for lang1, lang2, lang3 in shuf_order(params.bt_steps):
                small_trainer.bt_step(lang1, lang2, lang3, params.lambda_bt)

            small_trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    small_trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(small_trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        small_trainer.save_best_model(scores)
        small_trainer.save_periodic()
        small_trainer.end_epoch(scores)