def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecGenerator(trainer, data, params) # evaluation if params.eval_only: evaluator.generate(trainer) exit()
def __init__(self, params): self.params = params # check parameters assert not params.cuda or torch.cuda.is_available() assert params.dico_train in ["identical_char", "default" ] or os.path.isfile(params.dico_train) assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile( params.dico_eval) assert params.export in ["", "txt", "pth"] # build self.logger / model / self.trainer / evaluator self.logger = initialize_exp(params) src_emb, tgt_emb, mapping, _ = build_supervised_model(params, False) self.trainer = Trainer(src_emb, tgt_emb, mapping, None, params) evaluator = Evaluator(self.trainer) # load a training dictionary. if a dictionary path is not provided, use a default # one ("default") or create one based on identical character strings ("identical_char") self.trainer.load_training_dico(params.dico_train)
def __init__(self, args): self.args = args # check parameters if self.args.adversarial: assert 0 <= self.args.dis_dropout < 1 assert 0 <= self.args.dis_input_dropout < 1 assert 0 <= self.args.dis_smooth < 0.5 assert self.args.dis_lambda > 0 and self.args.dis_steps > 0 assert 0 < self.args.lr_shrink <= 1 assert self.args.model_path is not None self.dataset = None # build model / trainer / evaluator if not self.args.pred and not self.args.cal_sent_sim: self.logger = initialize_exp(self.args) if self.args.adversarial or self.args.cal_sent_sim: assert os.path.isfile(self.args.input_file) self.dataset, unique_id_to_feature, self.features = load(self.args.vocab_file, self.args.input_file, batch_size=self.args.batch_size, do_lower_case=self.args.do_lower_case, max_seq_length=self.args.max_seq_length, local_rank=self.args.local_rank, vocab_file1=self.args.vocab_file1) self.bert_model, self.mapping, self.discriminator, self.bert_model1 = build_model(self.args, True) if self.args.adversarial or self.args.pred: self.trainer = BertTrainer(self.bert_model, self.dataset, self.mapping, self.discriminator, self.args, bert_model1=self.bert_model1) if self.args.adversarial or self.args.cal_sent_sim: self.evaluator = BertEvaluator(self.bert_model, self.dataset, self.mapping, self.discriminator, self.args, self.features, bert_model1=self.bert_model1) if self.args.local_rank == -1 or self.args.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") else: self.device = torch.device("cuda", self.args.local_rank)
def main_worker(gpu, params): dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23457', world_size=torch.cuda.device_count(), rank=gpu) torch.cuda.set_device(gpu) params.gpu = gpu # load model if params.mode == 'train': # reload pretrained model my_model = MyModel.reload(params.model_path, params) else: my_model = torch.load(params.model_path) # reload langs from pretrained model params.n_langs = my_model.pretrain_params['n_langs'] params.id2lang = my_model.pretrain_params['id2lang'] params.lang2id = my_model.pretrain_params['lang2id'] if params.max_vocab > 1: my_model.dico.max_vocab(params.max_vocab) if params.min_count > 0: my_model.dico.min_count(params.min_count) params.bos_index = my_model.dico.bos_index # 0 params.eos_index = my_model.dico.eos_index # 1 params.pad_index = my_model.dico.pad_index # 2 params.unk_index = my_model.dico.unk_index # 3 # initialize the experiment logger = initialize_exp(params) task = MyTask(my_model, params) task.run()
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) hidden_size = 1024 encoder = EncoderRNN.EncoderRNN(params.n_words, hidden_size).cuda() decoder = Attention_decoder.Attention_decoder(hidden_size, params.n_words, dropout_p=0.1).cuda() trainer = LSTM_Trainer(encoder, decoder, data, params) evaluator = LSTM_Evaluator(trainer, data, params) # set sampling probabilities for training set_sampling_probs(data, params) # language model training for count in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.try_lstm(lang1, lang2, params.lambda_mt) logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores) # save the output of softmax trainer.save_softmax_output(clm_temp, 'clm_temp') trainer.save_softmax_output(ml_temp, 'ml_temp') trainer.save_softmax_output(bt_temp, 'bt_temp')
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() trainer = Translate(model_path = params.model_path , tgt_lang = params.tgt_lang, src_lang = params.src_lang, dump_path = params.dump_path, exp_name=params.exp_name, exp_id=params.exp_id, batch_size=params.batch_size) print(trainer.translate(["I eat something", "Good morning my frend"])) print(trainer.translate("Good morning my frend, I eat something"))
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters #parser = get_parser() #params = parser.parse_args() trainer = Translate(model_path=params.model_path, tgt_lang=params.tgt_lang, src_lang=params.src_lang, dump_path=params.dump_path, exp_name=params.exp_name, exp_id=params.exp_id, batch_size=params.batch_size) print(trainer.translate(params.text.split('[SEP]')))
def __init__(self, model_path, tgt_lang, src_lang,dump_path = "./dumped/", exp_name="translate", exp_id="test", batch_size=32): # parse parameters parser = argparse.ArgumentParser(description="Translate sentences") # main parameters parser.add_argument("--dump_path", type=str, default=dump_path, help="Experiment dump path") parser.add_argument("--exp_name", type=str, default=exp_name, help="Experiment name") parser.add_argument("--exp_id", type=str, default=exp_id, help="Experiment ID") parser.add_argument("--batch_size", type=int, default=batch_size, help="Number of sentences per batch") # model / output paths parser.add_argument("--model_path", type=str, default=model_path, help="Model path") # parser.add_argument("--max_vocab", type=int, default=-1, help="Maximum vocabulary size (-1 to disable)") # parser.add_argument("--min_count", type=int, default=0, help="Minimum vocabulary count") # source language / target language parser.add_argument("--src_lang", type=str, default=src_lang, help="Source language") parser.add_argument("--tgt_lang", type=str, default=tgt_lang, help="Target language") params = parser.parse_args() assert params.src_lang != '' and params.tgt_lang != '' and params.src_lang != params.tgt_lang # initialize the experiment logger = initialize_exp(params) # On a pas de GPU #reloaded = torch.load(params.model_path) reloaded = torch.load(params.model_path, map_location=torch.device('cpu')) model_params = AttrDict(reloaded['params']) self.supported_languages = model_params.lang2id.keys() logger.info("Supported languages: %s" % ", ".join(self.supported_languages)) # update dictionary parameters for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights self.dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) #self.encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() self.encoder = TransformerModel(model_params, self.dico, is_encoder=True, with_output=True).eval() #self.decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() self.decoder = TransformerModel(model_params, self.dico, is_encoder=False, with_output=True).eval() self.encoder.load_state_dict(reloaded['encoder']) self.decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] self.model_params = model_params self.params = params
def main(params, params_pretrain, trainer_class): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # Model if params.pretrain: #if params.model_path : # params_pretrain.reload_model="%s,%s"%(params.model_path, params.model_path) for attr_name in ['n_gpu_per_node', 'multi_gpu', 'is_master']: setattr(params_pretrain, attr_name, getattr(params, attr_name)) pre_trainer, evaluator, _ = get_trainer_evaluator( params_pretrain, logger) else: pre_trainer, evaluator = None, None model = build_model(params, logger, pre_trainer=pre_trainer) # Data train_dataset, val_dataset = load_dataset(params, logger, model) # optimizers optimizers = model.get_optimizers(params) if not params.eval_only else [] # Trainer trainer = trainer_class(params_pretrain, params, model, optimizers, train_dataset, val_dataset, logger, pre_trainer, evaluator) if params.pretrain: assert id(trainer.model.embedder.model) == id( getattr(pre_trainer, params.reload_key)) # Run train/evaluation logger.info("") if not params.eval_only: trainer.train(get_loss, end_of_epoch) else: trainer.eval(get_loss, end_of_epoch)
def get_models(params): assert not params.cuda or torch.cuda.is_available() assert 0 <= params.dis_dropout < 1 assert 0 <= params.dis_input_dropout < 1 assert 0 <= params.dis_smooth < 0.5 assert params.dis_lambda > 0 and params.dis_steps > 0 assert 0 < params.lr_shrink <= 1 assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval) assert params.export in ["", "txt", "pth"] # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, discriminator = build_model(params, True) trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) trainer.reload_best() evaluator = Evaluator(trainer) return evaluator, trainer
def inference(params): # check parameters assert params.exp_name check_all_data_params(params) check_mt_model_params(params) # initialize experiment / load data / build model logger = initialize_exp(params) data = load_data(params) encoder, decoder, discriminator, lm = build_mt_model(params, data) # initialize trainer / reload checkpoint / initialize evaluator trainer = TrainerMT(encoder, decoder, discriminator, lm, data, params) trainer.reload_best_model() trainer.test_sharing() # check parameters sharing evaluator = EvaluatorMT(trainer, data, params) # evaluation mode evaluator.eval_inference() exit()
def run_model(params, runid): params.exp_name = params.src_lang + params.tgt_lang if params.exp_name is None else params.exp_name seed = np.random.randint(10000, 20000) params.seed = seed params.exp_id = str(runid) params.exp_path = '' # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, discriminator = build_model(params, True) trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) evaluator = Evaluator(trainer) base_nn, base_csls = _adversarial(params, logger, trainer, evaluator) outputs = { "run": runid, "seed": seed, "base_nn": base_nn, "base_csls": base_csls } return logger, trainer, evaluator, outputs
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) _lang1, _lang2 = ( params.langs[0], params.langs[1]) if params.langs[0] < params.langs[1] else ( params.langs[1], params.langs[0]) dataset = data['para'][(_lang1, _lang2)]['test'] print(params.n_words) print("ref_paths" + str(params.ref_paths)) for i, ((x1, len1, id1, lenid1), (x2, len2, id2, lenid2)) in enumerate( dataset.get_iterator(shuffle=False, group_by_size=True, n_sentences=-1, tokens_per_batch=2000)): print('x2' + str(x2.size())) print("len2[None] - 1" + str(len2[None] - 1) + " " + str(len2[None])) print(str(len2[0])) print('len2' + str(len2)) alen = torch.arange(len2.max(), dtype=torch.long, device=len2.device) # do not predict anything given the last target word pred_mask = alen[:, None] < len2[None] - 1 print("pred_mask" + str(pred_mask)) print(str(pred_mask.size())) y = x2[1:].masked_select(pred_mask[:-1]) print("yyyy" + str(y)) print(str(y.size())) assert len(y) == (len2 - 1).sum().item()
def __init__(self, args): self.args = args # check parameters if not self.args.pred: #assert 0 < self.args.lr_shrink <= 1 assert self.args.model_path is not None self.dataset = None # build model / trainer / evaluator if not (self.args.pred or self.args.eval): self.logger = initialize_exp(self.args) self.bert_model, self.bert_model1, self.mapping = build_model( self.args, True) if self.args.local_rank == -1 or self.args.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") else: self.device = torch.device("cuda", self.args.local_rank) self.transformer_types = [ 'self_attention', 'attention', 'linear_self_attention', 'nonlinear_self_attention' ]
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed # if params.multi_gpu: # logger.info("Using nn.parallel.DistributedDataParallel ...") # if params.encoder_only: # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # else: # encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) # decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # mass prediction steps for lang in shuf_order(params.mass_steps): trainer.mass_step(lang, params.lambda_mass) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) # back-parallel steps for lang1, lang2 in shuf_order(params.bmt_steps, params): trainer.bmt_step(lang1, lang2, params.lambda_bmt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) params.lgs = lgs = params.lgs.split("-") if len(lgs) == 1: lgs.append(lgs[0]) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # Replace the original MLM steps for lang1, lang2 in shuf_order(params.mlm_steps, params): if params.do_meta_update: trainer.meta_mlm_step(lang1) else: trainer.mlm_step(lang1, lang2, params.lambda_mlm) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the experiment logger = initialize_exp(params) parser = get_parser() params = parser.parse_args() models_path = params.model_path.split(',') # generate parser / parse parameters models_reloaded = [] for model_path in models_path: models_reloaded.append(torch.load(model_path)) model_params = AttrDict(models_reloaded[0]['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in [ 'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index' ]: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(models_reloaded[0]['dico_id2word'], models_reloaded[0]['dico_word2id'], models_reloaded[0]['dico_counts']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] encoders = [] decoders = [] def package_module(modules): state_dict = OrderedDict() for k, v in modules.items(): if k.startswith('module.'): state_dict[k[7:]] = v else: state_dict[k] = v return state_dict for reloaded in models_reloaded: encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).to(params.device).eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).to(params.device).eval() encoder.load_state_dict(package_module(reloaded['encoder'])) decoder.load_state_dict(package_module(reloaded['decoder'])) # float16 if params.fp16: assert torch.backends.cudnn.enabled encoder = network_to_half(encoder) decoder = network_to_half(decoder) encoders.append(encoder) decoders.append(decoder) #src_sent = ['Poly@@ gam@@ ie statt Demokratie .'] src_sent = [] for line in sys.stdin.readlines(): assert len(line.strip().split()) > 0 src_sent.append(line) f = io.open(params.output_path, 'w', encoding='utf-8') for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [ torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size] ] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encodeds = [] for encoder in encoders: encoded = encoder('fwd', x=batch.to(params.device), lengths=lengths.to(params.device), langs=langs.to(params.device), causal=False) encoded = encoded.transpose(0, 1) encodeds.append(encoded) assert encoded.size(0) == lengths.size(0) decoded, dec_lengths = generate_beam( decoders, encodeds, lengths.to(params.device), params.tgt_id, beam_size=params.beam, length_penalty=params.length_penalty, early_stopping=False, max_len=int(1.5 * lengths.max().item() + 10), params=params) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip() target = " ".join([dico[sent[k].item()] for k in range(len(sent))]) sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) f.write(target + "\n") f.close()
def main(params): # check_data_params(params) check_model_params(params) # initialize the experiment logger = initialize_exp(params) # load data data = load_data(params) # check_vocab(data) # build model if params.encoder_only: model = build_model(params, data['source_dico']) else: encoder, decoder = build_model(params, data['source_dico'], data['target_dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_iter = 0 while trainer.n_iter < trainer.epoch_size: if params.cs_step: trainer.content_selection_step(params.lambda_cs) if params.sm_step: trainer.summarization_step(params.lambda_sm) if params.lm_step: trainer.clm_step(params.lambda_lm) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch()
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) model_params.add_pred = "" logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in [ 'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index' ]: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights src_dico = load_binarized(params.src_data) tgt_dico = load_binarized(params.tgt_data) encoder = TransformerModel(model_params, src_dico, is_encoder=True, with_output=False).cuda().eval() decoder = TransformerModel(model_params, tgt_dico, is_encoder=False, with_output=True).cuda().eval() if all([k.startswith('module.') for k in reloaded['encoder'].keys()]): reloaded['encoder'] = { k[len('module.'):]: v for k, v in reloaded['encoder'].items() } reloaded['decoder'] = { k[len('module.'):]: v for k, v in reloaded['decoder'].items() } encoder.load_state_dict(reloaded['encoder'], strict=False) decoder.load_state_dict(reloaded['decoder'], strict=False) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] # # float16 # # read sentences from stdin src_sent = [] input_f = open(params.input_path, 'r') for line in input_f: line = line.strip() assert len(line.strip().split()) > 0 src_sent.append(line) logger.info("Read %i sentences from stdin. Translating ..." % len(src_sent)) f = io.open(params.output_path, 'w', encoding='utf-8') for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [ torch.LongTensor([src_dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size] ] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encoded = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False) encoded = [enc.transpose(0, 1) for enc in encoded] decoded, dec_lengths = decoder.generate( encoded, lengths.cuda(), params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10)) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip() target = " ".join( [tgt_dico[sent[k].item()] for k in range(len(sent))]) #sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) f.write(target + "\n") f.close()
def main(): global args args = parser.parse_args() init_distributed_mode(args) fix_random_seeds(args.seed) logger, training_stats = initialize_exp(args, "epoch", "loss") # build data train_dataset = MultiCropDataset( args.data_path, args.size_crops, args.nmb_crops, args.min_scale_crops, args.max_scale_crops, return_index=True, ) sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, sampler=sampler, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, drop_last=True) logger.info("Building data done with {} images loaded.".format( len(train_dataset))) # build model model = resnet_models.__dict__[args.arch]( normalize=True, hidden_mlp=args.hidden_mlp, output_dim=args.feat_dim, nmb_prototypes=args.nmb_prototypes, ) # synchronize batch norm layers if args.sync_bn == "pytorch": model = nn.SyncBatchNorm.convert_sync_batchnorm(model) elif args.sync_bn == "apex": process_group = None if args.world_size // 8 > 0: process_group = apex.parallel.create_syncbn_process_group( args.world_size // 8) model = apex.parallel.convert_syncbn_model(model, process_group=process_group) # copy model to GPU model = model.cuda() if args.rank == 0: logger.info(model) logger.info("Building model done.") # build optimizer # base_lr=4.8 wd=1e-6 optimizer = torch.optim.SGD( model.parameters(), lr=args.base_lr, momentum=0.9, weight_decay=args.wd, ) # Using Dist LARC Optimizer optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) # LR Scheduling warmup_lr_schedule = np.linspace(args.start_warmup, args.base_lr, len(train_loader) * args.warmup_epochs) iters = np.arange(len(train_loader) * (args.epochs - args.warmup_epochs)) cosine_lr_schedule = np.array([args.final_lr + 0.5 * (args.base_lr - args.final_lr) * (1 + \ math.cos(math.pi * t / (len(train_loader) * (args.epochs - args.warmup_epochs)))) for t in iters]) lr_schedule = np.concatenate((warmup_lr_schedule, cosine_lr_schedule)) logger.info("Building optimizer done.") # wrap model model = nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu_to_work_on], find_unused_parameters=True, ) # optionally resume from a checkpoint to_restore = {"epoch": 0} restart_from_checkpoint( os.path.join(args.dump_path, "checkpoint.pth.tar"), run_variables=to_restore, state_dict=model, optimizer=optimizer, ) start_epoch = to_restore["epoch"] # build the memory bank mb_path = os.path.join(args.dump_path, "mb" + str(args.rank) + ".pth") if os.path.isfile(mb_path): mb_ckp = torch.load(mb_path) local_memory_index = mb_ckp["local_memory_index"] local_memory_embeddings = mb_ckp["local_memory_embeddings"] else: local_memory_index, local_memory_embeddings = init_memory( train_loader, model) cudnn.benchmark = True for epoch in range(start_epoch, args.epochs): # train the network for one epoch logger.info("============ Starting epoch %i ... ============" % epoch) # set sampler train_loader.sampler.set_epoch(epoch) # train the network scores, local_memory_index, local_memory_embeddings = train( train_loader, model, optimizer, epoch, lr_schedule, local_memory_index, local_memory_embeddings, ) training_stats.update(scores) # save checkpoints if args.rank == 0: save_dict = { "epoch": epoch + 1, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), } torch.save( save_dict, os.path.join(args.dump_path, "checkpoint.pth.tar"), ) if epoch % args.checkpoint_freq == 0 or epoch == args.epochs - 1: shutil.copyfile( os.path.join(args.dump_path, "checkpoint.pth.tar"), os.path.join(args.dump_checkpoints, "ckp-" + str(epoch) + ".pth"), ) torch.save( { "local_memory_embeddings": local_memory_embeddings, "local_memory_index": local_memory_index }, mb_path)
# parse parameters params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert params.dico_train in ["identical_char", "default"] or os.path.isfile( params.dico_train) assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"] assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.export in ["", "txt", "pth"] # build logger / model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, _ = build_model(params, False) trainer = Trainer(src_emb, tgt_emb, mapping, None, params) evaluator = Evaluator(trainer) # load a training dictionary. if a dictionary path is not provided, use a default # one ("default") or create one based on identical character strings ("identical_char") trainer.load_training_dico(params.dico_train) """ Learning loop for Procrustes Iterative Learning """ for n_iter in range(params.n_refinement + 1): logger.info('Starting iteration %i...' % n_iter) # build a dictionary from aligned embeddings (unless
def seq2seq_main(params): ''' Use different vocabulary/dictionary for src and tgt ''' # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = seq2seq_load_data(params) # build model # 因為 language pair 會重新升冪排序 (zh-en) --> (en-zh) # 所以 en 變成 src , zh 變成 tgt encoder, decoder = build_seq2seq_model( params, data['tgt_dico'], data['src_dico']) # build trainer, reload potential checkpoints / build evaluator trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = MyEncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() torch.manual_seed( params.seed ) # Set random seed. NB: Multi-GPU also needs torch.cuda.manual_seed_all(params.seed) assert (params.sample_temperature == 0) or (params.beam_size == 1), 'Cannot sample with beam search.' assert params.amp <= 1, f'params.amp == {params.amp} not yet supported.' reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in [ 'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index' ]: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=False).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() if all([k.startswith('module.') for k in reloaded['encoder'].keys()]): reloaded['encoder'] = { k[len('module.'):]: v for k, v in reloaded['encoder'].items() } encoder.load_state_dict(reloaded['encoder']) if all([k.startswith('module.') for k in reloaded['decoder'].keys()]): reloaded['decoder'] = { k[len('module.'):]: v for k, v in reloaded['decoder'].items() } decoder.load_state_dict(reloaded['decoder']) if params.amp != 0: models = apex.amp.initialize([encoder, decoder], opt_level=('O%i' % params.amp)) encoder, decoder = models params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] # read sentences from stdin src_sent = [] for line in sys.stdin.readlines(): assert len(line.strip().split()) > 0 src_sent.append(line) logger.info("Read %i sentences from stdin. Translating ..." % len(src_sent)) # f = io.open(params.output_path, 'w', encoding='utf-8') hypothesis = [[] for _ in range(params.beam_size)] for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [ torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size] ] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encoded = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False) encoded = encoded.transpose(0, 1) max_len = int(1.5 * lengths.max().item() + 10) if params.beam_size == 1: decoded, dec_lengths = decoder.generate( encoded, lengths.cuda(), params.tgt_id, max_len=max_len, sample_temperature=(None if params.sample_temperature == 0 else params.sample_temperature)) else: decoded, dec_lengths, all_hyp_strs = decoder.generate_beam( encoded, lengths.cuda(), params.tgt_id, beam_size=params.beam_size, length_penalty=params.length_penalty, early_stopping=params.early_stopping, max_len=max_len, output_all_hyps=True) # hypothesis.extend(convert_to_text(decoded, dec_lengths, dico, params)) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip().replace('<unk>', '<<unk>>') target = " ".join([dico[sent[k].item()] for k in range(len(sent)) ]).replace('<unk>', '<<unk>>') if params.beam_size == 1: hypothesis[0].append(target) else: for hyp_rank in range(params.beam_size): print( all_hyp_strs[j] [hyp_rank if hyp_rank < len(all_hyp_strs[j]) else -1]) hypothesis[hyp_rank].append( all_hyp_strs[j] [hyp_rank if hyp_rank < len(all_hyp_strs[j]) else -1]) sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source.replace( '@@ ', ''), target.replace('@@ ', ''))) # f.write(target + "\n") # f.close() # export sentences to reference and hypothesis files / restore BPE segmentation save_dir, split = params.output_path.rsplit('/', 1) for hyp_rank in range(len(hypothesis)): hyp_name = f'hyp.st={params.sample_temperature}.bs={params.beam_size}.lp={params.length_penalty}.es={params.early_stopping}.seed={params.seed if (len(hypothesis) == 1) else str(hyp_rank)}.{params.src_lang}-{params.tgt_lang}.{split}.txt' hyp_path = os.path.join(save_dir, hyp_name) with open(hyp_path, 'w', encoding='utf-8') as f: f.write('\n'.join(hypothesis[hyp_rank]) + '\n') restore_segmentation(hyp_path) # evaluate BLEU score if params.ref_path: bleu = eval_moses_bleu(params.ref_path, hyp_path) logger.info("BLEU %s %s : %f" % (hyp_path, params.ref_path, bleu))
def main(): global args args = parser.parse_args() init_distributed_mode(args) fix_random_seeds(args.seed) logger, training_stats = initialize_exp(args, "epoch", "loss") # build data train_dataset = MultiCropDataset( args.data_path, args.size_crops, args.nmb_crops, args.min_scale_crops, args.max_scale_crops, pil_blur=args.use_pil_blur, ) sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, sampler=sampler, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, drop_last=True) logger.info("Building data done with {} images loaded.".format( len(train_dataset))) # build model model = resnet_models.__dict__[args.arch]( normalize=True, hidden_mlp=args.hidden_mlp, output_dim=args.feat_dim, nmb_prototypes=args.nmb_prototypes, ) # synchronize batch norm layers if args.sync_bn == "pytorch": model = nn.SyncBatchNorm.convert_sync_batchnorm(model) elif args.sync_bn == "apex": process_group = None if args.world_size // 8 > 0: process_group = apex.parallel.create_syncbn_process_group( args.world_size // 8) model = apex.parallel.convert_syncbn_model(model, process_group=process_group) # copy model to GPU model = model.cuda() if args.rank == 0: logger.info(model) logger.info("Building model done.") # build optimizer optimizer = torch.optim.SGD( model.parameters(), lr=args.base_lr, momentum=0.9, weight_decay=args.wd, ) optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) warmup_lr_schedule = np.linspace(args.start_warmup, args.base_lr, len(train_loader) * args.warmup_epochs) iters = np.arange(len(train_loader) * (args.epochs - args.warmup_epochs)) cosine_lr_schedule = np.array([args.final_lr + 0.5 * (args.base_lr - args.final_lr) * (1 + \ math.cos(math.pi * t / (len(train_loader) * (args.epochs - args.warmup_epochs)))) for t in iters]) lr_schedule = np.concatenate((warmup_lr_schedule, cosine_lr_schedule)) logger.info("Building optimizer done.") # init mixed precision if args.use_fp16: model, optimizer = apex.amp.initialize(model, optimizer, opt_level="O1") logger.info("Initializing mixed precision done.") # wrap model model = nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu_to_work_on], find_unused_parameters=True, ) # optionally resume from a checkpoint to_restore = {"epoch": 0} restart_from_checkpoint( os.path.join(args.dump_path, "checkpoint.pth.tar"), run_variables=to_restore, state_dict=model, optimizer=optimizer, amp=apex.amp, ) start_epoch = to_restore["epoch"] # build the queue queue = None queue_path = os.path.join(args.dump_path, "queue" + str(args.rank) + ".pth") if os.path.isfile(queue_path): queue = torch.load(queue_path)["queue"] # the queue needs to be divisible by the batch size # args.queue_length -= args.queue_length % (args.batch_size * args.world_size) cudnn.benchmark = True ## initialize queue print('start initialize queue') queue = init_queue(train_loader, model, args) print('queue initialize finish') for epoch in range(start_epoch, args.epochs): # train the network for one epoch logger.info("============ Starting epoch %i ... ============" % epoch) # set sampler train_loader.sampler.set_epoch(epoch) # optionally starts a queue # queue shape : (Ncrops, Lqueue, feat) --> (NClass, NCrops, Lqueue, feat) # if queue is None: # queue = torch.randn(1000, args.feat_dim).cuda() # queue = nn.functional.normalize(queue, dim=1, p=2) # train the network scores, queue = train(train_loader, model, optimizer, epoch, lr_schedule, queue, args) training_stats.update(scores) # save checkpoints if args.rank == 0: save_dict = { "epoch": epoch + 1, "state_dict": model.state_dict(), "optimizer": optimizer.state_dict(), } if args.use_fp16: save_dict["amp"] = apex.amp.state_dict() torch.save( save_dict, os.path.join(args.dump_path, "checkpoint.pth.tar"), ) if epoch % args.checkpoint_freq == 0 or epoch == args.epochs - 1: shutil.copyfile( os.path.join(args.dump_path, "checkpoint.pth.tar"), os.path.join(args.dump_checkpoints, "ckp-" + str(epoch) + ".pth"), ) if queue is not None: torch.save({"queue": queue}, queue_path)
# parse parameters params = parser.parse_args() # check parameters assert not params.cuda or torch.cuda.is_available() assert 0 <= params.dis_dropout < 1 assert 0 <= params.dis_input_dropout < 1 assert 0 <= params.dis_smooth < 0.5 assert params.dis_lambda > 0 and params.dis_steps > 0 assert 0 < params.lr_shrink <= 1 assert os.path.isfile(params.src_emb) assert os.path.isfile(params.tgt_emb) assert params.export in ["", "txt", "pth"] # build model / trainer / evaluator logger = initialize_exp(params) src_emb, tgt_emb, mapping, discriminator = build_model(params, True) trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params) evaluator = Evaluator(trainer) """ Learning loop for Adversarial Training """ if params.adversarial: logger.info('----> ADVERSARIAL TRAINING <----\n\n') # training loop for n_epoch in range(params.n_epochs): logger.info('Starting adversarial training epoch %i...' % n_epoch)
def clts_elmo_main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # cross lingual encoder # cross lingual text summarization encoder, text summarization decoder elmo, ts_encoder, ts_decoder = build_clts_elmo_model(params, data['dico']) trainer = XLMCLTSEncDecTrainer(elmo, ts_encoder, ts_decoder, data, params) evaluator = XLMCLTSEncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # # initialize SLURM signal handler for time limit / pre-emption # init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # # float16 # if params.fp16: # assert torch.backends.cudnn.enabled # if params.encoder_only: # model = network_to_half(model) # else: # encoder = network_to_half(encoder) # decoder = network_to_half(decoder) # # distributed # if params.multi_gpu: # logger.info("Using nn.parallel.DistributedDataParallel ...") # if params.fp16: # if params.encoder_only: # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # else: # encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) # decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # else: # if params.encoder_only: # model = nn.parallel.DistributedDataParallel(model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # else: # encoder = nn.parallel.DistributedDataParallel(encoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # decoder = nn.parallel.DistributedDataParallel(decoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # # evaluation # if params.eval_only: # scores = evaluator.run_all_evals(trainer) # for k, v in scores.items(): # logger.info("%s -> %.6f" % (k, v)) # logger.info("__log__:%s" % json.dumps(scores)) # exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 trainer.n_images = 0 while trainer.n_sentences < trainer.epoch_size or trainer.n_images < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) # shuf_order's result could be: ['fr', 'fr'] or ['en', 'fr'] or ['fr', 'en'] or ['en', 'en'] for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # Image-language pretraining steps trainer.ipm_step("coco36", params.lambda_ipm) # CMLM steps steps for m1, m2 in shuf_order(params.cmlm_steps, params): trainer.cmlm_step(m1, m2, params.lambda_cmlm) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch)
def main(params): # check parameters assert params.exp_name check_all_data_params(params) check_mt_model_params(params) # initialize experiment / load data / build model logger = initialize_exp(params) data = load_data(params) encoder, decoder, discriminator, lm = build_mt_model(params, data) # initialize trainer / reload checkpoint / initialize evaluator trainer = TrainerMT(encoder, decoder, discriminator, lm, data, params) trainer.reload_checkpoint() trainer.test_sharing() # check parameters sharing evaluator = EvaluatorMT(trainer, data, params) # evaluation mode if params.eval_only: evaluator.run_all_evals(0) exit() # language model pretraining if params.lm_before > 0: logger.info("Pretraining language model for %i iterations ..." % params.lm_before) trainer.n_sentences = 0 for _ in range(params.lm_before): for lang in params.langs: trainer.lm_step(lang) trainer.iter() # define epoch size if params.epoch_size == -1: params.epoch_size = params.n_para assert params.epoch_size > 0 # start training for _ in range(trainer.epoch, params.max_epoch): logger.info( "====================== Starting epoch %i ... ======================" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < params.epoch_size: # discriminator training for _ in range(params.n_dis): trainer.discriminator_step() # language model training if params.lambda_lm > 0: for _ in range(params.lm_after): for lang in params.langs: trainer.lm_step(lang) # MT training (parallel data) if params.lambda_xe_para > 0: for lang1, lang2 in params.para_directions: trainer.enc_dec_step(lang1, lang2, params.lambda_xe_para) # MT training (back-parallel data) if params.lambda_xe_back > 0: for lang1, lang2 in params.back_directions: trainer.enc_dec_step(lang1, lang2, params.lambda_xe_back, back=True) # autoencoder training (monolingual data) if params.lambda_xe_mono > 0: for lang in params.mono_directions: trainer.enc_dec_step(lang, lang, params.lambda_xe_mono) # AE - MT training (on the fly back-translation) if params.lambda_xe_otfd > 0 or params.lambda_xe_otfa > 0: # start on-the-fly batch generations if not getattr(params, 'started_otf_batch_gen', False): otf_iterator = trainer.otf_bt_gen_async() params.started_otf_batch_gen = True # update model parameters on subprocesses if trainer.n_iter % params.otf_sync_params_every == 0: trainer.otf_sync_params() # get training batch from CPU before_gen = time.time() batches = next(otf_iterator) trainer.gen_time += time.time() - before_gen # training for batch in batches: lang1, lang2, lang3 = batch['lang1'], batch[ 'lang2'], batch['lang3'] # 2-lang back-translation - autoencoding if lang1 != lang2 == lang3: trainer.otf_bt(batch, params.lambda_xe_otfa, params.otf_backprop_temperature) # 2-lang back-translation - parallel data elif lang1 == lang3 != lang2: trainer.otf_bt(batch, params.lambda_xe_otfd, params.otf_backprop_temperature) # 3-lang back-translation - parallel data elif lang1 != lang2 and lang2 != lang3 and lang1 != lang3: trainer.otf_bt(batch, params.lambda_xe_otfd, params.otf_backprop_temperature) trainer.iter() # end of epoch logger.info( "====================== End of epoch %i ======================" % trainer.epoch) # evaluate discriminator / perplexity / BLEU scores = evaluator.run_all_evals(trainer.epoch) # print / JSON log for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) # save best / save periodic / end epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores) trainer.test_sharing()
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model # reload-model options are in here if params.encoder_only: model = build_model(params, data['dico']) if params.use_adapters: logger.info("Using adapters") for param in model.named_parameters(): if param[0][:8] != "adapters": param[1].requires_grad = False for param_name, param in model.embeddings.named_parameters(): param.requires_grad = True for param_name, param in model.position_embeddings.named_parameters( ): param.requires_grad = True for param_name, param in model.pred_layer.named_parameters(): param.requires_grad = True for param in model.layer_norm_emb.parameters(): param.requires_grad = True for param in model.named_parameters(): logger.info(param[0] + ' required grad = ' + str(param[1].requires_grad)) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) logger.info("Number of trainable parameters (encoder): %i" % sum( [p.numel() for p in trainer.model.parameters() if p.requires_grad])) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) logger.info( "Number of trainable parameters (encoder): %i" % sum([p.numel() for p in encoder.parameters() if p.requires_grad])) logger.info( "Number of trainable parameters (decoder): %i" % sum([p.numel() for p in decoder.parameters() if p.requires_grad])) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for epoch in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps (causal languge model) for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt, params.bt_sample_temperature) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch if params.validation_metrics != '': trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment / load data logger = initialize_exp(params) # Seed torch.manual_seed(params.seed) torch.cuda.manual_seed_all(params.seed) # initialize SLURM signal handler for time limit / pre-emption if params.is_slurm_job: init_signal_handler() # data loaders / samplers populate_dataset(params) train_data_loader, train_sampler, _ = get_data_loader( img_size=params.img_size, crop_size=params.crop_size, shuffle=True, batch_size=params.batch_size, num_classes=params.num_classes, nb_workers=params.nb_workers, distributed_sampler=params.multi_gpu, dataset=params.dataset, data_path=params.train_path, transform=params.train_transform, split='valid' if params.debug_train else 'train', seed=params.seed) valid_data_loader, _, _ = get_data_loader(img_size=params.img_size, crop_size=params.crop_size, shuffle=False, batch_size=params.batch_size, num_classes=params.num_classes, nb_workers=params.nb_workers, distributed_sampler=False, dataset=params.dataset, transform='center', split='valid', seed=params.seed) # build model / cuda logger.info("Building %s model ..." % params.architecture) ftmodel = build_model(params) ftmodel.fc = nn.Sequential() ftmodel.eval().cuda() linearmodel = nn.Linear(EMBEDDING_SIZE[params.architecture], params.num_classes).cuda() if params.from_ckpt != "": ckpt = torch.load(params.from_ckpt) state_dict = { k.replace("module.", ""): v for k, v in ckpt['model'].items() } del state_dict["fc.weight"] if "fc.bias" in state_dict: del state_dict["fc.bias"] missing_keys, unexcepted_keys = ftmodel.load_state_dict(state_dict, strict=False) print("Missing keys: ", missing_keys) print("Unexcepted keys: ", unexcepted_keys) # distributed # TODO: check this https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main.py#L142 if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") linearmodel = nn.parallel.DistributedDataParallel( linearmodel, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True) # build trainer / reload potential checkpoints / build evaluator trainer = Trainer(model=linearmodel, params=params, ftmodel=ftmodel) trainer.reload_checkpoint() evaluator = Evaluator(trainer, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # training for epoch in range(trainer.epoch, params.epochs): # update epoch / sampler / learning rate trainer.epoch = epoch logger.info("============ Starting epoch %i ... ============" % trainer.epoch) if params.multi_gpu: train_sampler.set_epoch(epoch) # update learning rate trainer.update_learning_rate() # train for i, (images, targets) in enumerate(train_data_loader): trainer.classif_step(images, targets) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate classification accuracy scores = evaluator.run_all_evals(trainer, evals=['classif'], data_loader=valid_data_loader) for name, val in trainer.get_scores().items(): scores[name] = val # print / JSON log for k, v in scores.items(): logger.info('%s -> %.6f' % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)