def run(self, task): """ Run GLUE training / evaluation. """ params = self.params # task parameters self.task = task params.out_features = N_CLASSES[task] self.is_classif = task != 'STS-B' # load data self.data = self.load_data(task) if not self.data['dico'] == self._embedder.dico: raise Exception(( "Dictionary in evaluation data (%i words) seems different than the one " + "in the pretrained model (%i words). Please verify you used the same dictionary, " + "and the same values for max_vocab and min_count.") % (len(self.data['dico']), len(self._embedder.dico))) # embedder self.embedder = copy.deepcopy(self._embedder) self.embedder.cuda() # projection layer self.proj = nn.Sequential(*[ nn.Dropout(params.dropout), nn.Linear(self.embedder.out_dim, params.out_features) ]).cuda() # float16 if params.fp16: assert torch.backends.cudnn.enabled self.embedder.model = network_to_half(self.embedder.model) self.proj = network_to_half(self.proj) # optimizer self.optimizer = get_optimizer( list(self.embedder.get_parameters(params.finetune_layers)) + list(self.proj.parameters()), params.optimizer) if params.fp16: self.optimizer = FP16_Optimizer(self.optimizer, dynamic_loss_scale=True) # train and evaluate the model for epoch in range(params.n_epochs): # update epoch self.epoch = epoch # training logger.info("GLUE - %s - Training epoch %i ..." % (task, epoch)) self.train() # evaluation logger.info("GLUE - %s - Evaluating epoch %i ..." % (task, epoch)) with torch.no_grad(): scores = self.eval() self.scores.update(scores)
def run(self): """ Run XNLI training / evaluation. """ params = self.params # load data self.data = self.load_data() assert len(self.data['dico']) == self._embedder.n_words # embedder self.embedder = copy.deepcopy(self._embedder) self.embedder.cuda() # projection layer self.proj = nn.Sequential( *[nn.Dropout(params.dropout), nn.Linear(self.embedder.out_dim, 3)]).cuda() # float16 if params.fp16: assert torch.backends.cudnn.enabled self.embedder.model = network_to_half(self.embedder.model) self.proj = network_to_half(self.proj) # optimizer self.optimizer = get_optimizer( list(self.embedder.get_parameters(params.finetune_layers)) + list(self.proj.parameters()), params.optimizer) if params.fp16: self.optimizer = FP16_Optimizer(self.optimizer, dynamic_loss_scale=True) # train and evaluate the model for epoch in range(params.n_epochs): # update epoch self.epoch = epoch # training logger.info("XNLI - Training epoch %i ..." % epoch) self.train() # evaluation logger.info("XNLI - Evaluating epoch %i ..." % epoch) with torch.no_grad(): scores = self.eval() self.scores.update(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed # if params.multi_gpu: # logger.info("Using nn.parallel.DistributedDataParallel ...") # if params.encoder_only: # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # else: # encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) # decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: scores = evaluator.run_all_evals(trainer) for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # CLM steps for lang1, lang2 in shuf_order(params.clm_steps, params): trainer.clm_step(lang1, lang2, params.lambda_clm) # MLM steps (also includes TLM if lang2 is not None) for lang1, lang2 in shuf_order(params.mlm_steps, params): trainer.mlm_step(lang1, lang2, params.lambda_mlm) # parallel classification steps for lang1, lang2 in shuf_order(params.pc_steps, params): trainer.pc_step(lang1, lang2, params.lambda_pc) # denoising auto-encoder steps for lang in shuf_order(params.ae_steps): trainer.mt_step(lang, lang, params.lambda_ae) # mass prediction steps for lang in shuf_order(params.mass_steps): trainer.mass_step(lang, params.lambda_mass) # machine translation steps for lang1, lang2 in shuf_order(params.mt_steps, params): trainer.mt_step(lang1, lang2, params.lambda_mt) # back-translation steps for lang1, lang2, lang3 in shuf_order(params.bt_steps): trainer.bt_step(lang1, lang2, lang3, params.lambda_bt) # back-parallel steps for lang1, lang2 in shuf_order(params.bmt_steps, params): trainer.bmt_step(lang1, lang2, params.lambda_bmt) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_all_evals(trainer) # print / JSON log for k, v in scores.items(): logger.info("%s -> %.6f" % (k, v)) if params.is_master: logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)
def main(params): # initialize the experiment logger = initialize_exp(params) parser = get_parser() params = parser.parse_args() models_path = params.model_path.split(',') # generate parser / parse parameters models_reloaded = [] for model_path in models_path: models_reloaded.append(torch.load(model_path)) model_params = AttrDict(models_reloaded[0]['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in [ 'n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index' ]: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(models_reloaded[0]['dico_id2word'], models_reloaded[0]['dico_word2id'], models_reloaded[0]['dico_counts']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] encoders = [] decoders = [] def package_module(modules): state_dict = OrderedDict() for k, v in modules.items(): if k.startswith('module.'): state_dict[k[7:]] = v else: state_dict[k] = v return state_dict for reloaded in models_reloaded: encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).to(params.device).eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).to(params.device).eval() encoder.load_state_dict(package_module(reloaded['encoder'])) decoder.load_state_dict(package_module(reloaded['decoder'])) # float16 if params.fp16: assert torch.backends.cudnn.enabled encoder = network_to_half(encoder) decoder = network_to_half(decoder) encoders.append(encoder) decoders.append(decoder) #src_sent = ['Poly@@ gam@@ ie statt Demokratie .'] src_sent = [] for line in sys.stdin.readlines(): assert len(line.strip().split()) > 0 src_sent.append(line) f = io.open(params.output_path, 'w', encoding='utf-8') for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [ torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size] ] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encodeds = [] for encoder in encoders: encoded = encoder('fwd', x=batch.to(params.device), lengths=lengths.to(params.device), langs=langs.to(params.device), causal=False) encoded = encoded.transpose(0, 1) encodeds.append(encoded) assert encoded.size(0) == lengths.size(0) decoded, dec_lengths = generate_beam( decoders, encodeds, lengths.to(params.device), params.tgt_id, beam_size=params.beam, length_penalty=params.length_penalty, early_stopping=False, max_len=int(1.5 * lengths.max().item() + 10), params=params) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip() target = " ".join([dico[sent[k].item()] for k in range(len(sent))]) sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) f.write(target + "\n") f.close()
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] # float16 if params.fp16: assert torch.backends.cudnn.enabled encoder = network_to_half(encoder) decoder = network_to_half(decoder) # read sentences from stdin src_sent = [] for line in sys.stdin.readlines(): assert len(line.strip().split()) > 0 src_sent.append(line) logger.info("Read %i sentences from stdin. Translating ..." % len(src_sent)) f = io.open(params.output_path, 'w', encoding='utf-8') for i in range(0, len(src_sent), params.batch_size): # prepare batch word_ids = [torch.LongTensor([dico.index(w) for w in s.strip().split()]) for s in src_sent[i:i + params.batch_size]] lengths = torch.LongTensor([len(s) + 2 for s in word_ids]) batch = torch.LongTensor(lengths.max().item(), lengths.size(0)).fill_(params.pad_index) batch[0] = params.eos_index for j, s in enumerate(word_ids): if lengths[j] > 2: # if sentence not empty batch[1:lengths[j] - 1, j].copy_(s) batch[lengths[j] - 1, j] = params.eos_index langs = batch.clone().fill_(params.src_id) # encode source batch and translate it encoded = encoder('fwd', x=batch.cuda(), lengths=lengths.cuda(), langs=langs.cuda(), causal=False) encoded = encoded.transpose(0, 1) decoded, dec_lengths = decoder.generate(encoded, lengths.cuda(), params.tgt_id, max_len=int(1.5 * lengths.max().item() + 10)) # convert sentences to words for j in range(decoded.size(1)): # remove delimiters sent = decoded[:, j] delimiters = (sent == params.eos_index).nonzero().view(-1) assert len(delimiters) >= 1 and delimiters[0].item() == 0 sent = sent[1:] if len(delimiters) == 1 else sent[1:delimiters[1]] # output translation source = src_sent[i + j].strip() target = " ".join([dico[sent[k].item()] for k in range(len(sent))]) sys.stderr.write("%i / %i: %s -> %s\n" % (i + j, len(src_sent), source, target)) f.write(target + "\n") f.close()
def main(params): # initialize the experiment logger = initialize_exp(params) # generate parser / parse parameters parser = get_parser() params = parser.parse_args() reloaded = torch.load(params.model_path) model_params = AttrDict(reloaded['params']) logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys())) # update dictionary parameters for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']: setattr(params, name, getattr(model_params, name)) # build dictionary / build encoder / build decoder / reload weights dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval() decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval() encoder.load_state_dict(reloaded['encoder']) decoder.load_state_dict(reloaded['decoder']) params.src_id = model_params.lang2id[params.src_lang] params.tgt_id = model_params.lang2id[params.tgt_lang] # float16 if params.fp16: assert torch.backends.cudnn.enabled encoder = network_to_half(encoder) decoder = network_to_half(decoder) input_data = torch.load(params.input) eval_dataset = Dataset(input_data["sentences"], input_data["positions"], params) if params.subset_start is not None: assert params.subset_end eval_dataset.select_data(params.subset_start, params.subset_end) eval_dataset.remove_empty_sentences() eval_dataset.remove_long_sentences(params.max_len) n_batch = 0 out = io.open(params.output_path, "w", encoding="utf-8") inp_dump = io.open(os.path.join(params.dump_path, "input.txt"), "w", encoding="utf-8") logger.info("logging to {}".format(os.path.join(params.dump_path, 'input.txt'))) with open(params.output_path, "w", encoding="utf-8") as out: for batch in eval_dataset.get_iterator(shuffle=False): n_batch += 1 (x1, len1) = batch input_text = convert_to_text(x1, len1, input_data["dico"], params) inp_dump.write("\n".join(input_text)) inp_dump.write("\n") langs1 = x1.clone().fill_(params.src_id) # cuda x1, len1, langs1 = to_cuda(x1, len1, langs1) # encode source sentence enc1 = encoder("fwd", x=x1, lengths=len1, langs=langs1, causal=False) enc1 = enc1.transpose(0, 1) # generate translation - translate / convert to text max_len = int(1.5 * len1.max().item() + 10) if params.beam_size == 1: generated, lengths = decoder.generate(enc1, len1, params.tgt_id, max_len=max_len) else: generated, lengths = decoder.generate_beam( enc1, len1, params.tgt_id, beam_size=params.beam_size, length_penalty=params.length_penalty, early_stopping=params.early_stopping, max_len=max_len) hypotheses_batch = convert_to_text(generated, lengths, input_data["dico"], params) out.write("\n".join(hypotheses_batch)) out.write("\n") if n_batch % 100 == 0: logger.info("{} batches processed".format(n_batch)) out.close() inp_dump.close()
def run(self): """ Run XNLI training / evaluation. """ params = self.params # load data self.data = self.load_data() if not self.data['dico'] == self._embedder.dico: raise Exception(( "Dictionary in evaluation data (%i words) seems different than the one " + "in the pretrained model (%i words). Please verify you used the same dictionary, " + "and the same values for max_vocab and min_count.") % (len(self.data['dico']), len(self._embedder.dico))) # embedder self.embedder = copy.deepcopy(self._embedder) self.embedder.cuda() self.encoder = TransformerEncoder(emb_dim=1024).cuda() # projection layer self.proj = nn.Sequential( *[nn.Dropout(params.dropout), nn.Linear(self.embedder.out_dim, 3)]).cuda() self.proj_adv = nn.Sequential( *[nn.Dropout(params.dropout), nn.Linear(self.embedder.out_dim, 2)]).cuda() # self.proj = nn.Sequential(*[ # nn.Dropout(params.dropout), # nn.Linear(self.embedder.out_dim, int(self.embedder.out_dim / 2)), # nn.ReLU(), # nn.Dropout(params.dropout), # nn.Linear(int(self.embedder.out_dim / 2), 3) # ]).cuda() # # self.proj_adv = nn.Sequential(*[ # nn.Dropout(params.dropout), # nn.Linear(self.embedder.out_dim, int(self.embedder.out_dim / 2)), # nn.ReLU(), # nn.Dropout(params.dropout), # nn.Linear(int(self.embedder.out_dim / 2), 2) # ]).cuda() # float16 if params.fp16: assert torch.backends.cudnn.enabled self.embedder.model = network_to_half(self.embedder.model) self.proj = network_to_half(self.proj) self.proj_adv = network_to_half(self.proj_adv) # optimizer self.optimizer_d = get_optimizer(list(self.proj_adv.parameters()), params.optimizer) self.optimizer_e = get_optimizer(list(self.proj.parameters()), params.optimizer) self.optimizer_g = get_optimizer( list(self.embedder.get_parameters(params.finetune_layers)) + list(self.encoder.parameters()), # list(self.encoder.parameters()), params.optimizer) if params.fp16: self.optimizer_d = FP16_Optimizer(self.optimizer_d, dynamic_loss_scale=True) self.optimizer_e = FP16_Optimizer(self.optimizer_e, dynamic_loss_scale=True) self.optimizer_g = FP16_Optimizer(self.optimizer_g, dynamic_loss_scale=True) # train and evaluate the model for epoch in range(params.n_epochs): # update epoch self.epoch = epoch # training logger.info("XNLI - Training epoch %i ..." % epoch) self.train() # if(epoch % 5 == 0): # evaluation logger.info("XNLI - Evaluating epoch %i ..." % epoch) with torch.no_grad(): scores = self.eval() self.scores.update(scores)
def main(params): # initialize the multi-GPU / multi-node training init_distributed_mode(params) # initialize the experiment logger = initialize_exp(params) # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data data = load_data(params) # build model if params.encoder_only: model = build_model(params, data['dico']) else: encoder, decoder = build_model(params, data['dico']) # float16 if params.fp16: assert torch.backends.cudnn.enabled if params.encoder_only: model = network_to_half(model) else: encoder = network_to_half(encoder) decoder = network_to_half(decoder) # distributed if params.multi_gpu: logger.info("Using nn.parallel.DistributedDataParallel ...") if params.encoder_only: model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) else: encoder = apex.parallel.DistributedDataParallel(encoder, delay_allreduce=True) decoder = apex.parallel.DistributedDataParallel(decoder, delay_allreduce=True) # build trainer, reload potential checkpoints / build evaluator if params.encoder_only: trainer = SingleTrainer(model, data, params) evaluator = SingleEvaluator(trainer, data, params) else: trainer = EncDecTrainer(encoder, decoder, data, params) evaluator = EncDecEvaluator(trainer, data, params) # evaluation if params.eval_only: logger.info('Evaluating and saving new result file') scores = evaluator.run_all_evals_match(trainer) for k, v in scores.items(): if 'likelihood' in k: logger.info("%s -> %.6f" % (k, np.mean(v))) elif 'scores' in k: logger.info("%s -> %s" % (k, v.shape)) else: logger.info("%s -> %.6f" % (k, v)) np.savetxt(os.path.join(params.dump_path, 'best-fwd-prediction.txt'),scores['%s_%s_fwd_scores' % ('test', params.mass_steps[0])],fmt='%f') for match in params.match_files.split(','): np.savetxt(os.path.join(params.dump_path, 'best-match-prediction{}.txt'.format(match.split('.')[-1])), scores['%s_%s_sentence_likelihood' % (match, params.mass_steps[0])], fmt='%f') labels = np.loadtxt(os.path.join(params.data_path, 'labels')) targets = np.loadtxt(os.path.join(params.data_path, 'suffix')) preds = scores['%s_%s_sentence_likelihood' % ('match', params.mass_steps[0])] results = pd.DataFrame({'label': labels, 'target': targets, 'pred': preds}) results.to_pickle(os.path.join(params.dump_path, 'best-matching-prediction.pkl')) #logger.info("__log__:%s" % json.dumps(scores)) exit() # set sampling probabilities for training set_sampling_probs(data, params) # language model training for _ in range(params.max_epoch): logger.info("============ Starting epoch %i ... ============" % trainer.epoch) trainer.n_sentences = 0 while trainer.n_sentences < trainer.epoch_size: # mass prediction steps for lang in shuf_order(params.mass_steps): trainer.mass_step(lang, params.lambda_mass) trainer.iter() logger.info("============ End of epoch %i ============" % trainer.epoch) # evaluate perplexity scores = evaluator.run_epoch_evals_match(trainer) # print / JSON log for k, v in scores.items(): if 'likelihood' in k: logger.info("%s -> %.6f" % (k, np.mean(v))) elif 'scores' in k: logger.info("%s -> %s" % (k, v.shape)) else: logger.info("%s -> %.6f" % (k, v)) #if params.is_master: #logger.info("__log__:%s" % json.dumps(scores)) # end of epoch trainer.save_best_model(scores) trainer.save_periodic() trainer.end_epoch(scores)