def _compute_scores(self, src_filename, trg_filename): valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=src_filename, vocabulary=self.vocab_src, is_train_dataset=False, ), TextLineDataset(data_path=trg_filename, vocabulary=self.vocab_tgt, is_train_dataset=False ) ) # 其实好像还是会打乱 valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=40, use_bucket=True, buffer_size=100000, numbering=True, shuffle=False ) valid_iter = valid_iterator.build_generator() score_result = dict() self.model.eval() with torch.no_grad(): for batch in valid_iter: seq_numbers, seqs_x, seqs_y = batch x, y = prepare_data(seqs_x, seqs_y, cuda=True) y_inp = y[:, :-1].contiguous() y_label = y[:, 1:].contiguous() log_probs = self.model(x, y_inp) # [batch_size, seq_len, vocab_size] batch_size, seq_len = y_label.shape log_probs = log_probs.view(-1, self.vocab_tgt.max_n_words) y_label = y_label.view(-1) loss = F.nll_loss(log_probs, y_label, reduce=False, ignore_index=self.vocab_tgt.pad) # 越小越好 loss = loss.view(batch_size, seq_len) loss = loss.sum(-1) y_label = y_label.view(batch_size, seq_len) valid_token = (y_label != self.vocab_tgt.pad).sum(-1) norm_loss = loss.double().div(valid_token.double()) for seq_num, l, nl in zip(seq_numbers, loss, norm_loss): score_result.update({seq_num: (l.item(), nl.item())}) # for i1, y_l in enumerate(y_label): # score = 0 # for i2, y_index in enumerate(y_l): # if y_index.item() == 0: # break # score += log_probs[i1][i2][y_index.item()].item() # score_result.update({seq_numbers[i1]: score}) return score_result
def train2(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] training_configs = configs['training_configs'] bt_configs = configs['bt_configs'] if 'bt_configs' in configs else None if bt_configs is not None: print("btconfigs ", bt_configs) if 'bt_attribute_data' not in bt_configs: Constants.USE_BT = False bt_configs = None else: Constants.USE_BT = True Constants.USE_BTTAG = bt_configs['use_bttag'] Constants.USE_CONFIDENCE = bt_configs['use_confidence'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos # bt tag dataset if Constants.USE_BT: if Constants.USE_BTTAG: Constants.BTTAG = vocab_src.bttag train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True ), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], is_train_dataset=True ), AttributeDataset(data_path=bt_configs['bt_attribute_data'], is_train_dataset=True) ) else: train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True ), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], is_train_dataset=True ) ) training_iterator = DataIterator(dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, numbering=True, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, vocab_tgt=vocab_tgt, **model_configs) INFO(nmt_model) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) nmt_model = nmt_model.encoder INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin training...') # 计算train集合每个句子的表示:mean pool training_iter = training_iterator.build_generator() nmt_model.eval() all_seq_numbers = [] encoder_filename = "/home/wangdq/encoder.mean.output" seq_numbers_filename = '/home/wangdq/seq_numbers.output' processd = 0 with open(encoder_filename, 'w') as f_encoder, open(seq_numbers_filename, 'w') as f_seq_numbers: for batch in training_iter: bt_attrib = None # bt attrib data if Constants.USE_BT: seq_numbers, seqs_x, seqs_y, bt_attrib = batch # seq_numerbs从0开始编号 else: seq_numbers, seqs_x, seqs_y = batch x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU, bt_attrib=bt_attrib) try: with torch.no_grad(): encoder_hidden, mask = nmt_model(x) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') else: raise e valid_hidden = (mask == False).float().cuda() sum_encoder_hidden = (encoder_hidden * valid_hidden.unsqueeze(-1)).sum(dim=1) valid_tokens = (mask == False).sum(-1) mean_encoder_hidden = sum_encoder_hidden.float().div(valid_tokens.unsqueeze(1)) all_seq_numbers.extend(seq_numbers) # if all_mean_encoder_hidden is None: # all_mean_encoder_hidden = mean_encoder_hidden.cpu() # else: # all_mean_encoder_hidden = torch.cat((all_mean_encoder_hidden, mean_encoder_hidden.cpu()), dim=0) mean_encoder_list = mean_encoder_hidden.cpu().numpy().tolist() content = [[str(i) for i in mean] for mean in mean_encoder_list] content = [' '.join(mean) + '\n' for mean in content] f_encoder.writelines(content) processd += len(seq_numbers) print(processd) content = [str(i) for i in all_seq_numbers] content = ' '.join(content) f_seq_numbers.writelines(content)
def test_data(flags): Constants.USE_GPU = flags.use_gpu world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] training_configs = configs['training_configs'] bt_configs = configs['bt_configs'] if 'bt_configs' in configs else None if bt_configs is not None: print("btconfigs ", bt_configs) if 'bt_attribute_data' not in bt_configs: Constants.USE_BT = False bt_configs = None else: Constants.USE_BT = True Constants.USE_BTTAG = bt_configs['use_bttag'] Constants.USE_CONFIDENCE = bt_configs['use_confidence'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['valid_data'][0], vocabulary=vocab_src, is_train_dataset=False, ), TextLineDataset(data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, is_train_dataset=False ) ) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, world_size=world_size, rank=rank, shuffle=False) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, vocab_tgt=vocab_tgt, **model_configs) INFO(nmt_model) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) nmt_model = nmt_model.encoder INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin training...') # 计算train集合每个句子的表示:mean pool nmt_model.eval() # 计算test集合每个句子的表示: mean pool valid_iter = valid_iterator.build_generator() all_seq_numbers = [] all_mean_encoder_hidden = None for batch in valid_iter: bt_attrib = None seq_numbers, seqs_x, seqs_y = batch all_seq_numbers.extend(seq_numbers) x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU, bt_attrib=bt_attrib) try: with torch.no_grad(): encoder_hidden, mask = nmt_model(x) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') else: raise e valid_hidden = (mask == False).float().cuda() sum_encoder_hidden = (encoder_hidden * valid_hidden.unsqueeze(-1)).sum(dim=1) valid_tokens = (mask == False).sum(-1) mean_encoder_hidden = sum_encoder_hidden.float().div(valid_tokens.unsqueeze(1)) if all_mean_encoder_hidden is None: all_mean_encoder_hidden = mean_encoder_hidden else: all_mean_encoder_hidden = torch.cat((all_mean_encoder_hidden, mean_encoder_hidden), dim=0) return all_mean_encoder_hidden, all_seq_numbers
def _compute_scores(self, src_filename, trg_filename): valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=src_filename, vocabulary=self.vocab_src, is_train_dataset=False, max_len=100), TextLineDataset(data_path=trg_filename, vocabulary=self.vocab_tgt, is_train_dataset=False, max_len=100)) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=20, use_bucket=True, buffer_size=1000, numbering=True, shuffle=False) valid_iter = valid_iterator.build_generator() score_result = dict() self.model.eval() eidx = 0 uidx = 0 training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(valid_iterator), unit="sents") with torch.no_grad(): for batch in valid_iter: seq_numbers, seqs_x, seqs_y = batch x, y = prepare_data(seqs_x, seqs_y, cuda=True) y_inp = y[:, :-1].contiguous() y_label = y[:, 1:].contiguous() # [batch_size, seq_len] log_probs = self.model( x, y_inp, log_probs=True) # [batch_size, seq_len, vocab_size] batch_size, seq_len = y_label.shape log_probs = log_probs.view(-1, self.vocab_tgt.max_n_words) y_label = y_label.view(-1) loss = F.nll_loss(log_probs, y_label, reduce=False, ignore_index=self.vocab_tgt.pad) loss = loss.view(batch_size, seq_len) loss = loss.sum(-1) y_label = y_label.view(batch_size, seq_len) valid_token = (y_label != self.vocab_tgt.pad).sum(-1) loss = loss.double().div(valid_token.double()) for seq_num, l in zip(seq_numbers, loss): assert seq_num not in score_result score_result.update({seq_num: l.item()}) training_progress_bar.update(batch_size) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) # for i1, y_l in enumerate(y_label): # score = 0 # for i2, y_index in enumerate(y_l): # if y_index.item() == 0: # break # score += log_probs[i1][i2][y_index.item()].item() # valid_token = (y_label != self.vocab_tgt.pad).long().sum().item() # score = -1 * score / valid_token # score_result.update({seq_numbers[i1]: score}) return score_result
def train(rank, device, args, counter, lock, attack_configs, discriminator_configs, src_vocab, trg_vocab, data_set, global_attacker, attacker_configs, optimizer=None, scheduler=None, saver=None): """ running train process #1# train the env_discriminator #2# run attacker AC based on rewards from trained env_discriminator #3# run training updates attacker AC #4# :param rank: (int) the rank of the process (from multiprocess) :param device: the device of the process :param counter: python multiprocess variable :param lock: python multiprocess variable :param args: global args :param attack_configs: attack settings :param discriminator_configs: discriminator settings :param src_vocab: :param trg_vocab: :param data_set: (data_iterator object) provide batched data labels :param global_attacker: the model to sync from :param attacker_configs: local attacker settings :param optimizer: uses shared optimizer for the attacker use local one if none :param scheduler: uses shared scheduler for the attacker, use local one if none :param saver: model saver :return: """ trust_acc = acc_bound = discriminator_configs["acc_bound"] converged_bound = discriminator_configs["converged_bound"] patience = discriminator_configs["patience"] attacker_model_configs = attacker_configs["attacker_model_configs"] attacker_optimizer_configs = attacker_configs["attacker_optimizer_configs"] # this is for multi-processing, GlobalNames can not be direct inherited GlobalNames.USE_GPU = args.use_gpu GlobalNames.SEED = attack_configs["seed"] torch.manual_seed(GlobalNames.SEED + rank) # initiate local saver and load checkpoint if possible local_saver = Saver(save_prefix="{0}.local".format( os.path.join(args.save_to, "train_env%d" % rank, "ACmodel")), num_max_keeping=attack_configs["num_kept_checkpoints"]) attack_iterator = DataIterator(dataset=data_set, batch_size=attack_configs["batch_size"], use_bucket=True, buffer_size=attack_configs["buffer_size"], numbering=True) summary_writer = SummaryWriter( log_dir=os.path.join(args.save_to, "train_env%d" % rank)) local_attacker = attacker.Attacker(src_vocab.max_n_words, **attacker_model_configs) # build optimizer for attacker if optimizer is None: optimizer = Optimizer( name=attacker_optimizer_configs["optimizer"], model=global_attacker, lr=attacker_optimizer_configs["learning_rate"], grad_clip=attacker_optimizer_configs["grad_clip"], optim_args=attacker_optimizer_configs["optimizer_params"]) # Build scheduler for optimizer if needed if attacker_optimizer_configs['schedule_method'] is not None: if attacker_optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler( optimizer=optimizer, **attacker_optimizer_configs["scheduler_configs"]) elif attacker_optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler( optimizer=optimizer, **attacker_optimizer_configs['scheduler_configs']) elif attacker_optimizer_configs["schedule_method"] == "rsqrt": scheduler = RsqrtScheduler( optimizer=optimizer, **attacker_optimizer_configs["scheduler_configs"]) else: WARN("Unknown scheduler name {0}. Do not use lr_scheduling.". format(attacker_optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None local_saver.load_latest(model=local_attacker, optim=optimizer, lr_scheduler=scheduler) attacker_iterator = attack_iterator.build_generator() env = Translate_Env(attack_configs=attack_configs, discriminator_configs=discriminator_configs, src_vocab=src_vocab, trg_vocab=trg_vocab, data_iterator=attacker_iterator, save_to=args.save_to, device=device) episode_count = 0 episode_length = 0 local_steps = 0 # optimization steps: for learning rate schedules patience_t = patience while True: # infinite loop of data set # we will continue with a new iterator with refreshed environments # whenever the last iterator breaks with "StopIteration" attacker_iterator = attack_iterator.build_generator() env.reset_data_iter(attacker_iterator) padded_src = env.reset() padded_src = torch.from_numpy(padded_src) if device != "cpu": padded_src = padded_src.to(device) done = True discriminator_base_steps = local_steps while True: # check for update of discriminator # if env.acc_validation(local_attacker, use_gpu=True if env.device != "cpu" else False) < 0.55: if episode_count % attacker_configs["attacker_update_steps"] == 0: """ stop criterion: when updates a discriminator, we check for acc. If acc fails acc_bound, we reset the discriminator and try, until acc reaches the bound with patience. otherwise the training thread stops """ try: discriminator_base_steps, trust_acc = env.update_discriminator( local_attacker, discriminator_base_steps, min_update_steps=discriminator_configs[ "acc_valid_freq"], max_update_steps=discriminator_configs[ "discriminator_update_steps"], accuracy_bound=acc_bound, summary_writer=summary_writer) except StopIteration: INFO("finish one training epoch, reset data_iterator") break discriminator_base_steps += 1 # a flag to label the discriminator updates if trust_acc < converged_bound: # GAN target reached patience_t -= 1 INFO( "discriminator reached GAN convergence bound: %d times" % patience_t) else: # reset patience if discriminator is refreshed patience_t = patience if saver and local_steps % attack_configs["save_freq"] == 0: local_saver.save(global_step=local_steps, model=local_attacker, optim=optimizer, lr_scheduler=scheduler) if trust_acc < converged_bound: # and patience_t == patience-1: # we only save the global params reaching acc_bound torch.save(global_attacker.state_dict(), os.path.join(args.save_to, "ACmodel.final")) # saver.raw_save(model=global_attacker) if patience_t == 0: WARN("maximum patience reached. Training Thread should stop") break local_attacker.train() # switch back to training mode # for a initial (reset) attacker from global parameters if done: INFO("sync from global model") local_attacker.load_state_dict(global_attacker.state_dict()) # move the local attacker params back to device after updates local_attacker = local_attacker.to(device) values = [] # training critic: network outputs log_probs = [] rewards = [] # actual rewards entropies = [] local_steps += 1 # run sequences step of attack try: for i in range(args.action_roll_steps): episode_length += 1 attack_out, critic_out = local_attacker( padded_src, padded_src[:, env.index - 1:env.index + 2]) logit_attack_out = torch.log(attack_out) entropy = -(attack_out * logit_attack_out).sum(dim=-1).mean() summary_writer.add_scalar("action_entropy", scalar_value=entropy, global_step=local_steps) entropies.append(entropy) # for entropy loss actions = attack_out.multinomial(num_samples=1).detach() # only extract the log prob for chosen action (avg over batch) log_attack_out = logit_attack_out.gather(-1, actions).mean() padded_src, reward, terminal_signal = env.step( actions.squeeze()) done = terminal_signal or episode_length > args.max_episode_lengths with lock: counter.value += 1 if done: episode_length = 0 padded_src = env.reset() padded_src = torch.from_numpy(padded_src) if device != "cpu": padded_src = padded_src.to(device) values.append( critic_out.mean()) # list of torch variables (scalar) log_probs.append( log_attack_out) # list of torch variables (scalar) rewards.append(reward) # list of reward variables if done: episode_count += 1 break except StopIteration: INFO("finish one training epoch, reset data_iterator") break R = torch.zeros(1, 1) gae = torch.zeros(1, 1) if device != "cpu": R = R.to(device) gae = gae.to(device) if not done: # calculate value loss value = local_attacker.get_critic( padded_src, padded_src[:, env.index - 1:env.index + 2]) R = value.mean().detach() values.append(R) policy_loss = 0 value_loss = 0 # collect values for training for i in reversed((range(len(rewards)))): # value loss and policy loss must be clipped to stabilize training R = attack_configs["gamma"] * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + attack_configs["gamma"] * \ values[i + 1] - values[i] gae = gae * attack_configs["gamma"] * attack_configs["tau"] + \ delta_t policy_loss = policy_loss - log_probs[i] * gae.detach() - \ attack_configs["entropy_coef"] * entropies[i] print("policy_loss", policy_loss) print("gae", gae) # update with optimizer optimizer.zero_grad() # we decay the loss according to discriminator's accuracy as a trust region constrain summary_writer.add_scalar("policy_loss", scalar_value=policy_loss * trust_acc, global_step=local_steps) summary_writer.add_scalar("value_loss", scalar_value=value_loss * trust_acc, global_step=local_steps) total_loss = trust_acc * policy_loss + \ trust_acc * attack_configs["value_coef"] * value_loss total_loss.backward() if attacker_optimizer_configs[ "schedule_method"] is not None and attacker_optimizer_configs[ "schedule_method"] != "loss": scheduler.step(global_step=local_steps) # move the model params to CPU and # assign local gradients to the global model to update local_attacker.to("cpu").ensure_shared_grads(global_attacker) optimizer.step() print("bingo!") if patience_t == 0: INFO("Reach maximum Discriminator patience, Finish") break
def test_attack(): """ during test phrase, the attacker modifies inputs without constrains :return: """ timer = Timer() args = parser.parse_args() with open(args.config_path) as f: configs = yaml.load(f) attack_configs = configs["attack_configs"] attacker_configs = configs["attacker_configs"] attacker_model_configs = attacker_configs["attacker_model_configs"] # for modification GlobalNames.SEED = attack_configs["seed"] torch.manual_seed(GlobalNames.SEED) # the Global variable of USE_GPU is mainly used for environments GlobalNames.USE_GPU = args.use_gpu INFO("build vocabularies and data set") with open(attack_configs["victim_configs"], "r") as victim_f: victim_configs = yaml.load(victim_f) data_configs = victim_configs["data_configs"] src_vocab = Vocabulary(**data_configs["vocabularies"][0]) trg_vocab = Vocabulary(**data_configs["vocabularies"][1]) print("attack ", args.source_path) datset = TextLineDataset(data_path=args.source_path, vocabulary=src_vocab) test_iterator = DataIterator(dataset=datset, batch_size=args.batch_size, use_bucket=attack_configs["use_bucket"], buffer_size=attack_configs["buffer_size"], numbering=True) total_amount = len(test_iterator) test_iterator = test_iterator.build_generator() _, w2vocab = load_or_extract_near_vocab( config_path=attack_configs["victim_configs"], model_path=attack_configs["victim_model"], init_perturb_rate=attack_configs["init_perturb_rate"], save_to=os.path.join(args.save_to, "near_vocab"), save_to_full=os.path.join(args.save_to, "full_near_vocab"), top_reserve=12, emit_as_id=True) if attack_configs["pinyin_data"] != "" and not args.unk_ignore: # for Chinese we adopt INFO("collect pinyin data for gen_UNK, this would take a while") char2pyDict, py2charDict = collect_pinyin( pinyin_path=attack_configs["pinyin_data"], src_path=data_configs["train_data"][0]) else: INFO("test without pinyin") char2pyDict, py2charDict = None, None INFO("build and reload attacker model parameters") global_attacker = attacker.Attacker(src_vocab.max_n_words, **attacker_model_configs) attacker_param = load_model_parameters(args.model_path) global_attacker.eval() global_attacker.load_state_dict(attacker_param) INFO("Build and reload translator...") nmt_model = build_model(n_src_vocab=src_vocab.max_n_words, n_tgt_vocab=trg_vocab.max_n_words, **victim_configs["model_configs"]) nmt_model.eval() nmt_param = load_model_parameters(attack_configs["victim_model"]) nmt_model.load_state_dict(nmt_param) if args.use_gpu: # collect available devices and distribute env on the available gpu global_attacker.cuda() nmt_model = nmt_model.cuda() result_indices = [] # to resume ordering origin_results = [] # original translation perturbed_seqs = [] # adversarial src perturbed_results = [] # adversarial translation overall_values = [ ] # attacker value estimation on first step: indicates overall degradation # translate all sentences and collect all adversarial src with open(os.path.join(args.save_to, "perturbed_src"), "w") as perturbed_src, \ open(os.path.join(args.save_to, "perturbed_trans"), "w") as perturbed_trans, \ open(os.path.join(args.save_to, "origin_trans"), "w") as origin_trans: i = 0 timer.tic() for batch in test_iterator: i += 1 if i: print(i * args.batch_size, "/", total_amount, " finished") numbers, seqs_x = batch # print(seqs_x) batch_size = len(seqs_x) x = prepare_data(seqs_x=seqs_x, cuda=args.use_gpu) x_mask = x.detach().eq(PAD).long() cummulate_survive = calculate_cummulate_survive( max_len=x.shape[1], gamma=attack_configs["gamma"], surrogate_step_survival=0) # x_len = (1 - x_mask).sum(dim=-1).float() with torch.no_grad(): word_ids = beam_search(nmt_model=nmt_model, beam_size=5, max_steps=150, src_seqs=x, alpha=-1.0) word_ids = word_ids.cpu().numpy().tolist( ) # in shape [batch_size, beam_size, max_len] # remove PAD and append result with its indices # we only take top-one final results from beam for sent_t in word_ids: top_result = [ trg_vocab.id2token(wid) for wid in sent_t[0] if wid not in [PAD, EOS] ] origin_results.append( trg_vocab.tokenizer.detokenize(top_result)) result_indices += numbers # calculate adversarial value functions for each src position attack_results = [] critic_results = [] with torch.no_grad(): for t in range(1, x.shape[1] - 1): attack_out, critic_out = global_attacker(x, label=x[:, t - 1:t + 1]) attack_results.append( attack_out.argmax(dim=1).unsqueeze(dim=1)) # print(mask_len.shape, critic_out.shape) critic_results.append(critic_out) attack_results = torch.cat(attack_results, dim=1) temp_mask = (1 - x_mask)[:, 1:x.shape[1] - 1] attack_results *= temp_mask critic_results = torch.cat(critic_results, dim=1) * ( 1 - x_mask)[:, 1:x.shape[1] - 1].float() critic_results *= temp_mask.float() # critic_results = critic_results.cpu().numpy().tolist() # print(attack_results) # print(critic_results) # get adversarial samples for the src with torch.no_grad(): perturbed_x_ids = x.clone().detach() batch_size, max_steps = x.shape for t in range(1, max_steps - 1): # ignore BOS and EOS inputs = x[:, t - 1:t + 1] attack_out, critic_out = global_attacker(x=perturbed_x_ids, label=inputs) actions = attack_out.argmax(dim=-1) if t == 1: overall_values += ( critic_out - cummulate_survive[-t - 2]).cpu().numpy().tolist() # action is masked if the corresponding value estimation is negative actions *= (critic_out - cummulate_survive[-t - 2]).gt( 0).squeeze().long() # - cummulate_survive[-t-2] target_of_step = [] for batch_index in range(batch_size): word_id = inputs[batch_index][1] # select least similar candidate based on victim embedding target_word_id = w2vocab[word_id.item( )][0] #[np.random.choice(len(w2vocab[word_id.item()]), 1)[0]] # select nearest candidate based on victim embedding # choose least similar candidates # origin_emb = global_attacker.src_embedding(word_id) # candidates_emb = global_attacker.src_embedding(torch.tensor(w2vocab[word_id.item()]).cuda()) # nearest = candidates_emb.matmul(origin_emb)\ # .div((candidates_emb*candidates_emb).sum(dim=-1))\ # .argmax(dim=-1).item() # target_word_id = w2vocab[word_id.item()][nearest] if args.unk_ignore and target_word_id == UNK: # undo this attack if UNK is set to be ignored target_word_id = word_id.item() target_of_step += [target_word_id] # override the perturbed results with choice from candidates perturbed_x_ids[:, t] *= (1 - actions) adjustification_ = torch.tensor(target_of_step, device=inputs.device) if GlobalNames.USE_GPU: adjustification_ = adjustification_.cuda() perturbed_x_ids[:, t] += adjustification_ * actions # re-tokenization and validate UNK inputs = perturbed_x_ids.cpu().numpy().tolist() new_inputs = [] for origin_indices, indices in zip(x.cpu().numpy().tolist(), inputs): new_line_token = [] # for output files # remove BOS, EOS, PAD, and detokenize to sentence for origin_word_id, word_id in zip(origin_indices, indices): if word_id not in [BOS, EOS, PAD]: if word_id == UNK and origin_word_id != UNK: # validate UNK induced by attack and append new_line_token.append( gen_UNK(src_token=src_vocab.id2token( origin_word_id), vocab=src_vocab, char2pyDict=char2pyDict, py2charDict=py2charDict)) else: new_line_token.append( src_vocab.id2token(word_id)) new_line_token = src_vocab.tokenizer.detokenize( new_line_token) perturbed_seqs.append(new_line_token) # tokenization must ignore original <UNK> if not hasattr(src_vocab.tokenizer, "bpe"): new_line = new_line_token.strip().split() else: new_token = [] for w in new_line_token.strip().split(): if w != src_vocab.id2token(UNK): new_token.append( src_vocab.tokenizer.bpe.segment_word(w)) else: new_token.append([w]) new_line = sum(new_token, []) new_line = [src_vocab.token2id(t) for t in new_line] new_inputs.append(new_line) # override perturbed_x_ids perturbed_x_ids = prepare_data(seqs_x=new_inputs, cuda=args.use_gpu) # batch translate perturbed_src word_ids = beam_search(nmt_model=nmt_model, beam_size=5, max_steps=150, src_seqs=perturbed_x_ids, alpha=-1.0) word_ids = word_ids.cpu().numpy().tolist( ) # in shape [batch_size, beam_size, max_len] # translate adversarial inputs for sent_t in word_ids: top_result = [ trg_vocab.id2token(wid) for wid in sent_t[0] if wid not in [PAD, EOS] ] perturbed_results.append( trg_vocab.tokenizer.detokenize(top_result)) print(timer.toc(return_seconds=True), "sec") # resume original ordering and output to files origin_order = np.argsort(result_indices).tolist() for line in [origin_results[ii] for ii in origin_order]: origin_trans.write(line + "\n") for line, value in [(perturbed_seqs[ii], overall_values[ii]) for ii in origin_order]: perturbed_src.write(line + "\n") # +" "+str(value) for line in [perturbed_results[ii] for ii in origin_order]: perturbed_trans.write(line + "\n")
def test_discriminator(config_path, save_to, model_name="Discriminator", shuffle=True, use_gpu=True): with open(config_path.strip()) as f: configs = yaml.load(f) attack_configs = configs["attack_configs"] discriminator_configs = configs["discriminator_configs"] discriminator_model_configs = discriminator_configs[ "discriminator_model_configs"] discriminator_optim_configs = discriminator_configs[ "discriminator_optimizer_configs"] victim_config_path = attack_configs["victim_configs"] victim_model_path = attack_configs["victim_model"] with open(victim_config_path.strip()) as v_f: print("open victim configs...%s" % victim_config_path) victim_configs = yaml.load(v_f) data_configs = victim_configs["data_configs"] # building inputs vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_trg = Vocabulary(**data_configs["vocabularies"][1]) # parallel data binding train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0]), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_trg, max_len=data_configs['max_len'][1]), shuffle=shuffle) valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs["valid_data"][0], vocabulary=vocab_src, max_len=data_configs["max_len"][0]), TextLineDataset(data_path=data_configs["valid_data"][1], vocabulary=vocab_trg, max_len=data_configs["max_len"][1]), shuffle=shuffle) train_batch_size = attack_configs["batch_size"] train_buffer_size = attack_configs["buffer_size"] training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=train_batch_size, use_bucket=attack_configs['use_bucket'], buffer_size=train_buffer_size, batching_func=attack_configs['batching_key']) # valid_iterator is bucketed by length to accelerate decoding (numbering to mark orders) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=attack_configs["batch_size"], use_bucket=True, buffer_size=50000, numbering=True) # initiate saver model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format(os.path.join(save_to, model_name)), num_max_keeping=attack_configs['num_kept_checkpoints']) # building model model_D = TransDiscriminator(n_src_words=vocab_src.max_n_words, n_trg_words=vocab_trg.max_n_words, **discriminator_model_configs) if use_gpu: model_D = model_D.cuda() CURRENT_DEVICE = "cuda" else: CURRENT_DEVICE = "cpu" # load embedding from trained NMT models load_embedding(model_D, model_path=victim_model_path, device=CURRENT_DEVICE) # TODO reloading parameters # classification need label smoothing to trigger Negative log likelihood loss criterion = nn.CrossEntropyLoss() # building optimizer optim = Optimizer( name=discriminator_optim_configs["optimizer"], model=model_D, lr=discriminator_optim_configs["learning_rate"], grad_clip=discriminator_optim_configs["grad_clip"], optim_args=discriminator_optim_configs["optimizer_params"]) # Build scheduler for optimizer if needed if discriminator_optim_configs['schedule_method'] is not None: if discriminator_optim_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler( optimizer=optim, **discriminator_optim_configs["scheduler_configs"]) elif discriminator_optim_configs['schedule_method'] == "noam": scheduler = NoamScheduler( optimizer=optim, **discriminator_optim_configs['scheduler_configs']) elif discriminator_optim_configs["schedule_method"] == "rsqrt": scheduler = RsqrtScheduler( optimizer=optim, **discriminator_optim_configs["scheduler_configs"]) else: WARN( "Unknown scheduler name {0}. Do not use lr_scheduling.".format( discriminator_optim_configs['schedule_method'])) scheduler = None else: scheduler = None # reload latest checkpoint checkpoint_saver.load_latest(model=model_D, optim=optim, lr_scheduler=scheduler, collections=model_collections) # prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] summary_writer = SummaryWriter(log_dir=save_to + "log") w2p, w2vocab = load_or_extract_near_vocab( config_path=victim_config_path, model_path=victim_model_path, init_perturb_rate=attack_configs["init_perturb_rate"], save_to=os.path.join(save_to, "near_vocab"), save_to_full=os.path.join(save_to, "full_near_vocab"), top_reserve=12) while True: # infinite loop for training epoch training_iter = training_iterator.build_generator() for batch in training_iter: uidx += 1 if discriminator_optim_configs[ "schedule_method"] is not None and discriminator_optim_configs[ "schedule_method"] != "loss": scheduler.step(global_step=uidx) # training session seqs_x, seqs_y = batch # returned tensor type of the data optim.zero_grad() try: x, y, flags = prepare_D_data(w2p, w2vocab, victim_config_path, seqs_x, seqs_y, use_gpu=use_gpu) loss = compute_D_forward(model_D, criterion=criterion, seqs_x=x, seqs_y=y, gold_flags=flags) optim.step() print("loss:", loss) except RuntimeError as e: if "out of memory" in str(e): print("WARNING: out of memory, skipping batch") oom_count += 1 optim.zero_grad() else: raise e # check for validation and save the model if should_trigger_by_steps( uidx, eidx, every_n_step=discriminator_configs["acc_valid_freq"]): lrate = list(optim.get_lrate())[0] summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) if should_trigger_by_steps( uidx, eidx, every_n_step=attack_configs["save_freq"]): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) checkpoint_saver.save(global_step=uidx, model=model_D, optim=optim, lr_scheduler=scheduler, collections=model_collections) if should_trigger_by_steps( uidx, eidx, every_n_step=discriminator_configs["acc_valid_freq"]): # validate average loss over samples on validation set n_sents = 0. sum_loss = 0.0 valid_iter = valid_iterator.build_generator() for batch in valid_iter: _, seqs_x, seqs_y = batch n_sents += len(seqs_x) x, y, flags = prepare_D_data(w2p, w2vocab, victim_config_path, seqs_x, seqs_y, use_gpu=use_gpu) loss = compute_D_forward(model_D, criterion, x, y, gold_flags=flags, eval=True) if np.isnan(loss): WARN("NaN detected!") sum_loss += float(loss) eval_loss = float(sum_loss / n_sents) summary_writer.add_scalar("valid", scalar_value=eval_loss, global_step=uidx) if should_trigger_by_steps( uidx, eidx, every_n_step=discriminator_configs["acc_valid_freq"]): # validate accuracy of the discriminator acc = acc_validation(uidx=uidx, discriminator_model=model_D, valid_iterator=valid_iterator, victim_configs=victim_config_path, w2p=w2p, w2vocab=w2vocab, batch_size=attack_configs["batch_size"], use_gpu=use_gpu) summary_writer.add_scalar("accuracy", scalar_value=acc, global_step=uidx) eidx += 1 pass
def train(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # If not root_rank, close logging # else write log of training to file. if rank == 0: write_log_to_file( os.path.join(flags.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) else: close_logging() # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos train_bitext_dataset = TextLineDataset( data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True) valid_bitext_dataset = TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_src, is_train_dataset=False) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, shuffle=False, world_size=world_size, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_prefix = os.path.join( flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(vocab_size=vocab_src.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) INFO(nmt_model) # 损失函数 critic = torch.nn.CrossEntropyLoss(ignore_index=Constants.PAD) INFO(critic) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 4. Build optimizer INFO('Building Optimizer...') optimizer = torch.optim.Adam(nmt_model.parameters(), lr=optimizer_configs['learning_rate']) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] is_early_stop = model_collections.get_collection("is_early_stop", [ False, ])[-1] train_loss_meter = AverageMeter() sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() grad_denom = 0 train_loss = 0.0 cum_n_words = 0 valid_loss = best_valid_loss = float('inf') if rank == 0: summary_writer = SummaryWriter(log_dir=flags.log_path) else: summary_writer = None sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") else: training_progress_bar = None for batch in training_iter: seqs_x = batch batch_size = len(seqs_x) cum_n_words = 0.0 train_loss = 0.0 try: # Prepare data grad_denom += batch_size x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU) nmt_model.train() critic.train() critic.zero_grad() with torch.enable_grad(): logits = nmt_model(x[:-1]) logits = logits.view(-1, vocab_src.max_n_words) trg = x[1:] trg = trg.view(-1) loss = critic(logits, trg) loss.backward() optimizer.step() valid_token = (trg != Constants.PAD).long().sum().item() cum_n_words += valid_token train_loss += loss.item() * valid_token except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom, update uidx # - learning rate scheduling # - update moving average if training_progress_bar is not None: training_progress_bar.update(grad_denom) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss / cum_n_words, valid_loss, best_valid_loss) training_progress_bar.set_postfix_str(postfix_str) # 4. update meters train_loss_meter.update(train_loss, cum_n_words) sent_per_sec_meter.update(grad_denom) tok_per_sec_meter.update(cum_n_words) # 5. reset accumulated variables, update uidx grad_denom = 0 uidx += 1 cum_n_words = 0.0 train_loss = 0.0 # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): if summary_writer is not None: summary_writer.add_scalar( "Speed(sents/sec)", scalar_value=sent_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "Speed(words/sec)", scalar_value=tok_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "train_loss", scalar_value=train_loss_meter.ave, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset Meters sent_per_sec_meter.reset() tok_per_sec_meter.reset() train_loss_meter.reset() # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=flags.debug): valid_iter = valid_iterator.build_generator() valid_loss = 0 total_tokens = 0 for batch in valid_iter: seq_number, seqs_x = batch x = prepare_data(seqs_x, seqs_y=None, cuda=Constants.USE_GPU) nmt_model.eval() critic.eval() with torch.no_grad(): logits = nmt_model(x[:-1]) logits = logits.view(-1, vocab_src.max_n_words) trg = x[1:] valid_token = (trg != Constants.PAD).sum(-1) batch_size, seq_len = trg.shape trg = trg.view(-1) # loss = critic(logits, trg) # valid_token = (trg != Constants.PAD).long().sum().item() # total_tokens += valid_token # valid_loss += loss.item() * valid_token import torch.nn.functional as F loss = F.cross_entropy(logits, trg, reduce=False, ignore_index=vocab_src.pad) loss = loss.view(batch_size, seq_len) loss = loss.sum(-1) print(seq_number) print(loss.double().div(valid_token.double())) exit(0) valid_loss = valid_loss / total_tokens model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() best_valid_loss = min_history_loss if summary_writer is not None: summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) # If model get new best valid bleu score if valid_loss <= best_valid_loss: bad_count = 0 if is_early_stop is False: if rank == 0: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save( global_step=uidx, model=nmt_model, optimizer=optimizer, collections=model_collections) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") exit(0) if summary_writer is not None: summary_writer.add_scalar("bad_count", bad_count, uidx) INFO("{0} Loss: {1:.2f} patience: {2}".format( uidx, valid_loss, bad_count)) # ================================================================================== # # # Saving checkpoints # if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug): # model_collections.add_to_collection("uidx", uidx) # model_collections.add_to_collection("eidx", eidx) # model_collections.add_to_collection("bad_count", bad_count) # # if not is_early_stop: # if rank == 0: # checkpoint_saver.save(global_step=uidx, # model=nmt_model, # optim=optimizer, # collections=model_collections) if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def tune(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu if flags.multi_gpu: dist.distributed_init(flags.shared_dir) world_size = dist.get_world_size() rank = dist.get_rank() local_rank = dist.get_local_rank() else: world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # If not root_rank, close logging # else write log of training to file. if rank == 0: write_log_to_file( os.path.join(flags.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) else: close_logging() # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos # bt tag dataset train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], is_train_dataset=True)) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial lrate = optimizer_configs['learning_rate'] model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_prefix = os.path.join( flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, vocab_tgt=vocab_tgt, **model_configs) INFO(nmt_model) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing'], padding_idx=vocab_tgt.pad) INFO(critic) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=flags.pretrain_exclude_prefix, device=Constants.CURRENT_DEVICE) # froze_parameters froze_params(nmt_model, flags.froze_config) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 4. Build optimizer INFO('Building Optimizer...') if not flags.multi_gpu: optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], update_cycle=training_configs['update_cycle']) else: optim = dist.DistributedOptimizer( name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], device_id=local_rank) # 5. Build scheduler for optimizer if needed scheduler = build_scheduler( schedule_method=optimizer_configs['schedule_method'], optimizer=optim, scheduler_configs=optimizer_configs['scheduler_configs']) # 6. build moving average if training_configs['moving_average_method'] is not None: ma = MovingAverage( moving_average_method=training_configs['moving_average_method'], named_params=nmt_model.named_parameters(), alpha=training_configs['moving_average_alpha']) else: ma = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if flags.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma, device=Constants.CURRENT_DEVICE) # broadcast parameters and optimizer states if world_size > 1: INFO("Broadcasting model parameters...") dist.broadcast_parameters(params=nmt_model.state_dict()) INFO("Broadcasting optimizer states...") dist.broadcast_optimizer_state(optimizer=optim.optim) INFO('Done.') # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] is_early_stop = model_collections.get_collection("is_early_stop", [ False, ])[-1] train_loss_meter = AverageMeter() sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() update_cycle = training_configs['update_cycle'] grad_denom = 0 train_loss = 0.0 cum_n_words = 0 valid_loss = best_valid_loss = float('inf') if rank == 0: summary_writer = SummaryWriter(log_dir=flags.log_path) else: summary_writer = None sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") else: training_progress_bar = None # INFO(Constants.USE_BT) for batch in training_iter: # bt attrib data seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU) loss = compute_forward( model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=1.0, norm_by_words=training_configs["norm_by_words"]) update_cycle -= 1 grad_denom += batch_size train_loss += loss except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom, update uidx # - learning rate scheduling # - update moving average if update_cycle == 0: # 0. reduce variables if world_size > 1: grad_denom = dist.all_reduce_py(grad_denom) train_loss = dist.all_reduce_py(train_loss) cum_n_words = dist.all_reduce_py(cum_n_words) # 1. update parameters optim.step(denom=grad_denom) optim.zero_grad() if training_progress_bar is not None: training_progress_bar.update(grad_denom) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss, valid_loss, best_valid_loss) training_progress_bar.set_postfix_str(postfix_str) # 2. learning rate scheduling if scheduler is not None and optimizer_configs[ "schedule_method"] != "loss": scheduler.step(global_step=uidx) # 3. update moving average if ma is not None and eidx >= training_configs[ 'moving_average_start_epoch']: ma.step() # 4. update meters train_loss_meter.update(train_loss, grad_denom) sent_per_sec_meter.update(grad_denom) tok_per_sec_meter.update(cum_n_words) # 5. reset accumulated variables, update uidx update_cycle = training_configs['update_cycle'] grad_denom = 0 uidx += 1 cum_n_words = 0.0 train_loss = 0.0 else: continue # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): lrate = list(optim.get_lrate())[0] if summary_writer is not None: summary_writer.add_scalar( "Speed(sents/sec)", scalar_value=sent_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "Speed(words/sec)", scalar_value=tok_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "train_loss", scalar_value=train_loss_meter.ave, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset Meters sent_per_sec_meter.reset() tok_per_sec_meter.reset() train_loss_meter.reset() # ================================================================================== # # Saving checkpoints # if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug): # model_collections.add_to_collection("uidx", uidx) # model_collections.add_to_collection("eidx", eidx) # model_collections.add_to_collection("bad_count", bad_count) # # if not is_early_stop: # if rank == 0: # checkpoint_saver.save(global_step=uidx, # model=nmt_model, # optim=optim, # lr_scheduler=scheduler, # collections=model_collections, # ma=ma) torch.save(nmt_model.state_dict(), best_model_prefix + ".final") if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def translate(FLAGS): GlobalNames.USE_GPU = FLAGS.use_gpu if FLAGS.multi_gpu: if hvd is None or distributed is None: ERROR("Distributed training is disable. Please check the installation of Horovod.") hvd.init() world_size = hvd.size() rank = hvd.rank() if GlobalNames.USE_GPU: torch.cuda.set_device(hvd.local_rank()) else: world_size = 1 rank = 0 if rank != 0: close_logging() config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs['data_configs'] model_configs = configs['model_configs'] timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) valid_dataset = TextLineDataset(data_path=FLAGS.source_path, vocabulary=vocab_src) valid_iterator = DataIterator(dataset=valid_dataset, batch_size=FLAGS.batch_size, use_bucket=True, buffer_size=100000, numbering=True, world_size=world_size, rank=rank ) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Build Model & Sampler & Validation INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) nmt_model.eval() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Reloading model parameters...') timer.tic() params = load_model_parameters(FLAGS.model_path, map_location="cpu") nmt_model.load_state_dict(params, strict=False) if GlobalNames.USE_GPU: nmt_model.cuda() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin...') result_numbers = [] result = [] n_words = 0 timer.tic() if rank == 0: infer_progress_bar = tqdm(total=len(valid_iterator), desc=' - (Infer) ', unit="sents") else: infer_progress_bar = None valid_iter = valid_iterator.build_generator() for batch in valid_iter: numbers, seqs_x = batch batch_size_t = len(seqs_x) x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU) with torch.no_grad(): word_ids = beam_search(nmt_model=nmt_model, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha) word_ids = word_ids.cpu().numpy().tolist() # Append result for sent_t in word_ids: sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] result.append(sent_t) n_words += len(sent_t[0]) result_numbers += numbers if rank == 0: infer_progress_bar.update(batch_size_t * world_size) if rank == 0: infer_progress_bar.close() if FLAGS.multi_gpu: n_words = sum(distributed.all_gather(n_words)) INFO('Done. Speed: {0:.2f} words/sec'.format(n_words / (timer.toc(return_seconds=True)))) if FLAGS.multi_gpu: result_gathered = distributed.all_gather_with_shared_fs(result) result = [] for lines in itertools.zip_longest(*result_gathered, fillvalue=None): for line in lines: if line is not None: result.append(line) result_numbers_gathered = distributed.all_gather_with_shared_fs(result_numbers) result_numbers = [] for numbers in itertools.zip_longest(*result_numbers_gathered, fillvalue=None): for num in numbers: if num is not None: result_numbers.append(num) if rank == 0: translation = [] for sent in result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(vocab_tgt.id2token(w)) samples.append(vocab_tgt.tokenizer.detokenize(sample)) translation.append(samples) # resume the ordering origin_order = np.argsort(result_numbers).tolist() translation = [translation[ii] for ii in origin_order] keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min(FLAGS.beam_size, FLAGS.keep_n) outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos')
def train(FLAGS): """ FLAGS: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed GlobalNames.USE_GPU = FLAGS.use_gpu if FLAGS.multi_gpu: if hvd is None or distributed is None: ERROR("Distributed training is disable. Please check the installation of Horovod.") hvd.init() world_size = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() else: world_size = 1 rank = 0 local_rank = 0 if GlobalNames.USE_GPU: torch.cuda.set_device(local_rank) CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: CURRENT_DEVICE = "cpu" # If not root_rank, close logging if rank != 0: close_logging() # write log of training to file. if rank == 0: write_log_to_file(os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) # ================================================================================== # # Parsing configuration files config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) INFO(pretty_configs(configs)) # Add default configs configs = default_baseline_configs(configs) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] GlobalNames.SEED = training_configs['seed'] set_seed(GlobalNames.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) actual_buffer_size = training_configs["buffer_size"] * max(1, training_configs["update_cycle"]) train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], ), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], ) ) valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['valid_data'][0], vocabulary=vocab_src, ), TextLineDataset(data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, ) ) training_iterator = DataIterator(dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=actual_buffer_size, batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, world_size=world_size, rank=rank) bleu_scorer = SacreBLEUScorer(reference_path=data_configs["bleu_valid_reference"], num_refs=data_configs["num_refs"], lang_pair=data_configs["lang_pair"], sacrebleu_args=training_configs["bleu_valid_configs"]['sacrebleu_args'], postprocess=training_configs["bleu_valid_configs"]['postprocess'] ) INFO('Done. Elapsed time {0}'.format(timer.toc())) lrate = optimizer_configs['learning_rate'] is_early_stop = False # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() best_model_prefix = os.path.join(FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX) checkpoint_saver = Saver(save_prefix="{0}.ckpt".format(os.path.join(FLAGS.saveto, FLAGS.model_name)), num_max_keeping=training_configs['num_kept_checkpoints'] ) best_model_saver = Saver(save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) INFO(nmt_model) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 2. Move to GPU if GlobalNames.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, FLAGS.pretrain_path, exclude_prefix=None, device=CURRENT_DEVICE) # 4. Build optimizer INFO('Building Optimizer...') optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], distributed=True if world_size > 1 else False, update_cycle=training_configs['update_cycle'] ) # 5. Build scheduler for optimizer if needed if optimizer_configs['schedule_method'] is not None: if optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler(optimizer=optim, **optimizer_configs["scheduler_configs"] ) elif optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs']) else: WARN("Unknown scheduler name {0}. Do not use lr_scheduling.".format(optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None # 6. build moving average if training_configs['moving_average_method'] is not None: ma = MovingAverage(moving_average_method=training_configs['moving_average_method'], named_params=nmt_model.named_parameters(), alpha=training_configs['moving_average_alpha']) else: ma = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if FLAGS.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # broadcast parameters and optimizer states if world_size > 1: hvd.broadcast_parameters(params=nmt_model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer=optim.optim, root_rank=0) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] cum_n_samples = 0 cum_n_words = 0 best_valid_loss = 1.0 * 1e10 # Max Float update_cycle = training_configs['update_cycle'] grad_denom = 0 if rank == 0: summary_writer = SummaryWriter(log_dir=FLAGS.log_path) else: summary_writer = None # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epoch %d) ' % eidx, total=len(training_iterator), unit="sents" ) else: training_progress_bar = None for batch in training_iter: seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_samples += batch_size cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=GlobalNames.USE_GPU) loss = compute_forward(model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=1.0, norm_by_words=training_configs["norm_by_words"]) update_cycle -= 1 grad_denom += batch_size except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom # - update uidx # - update moving average if update_cycle == 0: if world_size > 1: grad_denom = distributed.all_reduce(grad_denom) optim.step(denom=grad_denom) optim.zero_grad() if training_progress_bar is not None: training_progress_bar.update(grad_denom) update_cycle = training_configs['update_cycle'] grad_denom = 0 uidx += 1 if scheduler is None: pass elif optimizer_configs["schedule_method"] == "loss": scheduler.step(metric=best_valid_loss) else: scheduler.step(global_step=uidx) if ma is not None and eidx >= training_configs['moving_average_start_epoch']: ma.step() else: continue # ================================================================================== # # Display some information if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['disp_freq']): if world_size > 1: cum_n_words = sum(distributed.all_gather(cum_n_words)) cum_n_samples = sum(distributed.all_gather(cum_n_samples)) # words per second and sents per second words_per_sec = cum_n_words / (timer.toc(return_seconds=True)) sents_per_sec = cum_n_samples / (timer.toc(return_seconds=True)) lrate = list(optim.get_lrate())[0] if summary_writer is not None: summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx) summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset timer timer.tic() cum_n_words = 0 cum_n_samples = 0 # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=FLAGS.debug): valid_loss = loss_validation(model=nmt_model, critic=critic, valid_iterator=valid_iterator, rank=rank, world_size=world_size ) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array(model_collections.get_collection("history_losses")).min() best_valid_loss = min_history_loss if summary_writer is not None: summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) # ================================================================================== # # BLEU Validation & Early Stop if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['bleu_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=FLAGS.debug): valid_bleu = bleu_validation(uidx=uidx, valid_iterator=valid_iterator, batch_size=training_configs["bleu_valid_batch_size"], model=nmt_model, bleu_scorer=bleu_scorer, vocab_tgt=vocab_tgt, valid_dir=FLAGS.valid_path, max_steps=training_configs["bleu_valid_configs"]["max_steps"], beam_size=training_configs["bleu_valid_configs"]["beam_size"], alpha=training_configs["bleu_valid_configs"]["alpha"], world_size=world_size, rank=rank, ) model_collections.add_to_collection(key="history_bleus", value=valid_bleu) best_valid_bleu = float(np.array(model_collections.get_collection("history_bleus")).max()) if summary_writer is not None: summary_writer.add_scalar("bleu", valid_bleu, uidx) summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx) # If model get new best valid bleu score if valid_bleu >= best_valid_bleu: bad_count = 0 if is_early_stop is False: if rank == 0: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save(global_step=uidx, model=nmt_model, ma=ma) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs['early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") if summary_writer is not None: summary_writer.add_scalar("bad_count", bad_count, uidx) INFO("{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4}".format( uidx, valid_loss, valid_bleu, lrate, bad_count )) # ================================================================================== # # Saving checkpoints if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) if not is_early_stop: if rank == 0: checkpoint_saver.save(global_step=uidx, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def train(FLAGS): """ FLAGS: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # write log of training to file. write_log_to_file(os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) GlobalNames.USE_GPU = FLAGS.use_gpu if GlobalNames.USE_GPU: CURRENT_DEVICE = "cpu" else: CURRENT_DEVICE = "cuda:0" config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) INFO(pretty_configs(configs)) # Add default configs configs = default_configs(configs) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] GlobalNames.SEED = training_configs['seed'] set_seed(GlobalNames.SEED) best_model_prefix = os.path.join(FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) train_batch_size = training_configs["batch_size"] * max(1, training_configs["update_cycle"]) train_buffer_size = training_configs["buffer_size"] * max(1, training_configs["update_cycle"]) train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], ), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], ), shuffle=training_configs['shuffle'] ) valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['valid_data'][0], vocabulary=vocab_src, ), TextLineDataset(data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, ) ) training_iterator = DataIterator(dataset=train_bitext_dataset, batch_size=train_batch_size, use_bucket=training_configs['use_bucket'], buffer_size=train_buffer_size, batching_func=training_configs['batching_key']) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True) bleu_scorer = SacreBLEUScorer(reference_path=data_configs["bleu_valid_reference"], num_refs=data_configs["num_refs"], lang_pair=data_configs["lang_pair"], sacrebleu_args=training_configs["bleu_valid_configs"]['sacrebleu_args'], postprocess=training_configs["bleu_valid_configs"]['postprocess'] ) INFO('Done. Elapsed time {0}'.format(timer.toc())) lrate = optimizer_configs['learning_rate'] is_early_stop = False # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver(save_prefix="{0}.ckpt".format(os.path.join(FLAGS.saveto, FLAGS.model_name)), num_max_keeping=training_configs['num_kept_checkpoints'] ) best_model_saver = Saver(save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) INFO(nmt_model) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 2. Move to GPU if GlobalNames.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, FLAGS.pretrain_path, exclude_prefix=None, device=CURRENT_DEVICE) # 4. Build optimizer INFO('Building Optimizer...') optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'] ) # 5. Build scheduler for optimizer if needed if optimizer_configs['schedule_method'] is not None: if optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler(optimizer=optim, **optimizer_configs["scheduler_configs"] ) elif optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs']) else: WARN("Unknown scheduler name {0}. Do not use lr_scheduling.".format(optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None # 6. build EMA if training_configs['ema_decay'] > 0.0: ema = ExponentialMovingAverage(named_params=nmt_model.named_parameters(), decay=training_configs['ema_decay']) else: ema = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if FLAGS.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [0])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] summary_writer = SummaryWriter(log_dir=FLAGS.log_path) cum_samples = 0 cum_words = 0 best_valid_loss = 1.0 * 1e10 # Max Float saving_files = [] # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() INFO('Begin training...') while True: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epoch %d) ' % eidx, total=len(training_iterator), unit="sents" ) for batch in training_iter: uidx += 1 if scheduler is None: pass elif optimizer_configs["schedule_method"] == "loss": scheduler.step(metric=best_valid_loss) else: scheduler.step(global_step=uidx) seqs_x, seqs_y = batch n_samples_t = len(seqs_x) n_words_t = sum(len(s) for s in seqs_y) cum_samples += n_samples_t cum_words += n_words_t training_progress_bar.update(n_samples_t) optim.zero_grad() # Prepare data for seqs_x_t, seqs_y_t in split_shard(seqs_x, seqs_y, split_size=training_configs['update_cycle']): x, y = prepare_data(seqs_x_t, seqs_y_t, cuda=GlobalNames.USE_GPU) loss = compute_forward(model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=n_samples_t, norm_by_words=training_configs["norm_by_words"]) optim.step() if ema is not None: ema.step() # ================================================================================== # # Display some information if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['disp_freq']): # words per second and sents per second words_per_sec = cum_words / (timer.toc(return_seconds=True)) sents_per_sec = cum_samples / (timer.toc(return_seconds=True)) lrate = list(optim.get_lrate())[0] summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx) summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) # Reset timer timer.tic() cum_words = 0 cum_samples = 0 # ================================================================================== # # Saving checkpoints if should_trigger_by_steps(uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) if not is_early_stop: checkpoint_saver.save(global_step=uidx, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ema=ema) # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=FLAGS.debug): if ema is not None: origin_state_dict = deepcopy(nmt_model.state_dict()) nmt_model.load_state_dict(ema.state_dict(), strict=False) valid_loss = loss_validation(model=nmt_model, critic=critic, valid_iterator=valid_iterator, ) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array(model_collections.get_collection("history_losses")).min() summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) best_valid_loss = min_history_loss if ema is not None: nmt_model.load_state_dict(origin_state_dict) del origin_state_dict # ================================================================================== # # BLEU Validation & Early Stop if should_trigger_by_steps(global_step=uidx, n_epoch=eidx, every_n_step=training_configs['bleu_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=FLAGS.debug): if ema is not None: origin_state_dict = deepcopy(nmt_model.state_dict()) nmt_model.load_state_dict(ema.state_dict(), strict=False) valid_bleu = bleu_validation(uidx=uidx, valid_iterator=valid_iterator, batch_size=training_configs["bleu_valid_batch_size"], model=nmt_model, bleu_scorer=bleu_scorer, vocab_tgt=vocab_tgt, valid_dir=FLAGS.valid_path, max_steps=training_configs["bleu_valid_configs"]["max_steps"], beam_size=training_configs["bleu_valid_configs"]["beam_size"], alpha=training_configs["bleu_valid_configs"]["alpha"] ) model_collections.add_to_collection(key="history_bleus", value=valid_bleu) best_valid_bleu = float(np.array(model_collections.get_collection("history_bleus")).max()) summary_writer.add_scalar("bleu", valid_bleu, uidx) summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx) # If model get new best valid bleu score if valid_bleu >= best_valid_bleu: bad_count = 0 if is_early_stop is False: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save(global_step=uidx, model=nmt_model) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs['early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") summary_writer.add_scalar("bad_count", bad_count, uidx) if ema is not None: nmt_model.load_state_dict(origin_state_dict) del origin_state_dict INFO("{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4}".format( uidx, valid_loss, valid_bleu, lrate, bad_count )) training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def train(flags): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = flags.use_gpu if flags.multi_gpu: dist.distributed_init(flags.shared_dir) world_size = dist.get_world_size() rank = dist.get_rank() local_rank = dist.get_local_rank() else: world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # If not root_rank, close logging # else write log of training to file. if rank == 0: write_log_to_file( os.path.join(flags.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) else: close_logging() # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(flags.config_path, flags.predefined_config) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) # use odc if training_configs['use_odc'] is True: ave_best_k = check_odc_config(training_configs) else: ave_best_k = 0 Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], is_train_dataset=True), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], is_train_dataset=True)) valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_src, is_train_dataset=False, ), TextLineDataset(data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, is_train_dataset=False)) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=training_configs["batch_size"], use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], batching_func=training_configs['batching_key'], world_size=world_size, rank=rank) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True, world_size=world_size, rank=rank) bleu_scorer = SacreBLEUScorer( reference_path=data_configs["bleu_valid_reference"], num_refs=data_configs["num_refs"], lang_pair=data_configs["lang_pair"], sacrebleu_args=training_configs["bleu_valid_configs"] ['sacrebleu_args'], postprocess=training_configs["bleu_valid_configs"]['postprocess']) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial lrate = optimizer_configs['learning_rate'] model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_prefix = os.path.join( flags.saveto, flags.model_name + Constants.MY_BEST_MODEL_SUFFIX) best_k_saver = BestKSaver( save_prefix="{0}.best_k_ckpt".format( os.path.join(flags.saveto, flags.model_name)), num_max_keeping=training_configs['num_kept_best_k_checkpoints']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) INFO(nmt_model) # build teacher model teacher_model, teacher_model_path = get_teacher_model( training_configs, model_configs, vocab_src, vocab_tgt, flags) # build critic critic = CombinationCriterion(model_configs['loss_configs'], padding_idx=vocab_tgt.pad, teacher=teacher_model) # INFO(critic) critic.INFO() # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, flags.pretrain_path, exclude_prefix=None, device=Constants.CURRENT_DEVICE) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 4. Build optimizer INFO('Building Optimizer...') if not flags.multi_gpu: optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], update_cycle=training_configs['update_cycle']) else: optim = dist.DistributedOptimizer( name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params'], device_id=local_rank) # 5. Build scheduler for optimizer if needed scheduler = build_scheduler( schedule_method=optimizer_configs['schedule_method'], optimizer=optim, scheduler_configs=optimizer_configs['scheduler_configs']) # 6. build moving average ma = build_ma(training_configs, nmt_model.named_parameters()) INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if flags.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma, device=Constants.CURRENT_DEVICE) # broadcast parameters and optimizer states if world_size > 1: INFO("Broadcasting model parameters...") dist.broadcast_parameters(params=nmt_model.state_dict()) INFO("Broadcasting optimizer states...") dist.broadcast_optimizer_state(optimizer=optim.optim) INFO('Done.') # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [1])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] is_early_stop = model_collections.get_collection("is_early_stop", [ False, ])[-1] teacher_patience = model_collections.get_collection( "teacher_patience", [training_configs['teacher_patience']])[-1] train_loss_meter = AverageMeter() train_loss_dict_meter = AverageMeterDict(critic.get_critic_name()) sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() update_cycle = training_configs['update_cycle'] grad_denom = 0 train_loss = 0.0 cum_n_words = 0 train_loss_dict = dict() valid_loss = best_valid_loss = float('inf') if rank == 0: summary_writer = SummaryWriter(log_dir=flags.log_path) else: summary_writer = None sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') while True: if summary_writer is not None: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() if rank == 0: training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") else: training_progress_bar = None for batch in training_iter: seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU) loss, loss_dict = compute_forward( model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=1.0, norm_by_words=training_configs["norm_by_words"]) update_cycle -= 1 grad_denom += batch_size train_loss += loss train_loss_dict = add_dict_value(train_loss_dict, loss_dict) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 else: raise e # When update_cycle becomes 0, it means end of one batch. Several things will be done: # - update parameters # - reset update_cycle and grad_denom, update uidx # - learning rate scheduling # - update moving average if update_cycle == 0: # 0. reduce variables if world_size > 1: grad_denom = dist.all_reduce_py(grad_denom) train_loss = dist.all_reduce_py(train_loss) train_loss_dict = dist.all_reduce_py(train_loss_dict) cum_n_words = dist.all_reduce_py(cum_n_words) # 1. update parameters optim.step(denom=grad_denom) optim.zero_grad() if training_progress_bar is not None: training_progress_bar.update(grad_denom) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss, valid_loss, best_valid_loss) for critic_name, loss_value in train_loss_dict.items(): postfix_str += (critic_name + ': {:.2f}, ').format(loss_value) training_progress_bar.set_postfix_str(postfix_str) # 2. learning rate scheduling if scheduler is not None and optimizer_configs[ "schedule_method"] != "loss": scheduler.step(global_step=uidx) # 3. update moving average if ma is not None and eidx >= training_configs[ 'moving_average_start_epoch']: ma.step() # 4. update meters train_loss_meter.update(train_loss, grad_denom) train_loss_dict_meter.update(train_loss_dict, grad_denom) sent_per_sec_meter.update(grad_denom) tok_per_sec_meter.update(cum_n_words) # 5. reset accumulated variables, update uidx update_cycle = training_configs['update_cycle'] grad_denom = 0 uidx += 1 cum_n_words = 0.0 train_loss = 0.0 train_loss_dict = dict() else: continue # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): lrate = list(optim.get_lrate())[0] if summary_writer is not None: summary_writer.add_scalar( "Speed(sents/sec)", scalar_value=sent_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "Speed(words/sec)", scalar_value=tok_per_sec_meter.ave, global_step=uidx) summary_writer.add_scalar( "train_loss", scalar_value=train_loss_meter.ave, global_step=uidx) # add loss for every critic if flags.display_loss_detail: combination_loss = train_loss_dict_meter.value for key, value in combination_loss.items(): summary_writer.add_scalar(key, scalar_value=value, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset Meters sent_per_sec_meter.reset() tok_per_sec_meter.reset() train_loss_meter.reset() train_loss_dict_meter.reset() # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=flags.debug): with cache_parameters(nmt_model): valid_loss, valid_loss_dict = loss_evaluation( model=nmt_model, critic=critic, valid_iterator=valid_iterator, rank=rank, world_size=world_size) if scheduler is not None and optimizer_configs[ "schedule_method"] == "loss": scheduler.step(metric=valid_loss) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() best_valid_loss = min_history_loss if summary_writer is not None: summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) # ================================================================================== # # BLEU Validation & Early Stop if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['bleu_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=flags.debug): with cache_parameters(nmt_model): valid_bleu = bleu_evaluation( uidx=uidx, valid_iterator=valid_iterator, batch_size=training_configs["bleu_valid_batch_size"], model=nmt_model, bleu_scorer=bleu_scorer, vocab_src=vocab_src, vocab_tgt=vocab_tgt, valid_dir=flags.valid_path, max_steps=training_configs["bleu_valid_configs"] ["max_steps"], beam_size=training_configs["bleu_valid_configs"] ["beam_size"], alpha=training_configs["bleu_valid_configs"]["alpha"], world_size=world_size, rank=rank, ) model_collections.add_to_collection(key="history_bleus", value=valid_bleu) best_valid_bleu = float( np.array(model_collections.get_collection( "history_bleus")).max()) if summary_writer is not None: summary_writer.add_scalar("bleu", valid_bleu, uidx) summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx) # If model get new best valid bleu score if valid_bleu >= best_valid_bleu: bad_count = 0 if is_early_stop is False: if rank == 0: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") exit(0) if rank == 0: best_k_saver.save(global_step=uidx, metric=valid_bleu, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ODC if training_configs['use_odc'] is True: if valid_bleu >= best_valid_bleu: pass # choose method to generate teachers from checkpoints # - best # - ave_k_best # - ma if training_configs['teacher_choice'] == 'ma': teacher_params = ma.export_ma_params() elif training_configs['teacher_choice'] == 'best': teacher_params = nmt_model.state_dict() elif "ave_best" in training_configs['teacher_choice']: if best_k_saver.num_saved >= ave_best_k: teacher_params = average_checkpoints( best_k_saver.get_all_ckpt_path() [-ave_best_k:]) else: teacher_params = nmt_model.state_dict() else: raise ValueError( "can not support teacher choice %s" % training_configs['teacher_choice']) torch.save(teacher_params, teacher_model_path) del teacher_params teacher_patience = 0 critic.set_use_KD(False) else: teacher_patience += 1 if teacher_patience >= training_configs[ 'teacher_refresh_warmup']: teacher_params = torch.load( teacher_model_path, map_location=Constants.CURRENT_DEVICE) teacher_model.load_state_dict(teacher_params, strict=False) del teacher_params critic.reset_teacher(teacher_model) critic.set_use_KD(True) if summary_writer is not None: summary_writer.add_scalar("bad_count", bad_count, uidx) info_str = "{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4} ".format( uidx, valid_loss, valid_bleu, lrate, bad_count) for key, value in valid_loss_dict.items(): info_str += (key + ': {0:.2f} '.format(value)) INFO(info_str) # ================================================================================== # # Saving checkpoints if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['save_freq'], debug=flags.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) model_collections.add_to_collection("teacher_patience", teacher_patience) if not is_early_stop: if rank == 0: checkpoint_saver.save(global_step=uidx, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) if training_progress_bar is not None: training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def train(config_path, model_path, model_type, src_filename, trg_filename): """ flags: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # ================================================================================== # # Initialization for training on different devices # - CPU/GPU # - Single/Distributed Constants.USE_GPU = True print(config_path) print(model_path) print(model_type) world_size = 1 rank = 0 local_rank = 0 if Constants.USE_GPU: torch.cuda.set_device(local_rank) Constants.CURRENT_DEVICE = "cuda:{0}".format(local_rank) else: Constants.CURRENT_DEVICE = "cpu" # ================================================================================== # # Parsing configuration files # - Load default settings # - Load pre-defined settings # - Load user-defined settings configs = prepare_configs(config_path) data_configs = configs['data_configs'] model_configs = configs['model_configs'] training_configs = configs['training_configs'] INFO(pretty_configs(configs)) Constants.SEED = training_configs['seed'] set_seed(Constants.SEED) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary.build_from_file(**data_configs['vocabularies'][0]) vocab_tgt = Vocabulary.build_from_file(**data_configs['vocabularies'][1]) Constants.EOS = vocab_src.eos Constants.PAD = vocab_src.pad Constants.BOS = vocab_src.bos valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=src_filename, vocabulary=vocab_src, max_len=100, is_train_dataset=False, ), TextLineDataset( data_path=trg_filename, vocabulary=vocab_tgt, is_train_dataset=False, max_len=100, )) valid_iterator = DataIterator(dataset=valid_bitext_dataset, batch_size=20, use_bucket=training_configs['use_bucket'], buffer_size=training_configs['buffer_size'], numbering=True, world_size=world_size, rank=rank) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, padding_idx=vocab_src.pad, vocab_src=vocab_src, **model_configs) INFO(nmt_model) # 2. Move to GPU if Constants.USE_GPU: nmt_model = nmt_model.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, model_path, device=Constants.CURRENT_DEVICE) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Prepare training sent_per_sec_meter = TimeMeter() tok_per_sec_meter = TimeMeter() grad_denom = 0 train_loss = 0.0 cum_n_words = 0 valid_loss = best_valid_loss = float('inf') sent_per_sec_meter.start() tok_per_sec_meter.start() INFO('Begin training...') eidx = 0 uidx = 0 score_result = dict() # Build iterator and progress bar training_iter = valid_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(valid_iterator), unit="sents") for batch in training_iter: seqs_numbers, seqs_x, seqs_y = batch batch_size = len(seqs_x) cum_n_words += sum(len(s) for s in seqs_y) try: # Prepare data x, y = prepare_data(seqs_x, seqs_y, cuda=Constants.USE_GPU) y_inp = y[:, :-1].contiguous() y_label = y[:, 1:].contiguous() # [batch_size, seq_len] log_probs = nmt_model( x, y_inp, log_probs=True) # [batch_size, seq_len, vocab_size] _, seq_len = y_label.shape log_probs = log_probs.view(-1, vocab_tgt.max_n_words) y_label = y_label.view(-1) loss = F.nll_loss(log_probs, y_label, reduce=False, ignore_index=vocab_tgt.pad) loss = loss.view(batch_size, seq_len) loss = loss.sum(-1) y_label = y_label.view(batch_size, seq_len) valid_token = (y_label != vocab_tgt.pad).sum(-1) loss = loss.double().div(valid_token.double()) for seq_num, l in zip(seqs_numbers, loss): assert seq_num not in score_result score_result.update({seq_num: l.item()}) uidx += 1 grad_denom += batch_size except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') else: raise e if training_progress_bar is not None: training_progress_bar.update(batch_size) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) postfix_str = 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f}), '.format( train_loss, valid_loss, best_valid_loss) training_progress_bar.set_postfix_str(postfix_str) training_progress_bar.close() return score_result
def train_thread(rank, device, args, reinforce_configs, annunciator_configs, src_vocab, trg_vocab, data_set, global_SACAgent, global_replay_buffer, global_step_counter, global_step_lock, local_agent_configs): """ build training thread for a local SACAgent on gpu device provide parameter soft-updates for the global_models. """ GlobalNames.USE_GPU = args.use_gpu GlobalNames.SEED = reinforce_configs["seed"] torch.manual_seed(GlobalNames.SEED + rank) # build local SACAgent rephraser_model_configs = local_agent_configs["rephraser_model_configs"] rephraser_optimizer_configs = local_agent_configs["rephraser_optimizer_configs"] with open(reinforce_configs["victim_configs"], "r") as victim_f: victim_configs = yaml.load(victim_f, Loader=yaml.FullLoader) _, _, limit_dist = load_or_extract_near_vocab( config_path=reinforce_configs["victim_configs"], model_path=reinforce_configs["victim_model"], init_perturb_rate=reinforce_configs["init_perturb_rate"], save_to=os.path.join(args.save_to, "near_vocab"), save_to_full=os.path.join(args.save_to, "full_near_vocab"), top_reserve=12, emit_as_id=True) local_agent = rephraser.SACAgent( device=device, d_word_vec=victim_configs["model_configs"]["d_word_vec"], d_model=rephraser_model_configs["d_model"], limit_dist=limit_dist, dropout=rephraser_model_configs["dropout"], learnable_temperature=rephraser_model_configs["learnable_temperature"], init_temperature=rephraser_model_configs["init_temperature"], rephraser_optimizer_configs=rephraser_optimizer_configs, rank=rank, save_to=os.path.join(args.save_to), num_kept_checkpoints=reinforce_configs["num_kept_checkpoints"] ) local_summary_writer = SummaryWriter( log_dir=os.path.join(args.save_to, "train_env%d" % rank)) # # this is the secondary buffer # local_replay_buffer = ReplayBuffer( # max_sen_len=global_replay_buffer.max_sen_len, # state_dim=victim_configs["model_configs"]["d_word_vec"], # action_dim=victim_configs["model_configs"]["d_word_vec"], # capacity=max(reinforce_configs["replay_buffer_capacity"]/20, 10000) # ) # build environment (include annunciator update settings) reinforce_iterator = DataIterator( dataset=data_set, batch_size=reinforce_configs["batch_size"], use_bucket=True, buffer_size=reinforce_configs["buffer_size"], numbering=True) local_data_iterator = reinforce_iterator.build_generator() local_env = Translate_Env( reinforce_configs=reinforce_configs, annunciator_configs=annunciator_configs, src_vocab=src_vocab, trg_vocab=trg_vocab, data_iterator=local_data_iterator, save_to=args.save_to, device=device) episode_count = 0 # a batch of sentences as an episode & learning episodes local_step = global_step_counter.value # initiate local agent update steps patience_t = annunciator_configs["patience"] trust_acc = 0.5 while True: # infinite loop of data set iterator (epoch) # we will continue with a new iterator with refreshed environments # whenever the last iterator breaks with "StopIteration" rephraser_iterator = reinforce_iterator.build_generator() local_env.reset_data_iter(rephraser_iterator) try: while True: # loop training episodes # the environment will be initiated by an actor as self learning local_env.reset() states, _, _, _ = local_env.get_state() # x_emb = local_env.reset(local_agent.actor) # x_emb = torch.from_numpy(x_emb) # if device != "cpu": # x_emb = x_emb.to(device) annunciator_base_step = local_step episode_length = 0 # the rollout steps episode_rewards = np.array([0.] * states.shape[0]) done = True # whether the episode is finished & should reset episode with new batch while episode_length <= args.max_episode_lengths: # loop TD learning rollouts # check for the environment updates using current **global** agent if episode_count % local_agent_configs["rephraser_update_steps"] == 0: """ stop criterion: until mse reaches the bound within patience. otherwise the training thread stops """ try: # update environment INFO("Update environment") # sync from the global agent to avoid unnecessary locking. local_agent.sync_from(global_SACAgent) annunciator_base_step, trust_acc = local_env.update_annunciator( local_agent, annunciator_base_step, min_update_steps=annunciator_configs["valid_freq"], max_update_steps=annunciator_configs["annunciator_update_steps"], accuracy_bound=annunciator_configs["acc_bound"], overall_update_weight=1-trust_acc, summary_writer=local_summary_writer) # global_SACAgent.to("cpu").train() # switch back to training mode local_agent = local_agent.to(device).train() except StopIteration: INFO("finish one training epoch, reset data_iterator") break annunciator_base_step += 1 # a flag to label the scorer updates if trust_acc < annunciator_configs["d_converged_bound"]: # GAN target reached, scorer has reached its limit. patience_t -= 1 INFO("scorer reached GAN convergence bound: %d times" % patience_t) else: # reset patience if scorer is refreshed patience_t = annunciator_configs["patience"] if patience_t == 0 or episode_count > args.max_episodes: WARN("maximum patience & training step reached. Thread stop") break local_agent = local_agent.to(device).train() # switch the agent to training mode if done: # can't create SARSA with only one step, start a new episode INFO("sync from global agent") local_agent.sync_from(global_SACAgent) local_agent = local_agent.to(device).train() local_env.reset() done = False states, masks, rephrase_positions, _ = local_env.get_state() roll_out_rewards = np.array([0] * states.shape[0]) # n-step TD learning accumulation for i in range(args.action_roll_steps): # sample actions for buffer collection (start with fully random) if global_replay_buffer.size() < local_agent_configs["rephraser_learning_batch"]: # fast random actions in the **action_space** as initial exploration actions = local_env.limit_dist * np.random.sample([states.shape[0], states.shape[2]]) actions = torch.tensor(actions,dtype=torch.float32, device=device) else: # explore and exploit local_agent = local_agent.train(False) # set to validation mode .detach().cpu().numpy() label_emb = rephraser.slice_by_indices(states, rephrase_positions, device=device) actions, _ = local_agent.actor.sample_normal( states, x_pad_indicator=1.-masks, label_emb=label_emb, reparamization=False) # able to update agent with current buffer if global_replay_buffer.size() >= local_agent_configs["rephraser_learning_batch"]: local_agent = local_agent.train() local_agent.update_local_net( local_agent_configs, global_replay_buffer, target_critic=global_SACAgent.critic.to(device), update_step=local_step, discount_factor=reinforce_configs["gamma"], summary_writer=local_summary_writer, update_trust_region=trust_acc ) global_SACAgent.critic.to("cpu") if local_step % local_agent_configs["soft_update_freq"] == 0: with global_step_lock: # update global model INFO("update global model by local model on: %s"%device) global_step_counter.value += 1 local_agent.soft_update_target_net(global_SACAgent, reinforce_configs["tau"]) if global_step_counter.value % reinforce_configs["save_freq"] == 0: INFO("save global model %d by device: %s"%(global_step_counter.value, device)) with global_SACAgent.soft_update_lock: global_SACAgent.save_model(global_step_counter.value) # save final global model (as object) if trust_acc < annunciator_configs["d_converged_bound"]: global_SACAgent.save_model(save_to_final=args.save_to) local_step += 1 INFO("finish local update by %2f"%trust_acc) _, rewards, _ = local_env.step(actions) new_states, _, new_rephrase_positions, survival_signals = local_env.get_state() roll_out_rewards = [roll_out_rewards[t]+rewards[t] for t in range(states.shape[0])] episode_rewards = [episode_rewards[t]+rewards[t] for t in range(states.shape[0])] survival_signals_cpu = survival_signals.cpu().numpy().tolist() survival_and_no_max = np.array([0 if episode_length == local_env.sent_len[t] else survival_signals_cpu[t][0] for t in range(states.shape[0])]) INFO("step %d of episode:%d, r:%f, device:%s"%(i, episode_count, np.array(episode_rewards).mean(), actions.device)) # add the tensor variables to the replay buffer, # learn by rollout rewards or episode reward accumulated # update the overall signals for rollout if 1 in survival_signals: done = False else: done = True if local_env.index == states.shape[1]-1: # note that the environment index will increase beyond the shape done = True if done: # break from roll-out break # print(states.shape, masks.shape, actions.shape, rephrase_positions.shape, torch.tensor(roll_out_rewards).shape, # survival_signals.shape, torch.tensor(survival_and_no_max).shape) INFO("write buffer begin") #cleanse unecessary experience to save time invalid_sample = [] if local_env.index>min(local_env.sent_len): for i in range(states.shape[0]): # for every record if local_env.index > local_env.sent_len[i]: invalid_sample.append(i) global_replay_buffer.add( states=states, masks=masks, actions=actions, rephrase_positions=rephrase_positions, rewards=torch.tensor(rewards, dtype=torch.float32).unsqueeze(dim=-1), survival_signals=survival_signals, survive_and_no_max=torch.tensor(survival_and_no_max, dtype=torch.float32).unsqueeze(dim=-1), invalid_sample=invalid_sample ) INFO("write buffer end, size=%d"%global_replay_buffer.size()) episode_length += 1 states = new_states rephrase_positions = new_rephrase_positions if done: # break from current episodes episode_count += 1 break if episode_count > args.max_episodes: return except StopIteration: INFO("finish one training epoch, reset data_iterator") continue
def ensemble_translate(FLAGS): GlobalNames.USE_GPU = FLAGS.use_gpu config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs['data_configs'] model_configs = configs['model_configs'] timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) valid_dataset = TextLineDataset(data_path=FLAGS.source_path, vocabulary=vocab_src) valid_iterator = DataIterator(dataset=valid_dataset, batch_size=FLAGS.batch_size, use_bucket=True, buffer_size=100000, numbering=True) INFO('Done. Elapsed time {0}'.format(timer.toc())) # ================================================================================== # # Build Model & Sampler & Validation INFO('Building model...') timer.tic() nmt_models = [] model_path = FLAGS.model_path for ii in range(len(model_path)): nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) nmt_model.eval() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Reloading model parameters...') timer.tic() params = load_model_parameters(model_path[ii], map_location="cpu") nmt_model.load_state_dict(params) if GlobalNames.USE_GPU: nmt_model.cuda() nmt_models.append(nmt_model) INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Begin...') result_numbers = [] result = [] n_words = 0 timer.tic() infer_progress_bar = tqdm(total=len(valid_iterator), desc=' - (Infer) ', unit="sents") valid_iter = valid_iterator.build_generator() for batch in valid_iter: numbers, seqs_x = batch batch_size_t = len(seqs_x) x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU) with torch.no_grad(): word_ids = ensemble_beam_search(nmt_models=nmt_models, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha) word_ids = word_ids.cpu().numpy().tolist() # Append result for sent_t in word_ids: sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] result.append(sent_t) n_words += len(sent_t[0]) infer_progress_bar.update(batch_size_t) infer_progress_bar.close() INFO('Done. Speed: {0:.2f} words/sec'.format( n_words / (timer.toc(return_seconds=True)))) translation = [] for sent in result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(vocab_tgt.id2token(w)) samples.append(vocab_tgt.tokenizer.detokenize(sample)) translation.append(samples) # resume the ordering origin_order = np.argsort(result_numbers).tolist() translation = [translation[ii] for ii in origin_order] keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min( FLAGS.beam_size, FLAGS.keep_n) outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos')
def valid_thread(device, args, reinforce_configs, annunciator_configs, src_vocab, trg_vocab, data_set, global_SACAgent, global_replay_buffer, global_step_counter, global_step_lock, local_agent_configs): """ build validation thread to check global SACAgent tests and update buffer """ GlobalNames.USE_GPU = args.use_gpu GlobalNames.SEED = reinforce_configs["seed"] torch.manual_seed(GlobalNames.SEED) # build local agent and sync from the global model to avoid deadlock rephraser_model_configs = local_agent_configs["rephraser_model_configs"] rephraser_optimizer_configs = local_agent_configs["rephraser_optimizer_configs"] with open(reinforce_configs["victim_configs"], "r") as victim_f: victim_configs = yaml.load(victim_f, Loader=yaml.FullLoader) _, _, limit_dist = load_or_extract_near_vocab( config_path=reinforce_configs["victim_configs"], model_path=reinforce_configs["victim_model"], init_perturb_rate=reinforce_configs["init_perturb_rate"], save_to=os.path.join(args.save_to, "near_vocab"), save_to_full=os.path.join(args.save_to, "full_near_vocab"), top_reserve=12, emit_as_id=True) local_agent = rephraser.SACAgent( device=device, d_word_vec=victim_configs["model_configs"]["d_word_vec"], d_model=rephraser_model_configs["d_model"], limit_dist=limit_dist, dropout=rephraser_model_configs["dropout"], learnable_temperature=rephraser_model_configs["learnable_temperature"], init_temperature=rephraser_model_configs["init_temperature"], rephraser_optimizer_configs=rephraser_optimizer_configs, rank="dev", save_to=os.path.join(args.save_to), num_kept_checkpoints=reinforce_configs["num_kept_checkpoints"] ) local_summary_writer = SummaryWriter( log_dir=os.path.join(args.save_to, "valid_env")) # build an independent dev env (annunciator) reinforce_iterator = DataIterator( dataset=data_set, batch_size=reinforce_configs["batch_size"], use_bucket=True, buffer_size=reinforce_configs["buffer_size"], numbering=True) local_data_iterator = reinforce_iterator.build_generator() local_env = Translate_Env( reinforce_configs=reinforce_configs, annunciator_configs=annunciator_configs, src_vocab=src_vocab, trg_vocab=trg_vocab, data_iterator=local_data_iterator, save_to=args.save_to, device=device) local_step = 0 episode_count = 0 # determines the global sync update_signal = False # whether update the environment annunciator while True: # inifinite loop of dataset iterator rephraser_iterator = reinforce_iterator.build_generator() local_env.reset_data_iter(rephraser_iterator) try: annunciator_base_step = local_step while True: # inifinite loop of the episodes # check for the environment annunciator updates; with global_step_lock: if global_step_counter.value % (local_agent_configs["rephraser_update_steps"]) == 0: update_signal=True if update_signal: try: INFO("update valid env annunciator") update_signal=False annunciator_base_step, trust_acc = local_env.update_annunciator( local_agent, annunciator_base_step, min_update_steps=annunciator_configs["valid_freq"], max_update_steps=annunciator_configs["annunciator_update_steps"], accuracy_bound=annunciator_configs["acc_bound"], summary_writer=local_summary_writer) # global_SACAgent.to("cpu").train() local_agent = local_agent.to(device).train(False) except StopIteration: INFO("finish one training epoch, reset data_iterator") break # prepare a new episode local_agent.sync_from(global_SACAgent) local_agent = local_agent.to(device).train(False) local_env.reset() states, masks, rephrase_positions, _ = local_env.get_state() episode_rewards = np.array([0.] * states.shape[0]) episode_length = 0 done = False while not done: states, masks, rephrase_positions, _ = local_env.get_state() label_embs = rephraser.slice_by_indices(states, rephrase_positions, device=device) # directly use the expectation as validation mu, _ = local_agent.actor.forward( states, x_pad_indicator=1.-masks, label_emb=label_embs ) actions = mu * local_agent.actor.action_range _, rewards, _ = local_env.step(actions) episode_rewards = [episode_rewards[t]+rewards[t] for t in range(states.shape[0])] new_states, _, _, survival_signals = local_env.get_state() survival_signals_cpu = survival_signals.cpu().numpy().tolist() survival_and_no_max = np.array([0 if episode_length == local_env.sent_len[t] else survival_signals_cpu[t][0] for t in range(states.shape[0])]) INFO("step %d survive and no max:%d, %s"%(episode_length, survival_and_no_max.sum(), actions.device)) if 1 in survival_signals: done = False else: episode_count += 1 done = True if local_env.index == states.shape[1]-1: # note that the environment index will increase beyond the shape done = True if done: # skip the final step break INFO("write buffer begin") invalid_sample = [] if local_env.index>min(local_env.sent_len): for i in range(states.shape[0]): # for every record if local_env.index > local_env.sent_len[i]: invalid_sample.append(i) # print(states.shape, masks.shape, actions.shape, rephrase_positions.shape, torch.tensor(roll_out_rewards).shape, # survival_signals.shape, torch.tensor(survival_and_no_max).shape) global_replay_buffer.add( states=states, masks=masks, actions=actions, rephrase_positions=rephrase_positions, rewards=torch.tensor(rewards, dtype=torch.float32).unsqueeze(dim=-1), survival_signals=survival_signals, survive_and_no_max=torch.tensor(survival_and_no_max, dtype=torch.float32).unsqueeze(dim=-1), invalid_sample=invalid_sample ) INFO("write buffer end, size=%d"%global_replay_buffer.size()) local_step += 1 episode_length += 1 INFO("mean episode rewards on batch:%f"%np.array(episode_rewards).mean()) local_summary_writer.add_scalar( "valid_rewards", scalar_value=np.array(episode_rewards).mean(), global_step=local_step) except StopIteration: INFO("finish one validation epoch, reset data_iterator") continue
def train(FLAGS): """ FLAGS: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # write log of training to file. write_log_to_file( os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) GlobalNames.USE_GPU = FLAGS.use_gpu if GlobalNames.USE_GPU: CURRENT_DEVICE = "cpu" else: CURRENT_DEVICE = "cuda:0" config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) INFO(pretty_configs(configs)) # Add default configs configs = default_configs(configs) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] GlobalNames.SEED = training_configs['seed'] set_seed(GlobalNames.SEED) best_model_prefix = os.path.join( FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_tgt = Vocabulary(**data_configs["vocabularies"][0]) train_batch_size = training_configs["batch_size"] * max( 1, training_configs["update_cycle"]) train_buffer_size = training_configs["buffer_size"] * max( 1, training_configs["update_cycle"]) train_bitext_dataset = ZipDataset(TextLineDataset( data_path=data_configs['train_data'][0], vocabulary=vocab_tgt, max_len=data_configs['max_len'][0], ), shuffle=training_configs['shuffle']) valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_tgt, )) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=train_batch_size, use_bucket=training_configs['use_bucket'], buffer_size=train_buffer_size, batching_func=training_configs['batching_key']) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True) INFO('Done. Elapsed time {0}'.format(timer.toc())) lrate = optimizer_configs['learning_rate'] is_early_stop = False # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(FLAGS.saveto, FLAGS.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() lm_model = build_model(n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) INFO(lm_model) params_total = sum([p.numel() for n, p in lm_model.named_parameters()]) params_with_embedding = sum([ p.numel() for n, p in lm_model.named_parameters() if n.find('embedding') == -1 ]) INFO('Total parameters: {}'.format(params_total)) INFO('Total parameters (excluding word embeddings): {}'.format( params_with_embedding)) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 2. Move to GPU if GlobalNames.USE_GPU: lm_model = lm_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed lm_model.init_parameters(FLAGS.pretrain_path, device=CURRENT_DEVICE) # 4. Build optimizer INFO('Building Optimizer...') optim = Optimizer(name=optimizer_configs['optimizer'], model=lm_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params']) # 5. Build scheduler for optimizer if needed if optimizer_configs['schedule_method'] is not None: if optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler( optimizer=optim, **optimizer_configs["scheduler_configs"]) elif optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs']) else: WARN( "Unknown scheduler name {0}. Do not use lr_scheduling.".format( optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None # 6. build moving average if training_configs['moving_average_method'] is not None: ma = MovingAverage( moving_average_method=training_configs['moving_average_method'], named_params=lm_model.named_parameters(), alpha=training_configs['moving_average_alpha']) else: ma = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if FLAGS.reload: checkpoint_saver.load_latest(model=lm_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [0])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] summary_writer = SummaryWriter(log_dir=FLAGS.log_path) cum_samples = 0 cum_words = 0 valid_loss = best_valid_loss = float('inf') # Max Float saving_files = [] # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() INFO('Begin training...') while True: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epc {}, Upd {}) '.format( eidx, uidx), total=len(training_iterator), unit="sents") for batch in training_iter: uidx += 1 if optimizer_configs[ "schedule_method"] is not None and optimizer_configs[ "schedule_method"] != "loss": scheduler.step(global_step=uidx) seqs_y = batch n_samples_t = len(seqs_y) n_words_t = sum(len(s) for s in seqs_y) cum_samples += n_samples_t cum_words += n_words_t train_loss = 0. optim.zero_grad() try: # Prepare data for (seqs_y_t, ) in split_shard( seqs_y, split_size=training_configs['update_cycle']): y = prepare_data(seqs_y_t, cuda=GlobalNames.USE_GPU) loss = compute_forward( model=lm_model, critic=critic, # seqs_x=x, seqs_y=y, eval=False, normalization=n_samples_t, norm_by_words=training_configs["norm_by_words"]) train_loss += loss / y.size( 1) if not training_configs["norm_by_words"] else loss optim.step() except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 optim.zero_grad() else: raise e if ma is not None and eidx >= training_configs[ 'moving_average_start_epoch']: ma.step() training_progress_bar.update(n_samples_t) training_progress_bar.set_description( ' - (Epc {}, Upd {}) '.format(eidx, uidx)) training_progress_bar.set_postfix_str( 'TrainLoss: {:.2f}, ValidLoss(best): {:.2f} ({:.2f})'.format( train_loss, valid_loss, best_valid_loss)) summary_writer.add_scalar("train_loss", scalar_value=train_loss, global_step=uidx) # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): # words per second and sents per second words_per_sec = cum_words / (timer.toc(return_seconds=True)) sents_per_sec = cum_samples / (timer.toc(return_seconds=True)) lrate = list(optim.get_lrate())[0] summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx) summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset timer timer.tic() cum_words = 0 cum_samples = 0 # ================================================================================== # # Saving checkpoints if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) if not is_early_stop: checkpoint_saver.save(global_step=uidx, model=lm_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=FLAGS.debug): if ma is not None: origin_state_dict = deepcopy(lm_model.state_dict()) lm_model.load_state_dict(ma.export_ma_params(), strict=False) valid_loss = loss_validation( model=lm_model, critic=critic, valid_iterator=valid_iterator, norm_by_words=training_configs["norm_by_words"]) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) if ma is not None: lm_model.load_state_dict(origin_state_dict) del origin_state_dict if optimizer_configs["schedule_method"] == "loss": scheduler.step(metric=best_valid_loss) # If model get new best valid loss if valid_loss < best_valid_loss: bad_count = 0 if is_early_stop is False: # 1. save the best model torch.save(lm_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save(global_step=uidx, model=lm_model) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") best_valid_loss = min_history_loss summary_writer.add_scalar("bad_count", bad_count, uidx) INFO("{0} Loss: {1:.2f} lrate: {2:6f} patience: {3}".format( uidx, valid_loss, lrate, bad_count)) training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
def valid(rank, device, args, attack_configs, discriminator_configs, src_vocab, trg_vocab, data_set, global_attacker, attacker_configs, counter): """ for test thread (runs the network results) we simply runs the attacker on batch of sequences on the environment using current global attaker (without discriminator, since discriminator is not general) :param rank: rank of the thread (might be multi-processing) :param device: running device for the ac network thread :param args: global args :param attack_configs: initiate env :param discriminator_configs: initiate env :param src_vocab: initiate env :param trg_vocab: initiate env :param data_set: provides data set for iterator :param global_attacker: global attacker models :param attacker_configs: initiate local attacker configs :param counter: multiprocessing counter :return: """ # this is for multi-processing, GlobalNames can not be direct inherited GlobalNames.USE_GPU = args.use_gpu GlobalNames.SEED = attack_configs["seed"] torch.manual_seed(GlobalNames.SEED + rank) attacker_model_configs = attacker_configs["attacker_model_configs"] valid_iterator = DataIterator(dataset=data_set, batch_size=attack_configs["batch_size"], use_bucket=attack_configs["use_bucket"], buffer_size=attack_configs["buffer_size"], numbering=True) valid_iterator = valid_iterator.build_generator() env = Translate_Env(attack_configs=attack_configs, discriminator_configs=discriminator_configs, src_vocab=src_vocab, trg_vocab=trg_vocab, data_iterator=valid_iterator, save_to=args.save_to, device=device) INFO("finish building validation env") # need a directory for saving and loading summary_writer = SummaryWriter( log_dir=os.path.join(args.save_to, "dev_env")) local_attacker = attacker.Attacker(src_vocab.max_n_words, **attacker_model_configs) if device != "cpu": local_attacker = local_attacker.to(device) local_attacker.eval() def trans_from_vocab(vocab, ids): """ transcribe from vocabulary :param vocab: A Vocabulary object :param ids: 2D list of ids, in shape [sent, tokens] :return: 2D list of tokens, with all special tokens removed (detokenized) """ result = [] for sent_ids in ids: result += [ vocab.ids2sent( [i for i in sent_ids if i not in [PAD, EOS, BOS]]) ] return result episode_count = 0 with open(os.path.join(args.save_to, "dev_env/src_enhanced"), "w") as src_f, \ open(os.path.join(args.save_to, "dev_env/src_pert"), "w") as src_pert, \ open(os.path.join(args.save_to, "dev_env/trg_enhanced"), "w") as trg_f, \ open(os.path.join(args.save_to, "dev_env/trans_origin"), "w") as trans_origin, \ open(os.path.join(args.save_to, "dev_env/trans_pert"), "w") as trans_pert: while True: padded_src = env.reset() # sync with current attacker model local_attacker.load_state_dict(global_attacker.state_dict()) episode_count += 1 perturbed_x_ids = env.padded_src.clone().detach() # print(perturbed_x_ids) mask = perturbed_x_ids.detach().eq(PAD).long() # print(mask) with torch.no_grad(): batch_size, max_steps = padded_src.shape for t in range(1, max_steps - 1): # ignore BOS and EOS inputs = env.padded_src[:, t - 1:t + 2] attack_out = local_attacker.get_attack(x=perturbed_x_ids, label=inputs) actions = attack_out.argmax(dim=-1) actions_entropy = -( attack_out * torch.log(attack_out)).sum(dim=-1).mean() summary_writer.add_scalar( "action_entropy", scalar_value=actions_entropy.item(), global_step=episode_count) target_of_step = [] for batch_index in range(batch_size): word_id = inputs[batch_index][1] # random choice from candidates target_word_id = env.w2vocab[word_id.item()][ np.random.choice(len(env.w2vocab[word_id.item()]), 1)[0]] target_of_step += [target_word_id] # override the perturbed results with random choice from candidates perturbed_x_ids[:, t] *= (1 - actions) adjustification_ = torch.tensor(target_of_step, device=inputs.device) if GlobalNames.USE_GPU: adjustification_ = adjustification_.to(device) perturbed_x_ids[:, t] += adjustification_ * actions # apply mask on the results perturbed_x_ids *= (1 - mask) # translate sequences and calculate degredated bleu scores on batches perturbed_result = env.translate(perturbed_x_ids) trg_y = trans_from_vocab(trg_vocab, env.seqs_y) trans_y_p = trans_from_vocab(trg_vocab, perturbed_result) trans_y = trans_from_vocab(trg_vocab, env.origin_result) # print("golden:", trg_y) # print("origin_results:", trans_y) # print("perturbed_results:", trans_y_p) # calculate final BLEU degredation: perturbed_bleu = [] for i, sent in enumerate(env.seqs_y): # sentence is still surviving perturbed_bleu.append( bleu.sentence_bleu(references=[sent], hypothesis=perturbed_result[i], emulate_multibleu=True)) print("origin_bleu:", env.origin_bleu) print("perturbed_bleu: ", perturbed_bleu) bleu_degrade = (sum(env.origin_bleu) - sum(perturbed_bleu)) / len(perturbed_bleu) summary_writer.add_scalar("bleu_degradation", scalar_value=bleu_degrade, global_step=episode_count) # edit BLEU edit_bleu = [] padded_src = padded_src.tolist() perturbed_x_ids = perturbed_x_ids.cpu().numpy().tolist() trans_x = trans_from_vocab(src_vocab, padded_src) trans_x_p = trans_from_vocab(src_vocab, perturbed_x_ids) # print(trans_x) # print(trans_x_p) for i in range(len(padded_src)): src = [label for label in padded_src[i] if label != PAD] perturbed_src = [ label for label in perturbed_x_ids[i] if label != PAD ] edit_bleu += [ bleu.sentence_bleu(references=[src], hypothesis=perturbed_src, emulate_multibleu=True) ] # print("edit_bleu: ", edit_bleu) summary_writer.add_scalar("edit_bleu", scalar_value=sum(edit_bleu) / len(edit_bleu), global_step=episode_count) # output enhanced results to log files for i in range(len(perturbed_bleu)): if perturbed_bleu[i] > env.origin_bleu[i]: src_f.write(trans_x[i] + "\n") src_pert.write(trans_x_p[i] + "\n") trg_f.write(trg_y[i] + "\n") trans_origin.write(trans_y[i] + "\n") trans_pert.write(trans_y_p[i] + "\n") time.sleep(5)
def test_attack(config_path, save_to, model_name="attacker", shuffle=True, use_gpu=True): """ attack :param config_path: attack attack configs :param save_to: (string) saving directories :param model_name: (string) for saving names :param shuffle: (boolean) for batch scheme, shuffle data set :param use_gpu: (boolean) on gpu or not :return: attacked sequences """ # initiate with open(config_path.strip()) as f: configs = yaml.load(f) attack_configs = configs["attack_configs"] attacker_model_configs = configs["attacker_model_configs"] attacker_optim_configs = configs["attacker_optimizer_configs"] training_configs = configs["training_configs"] victim_config_path = attack_configs["victim_configs"] victim_model_path = attack_configs["victim_model"] with open(victim_config_path.strip()) as v_f: print("open victim configs...%s" % victim_config_path) victim_configs = yaml.load(v_f) data_configs = victim_configs["data_configs"] # building inputs vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_trg = Vocabulary(**data_configs["vocabularies"][1]) # parallel data binding train_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0]), TextLineDataset(data_path=data_configs['train_data'][1], vocabulary=vocab_trg, max_len=data_configs['max_len'][1]), shuffle=shuffle) valid_bitext_dataset = ZipDataset( TextLineDataset(data_path=data_configs["valid_data"][0], vocabulary=vocab_src, max_len=data_configs["max_len"][0]), TextLineDataset(data_path=data_configs["valid_data"][1], vocabulary=vocab_trg, max_len=data_configs["max_len"][1]), shuffle=shuffle) train_batch_size = training_configs["batch_size"] train_buffer_size = training_configs["buffer_size"] training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=train_batch_size, use_bucket=training_configs['use_bucket'], buffer_size=train_buffer_size, batching_func=training_configs['batching_key']) # valid_iterator is bucketed by length to accelerate decoding (numbering to mark orders) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs["valid_batch_size"], use_bucket=True, buffer_size=50000, numbering=True) # initiate saver model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format(os.path.join(save_to, model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) w2p, w2vocab = load_or_extract_near_vocab( config_path=victim_config_path, model_path=victim_model_path, init_perturb_rate=attack_configs["init_perturb_rate"], save_to=os.path.join(save_to, "near_vocab"), save_to_full=os.path.join(save_to, "full_near_vocab"), top_reserve=12, emit_as_id=True) # build attacker # attacker = Attacker(n_words=vocab_src.max_n_words, # **attacker_model_configs) # if use_gpu: # attacker = attacker.cuda() # CURRENT_DEVICE = "cuda" # else: # CURRENT_DEVICE = "cpu" # load embedding from trained NMT models # load_embedding(attacker, model_path=victim_model_path, device=CURRENT_DEVICE) # attacker.eval() # for i in range(6): train_iter = training_iterator.build_generator() batch = train_iter.__next__() print(batch[1][3])
def interactive_FBS(FLAGS): patience = FLAGS.try_times GlobalNames.USE_GPU = FLAGS.use_gpu config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) data_configs = configs['data_configs'] model_configs = configs['model_configs'] timer = Timer() #=================================================================================== #load data INFO('loading data...') timer.tic() vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) valid_dataset = TextLineDataset(data_path=FLAGS.source_path, vocabulary=vocab_src) valid_iterator = DataIterator(dataset=valid_dataset, batch_size=FLAGS.batch_size, use_bucket=True, buffer_size=100000, numbering=True) valid_ref = [] with open(FLAGS.ref_path) as f: for sent in f: valid_ref.append(vocab_tgt.sent2ids(sent)) INFO('Done. Elapsed time {0}'.format(timer.toc())) #=================================================================================== #build Model & Sampler & Validation INFO('Building model...') critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) # 2. Move to GPU if GlobalNames.USE_GPU: critic = critic.cuda() timer.tic() fw_nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) #bw_nmt_model = None bw_nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) fw_nmt_model.eval() bw_nmt_model.eval() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('Reloading model parameters...') timer.tic() fw_params = load_model_parameters(FLAGS.fw_model_path, map_location="cpu") bw_params = load_model_parameters(FLAGS.bw_model_path, map_location="cpu") fw_nmt_model.load_state_dict(fw_params) bw_nmt_model.load_state_dict(bw_params) if GlobalNames.USE_GPU: fw_nmt_model.cuda() bw_nmt_model.cuda() INFO('Done. Elapsed time {0}'.format(timer.toc())) INFO('begin...') timer.tic() result_numbers = [] result = [] n_words = 0 imt_numbers = [] imt_result = [] imt_n_words = 0 imt_constrains = [[] for ii in range(FLAGS.imt_step)] infer_progress_bar = tqdm(total=len(valid_iterator), desc=' - (Infer)', unit='sents') valid_iter = valid_iterator.build_generator() for batch in valid_iter: batch_result = [] batch_numbers = [] numbers, seqs_x = batch batch_size_t = len(seqs_x) x = prepare_data(seqs_x=seqs_x, cuda=GlobalNames.USE_GPU) with torch.no_grad(): word_ids = beam_search(nmt_model=fw_nmt_model, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha) word_ids = word_ids.cpu().numpy().tolist() for sent_t in word_ids: sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] result.append(sent_t) batch_result.append(sent_t[0]) n_words += len(sent_t[0]) result_numbers += numbers imt_numbers += numbers batch_numbers += numbers batch_ref = [valid_ref[ii] for ii in batch_numbers] last_sents = copy.deepcopy(batch_result) constrains = [[[] for ii in range(patience)] for jj in range(batch_size_t)] positions = [[[] for ii in range(patience)] for jj in range(batch_size_t)] for idx in range(FLAGS.imt_step): cons, pos = sample_constrains(last_sents, batch_ref, patience) for ii in range(batch_size_t): for jj in range(patience): constrains[ii][jj].append(cons[ii][jj]) positions[ii][jj].append(pos[ii][jj]) #print(positions) imt_constrains[idx].append([vocab_tgt.ids2sent(c) for c in cons]) bidirection = False if FLAGS.bidirection: bidirection = True with torch.no_grad(): constrained_word_ids, positions = fixwords_beam_search( fw_nmt_model=fw_nmt_model, bw_nmt_model=bw_nmt_model, beam_size=FLAGS.beam_size, max_steps=FLAGS.max_steps, src_seqs=x, alpha=FLAGS.alpha, constrains=constrains, positions=positions, last_sentences=last_sents, imt_step=idx + 1, bidirection=bidirection) constrained_word_ids = constrained_word_ids.cpu().numpy().tolist() last_sents = [] for i, sent_t in enumerate(constrained_word_ids): sent_t = [[wid for wid in line if wid != PAD] for line in sent_t] if idx == FLAGS.imt_step - 1: imt_result.append(copy.deepcopy(sent_t)) imt_n_words += len(sent_t[0]) samples = [] for trans in sent_t: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(w) samples.append(sample) sent_t = [] for ii in range(len(samples)): if ii % FLAGS.beam_size == 0: sent_t.append(samples[ii]) BLEU = [] for sample in sent_t: bleu, _ = bleuScore(sample, batch_ref[i]) BLEU.append(bleu) # print("BLEU: ", BLEU) order = np.argsort(BLEU).tolist() order = order[::-1] # print("order: ", order) sent_t = [sent_t[ii] for ii in order] last_sents.append(sent_t[0]) if FLAGS.online_learning and idx == FLAGS.imt_step - 1: seqs_y = [] for sent in last_sents: sent = [BOS] + sent seqs_y.append(sent) compute_forward(fw_nmt_model, critic, x, torch.Tensor(seqs_y).long().cuda()) seqs_y = [sent[::-1] for sent in seqs_y] for ii in range(len(seqs_y)): seqs_y[ii][0] = BOS seqs_y[ii][-1] = EOS compute_forward(bw_nmt_model, critic, x, torch.Tensor(seqs_y).long().cuda()) infer_progress_bar.update(batch_size_t) infer_progress_bar.close() INFO('Done. Speed: {0:.2f} words/sec'.format( n_words / (timer.toc(return_seconds=True)))) translation = [] for sent in result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(vocab_tgt.id2token(w)) samples.append(vocab_tgt.tokenizer.detokenize(sample)) translation.append(samples) origin_order = np.argsort(result_numbers).tolist() translation = [translation[ii] for ii in origin_order] keep_n = FLAGS.beam_size if FLAGS.keep_n <= 0 else min( FLAGS.beam_size, FLAGS.keep_n) outputs = ['%s.%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos') imt_translation = [] for sent in imt_result: samples = [] for trans in sent: sample = [] for w in trans: if w == vocab_tgt.EOS: break sample.append(w) samples.append(sample) imt_translation.append(samples) origin_order = np.argsort(imt_numbers).tolist() imt_translation = [imt_translation[ii] for ii in origin_order] for idx in range(FLAGS.imt_step): imt_constrains[idx] = [ ' '.join(imt_constrains[idx][ii]) + '\n' for ii in origin_order ] with open('%s.cons%d' % (FLAGS.saveto, idx), 'w') as f: f.writelines(imt_constrains[idx]) bleu_translation = [] for idx, sent in enumerate(imt_translation): samples = [] for ii in range(len(sent)): if ii % FLAGS.beam_size == 0: samples.append(sent[ii]) BLEU = [] for sample in samples: bleu, _ = bleuScore(sample, valid_ref[idx]) BLEU.append(bleu) #print("BLEU: ", BLEU) order = np.argsort(BLEU).tolist() order = order[::-1] #print("order: ", order) samples = [vocab_tgt.ids2sent(samples[ii]) for ii in order] bleu_translation.append(samples) #keep_n = FLAGS.beam_size*patience if FLAGS.keep_n <= 0 else min(FLAGS.beam_size*patience, FLAGS.keep_n) keep_n = patience outputs = ['%s.imt%d' % (FLAGS.saveto, i) for i in range(keep_n)] with batch_open(outputs, 'w') as handles: for trans in bleu_translation: for i in range(keep_n): if i < len(trans): handles[i].write('%s\n' % trans[i]) else: handles[i].write('%s\n' % 'eos')