def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True)
def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir)
def main(): tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info('Starting seq2seq_attention in %s mode...', (args.mode)) args.model_path = os.path.join(args.model_path, args.exp_name) if not os.path.exists(args.model_path): if args.mode == "train": os.makedirs(args.model_path) else: raise Exception( "Logdir %s doesn't exist. Run in train mode to create it." % (args.model_path)) #加载数据集 加载source的字典 src_vocab = utils.Vocab(args.src_vocab_path, args.src_vocab_size) #加载targe的单词字典# tgt_vocab = utils.Vocab(args.tgt_vocab_path, args.tgt_vocab_size) #把数据集进行batch化,同时加入并发数据队列对数据进行并发传入 batcher = Batcher(args.data_path, src_vocab, tgt_vocab, args) if args.model == "vanilla": model_class = VanillaSeq2seqModel elif args.model == "sep_dec": model_class = SeparateDecoderModel elif args.model == "shd_dec": model_class = SharedDecoderModel tf.set_random_seed(111) if args.mode == 'train': model = model_class(args, src_vocab, tgt_vocab) setup_training(model, batcher) elif args.mode == 'eval': model = model_class(args, src_vocab, tgt_vocab) run_eval(model, batcher, args.ckpt_id) elif args.mode == "decode": args.batch_size = args.beam_size args.arg_max_dec_steps = 1 args.kp_max_dec_steps = 1 model = model_class(args, src_vocab, tgt_vocab) decoder = BeamSearchDecoder(model, batcher, src_vocab, tgt_vocab, args.ckpt_id) decoder.decode() else: raise ValueError("The 'mode' flag must be one of train/eval/decode")
class Train: def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, moving_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': moving_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.do_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] # 在训练到某个epoch,需要切换到coverage结构,因此需要使用新的optimizer状态。此处控制切换时机。 if not config.do_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, context_v, coverage = \ get_encoder_variables(batch, use_cuda) # dec_lens_var:一个batch的decoder目标序列长度 dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_decoder_variables(batch, use_cuda) self.optimizer.zero_grad() if 0 in enc_lens: print('=================') print(enc_batch.shape) print(enc_lens) print(enc_batch) print('=================') encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) d_hc = self.model.reduce_state(encoder_hidden) # decoder初始h,c step_losses = [] # for step in tqdm.tqdm(range(min(max_dec_len, config.max_dec_steps))): for step in range(min(max_dec_len, config.max_dec_steps)): d_inp = dec_batch[:, step] # Teacher forcing final_dist, d_hc, context_v, attn_dist, p_gen, next_coverage = self.model.decoder( d_inp, d_hc, encoder_outputs, encoder_feature, enc_padding_mask, context_v, extra_zeros, enc_batch_extend_vocab, coverage, step) target = target_batch[:, step] # gather每一步target id的预测概率 gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.do_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) # encoder的累计分布作为损失,见原论文 step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, step] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, moving_avg_loss = self.setup_train(model_file_path) start = time.time() pbar = tqdm.tqdm(total=n_iters) while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) moving_avg_loss = calc_moving_avg_loss(loss, moving_avg_loss, self.summary_writer, iter) iter += 1 pbar.update(1) if iter % 100 == 0: self.summary_writer.flush() print_interval = 100 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: self.save_model(moving_avg_loss, iter) pbar.close()
def main(): utils.print_config(args) if 'train' not in args.mode: args.keep_rate = 1.0 args.use_pretrain = True if args.use_pretrain == 'True' else False args.use_aux_task = True if args.use_aux_task == 'True' else False if args.mode == 'lm_train': args.model = 'lm' args.data_path = "./data/wikitext/wikitext-103/processed_wiki_train.bin" args.use_pretrain = False args.model_path = os.path.join(args.model_path, args.exp_name).format( args.model) #model_path default="data/log/{} if not os.path.exists(args.model_path): if 'train' not in args.mode: print(args.model_path) raise ValueError os.makedirs(args.model_path) with open(os.path.join(args.model_path, 'config.json'), 'w', encoding='utf8') as f: json.dump(vars(args), f) print("Default models path: {}".format(args.model_path)) print('code start/ {} mode / {} models'.format(args.mode, args.model)) utils.assign_specific_gpu(args.gpu_nums) vocab = utils.Vocab() vardicts = utils.get_pretrain_weights( args.true_pretrain_ckpt_path ) if args.use_pretrain and args.mode == 'train' else None if args.mode == 'decode': if args.model == 'mmi_bidi': args.beam_size = args.mmi_bsize args.batch_size = args.beam_size modelhps = deepcopy(args) if modelhps.mode == 'decode': modelhps.max_dec_len = 1 if args.model == 'vanilla': model = BaseModel(vocab, modelhps) elif args.model == 'mmi_bidi': if args.mode == 'decode': bw_graph = tf.Graph() with bw_graph.as_default(): bw_model = BaseModel(vocab, args) bw_sess = tf.Session(graph=bw_graph, config=utils.gpu_config()) with bw_sess.as_default(): with bw_graph.as_default(): bidi_ckpt_path = utils.load_ckpt(bw_model.hps, bw_model.saver, bw_sess) fw_graph = tf.Graph() with fw_graph.as_default(): modelhps.model_path = modelhps.model_path.replace( 'mmi_bidi', 'vanilla') modelhps.model = 'vanilla' fw_model = BaseModel(vocab, modelhps) fw_sess = tf.Session(graph=fw_graph) with fw_sess.as_default(): with fw_graph.as_default(): ckpt_path = utils.load_ckpt(fw_model.hps, fw_model.saver, fw_sess) else: model = BaseModel(vocab, modelhps) elif args.model == 'lm': model = LMModel(vocab, modelhps) elif args.model == 'embmin': model = DiverEmbMin(vocab, modelhps) else: raise ValueError print('models load end') if args.mode in ['train', 'lm_train']: train(model, vocab, vardicts) elif args.mode == 'decode': import time if args.model == 'mmi_bidi': batcher = Batcher( vocab, bw_model.hps.data_path.replace('train_', 'test_'), args) decoder = BeamsearchDecoder(fw_model, batcher, vocab, fw_sess=fw_sess, bw_model=bw_model, bw_sess=bw_sess, bidi_ckpt_path=bidi_ckpt_path) else: batcher = Batcher(vocab, model.hps.data_path.replace('train_', 'test_'), args) decoder = BeamsearchDecoder(model, batcher, vocab) decoder.decode() elif args.mode == 'eval': pass
class Evaluate(object): """在eval data上计算损失。""" def __init__(self, model_file_path): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval', batch_size=config.batch_size, single_pass=True) time.sleep(15) model_name = os.path.basename(model_file_path) eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name)) if not os.path.exists(eval_dir): os.mkdir(eval_dir) self.summary_writer = tf.summary.FileWriter(eval_dir) self.model = Model(model_file_path, is_eval=True) def eval_one_batch(self, batch): "train_one_batch不进行back propagation" enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, context_v, coverage = \ get_encoder_variables(batch, use_cuda) # dec_lens_var:一个batch的decoder目标序列长度 dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_decoder_variables(batch, use_cuda) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) d_hc = self.model.reduce_state(encoder_hidden) # decoder初始h,c step_losses = [] for step in range(min(max_dec_len, config.max_dec_steps)): d_inp = dec_batch[:, step] # Teacher forcing final_dist, d_hc, context_v, attn_dist, p_gen, next_coverage = self.model.decoder(d_inp, d_hc, encoder_outputs, encoder_feature, enc_padding_mask, context_v, extra_zeros, enc_batch_extend_vocab, coverage, step) target = target_batch[:, step] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.do_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, step] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.data[0] def run_eval(self): moving_avg_loss, iter = 0, 0 start = time.time() batch = self.batcher.next_batch() while batch is not None: loss = self.eval_one_batch(batch) moving_avg_loss = calc_moving_avg_loss(loss, moving_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % ( iter, print_interval, time.time() - start, moving_avg_loss)) start = time.time() batch = self.batcher.next_batch()
def train(model, vocab, pretrain_vardicts=None): train_data_loader = Batcher(vocab, model.hps.data_path, args) valid_data_loader = Batcher(vocab, model.hps.data_path.replace('train_', 'dev_'), args) if model.hps.mode == 'lm_train': valid_data_loader = Batcher( vocab, model.hps.data_path.replace('train_', 'valid_'), args) with tf.Session(config=utils.gpu_config()) as sess: train_logdir, dev_logdir = os.path.join(args.model_path, 'logdir/train'), os.path.join( args.model_path, 'logdir/dev') train_savedir = os.path.join(args.model_path, 'train/') print("[*] Train save directory is: {}".format(train_savedir)) if not os.path.exists(train_logdir): os.makedirs(train_logdir) if not os.path.exists(dev_logdir): os.makedirs(dev_logdir) if not os.path.exists(train_savedir): os.makedirs(train_savedir) summary_writer1 = tf.summary.FileWriter(train_logdir, sess.graph) summary_writer2 = tf.summary.FileWriter(dev_logdir, sess.graph) """ Initialize with pretrain variables """ if model.hps.use_pretrain: assign_ops, uninitialized_varlist = utils.assign_pretrain_weights( pretrain_vardicts) sess.run(assign_ops) sess.run(tf.initialize_variables(uninitialized_varlist)) else: sess.run(tf.global_variables_initializer()) posterior = [0 for _ in range(model.hps.matrix_num)] prior = [0 for _ in range(model.hps.matrix_num)] step = 0 while True: # 6978 sample for one epoch beg_time = time() batch = train_data_loader.next_batch() sample_per_epoch = 857899 if 'lm' in model.hps.mode else 6978 if model.hps.mode == 'lm_train': res = model.run_step(batch, sess, is_train=True) else: res = model.run_step( batch, sess, is_train=True, freeze_layer=( model.hps.use_pretrain and step < sample_per_epoch / model.hps.batch_size)) loss, summaries, step = res['loss'], res['summaries'], res[ 'global_step'] if model.hps.model == 'posterior': gumbel = res['posterior'] gumbel_prior = res['prior'] selected = np.argsort(-gumbel) selected_poste = [int(el[0]) for el in selected] selected_prior = [ int(el[0]) for el in np.argsort(-gumbel_prior) ] posterior = [ el1 + el2 for el1, el2 in zip(posterior, selected_poste) ] prior = [el1 + el2 for el1, el2 in zip(prior, selected_prior)] print("prior: {} posterior: {}".format(prior, posterior)) elif model.hps.model == 'embmin': dist = res['selected_emb_idx'] for tmp in dist: prior[tmp] += 1 print(prior) end_time = time() print("{} epoch, {} step, {}sec, {} loss".format( int(step * model.hps.batch_size / sample_per_epoch), step, round(end_time - beg_time, 3), round(loss, 3))) summary_writer1.add_summary(summaries, step) if step % 5 == 0: dev_batch = valid_data_loader.next_batch() res = model.run_step(dev_batch, sess, is_train=False) loss, summaries, step = res['loss'], res['summaries'], res[ 'global_step'] assert step % 5 == 0 print("[VALID] {} loss".format(round(loss, 3))) summary_writer2.add_summary(summaries, step) if step == 10 or step % 2000 == 0: model.saver.save(sess, train_savedir, global_step=step) if int(step * model.hps.batch_size / sample_per_epoch) > model.hps.max_epoch: model.saver.save(sess, train_savedir, global_step=step) print("training end") break