def train(args): if not exists(args.path): os.makedirs(args.path) # make net agent, agent_vocab, abstractor, net_args = configure_net( args.abs_dir, args.ext_dir, args.cuda) # configure training setting assert args.stop > 0 train_params = configure_training( 'adam', args.lr, args.clip, args.decay, args.batch, args.gamma, args.reward, args.stop, 'rouge-1' ) train_batcher, val_batcher = build_batchers(args.batch) # TODO different reward reward_fn = compute_rouge_l stop_reward_fn = compute_rouge_n(n=1) # save abstractor binary if args.abs_dir is not None: abs_ckpt = {} abs_ckpt['state_dict'] = load_best_ckpt(args.abs_dir) abs_vocab = pkl.load(open(join(args.abs_dir, 'vocab.pkl'), 'rb')) abs_dir = join(args.path, 'abstractor') os.makedirs(join(abs_dir, 'ckpt')) with open(join(abs_dir, 'meta.json'), 'w') as f: json.dump(net_args['abstractor'], f, indent=4) torch.save(abs_ckpt, join(abs_dir, 'ckpt/ckpt-0-0')) with open(join(abs_dir, 'vocab.pkl'), 'wb') as f: pkl.dump(abs_vocab, f) # save configuration meta = {} meta['net'] = 'rnn-ext_abs_rl' meta['net_args'] = net_args meta['train_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) with open(join(args.path, 'agent_vocab.pkl'), 'wb') as f: pkl.dump(agent_vocab, f) # prepare trainer grad_fn = get_grad_fn(agent, args.clip) optimizer = optim.Adam(agent.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'max', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) pipeline = A2CPipeline(meta['net'], agent, abstractor, train_batcher, val_batcher, optimizer, grad_fn, reward_fn, args.gamma, stop_reward_fn, args.stop) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler, val_mode='score') print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): assert args.net_type in ['ff', 'rnn'] # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(args.net_type, word2id, args.cuda, args.debug) # make net net, net_args = configure_net(args.net_type, len(word2id), args.emb_dim, args.conv_hidden, args.lstm_hidden, args.lstm_layer, args.bi) if args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, _ = make_embedding({i: w for w, i in word2id.items()}, args.w2v) net.set_embedding(embedding) # configure training setting criterion, train_params = configure_training(args.net_type, 'adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'ml_{}_extractor'.format(args.net_type) meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug) # make net print('vocab size:', len(word2id)) ids = [id for word, id in word2id.items()] print(max(ids)) print(list(sorted(ids))[0]) net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden, args.bi, args.n_layer, args.load_from) # configure training setting criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer if args.cuda: net = net.cuda() val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.AdamW(net.parameters(), **train_params['optimizer'][1]) #optimizer = optim.Adagrad(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') trainer.train()
def main(args): assert args.net_type in ['ff', 'rnn'] # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(args.net_type, word2id, args.cuda, args.debug) # make net net, net_args = configure_net(args.net_type, len(word2id), args.emb_dim, args.conv_hidden, args.lstm_hidden, args.lstm_layer, args.bi) if args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, _ = make_embedding( {i: w for w, i in word2id.items()}, args.w2v) net.set_embedding(embedding) # configure training setting criterion, train_params = configure_training( args.net_type, 'adam', args.lr, args.clip, args.decay, args.batch ) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'ml_{}_extractor'.format(args.net_type) meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def train(args): if not exists(args.path): os.makedirs(args.path) # make net if args.docgraph or args.paragraph: agent, agent_vocab, abstractor, net_args = configure_net_graph( args.abs_dir, args.ext_dir, args.cuda, args.docgraph, args.paragraph) else: agent, agent_vocab, abstractor, net_args = configure_net( args.abs_dir, args.ext_dir, args.cuda, True, False, args.rl_dir) if args.bert_stride > 0: assert args.bert_stride == agent._bert_stride # configure training setting assert args.stop > 0 train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch, args.gamma, args.reward, args.stop, 'rouge-1') if args.docgraph or args.paragraph: if args.bert: train_batcher, val_batcher = build_batchers_graph_bert( args.batch, args.key, args.adj_type, args.max_bert_word, args.docgraph, args.paragraph) else: train_batcher, val_batcher = build_batchers_graph( args.batch, args.key, args.adj_type, args.gold_key, args.docgraph, args.paragraph) elif args.bert: train_batcher, val_batcher = build_batchers_bert( args.batch, args.bert_sent, args.bert_stride, args.max_bert_word) else: train_batcher, val_batcher = build_batchers(args.batch) # TODO different reward if args.reward == 'rouge-l': reward_fn = compute_rouge_l elif args.reward == 'rouge-1': reward_fn = compute_rouge_n(n=1) elif args.reward == 'rouge-2': reward_fn = compute_rouge_n(n=2) elif args.reward == 'rouge-l-s': reward_fn = compute_rouge_l_summ else: raise Exception('Not prepared reward') stop_reward_fn = compute_rouge_n(n=1) # save abstractor binary if args.abs_dir is not None: abs_ckpt = {} abs_ckpt['state_dict'] = load_best_ckpt(args.abs_dir, reverse=True) abs_vocab = pkl.load(open(join(args.abs_dir, 'vocab.pkl'), 'rb')) abs_dir = join(args.path, 'abstractor') os.makedirs(join(abs_dir, 'ckpt')) with open(join(abs_dir, 'meta.json'), 'w') as f: json.dump(net_args['abstractor'], f, indent=4) torch.save(abs_ckpt, join(abs_dir, 'ckpt/ckpt-0-0')) with open(join(abs_dir, 'vocab.pkl'), 'wb') as f: pkl.dump(abs_vocab, f) # save configuration meta = {} meta['net'] = 'rnn-ext_abs_rl' meta['net_args'] = net_args meta['train_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) with open(join(args.path, 'agent_vocab.pkl'), 'wb') as f: pkl.dump(agent_vocab, f) # prepare trainer grad_fn = get_grad_fn(agent, args.clip) optimizer = optim.Adam(agent.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'max', verbose=True, factor=args.decay, min_lr=1e-5, patience=args.lr_p) if args.docgraph or args.paragraph: entity = True else: entity = False pipeline = SCPipeline(meta['net'], agent, abstractor, train_batcher, val_batcher, optimizer, grad_fn, reward_fn, entity, args.bert) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler, val_mode='score') print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher word2id = pkl.load(open(join(args.abs_dir, 'vocab.pkl'), 'rb')) # reward func reward_func = None reward_weight = 0. # make net net, net_args = configure_net(args.abs_dir) bert = net._bert train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug) # configure training setting criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer if args.cuda: net = net.cuda() val_fn = rl_validate(net, reward_func=reward_func, reward_coef=reward_weight, _bleu=args.bleu, f1=args.f1) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.AdamW(net.parameters(), **train_params['optimizer'][1]) #optimizer = optim.Adagrad(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'max', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) pipeline = AbsSelfCriticalPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, optimizer, grad_fn, weights=[args.r1, args.r2, args.rl], _bleu=args.bleu, f1=args.f1) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler, val_mode='score') print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher with open(join(args.data_path, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize, args.max_target_sent) #一个word的词典 train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug) # make net if args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, _ = make_embedding({i: w for w, i in word2id.items()}, args.w2v) #提供一个embedding矩阵 net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden, args.bi, args.n_layer, args.sampling_teaching_force, args.self_attn, args.hi_encoder, embedding) else: print("please provide pretrain_w2v") return # configure training setting criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) net_args_backup = net_args.copy() del net_args_backup["embedding"] meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args_backup meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): abstractor = get_abstractor(args.abs_dir, args.beam_search, args.cuda) for split in ('train', 'val'): decode(args, split) embedding = abstractor._net._decoder._embedding word2id = abstractor._word2id id2words = {i: w for w, i in word2id.items()} elmo = None if args.elmo: elmo = get_elmo(dropout=args.elmo_dropout, vocab_to_cache=[id2words[i] for i in range(len(id2words))], cuda=args.cuda, projection_dim=args.elmo_projection) args.emb_dim = elmo.get_output_dim() meta = { 'net': '{}_discriminator'.format('cnn'), 'net_args': { 'vocab_size': len(abstractor._word2id), 'emb_dim': embedding.embedding_dim, 'kernel_num': args.kernel_num, 'kernel_sizes': args.kernel_sizes, 'class_num': 2, 'dropout': args.dropout, 'max_norm': args.max_norm, 'static': args.static, }, 'training_params': { 'optimizer': ('adam', {'lr': args.lr}), 'batch_size': args.batch, 'clip_grad_norm': args.clip, 'lr_decay': args.decay, } } net = ConvNet(**meta['net_args']) if elmo: meta['net_args']['elmo'] = { 'dropout': args.elmo_dropout, 'projection': args.elmo_projection, } net.set_elmo_embedding(elmo) else: net.set_embedding(embedding.weight) train_batcher, val_batcher = build_batchers(args, word2id) def criterion(logit, target): return F.cross_entropy(logit, target, reduce=False) val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=args.lr) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline('discriminator', net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def train(args): assert args.encoder in ['BiLSTM', 'DeepLSTM', 'Transformer'] assert args.decoder in ['SL', 'PN'] assert args.emb_type in ['W2V', 'BERT'] # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(args.decoder, args.emb_type, word2id, args.cuda, args.debug) # make model model, model_args = configure_net(args.encoder, args.decoder, args.emb_type, len(word2id), args.emb_dim, args.conv_hidden, args.encoder_hidden, args.encoder_layer) if args.emb_type == 'W2V': # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained w2v_path='./CNNDM/word2vec/word2vec.128d.226k.bin' embedding, _ = make_embedding( {i: w for w, i in word2id.items()}, w2v_path) model.set_embedding(embedding) # configure training setting criterion, train_params = configure_training( args.decoder, 'adam', args.lr, args.clip, args.decay, args.batch ) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['model_args'] = model_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(model, criterion, args.decoder) grad_fn = get_basic_grad_fn(model, args.clip) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=2e-5, patience=args.lr_p) if args.cuda: model = model.cuda() pipeline = BasicPipeline(model, args.decoder, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) # for name, para in net.named_parameters(): # if para.requires_grad: # print(name) print('Start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher word2id = pkl.load(open(join(args.abs_dir, 'vocab.pkl'), 'rb')) # reward func if args.reward_model_dir is not None: assert args.reward_data_dir is not None reward_func = cloze_reward(args.reward_model_dir, args.cloze_device) reward_weight = args.reward_weight else: reward_func = None reward_weight = 0. # make net if args.docgraph or args.paragraph: net, net_args = configure_net_graph(args.abs_dir, args.docgraph, args.paragraph) else: net, net_args = configure_net(args.abs_dir) bert = net._bert if bert: print('model use bert') import logging print('disable') logging.getLogger('transformers.tokenization_utils').setLevel( logging.ERROR) logging.getLogger('transformers.tokenization_utils').disabled = True if args.docgraph or args.paragraph: if bert: tokenizer = net._bert_model._tokenizer train_batcher, val_batcher = build_batchers_graph_bert( tokenizer, args.cuda, args.debug, args.key, net._adj_type, args.docgraph, args.reward_data_dir) else: train_batcher, val_batcher = build_batchers_graph( word2id, args.cuda, args.debug, args.key, net._adj_type, args.docgraph, args.reward_data_dir) else: if bert: tokenizer = net._bert_model._tokenizer train_batcher, val_batcher = build_batchers_bert( tokenizer, args.cuda, args.debug) else: train_batcher, val_batcher = build_batchers( word2id, args.cuda, args.debug) # configure training setting criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) local_coh_fun = None # prepare trainer if args.cuda: net = net.cuda() multigpu = False val_fn = rl_validate(net, reward_func=reward_func, reward_coef=reward_weight, bert=bert) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.AdamW(net.parameters(), **train_params['optimizer'][1]) #optimizer = optim.Adagrad(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'max', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) print('rouge weights:', [args.r1, args.r2, args.rl]) pipeline = AbsSelfCriticalPipeline( meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, optimizer, grad_fn, reward_func, reward_weight, local_coh_fun, 0., accumulate_g_step=args.accumulate_g_step, weights=[args.r1, args.r2, args.rl], bert=bert, multigpu=multigpu, ml_loss=args.ml_loss) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler, val_mode='score') print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug) # make net net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden, args.bi, args.n_layer) if args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, oov = make_embedding({i: w for w, i in word2id.items()}, args.w2v) net.set_embedding(embedding) # configure training setting criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) # # Print model's state_dict # print("Model's state_dict:") # for param_tensor in net.state_dict(): # print(param_tensor, "\t", net.state_dict()[param_tensor].size()) # # # Print optimizer's state_dict # print("Optimizer's state_dict:") # for var_name in optimizer.state_dict(): # print(var_name, "\t", optimizer.state_dict()[var_name]) # # IMPORT PRETRAINED MODEL PARAMETERS # net.load_state_dict(torch.load( # 'pretrained_eng_model/abstractor/ckpt/ckpt-0-0')['state_dict']) # net.eval() # do I need that or not? # copy net # from copy import deepcopy # net_copy = deepcopy(net) # net_copy.load_state_dict(torch.load('pretrained_eng_model/abstractor/ckpt/ckpt-0-0', map_location='cpu')['state_dict']) # for key in net_copy.state_dict(): # print('key: ', key) # param = net_copy.state_dict()[key] # print('param.shape: ', param.shape) # print('param.requires_grad: ', param.requires_grad) # print('param.shape, param.requires_grad: ', param.shape, param.requires_grad) # print('isinstance(param, nn.Module) ', isinstance(param, nn.Module)) # print('isinstance(param, nn.Parameter) ', isinstance(param, nn.Parameter)) # print('isinstance(param, torch.Tensor): ', isinstance(param, torch.Tensor)) # print('=====') # save current state dict model_dict = net.state_dict() # save some parameters for testing purposes if the dict was loaded successfully p1 = net._embedding.weight[0][0].detach().cpu().numpy() p2 = net._enc_lstm.weight_hh_l0[0][0].detach().cpu().numpy() p3 = net._attn_wm.data[0][0].detach().cpu().numpy() # print(p1) # print(p2) # print(p3) # load dict from pretrained net ABS_DIR = os.environ['ABS'] print(ABS_DIR) # uncomment for gpu # pretrained_dict = torch.load(ABS_DIR)['state_dict'] pretrained_dict = torch.load(ABS_DIR)['state_dict'] # skip embedding weights pretrained_dict = { k: v for k, v in pretrained_dict.items() if k != '_embedding.weight' } # overwrite entries in the existing state dict model_dict.update(pretrained_dict) print('Model will be trained on device:') print(model_dict['_embedding.weight'].device) # load the new state dict net.load_state_dict(model_dict) # check if the update was correct pn1 = net._embedding.weight[0][0].detach().cpu().numpy() pn2 = net._enc_lstm.weight_hh_l0[0][0].detach().cpu().numpy() pn3 = net._attn_wm.data[0][0].detach().cpu().numpy() # print(pn1) # print(pn2) # print(pn3) assert p1 == pn1 # embedding layer has to be the same assert p2 != pn2 assert p3 != pn3 print('Embedding layer has not been overwritten') # set updating of the parameters for name, param in net.named_parameters(): #param.requires_grad = True print(name, param.requires_grad) trainer.train()
def main(args): assert args.net_type in ['ff', 'rnn'] # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) id2words = {i: w for w, i in word2id.items()} elmo = None if args.elmo: elmo = get_elmo( dropout=args.elmo_dropout, vocab_to_cache=[id2words[i] for i in range(len(id2words))], cuda=args.cuda, projection_dim=args.elmo_projection) args.emb_dim = elmo.get_output_dim() train_batcher, val_batcher = build_batchers(args.net_type, word2id, args.cuda, args.debug) # make net net, net_args = configure_net(args.net_type, len(word2id), args.emb_dim, args.conv_hidden, args.lstm_hidden, args.lstm_layer, args.bi) if elmo: net_args['elmo'] = { 'dropout': args.elmo_dropout, 'projection': args.elmo_projection, } net.set_elmo_embedding(elmo) elif args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, _ = make_embedding(id2words, args.w2v) net.set_embedding(embedding) # configure training setting criterion, train_params = configure_training(args.net_type, 'adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = { 'net': 'ml_{}_extractor'.format(args.net_type), 'net_args': net_args, 'traing_params': train_params } with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher word2id = pkl.load(open(join(args.abs_dir, 'vocab.pkl'), 'rb')) train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug) # make net net, net_args = configure_net(args.abs_dir) # configure training setting criterion, train_params = configure_training( 'adam', args.lr, args.clip, args.decay, args.batch ) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) if args.coherence or args.all_local: print('use coref score') word_dict, model, sess = graph_init(model="cnndm") coh_func = (model, sess, word_dict) print('finish load coref model') else: coh_func = None assert not (args.anaphora and args.apposition) if args.anaphora: print('use anaphora') local_coh_fun = getAnaphoraReward elif args.apposition: print('use apposition') local_coh_fun = getAppositionReward elif args.all_local: local_coh_fun = getLocalReward else: local_coh_fun = None # prepare trainer if args.cuda: net = net.cuda() val_fn = rl_validate(net, coherence_func=coh_func, coh_coef=args.coh_coef, local_coh_func=local_coh_fun, local_coh_coef=args.local_coh_coef) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1]) #optimizer = optim.Adagrad(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'max', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) pipeline = AbsSelfCriticalPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, optimizer, grad_fn, coh_func, args.coh_coef, local_coh_fun, args.local_coh_coef) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler, val_mode='score') print('start training with the following hyper-parameters:') print(meta) trainer.train()