def main(args): assert args.net_type in ['ff', 'rnn'] # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(args.net_type, word2id, args.cuda, args.debug) # make net net, net_args = configure_net(args.net_type, len(word2id), args.emb_dim, args.conv_hidden, args.lstm_hidden, args.lstm_layer, args.bi) if args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, _ = make_embedding({i: w for w, i in word2id.items()}, args.w2v) net.set_embedding(embedding) # configure training setting criterion, train_params = configure_training(args.net_type, 'adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'ml_{}_extractor'.format(args.net_type) meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug) # make net print('vocab size:', len(word2id)) ids = [id for word, id in word2id.items()] print(max(ids)) print(list(sorted(ids))[0]) net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden, args.bi, args.n_layer, args.load_from) # configure training setting criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer if args.cuda: net = net.cuda() val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.AdamW(net.parameters(), **train_params['optimizer'][1]) #optimizer = optim.Adagrad(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') trainer.train()
def main(args): # create data batcher, vocabulary # batcher with open(join(args.data_path, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize, args.max_target_sent) #一个word的词典 train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug) # make net if args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, _ = make_embedding({i: w for w, i in word2id.items()}, args.w2v) #提供一个embedding矩阵 net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden, args.bi, args.n_layer, args.sampling_teaching_force, args.self_attn, args.hi_encoder, embedding) else: print("please provide pretrain_w2v") return # configure training setting criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) net_args_backup = net_args.copy() del net_args_backup["embedding"] meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args_backup meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): abstractor = get_abstractor(args.abs_dir, args.beam_search, args.cuda) for split in ('train', 'val'): decode(args, split) embedding = abstractor._net._decoder._embedding word2id = abstractor._word2id id2words = {i: w for w, i in word2id.items()} elmo = None if args.elmo: elmo = get_elmo(dropout=args.elmo_dropout, vocab_to_cache=[id2words[i] for i in range(len(id2words))], cuda=args.cuda, projection_dim=args.elmo_projection) args.emb_dim = elmo.get_output_dim() meta = { 'net': '{}_discriminator'.format('cnn'), 'net_args': { 'vocab_size': len(abstractor._word2id), 'emb_dim': embedding.embedding_dim, 'kernel_num': args.kernel_num, 'kernel_sizes': args.kernel_sizes, 'class_num': 2, 'dropout': args.dropout, 'max_norm': args.max_norm, 'static': args.static, }, 'training_params': { 'optimizer': ('adam', {'lr': args.lr}), 'batch_size': args.batch, 'clip_grad_norm': args.clip, 'lr_decay': args.decay, } } net = ConvNet(**meta['net_args']) if elmo: meta['net_args']['elmo'] = { 'dropout': args.elmo_dropout, 'projection': args.elmo_projection, } net.set_elmo_embedding(elmo) else: net.set_embedding(embedding.weight) train_batcher, val_batcher = build_batchers(args, word2id) def criterion(logit, target): return F.cross_entropy(logit, target, reduce=False) val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=args.lr) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline('discriminator', net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def train(args): assert args.encoder in ['BiLSTM', 'DeepLSTM', 'Transformer'] assert args.decoder in ['SL', 'PN'] assert args.emb_type in ['W2V', 'BERT'] # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(args.decoder, args.emb_type, word2id, args.cuda, args.debug) # make model model, model_args = configure_net(args.encoder, args.decoder, args.emb_type, len(word2id), args.emb_dim, args.conv_hidden, args.encoder_hidden, args.encoder_layer) if args.emb_type == 'W2V': # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained w2v_path='./CNNDM/word2vec/word2vec.128d.226k.bin' embedding, _ = make_embedding( {i: w for w, i in word2id.items()}, w2v_path) model.set_embedding(embedding) # configure training setting criterion, train_params = configure_training( args.decoder, 'adam', args.lr, args.clip, args.decay, args.batch ) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['model_args'] = model_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(model, criterion, args.decoder) grad_fn = get_basic_grad_fn(model, args.clip) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=2e-5, patience=args.lr_p) if args.cuda: model = model.cuda() pipeline = BasicPipeline(model, args.decoder, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) # for name, para in net.named_parameters(): # if para.requires_grad: # print(name) print('Start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher if args.bert: import logging logging.basicConfig(level=logging.ERROR) if not args.bert: with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) if not args.gat: if args.bert: train_batcher, val_batcher, word2id = build_batchers_bert( args.cuda, args.debug, args.bertmodel) else: train_batcher, val_batcher = build_batchers( word2id, args.cuda, args.debug) else: if args.bert: train_batcher, val_batcher, word2id = build_batchers_gat_bert( args.cuda, args.debug, args.gold_key, args.adj_type, args.mask_type, args.topic_flow_model, num_worker=args.num_worker, bert_model=args.bertmodel) else: train_batcher, val_batcher = build_batchers_gat( word2id, args.cuda, args.debug, args.gold_key, args.adj_type, args.mask_type, args.topic_flow_model, num_worker=args.num_worker) # make net if args.gat: _args = {} _args['rtoks'] = 1 _args['graph_hsz'] = args.n_hidden _args['blockdrop'] = 0.1 _args['sparse'] = False _args['graph_model'] = 'transformer' _args['adj_type'] = args.adj_type net, net_args = configure_net_gat( len(word2id), args.emb_dim, args.n_hidden, args.bi, args.n_layer, args.load_from, gat_args=_args, adj_type=args.adj_type, mask_type=args.mask_type, feed_gold=False, graph_layer_num=args.graph_layer, feature=args.feat, subgraph=args.topic_flow_model, hierarchical_attn=args.topic_flow_model, bert=args.bert, bert_length=args.max_art) else: net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden, args.bi, args.n_layer, args.load_from, args.bert, args.max_art) if args.w2v: assert not args.bert # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, _ = make_embedding({i: w for w, i in word2id.items()}, args.w2v) net.set_embedding(embedding) # configure training setting if 'soft' in args.mask_type and args.gat: criterion, train_params = configure_training_multitask( 'adam', args.lr, args.clip, args.decay, args.batch, args.mask_type, args.bert) else: criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch, args.bert) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer if args.cuda: net = net.cuda() if 'soft' in args.mask_type and args.gat: val_fn = multitask_validate(net, criterion) else: val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) print(net._embedding.weight.requires_grad) optimizer = optim.AdamW(net.parameters(), **train_params['optimizer'][1]) #optimizer = optim.Adagrad(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) # pipeline = BasicPipeline(meta['net'], net, # train_batcher, val_batcher, args.batch, val_fn, # criterion, optimizer, grad_fn) # trainer = BasicTrainer(pipeline, args.path, # args.ckpt_freq, args.patience, scheduler) if 'soft' in args.mask_type and args.gat: pipeline = MultiTaskPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = MultiTaskTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) else: pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()
def main(args): # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) train_batcher, val_batcher = build_batchers(word2id, args.cuda, args.debug) # make net net, net_args = configure_net(len(word2id), args.emb_dim, args.n_hidden, args.bi, args.n_layer) if args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, oov = make_embedding({i: w for w, i in word2id.items()}, args.w2v) net.set_embedding(embedding) # configure training setting criterion, train_params = configure_training('adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = {} meta['net'] = 'base_abstractor' meta['net_args'] = net_args meta['traing_params'] = train_params with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(net.parameters(), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) # # Print model's state_dict # print("Model's state_dict:") # for param_tensor in net.state_dict(): # print(param_tensor, "\t", net.state_dict()[param_tensor].size()) # # # Print optimizer's state_dict # print("Optimizer's state_dict:") # for var_name in optimizer.state_dict(): # print(var_name, "\t", optimizer.state_dict()[var_name]) # # IMPORT PRETRAINED MODEL PARAMETERS # net.load_state_dict(torch.load( # 'pretrained_eng_model/abstractor/ckpt/ckpt-0-0')['state_dict']) # net.eval() # do I need that or not? # copy net # from copy import deepcopy # net_copy = deepcopy(net) # net_copy.load_state_dict(torch.load('pretrained_eng_model/abstractor/ckpt/ckpt-0-0', map_location='cpu')['state_dict']) # for key in net_copy.state_dict(): # print('key: ', key) # param = net_copy.state_dict()[key] # print('param.shape: ', param.shape) # print('param.requires_grad: ', param.requires_grad) # print('param.shape, param.requires_grad: ', param.shape, param.requires_grad) # print('isinstance(param, nn.Module) ', isinstance(param, nn.Module)) # print('isinstance(param, nn.Parameter) ', isinstance(param, nn.Parameter)) # print('isinstance(param, torch.Tensor): ', isinstance(param, torch.Tensor)) # print('=====') # save current state dict model_dict = net.state_dict() # save some parameters for testing purposes if the dict was loaded successfully p1 = net._embedding.weight[0][0].detach().cpu().numpy() p2 = net._enc_lstm.weight_hh_l0[0][0].detach().cpu().numpy() p3 = net._attn_wm.data[0][0].detach().cpu().numpy() # print(p1) # print(p2) # print(p3) # load dict from pretrained net ABS_DIR = os.environ['ABS'] print(ABS_DIR) # uncomment for gpu # pretrained_dict = torch.load(ABS_DIR)['state_dict'] pretrained_dict = torch.load(ABS_DIR)['state_dict'] # skip embedding weights pretrained_dict = { k: v for k, v in pretrained_dict.items() if k != '_embedding.weight' } # overwrite entries in the existing state dict model_dict.update(pretrained_dict) print('Model will be trained on device:') print(model_dict['_embedding.weight'].device) # load the new state dict net.load_state_dict(model_dict) # check if the update was correct pn1 = net._embedding.weight[0][0].detach().cpu().numpy() pn2 = net._enc_lstm.weight_hh_l0[0][0].detach().cpu().numpy() pn3 = net._attn_wm.data[0][0].detach().cpu().numpy() # print(pn1) # print(pn2) # print(pn3) assert p1 == pn1 # embedding layer has to be the same assert p2 != pn2 assert p3 != pn3 print('Embedding layer has not been overwritten') # set updating of the parameters for name, param in net.named_parameters(): #param.requires_grad = True print(name, param.requires_grad) trainer.train()
def main(args): assert args.net_type in ['ff', 'rnn'] # create data batcher, vocabulary # batcher with open(join(DATA_DIR, 'vocab_cnt.pkl'), 'rb') as f: wc = pkl.load(f) word2id = make_vocab(wc, args.vsize) id2words = {i: w for w, i in word2id.items()} elmo = None if args.elmo: elmo = get_elmo( dropout=args.elmo_dropout, vocab_to_cache=[id2words[i] for i in range(len(id2words))], cuda=args.cuda, projection_dim=args.elmo_projection) args.emb_dim = elmo.get_output_dim() train_batcher, val_batcher = build_batchers(args.net_type, word2id, args.cuda, args.debug) # make net net, net_args = configure_net(args.net_type, len(word2id), args.emb_dim, args.conv_hidden, args.lstm_hidden, args.lstm_layer, args.bi) if elmo: net_args['elmo'] = { 'dropout': args.elmo_dropout, 'projection': args.elmo_projection, } net.set_elmo_embedding(elmo) elif args.w2v: # NOTE: the pretrained embedding having the same dimension # as args.emb_dim should already be trained embedding, _ = make_embedding(id2words, args.w2v) net.set_embedding(embedding) # configure training setting criterion, train_params = configure_training(args.net_type, 'adam', args.lr, args.clip, args.decay, args.batch) # save experiment setting if not exists(args.path): os.makedirs(args.path) with open(join(args.path, 'vocab.pkl'), 'wb') as f: pkl.dump(word2id, f, pkl.HIGHEST_PROTOCOL) meta = { 'net': 'ml_{}_extractor'.format(args.net_type), 'net_args': net_args, 'traing_params': train_params } with open(join(args.path, 'meta.json'), 'w') as f: json.dump(meta, f, indent=4) # prepare trainer val_fn = basic_validate(net, criterion) grad_fn = get_basic_grad_fn(net, args.clip) optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), **train_params['optimizer'][1]) scheduler = ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=args.decay, min_lr=0, patience=args.lr_p) if args.cuda: net = net.cuda() pipeline = BasicPipeline(meta['net'], net, train_batcher, val_batcher, args.batch, val_fn, criterion, optimizer, grad_fn) trainer = BasicTrainer(pipeline, args.path, args.ckpt_freq, args.patience, scheduler) print('start training with the following hyper-parameters:') print(meta) trainer.train()