def main(opt, logger): logger.info('My PID is {0}'.format(os.getpid())) logger.info('PyTorch version: {0}'.format(str(torch.__version__))) logger.info(opt) if torch.cuda.is_available() and not opt.gpus: logger.info("WARNING: You have a CUDA device, so you should probably run with -gpus 0") if opt.seed > 0: torch.manual_seed(opt.seed) if opt.gpus: if opt.cuda_seed > 0: torch.cuda.manual_seed(opt.cuda_seed) # cuda.set_device(opt.gpus[0]) logger.info('My seed is {0}'.format(torch.initial_seed())) logger.info('My cuda seed is {0}'.format(torch.cuda.initial_seed())) ###### ==================== Loading Dataset ==================== ###### data = torch.load(opt.data) vocabularies = data['dict'] if isinstance(vocabularies['src'], str): assert vocabularies['src'] == opt.pretrained options = {'transf':True, 'separate':False, 'tgt':False} vocabularies['src'] = Vocab.from_opt(pretrained=opt.pretrained, opt=options) train_data, valid_data = data['train'], data['valid'] ### ===== load pre-trained vocabulary ===== ### if opt.pre_trained_vocab: if not opt.pretrained: opt.pre_trained_src_emb = vocabularies['pre-trained']['src'] opt.pre_trained_tgt_emb = vocabularies['pre-trained']['tgt'] ### ===== wrap datasets ===== ### attn_mask_file = '' if not opt.defined_slf_attn_mask else opt.defined_slf_attn_mask + '.train.npy' pad_id = vocabularies['src'].lookup('<|endoftext|>') if opt.pretrained.count('gpt2') else Constants.PAD trainData = DialogueDataset(train_data, opt.batch_size, copy=opt.copy, attn_mask_file=attn_mask_file, opt_cuda=opt.gpus, pad=pad_id) validData = DialogueDataset(valid_data, opt.eval_batch_size, copy=opt.copy, attn_mask_file=attn_mask_file, opt_cuda=opt.gpus, pad=pad_id) opt.src_vocab_size, opt.tgt_vocab_size = vocabularies['src'].size, vocabularies['tgt'].size logger.info(' * vocabulary size. source = %d; target = %d' % (opt.src_vocab_size, opt.tgt_vocab_size)) logger.info(' * number of training batches. %d' % len(trainData)) logger.info(' * maximum batch size. %d' % opt.batch_size) ##### =================== Prepare Model =================== ##### separate = -1 device = torch.device('cuda:' + str(opt.gpus[0]) if len(opt.gpus) else 'cpu') checkpoint = torch.load(opt.checkpoint) if opt.checkpoint else None model, parameters_cnt = build_dialogue_model(opt, device, separate=separate, checkpoint=checkpoint) logger.info(' * Number of parameters to learn = %d' % parameters_cnt) ##### ==================== Prepare Optimizer ==================== ##### optimizer = Optimizer.from_opt(model, opt) ##### ==================== Prepare Loss ==================== ##### weight = torch.ones(opt.tgt_vocab_size) weight[Constants.PAD] = 0 loss = NLLLoss(opt, weight=weight, size_average=False) if opt.gpus: cuda.set_device(opt.gpus[0]) loss.cuda() ##### ==================== Prepare Translator ==================== ##### forward_translator = DialogueTranslator(opt, vocabularies['tgt'], data['valid']['tokens'], vocabularies['src']) backward_translator = DialogueTranslator(opt, vocabularies['src'], data['valid']['tokens'], vocabularies['tgt'], reverse=True) # torch.save(opt, opt.save_model + '-opt.pt') # import ipdb; ipdb.set_trace() ##### ==================== Training ==================== ##### trainer = DialogueSupervisedTrainer(model, loss, optimizer, forward_translator, backward_translator, logger, opt, trainData, validData) trainer.train(device)
def main(opt): logging.info('My PID is {0}'.format(os.getpid())) logging.info('PyTorch version: {0}'.format(str(torch.__version__))) logging.info(opt) if torch.cuda.is_available() and not opt.gpus: logging.info( "WARNING: You have a CUDA device, so you should probably run with -gpus 0" ) if opt.seed > 0: torch.manual_seed(opt.seed) if opt.gpus: if opt.cuda_seed > 0: torch.cuda.manual_seed(opt.cuda_seed) cuda.set_device(opt.gpus[0]) logging.info('My seed is {0}'.format(torch.initial_seed())) logging.info('My cuda seed is {0}'.format(torch.cuda.initial_seed())) ###### ==================== Loading Options ==================== ###### if opt.checkpoint: checkpoint = torch.load(opt.checkpoint) ###### ==================== Loading Dataset ==================== ###### opt.sparse = True if opt.sparse else False # logger.info('Loading sequential data ......') # sequences = torch.load(opt.sequence_data) # seq_vocabularies = sequences['dict'] # logger.info('Loading structural data ......') # graphs = torch.load(opt.graph_data) # graph_vocabularies = graphs['dict'] ### ===== load pre-trained vocabulary ===== ### logging.info('Loading sequential data ......') sequences = torch.load(opt.sequence_data) seq_vocabularies = sequences['dict'] logging.info('Loading pre-trained vocabulary ......') if opt.pre_trained_vocab: if not opt.pretrained: opt.pre_trained_src_emb = seq_vocabularies['pre-trained']['src'] opt.pre_trained_tgt_emb = seq_vocabularies['pre-trained']['tgt'] if opt.answer: opt.pre_trained_ans_emb = seq_vocabularies['pre-trained']['src'] ### ===== wrap datasets ===== ### logging.info('Loading Dataset objects ......') trainData = torch.load(opt.train_dataset) validData = torch.load(opt.valid_dataset) trainData.batchSize = validData.batchSize = opt.batch_size trainData.numBatches = math.ceil(len(trainData.src) / trainData.batchSize) validData.numBatches = math.ceil(len(validData.src) / validData.batchSize) logging.info('Preparing vocabularies ......') opt.src_vocab_size = seq_vocabularies['src'].size opt.tgt_vocab_size = seq_vocabularies['tgt'].size opt.feat_vocab = [fv.size for fv in seq_vocabularies['feature'] ] if opt.feature else None logging.info('Loading structural data ......') graphs = torch.load(opt.graph_data) graph_vocabularies = graphs['dict'] del graphs opt.edge_vocab_size = graph_vocabularies['edge']['in'].size opt.node_feat_vocab = [ fv.size for fv in graph_vocabularies['feature'][1:-1] ] if opt.node_feature else None logging.info(' * vocabulary size. source = %d; target = %d' % (opt.src_vocab_size, opt.tgt_vocab_size)) logging.info(' * number of training batches. %d' % len(trainData)) logging.info(' * maximum batch size. %d' % opt.batch_size) ##### =================== Prepare Model =================== ##### device = torch.device('cuda' if opt.gpus else 'cpu') trainData.device = validData.device = device checkpoint = checkpoint if opt.checkpoint else None model, parameters_cnt = build_model(opt, device, checkpoint=checkpoint) del checkpoint logging.info(' * Number of parameters to learn = %d' % parameters_cnt) ##### ==================== Prepare Optimizer ==================== ##### optimizer = Optimizer.from_opt(model, opt) ##### ==================== Prepare Loss ==================== ##### weight = torch.ones(opt.tgt_vocab_size) weight[Constants.PAD] = 0 loss = NLLLoss(opt, weight, size_average=False) if opt.gpus: loss.cuda() ##### ==================== Prepare Translator ==================== ##### translator = Translator(opt, seq_vocabularies['tgt'], sequences['valid']['tokens'], seq_vocabularies['src']) ##### ==================== Training ==================== ##### trainer = SupervisedTrainer(model, loss, optimizer, translator, opt, trainData, validData, seq_vocabularies['src'], graph_vocabularies['feature']) del model del trainData del validData del seq_vocabularies['src'] del graph_vocabularies['feature'] trainer.train(device)
def main(opt, logger): logger.info('My PID is {0}'.format(os.getpid())) logger.info('PyTorch version: {0}'.format(str(torch.__version__))) logger.info(opt) if torch.cuda.is_available() and not opt.gpus: logger.info("WARNING: You have a CUDA device, so you should probably run with -gpus 0") if opt.seed > 0: torch.manual_seed(opt.seed) if opt.gpus: if opt.cuda_seed > 0: torch.cuda.manual_seed(opt.cuda_seed) # cuda.set_device(opt.gpus[0]) logger.info('My seed is {0}'.format(torch.initial_seed())) logger.info('My cuda seed is {0}'.format(torch.cuda.initial_seed())) ###### ==================== Loading Dataset ==================== ###### data = torch.load(opt.data) vocabularies = data['dict'] if isinstance(vocabularies['src'], str): assert vocabularies['src'] == opt.pretrained sep = True if opt.answer == 'sep' else False options = {'transf':opt.answer != 'enc', 'separate':sep, 'tgt':False} vocabularies['src'] = Vocab.from_opt(pretrained=opt.pretrained, opt=options) train_data, valid_data = data['train'], data['valid'] ### ===== load pre-trained vocabulary ===== ### if opt.pre_trained_vocab: if not opt.pretrained: opt.pre_trained_src_emb = vocabularies['pre-trained']['src'] opt.pre_trained_tgt_emb = vocabularies['pre-trained']['tgt'] if opt.answer == 'enc': opt.pre_trained_ans_emb = vocabularies['pre-trained']['ans'] ### ===== wrap datasets ===== ### attn_mask_file = '' if not opt.defined_slf_attn_mask else opt.defined_slf_attn_mask + '.train.npy' pad_id = vocabularies['src'].lookup('<|endoftext|>') if opt.pretrained.count('gpt2') else Constants.PAD trainData = Dataset(train_data, opt.batch_size, copy=opt.copy, answer=opt.answer == 'enc', ans_feature=opt.ans_feature, feature=opt.feature, attn_mask_file=attn_mask_file, opt_cuda=opt.gpus, pad=pad_id) validData = Dataset(valid_data, opt.eval_batch_size, copy=opt.copy, answer=opt.answer == 'enc', ans_feature=opt.ans_feature, feature=opt.feature, attn_mask_file=attn_mask_file, opt_cuda=opt.gpus, pad=pad_id) opt.src_vocab_size = vocabularies['src'].size opt.tgt_vocab_size = vocabularies['tgt'].size opt.feat_vocab = [fv.size for fv in vocabularies['feature']] if opt.feature else None opt.ans_feat_vocab = [fv.size for fv in vocabularies['ans_feature']] if opt.ans_feature else None logger.info(' * vocabulary size. source = %d; target = %d' % (opt.src_vocab_size, opt.tgt_vocab_size)) logger.info(' * number of training batches. %d' % len(trainData)) logger.info(' * maximum batch size. %d' % opt.batch_size) ##### =================== Prepare Model =================== ##### separate = vocabularies['src'].lookup(Constants.SEP_WORD) if opt.answer == 'sep' else -1 device = torch.device('cuda:' + str(opt.gpus[0]) if len(opt.gpus) else 'cpu') checkpoint = torch.load(opt.checkpoint) if opt.checkpoint else None if opt.rl: rl_device = [torch.device('cuda:' + str(gpu)) for gpu in opt.rl_gpu] rl_device = {k:v for k, v in zip(opt.rl, rl_device)} opt.rl_device = rl_device discriminator = load_rl_model(opt, device, rl_device) model, parameters_cnt = build_model(opt, device, separate=separate, checkpoint=checkpoint) logger.info(' * Number of parameters to learn = %d' % parameters_cnt) ##### ==================== Prepare Optimizer ==================== ##### optimizer = Optimizer.from_opt(model, opt) ##### ==================== Prepare Loss ==================== ##### weight = torch.ones(opt.tgt_vocab_size) weight[Constants.PAD] = 0 loss = NLLLoss(opt, weight=weight, size_average=False) if opt.gpus: cuda.set_device(opt.gpus[0]) loss.cuda() ##### ==================== Prepare Translator ==================== ##### translator = Translator(opt, vocabularies['tgt'], data['valid']['tokens'], vocabularies['src']) ##### ==================== Training ==================== ##### if opt.rl: trainer = RLTrainer(model, discriminator, loss, optimizer, translator, logger, opt, trainData, validData, vocabularies['src'], vocabularies['tgt']) else: trainer = SupervisedTrainer(model, loss, optimizer, translator, logger, opt, trainData, validData, vocabularies['src']) trainer.train(device)
def main(opt): tokenizer = BertTokenizer.from_pretrained(opt.pre_model) ###========== Load Data ==========### train_data = filter_data(opt.train_src, opt.train_tgt, tokenizer) valid_data = filter_data(opt.valid_src, opt.valid_tgt, tokenizer) ###========== Get Index ==========### options = {'transf':False, 'separate':False, 'tgt':False} src_vocab = Vocab.from_opt(pretrained=opt.pre_model, opt=options) options = {'lower':False, 'mode':'size', 'size':1000, 'frequency':1, 'transf':False, 'separate':False, 'tgt':False} tgt_vocab = Vocab.from_opt(corpus=train_data['tgt'], opt=options) train_src_idx = [src_vocab.convertToIdx(sent) for sent in train_data['src']] valid_src_idx = [src_vocab.convertToIdx(sent) for sent in valid_data['src']] train_tgt_idx = [tgt_vocab.convertToIdx(sent) for sent in train_data['tgt']] valid_tgt_idx = [tgt_vocab.convertToIdx(sent) for sent in valid_data['tgt']] ###========== Get Data ==========### train_data = Dataset({'src':train_src_idx, 'tgt':train_tgt_idx, 'feature':[train_data['idx']]}, opt.batch_size, feature=True, opt_cuda=opt.gpus) valid_data = Dataset({'src':valid_src_idx, 'tgt':valid_tgt_idx, 'feature':[valid_data['idx']]}, opt.batch_size, feature=True, opt_cuda=opt.gpus) opt.tgt_vocab_size = tgt_vocab.size ###========== Prepare Model ==========### device = torch.device('cuda') encoder = BertModel.from_pretrained(opt.pre_model) classifier = nn.Sequential( nn.Linear(768 // opt.maxout_pool_size, opt.tgt_vocab_size), # TODO: fix this magic number later (hidden size of the model) nn.Softmax(dim=1) ) model = NERTagger(encoder, classifier, device).to(device) for _, para in model.classifier.named_parameters(): if para.dim() == 1: para.data.normal_(0, math.sqrt(6 / (1 + para.size(0)))) else: nn.init.xavier_normal(para, math.sqrt(3)) if len(opt.gpus) > 1: model = nn.DataParallel(model, device_ids=opt.gpus) ###========== Prepare for training ==========### opt.optim = 'adam' opt.decay_method = '' opt.learning_rate = 3e-5 opt.learning_rate_decay = 1 opt.decay_steps = 10000000 opt.start_decay_steps = 10000000000 opt.max_grad_norm = 5 opt.max_weight_value = 20 opt.decay_bad_cnt = 5 optimizer = Optimizer.from_opt(model, opt) weight = torch.ones(opt.tgt_vocab_size) weight[0] = 0 # TODO: fix this magic number later (PAD) loss = NLLLoss(opt, weight, size_average=False) if opt.gpus: loss.cuda() ###========== Training ==========### best_val = 0 def eval_model(M, D, L): M.eval() all_loss, all_accu, all_words = 0, 0, 0 for i in tqdm(range(len(D)), mininterval=2, desc=' - (Validation) ', leave=False): B = D[i] s, t, sid = B['src'][0], B['tgt'], B['feat'][0][0] t = t.transpose(0, 1) P = M(s, sid) lv, G = L.cal_loss_ner(P, t) all_loss += lv.item() all_words += P.size(0) P = P.max(1)[1] n_correct = P.eq(G.view(-1)) n_correct = n_correct.sum().item() all_accu += n_correct return all_loss/all_words, all_accu/all_words def save_model(M, score, best_val, opt): if score > best_val: model_to_save = M.module.encoder if hasattr(M, 'module') else M.encoder # Only save the model it-self output_model_file = os.path.join(opt.output_dir, "pytorch_model_" + str(round(score * 100, 2)) + ".bin") torch.save(model_to_save.state_dict(), output_model_file) print('validation', score) for _ in range(opt.num_train_epochs): train_data.shuffle() model.train() batch_order = torch.randperm(len(train_data)) loss_print, words_cnt, accuracy = 0, 0, 0 for idx in tqdm(range(len(train_data)), mininterval=2, desc=' - (Training) ', leave=False): batch_idx = batch_order[idx] batch = train_data[batch_idx] src, tgt, src_idx = batch['src'][0], batch['tgt'], batch['feat'][0][0] tgt = tgt.transpose(0, 1) out = model(src, src_idx) loss_val, gold = loss.cal_loss_ner(out, tgt) if len(opt.gpus) > 1: loss_val = loss_val.mean() # mean() to average on multi-gpu. if math.isnan(loss_val.item()) or loss_val.item() > 1e20: print('catch NaN') import ipdb; ipdb.set_trace() loss_val.backward() optimizer.step() optimizer.zero_grad() loss_print += loss_val.item() words_cnt += out.size(0) pred = out.max(1)[1] n_correct = pred.eq(gold.view(-1)) n_correct = n_correct.sum().item() accuracy += n_correct if idx % 1000 == 0: loss_print /= words_cnt accuracy /= words_cnt print('loss', loss_print) print('accuracy', accuracy) loss_val, words_cnt, accuracy = 0, 0, 0 if idx % 2000 == 0: loss_val, accuracy_val = eval_model(model, valid_data, loss) save_model(model, accuracy_val, best_val, opt) if accuracy_val > best_val: best_val = accuracy_val model_to_save = model.module.encoder if hasattr(model, 'module') else model.encoder # Only save the model it-self output_model_file = os.path.join(opt.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file)