def validate(self, valid_dataset, device, epoch=0): """ Validate model. valid_iter: validate data iterator Returns: :obj:`nmt.Statistics`: validation loss statistics """ # Set model in validating mode. self.model.eval() stats = Statistics() with torch.no_grad(): mini_batches = get_minibatches_WDP(valid_dataset, self.args.batch_size, self.args.max_seq_length) logger.info('Number of minibatches: %s' % (len(valid_dataset) // self.args.batch_size)) for step, batch in enumerate(mini_batches): x, labels = batch x = torch.cuda.Tensor(x) labels = torch.cuda.Tensor(labels) logits = self.model(x) # , mask loss = self.loss(logits, labels) # loss = (loss * mask.float()).sum() batch_stats = Statistics(float(loss.cpu().item()), len(labels)) stats.update(batch_stats) self._report_step(0, epoch, valid_stats=stats) return stats
def multi_main(args): """ Spawns 1 process per GPU """ init_logger() nb_gpu = args.world_size mp = torch.multiprocessing.get_context('spawn') # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for i in range(nb_gpu): device_id = i procs.append( mp.Process(target=run, args=( args, device_id, error_queue, ), daemon=True)) procs[i].start() logger.info(" Starting process pid: %d " % procs[i].pid) error_handler.add_child(procs[i].pid) for p in procs: p.join()
def validate(args, device_id, pt, epoch): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) config = BertConfig.from_json_file(args.bert_config_name) model = Summarizer(args, device, load_pretrained_bert=False, bert_config=config) model.load_cp(checkpoint) model.eval() valid_dataset = torch.load(args.bert_data_path + 'valid.data') trainer = build_trainer(args, device_id, model, None) stats = trainer.validate(valid_dataset, epoch) return stats.xent()
def validate(self, valid_dataset, device, epoch=0): """ Validate model. valid_iter: validate data iterator Returns: :obj:`nmt.Statistics`: validation loss statistics """ # Set model in validating mode. self.model.eval() stats = Statistics() with torch.no_grad(): mini_batches = get_minibatches(valid_dataset, self.args.batch_size, self.args.max_seq_length) logger.info('Number of minibatches: %s' % (len(valid_dataset) // self.args.batch_size)) for step, batch in enumerate(mini_batches): src, labels, segs, clss = batch[0], batch[1], batch[2], batch[ 3] if torch.cuda.is_available(): src = torch.cuda.LongTensor(src).to( device) # .reshape(-1, self.args.max_seq_length) labels = torch.cuda.LongTensor(labels).to( device) # .reshape(1, -1) segs = torch.cuda.LongTensor(segs).to( device) # .reshape(1, -1) clss = [(cls + [-1] * (max([len(i) for i in clss]) - len(cls))) for cls in clss] clss = torch.cuda.LongTensor(clss).to(device) mask = torch.cuda.ByteTensor((1 - (src == 0))).to(device) mask_cls = torch.cuda.ByteTensor((1 - (clss == -1))) else: src = torch.LongTensor(src).to( device) # .reshape(-1, self.args.max_seq_length) labels = torch.LongTensor(labels).to( device) # .reshape(1, -1) segs = torch.LongTensor(segs).to(device) # .reshape(1, -1) clss = [(cls + [-1] * (max([len(i) for i in clss]) - len(cls))) for cls in clss] clss = torch.LongTensor(clss).to(device) mask = torch.ByteTensor((1 - (src == 0))).to(device) mask_cls = torch.ByteTensor(( 1 - (clss == -1))) # torch.ByteTensor(mask_cls).to(device) logits = self.model(src, segs, clss, mask, mask_cls) # , mask loss = self.loss(logits, labels) # loss = (loss * mask.float()).sum() batch_stats = Statistics(float(loss.cpu().item()), len(labels)) stats.update(batch_stats) self._report_step(0, epoch, valid_stats=stats) return stats
def test(self, model, test_dataset, device): """ Validate model. valid_iter: validate data iterator Returns: :obj:`nmt.Statistics`: validation loss statistics """ model.eval() mini_batches = get_minibatches_WDP(test_dataset, self.args.batch_size, self.args.max_seq_length) logger.info('Number of minibatches: %s' % (len(test_dataset) // self.args.batch_size)) with torch.no_grad(): n_correct = 0. n_total = 0. target_all = None output_all = None full_pred = [] full_label_ids = [] for step, batch in enumerate(mini_batches): x, labels = batch if torch.cuda.is_available(): x = torch.cuda.Tensor(x).to(device) labels = torch.cuda.Tensor(labels).to(device) else: x = torch.Tensor(x).to(device) labels = torch.Tensor(labels).to(device) logits = self.model(x) # , mask # loss = self.loss(logits, labels) n_correct += (torch.argmax(logits, -1) == labels).sum().item() n_total += len(logits) full_pred.extend(torch.argmax(logits, -1).tolist()) full_label_ids.extend(labels.tolist()) if target_all is None: target_all = labels output_all = logits else: target_all = torch.cat((target_all, labels), dim=0) output_all = torch.cat((output_all, logits), dim=0) acc = n_correct / n_total pred_res = metrics.classification_report( target_all.cpu(), torch.argmax(output_all, -1).cpu(), target_names=['NEG', 'NEU', 'POS']) logger.info( 'Prediction results for test dataset: \n{}'.format(pred_res)) # self._report_step(0, step, valid_stats=stats) return acc
def wait_and_validate(args, device_id): timestep = 0 if (args.test_all): cp_files = sorted( glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) xent_lst = [] for i, cp in enumerate(cp_files): step = int(cp.split('.')[-2].split('_')[-1]) xent = validate(args, device_id, cp, step) xent_lst.append((xent, cp)) max_step = xent_lst.index(min(xent_lst)) if (i - max_step > 10): break xent_lst = sorted(xent_lst, key=lambda x: x[0])[:3] logger.info('PPL %s' % str(xent_lst)) for xent, cp in xent_lst: step = int(cp.split('.')[-2].split('_')[-1]) test(args, device_id, cp, step) else: while (True): cp_files = sorted( glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) if cp_files: cp = cp_files[-1] time_of_cp = os.path.getmtime(cp) if (not os.path.getsize(cp) > 0): time.sleep(60) continue if (time_of_cp > timestep): timestep = time_of_cp step = int(cp.split('.')[-2].split('_')[-1]) validate(args, device_id, cp, step) test(args, device_id, cp, step) cp_files = sorted( glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) if (cp_files): cp = cp_files[-1] time_of_cp = os.path.getmtime(cp) if (time_of_cp > timestep): continue else: time.sleep(300)
def _save(self, model_name, epoch, acc): real_model = self.model model_state_dict = real_model.state_dict() # generator_state_dict = real_generator.state_dict() checkpoint = { 'model': model_state_dict, # 'generator': generator_state_dict, 'opt': self.args, 'optim': self.optim, } checkpoint_path = os.path.join( self.args.model_path, 'model_{}_epoch_{}_acc_{:.4f}.pt'.format(model_name, epoch, acc)) logger.info("Saving checkpoint %s" % checkpoint_path) # checkpoint_path = '%s_step_%d.pt' % (FLAGS.model_path, step) if not os.path.exists(checkpoint_path): torch.save(checkpoint, checkpoint_path) return checkpoint, checkpoint_path
def build_trainer(args, device_id, model, optim): """ Simplify `Trainer` creation based on user `opt`s* Args: opt (:obj:`Namespace`): user options (usually from argument parsing) model (:obj:`onmt.models.NMTModel`): the model to train fields (dict): dict of fields optim (:obj:`onmt.utils.Optimizer`): optimizer used during training data_type (str): string describing the type of data e.g. "text", "img", "audio" model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object used to save the model """ # device = "cpu" if args.visible_gpus == '-1' else "cuda" grad_accum_count = args.accum_count n_gpu = args.world_size # if device_id >= 0: # != 'cpu': # >= 0: # gpu_rank = int(args.gpu_ranks) # else: gpu_rank = 0 n_gpu = 0 print('gpu_rank %d' % gpu_rank) tensorboard_log_dir = args.model_path writer = SummaryWriter(tensorboard_log_dir, comment="Unmt") report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer) trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager) # print(tr) if (model): n_params = _tally_parameters(model) logger.info('* number of parameters: %d' % n_params) return trainer
def output(self, step, num_steps, learning_rate, start): """Write out statistics to stdout. Args: step (int): current step n_batch (int): total batches start (int): start time of step. """ t = self.elapsed_time() step_fmt = "%2d" % step if num_steps > 0: step_fmt = "%s/%5d" % (step_fmt, num_steps) logger.info( ("Step %s; xent: %4.2f; " + "lr: %7.7f; %3.0f docs/s; %6.0f sec") % (step_fmt, self.xent(), learning_rate, self.n_docs / (t + 1e-5), time.time() - start)) sys.stdout.flush()
def test(args, device_id, pt): device = "cpu" if args.visible_gpus == '-1' else "cuda" if pt != '': test_from = pt else: test_from = args.best_model logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) # print(args) config = BertConfig.from_json_file(args.bert_config_path) model = Summarizer(args, device, load_pretrained_bert=False, bert_config=config) model.load_cp(checkpoint) model.eval() logger.info("Test dataset......") test_dataset = torch.load(args.bert_data_path + 'test.data') trainer = build_trainer(args, device_id, model, None) trainer.test(model, test_dataset, device) logger.info("Valid dataset......") test_dataset = torch.load(args.bert_data_path + 'valid.data') trainer = build_trainer(args, device_id, model, None) trainer.test(model, test_dataset, device)
def orig(): # Set model in validating mode. def _get_ngrams(n, text): ngram_set = set() text_length = len(text) max_index_ngram_start = text_length - n for i in range(max_index_ngram_start + 1): ngram_set.add(tuple(text[i:i + n])) return ngram_set def _block_tri(c, p): tri_c = _get_ngrams(3, c.split()) for s in p: tri_s = _get_ngrams(3, s.split()) if len(tri_c.intersection(tri_s)) > 0: return True return False if not cal_lead and not cal_oracle: model.eval() stats = Statistics() can_path = '%s_step%d.candidate' % (self.args.result_path, step) gold_path = '%s_step%d.gold' % (self.args.result_path, step) with open(can_path, 'w') as save_pred: with open(gold_path, 'w') as save_gold: with torch.no_grad(): target_all = [] output_all = [] # n_correct, n_total = 0., 0. mini_batches = get_minibatches( test_dataset, self.args.batch_size, self.args.max_seq_length) for i, batch in enumerate(mini_batches): src = batch.src labels = batch.labels segs = batch.segs clss = batch.clss mask = batch.mask mask_cls = batch.mask_cls gold = [] pred = [] if (cal_lead): selected_ids = [ list(range(batch.clss.size(1))) ] * batch.batch_size elif (cal_oracle): selected_ids = [[ j for j in range(batch.clss.size(1)) if labels[i][j] == 1 ] for i in range(batch.batch_size)] else: logits = model(src, segs, clss, mask, mask_cls) loss = self.loss( logits, labels ) # loss = self.loss(sent_scores, labels.float()) # loss = (loss * mask.float()).sum() # n_correct += (torch.argmax(logits, -1) == labels).sum().item() # n_total += len(logits) if target_all is None: target_all = labels output_all = logits else: target_all = torch.cat( (target_all, labels), dim=0) output_all = torch.cat( (output_all, logits), dim=0) batch_stats = Statistics( float(loss.cpu().item()), len(labels)) stats.update(batch_stats) sent_scores = sent_scores + mask.float() sent_scores = sent_scores.cpu().data.numpy() selected_ids = np.argsort(-sent_scores, 1) # selected_ids = np.sort(selected_ids,1) for i, idx in enumerate(selected_ids): _pred = [] if len(batch.src_str[i]) == 0: continue for j in selected_ids[i][:len(batch.src_str[i] )]: if j >= len(batch.src_str[i]): continue candidate = batch.src_str[i][j].strip() if self.args.block_trigram: if not _block_tri(candidate, _pred): _pred.append(candidate) else: _pred.append(candidate) if (not cal_oracle) and ( not self.args.recall_eval ) and len(_pred) == 3: break _pred = '<q>'.join(_pred) if self.args.recall_eval: _pred = ' '.join( _pred.split() [:len(batch.tgt_str[i].split())]) pred.append(_pred) gold.append(batch.tgt_str[i]) for i in range(len(gold)): save_gold.write(gold[i].strip() + '\n') for i in range(len(pred)): save_pred.write(pred[i].strip() + '\n') pred_res = metrics.classification_report( target_all.cpu(), torch.argmax(output_all, -1).cpu(), target_names=['NEG', 'NEU', 'POS']) logger.info( 'Prediction results for test dataset: \n{}'.format( pred_res)) if step != -1 and self.args.report_rouge: rouges = test_rouge(self.args.temp_dir, can_path, gold_path) logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges))) self._report_step(0, step, valid_stats=stats) return stats
def test(self, model, test_dataloader, device, cal_lead=False, cal_oracle=False): """ Validate model. valid_iter: validate data iterator Returns: :obj:`nmt.Statistics`: validation loss statistics """ model.eval() stats = Statistics() batch_num = len(test_dataloader) # logger.info('Number of minibatches: %s' % batch_num) mini_batches = get_minibatches(test_dataset, self.args.batch_size, self.args.max_seq_length, shuffle=False) logger.info('Number of minibatches: %s' % len(test_dataloader)) with torch.no_grad(): n_correct = 0. n_total = 0. target_all = None output_all = None full_pred = [] full_label_ids = [] for step, batch in enumerate(mini_batches): src, labels, segs, clss = batch[0], batch[1], batch[2], batch[ 3] if torch.cuda.is_available(): src = torch.cuda.LongTensor(src).to( device) # .reshape(-1, self.args.max_seq_length) labels = torch.cuda.LongTensor(labels).to( device) # .reshape(1, -1) segs = torch.cuda.LongTensor(segs).to( device) # .reshape(1, -1) clss = [(cls + [-1] * (max([len(i) for i in clss]) - len(cls))) for cls in clss] clss = torch.cuda.LongTensor(clss).to(device) mask = torch.cuda.ByteTensor((1 - (src == 0))).to(device) mask_cls = torch.cuda.ByteTensor((1 - (clss == -1))) else: src = torch.LongTensor(src).to( device) # .reshape(-1, self.args.max_seq_length) labels = torch.LongTensor(labels).to( device) # .reshape(1, -1) segs = torch.LongTensor(segs).to(device) # .reshape(1, -1) clss = [(cls + [-1] * (max([len(i) for i in clss]) - len(cls))) for cls in clss] clss = torch.LongTensor(clss).to(device) mask = torch.ByteTensor((1 - (src == 0))).to(device) mask_cls = torch.ByteTensor(( 1 - (clss == -1))) # torch.ByteTensor(mask_cls).to(device) logits = self.model(src, segs, clss, mask, mask_cls) # , mask # loss = self.loss(logits, labels) n_correct += (torch.argmax(logits, -1) == labels).sum().item() n_total += len(logits) full_pred.extend(torch.argmax(logits, -1).tolist()) full_label_ids.extend(labels.tolist()) if target_all is None: target_all = labels output_all = logits else: target_all = torch.cat((target_all, labels), dim=0) output_all = torch.cat((output_all, logits), dim=0) # batch_stats = Statistics(float(loss.cpu().item()), len(labels)) # stats.update(batch_stats) # sent_scores = sent_scores + mask.float() # sent_scores = sent_scores.cpu().data.numpy() # selected_ids = np.argsort(-sent_scores, 1) acc = n_correct / n_total pred_res = metrics.classification_report( target_all.cpu(), torch.argmax(output_all, -1).cpu(), target_names=['NEG', 'NEU', 'POS']) logger.info('Prediction results: \n{}'.format(pred_res)) predict_vote(full_pred, full_label_ids, test_dataloader) # self._report_step(0, step, valid_stats=stats) return acc def orig(): # Set model in validating mode. def _get_ngrams(n, text): ngram_set = set() text_length = len(text) max_index_ngram_start = text_length - n for i in range(max_index_ngram_start + 1): ngram_set.add(tuple(text[i:i + n])) return ngram_set def _block_tri(c, p): tri_c = _get_ngrams(3, c.split()) for s in p: tri_s = _get_ngrams(3, s.split()) if len(tri_c.intersection(tri_s)) > 0: return True return False if not cal_lead and not cal_oracle: model.eval() stats = Statistics() can_path = '%s_step%d.candidate' % (self.args.result_path, step) gold_path = '%s_step%d.gold' % (self.args.result_path, step) with open(can_path, 'w') as save_pred: with open(gold_path, 'w') as save_gold: with torch.no_grad(): target_all = [] output_all = [] # n_correct, n_total = 0., 0. mini_batches = get_minibatches( test_dataset, self.args.batch_size, self.args.max_seq_length) for i, batch in enumerate(mini_batches): src = batch.src labels = batch.labels segs = batch.segs clss = batch.clss mask = batch.mask mask_cls = batch.mask_cls gold = [] pred = [] if (cal_lead): selected_ids = [ list(range(batch.clss.size(1))) ] * batch.batch_size elif (cal_oracle): selected_ids = [[ j for j in range(batch.clss.size(1)) if labels[i][j] == 1 ] for i in range(batch.batch_size)] else: logits = model(src, segs, clss, mask, mask_cls) loss = self.loss( logits, labels ) # loss = self.loss(sent_scores, labels.float()) # loss = (loss * mask.float()).sum() # n_correct += (torch.argmax(logits, -1) == labels).sum().item() # n_total += len(logits) if target_all is None: target_all = labels output_all = logits else: target_all = torch.cat( (target_all, labels), dim=0) output_all = torch.cat( (output_all, logits), dim=0) batch_stats = Statistics( float(loss.cpu().item()), len(labels)) stats.update(batch_stats) sent_scores = sent_scores + mask.float() sent_scores = sent_scores.cpu().data.numpy() selected_ids = np.argsort(-sent_scores, 1) # selected_ids = np.sort(selected_ids,1) for i, idx in enumerate(selected_ids): _pred = [] if len(batch.src_str[i]) == 0: continue for j in selected_ids[i][:len(batch.src_str[i] )]: if j >= len(batch.src_str[i]): continue candidate = batch.src_str[i][j].strip() if self.args.block_trigram: if not _block_tri(candidate, _pred): _pred.append(candidate) else: _pred.append(candidate) if (not cal_oracle) and ( not self.args.recall_eval ) and len(_pred) == 3: break _pred = '<q>'.join(_pred) if self.args.recall_eval: _pred = ' '.join( _pred.split() [:len(batch.tgt_str[i].split())]) pred.append(_pred) gold.append(batch.tgt_str[i]) for i in range(len(gold)): save_gold.write(gold[i].strip() + '\n') for i in range(len(pred)): save_pred.write(pred[i].strip() + '\n') pred_res = metrics.classification_report( target_all.cpu(), torch.argmax(output_all, -1).cpu(), target_names=['NEG', 'NEU', 'POS']) logger.info( 'Prediction results for test dataset: \n{}'.format( pred_res)) if step != -1 and self.args.report_rouge: rouges = test_rouge(self.args.temp_dir, can_path, gold_path) logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges))) self._report_step(0, step, valid_stats=stats) return stats
def log(self, *args, **kwargs): logger.info(*args, **kwargs)
def train(args, device_id): init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: # torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True # def train_iter_fct(): # return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, # shuffle=True, is_test=False) train_dataset = torch.load(args.bert_data_path + 'train.data') if args.do_use_second_dataset: train_dataset += torch.load(args.second_dataset_path + 'train.data') logger.info('Loading training dataset from %s, number of examples: %d' % (args.bert_data_path, len(train_dataset))) if args.do_WDP: if os.path.exists(args.bert_data_path + 'train_512dim.data.npy' ) and os.path.exists('train_512dim.labels.npy'): all_document = np.load(args.bert_data_path + 'train_512dim.data') all_labels = np.load(args.bert_data_path + 'train_512dim.labels') else: DimReducer = reduceDim.DimReducer(args, device) all_document = None all_labels = None for document in tqdm(train_dataset, desc="Loading dataset", unit="lines"): if all_document is None: all_document = DimReducer( document['src']) #.reshape(args.max_seq_length, -1) all_labels = np.array([document['labels']]) else: all_document = np.append(all_document, DimReducer(document['src']), axis=0) all_labels = np.append(all_labels, document['labels']) # all_document.append(DimReducer(document['src']).reshape(args.max_seq_length, -1)) assert all_labels.shape[0] == all_document.shape[0] np.save(args.bert_data_path + 'train_512dim.data', all_document) np.save(args.bert_data_path + 'train_512dim.labels', all_labels) test_dataset = torch.load(args.bert_data_path + 'valid.data') logger.info('Loading valid dataset from %s, number of examples: %d' % (args.bert_data_path, len(test_dataset))) if os.path.exists(args.bert_data_path + 'valid_512dim.data.npy' ) and os.path.exists('valid_512dim.labels.npy'): test_document = np.load(args.bert_data_path + 'valid_512dim.data') test_labels = np.load(args.bert_data_path + 'valid_512dim.labels') else: DimReducer = reduceDim.DimReducer(args, device) test_document = None test_labels = None for document in tqdm(test_dataset, desc="Loading dataset", unit="lines"): if test_document is None: test_document = DimReducer( document['src']) #.reshape(args.max_seq_length, -1) test_labels = np.array([document['labels']]) else: test_document = np.append(test_document, DimReducer(document['src']), axis=0) test_labels = np.append(test_labels, document['labels']) # test_document.append(DimReducer(document['src']).reshape(args.max_seq_length, -1)) assert test_labels.shape[0] == test_document.shape[0] np.save(args.bert_data_path + 'valid_512dim.data', test_document) np.save(args.bert_data_path + 'valid_512dim.labels', test_labels) model = reduceDim.Decoder(args, device, DimReducer.hidden_size, DimReducer.bert_vocab_size) _params = filter(lambda p: p.requires_grad, model.parameters()) optim = optimization.BertAdam(_params, lr=args.lr, weight_decay=args.l2reg) logger.info(model) trainer = trainerWDP.build_trainer(args, device_id, model, optim) trainer.train([all_document, all_labels], device, [test_document, test_labels]) if args.do_test: model = trainer.model model.eval() test_dataset = torch.load(args.bert_data_path + 'valid.data') logger.info( 'Loading valid dataset from %s, number of examples: %d' % (args.bert_data_path, len(test_dataset))) valid_document = [] valid_labels = [] for document in train_dataset: valid_document.append(DimReducer(document['src'])) valid_labels.append(document['labels']) trainer.test(model, [valid_document, valid_labels], device) else: # train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True) model = Summarizer(args, device, load_pretrained_bert=True) # if args.train_from != '': # logger.info('Loading checkpoint from %s' % args.train_from) # checkpoint = torch.load(args.train_from, # map_location=lambda storage, loc: storage) # opt = vars(checkpoint['opt']) # for k in opt.keys(): # if (k in model_flags): # setattr(args, k, opt[k]) # model.load_cp(checkpoint) # optim = model_builder.build_optim(args, model, checkpoint) # else: # optim = model_builder.build_optim(args, model, None) _params = filter(lambda p: p.requires_grad, model.parameters()) optim = optimization.BertAdam(_params, lr=args.lr, weight_decay=args.l2reg) logger.info(model) trainer = build_trainer(args, device_id, model, optim) trainer.train(train_dataset, device) if args.do_test: model = trainer.model model.eval() test_dataset = torch.load(args.bert_data_path + 'valid.data') logger.info( 'Loading valid dataset from %s, number of examples: %d' % (args.bert_data_path, len(test_dataset))) trainer = build_trainer(args, device_id, model, None) trainer.test(model, test_dataset, device)
def train(self, train_dataset, device): # , valid_iter_fct=None, valid_steps=-1) """ The main training loops. by iterating over training data (i.e. `train_iter_fct`) and running validation (i.e. iterating over `valid_iter_fct` Args: train_iter_fct(function): a function that returns the train iterator. e.g. something like train_iter_fct = lambda: generator(*args, **kwargs) valid_iter_fct(function): same as train_iter_fct, for valid data train_steps(int): valid_steps(int): save_checkpoint_steps(int): Return: None """ # step = self.optim._step + 1 # step = self.optim._step + 1 # epoch = 0 true_batchs = [] accum = 0 normalization = 0 # train_iter = train_iter_fct() total_stats = Statistics() report_stats = Statistics() self._start_report_manager(start_time=total_stats.start_time) if self.args.do_eval: test_dataset = torch.load(self.args.bert_data_path + 'test.data') logger.info( 'Loading test dataset from %s, number of examples: %d' % (self.args.bert_data_path, len(test_dataset))) test_dataloader = DataLoader(dataset=test_dataset, batch_size=self.args.batch_size, shuffle=False) if self.args.do_use_second_dataset: test_dataset2 = torch.load(self.args.second_dataset_path + 'test.data') test_dataloader2 = DataLoader(dataset=test_dataset2, batch_size=self.args.batch_size, shuffle=False) for epoch in range(self.args.train_epochs): n_correct, n_total = 0., 0. reduce_counter = 0 loss_total = 0 logger.info('Getting minibatches') mini_batches = get_minibatches(train_dataset, self.args.batch_size, self.args.max_seq_length) logger.info('Number of minibatches: %s' % (len(train_dataset) // self.args.batch_size)) logger.info('Start training...') for step, batch in enumerate(mini_batches): # if self.n_gpu == 0 or (step % self.n_gpu == self.gpu_rank): self.optim.zero_grad() # true_batchs.append(batch) # normalization += batch.batch_size # accum += 1 # if accum == self.grad_accum_count: # reduce_counter += 1 # if self.n_gpu > 1: # normalization = sum(distributed.all_gather_list(normalization)) src, labels, segs, clss = batch[0], batch[1], batch[2], batch[ 3] if torch.cuda.is_available(): src = torch.cuda.LongTensor(src).to( device) # .reshape(-1, self.args.max_seq_length) labels = torch.cuda.LongTensor(labels).to( device) # .reshape(1, -1) segs = torch.cuda.LongTensor(segs).to( device) # .reshape(1, -1) clss = [(cls + [-1] * (max([len(i) for i in clss]) - len(cls))) for cls in clss] clss = torch.cuda.LongTensor(clss).to(device) mask = torch.cuda.ByteTensor((1 - (src == 0))).to(device) mask_cls = torch.cuda.ByteTensor((1 - (clss == -1))) else: src = torch.LongTensor(src).to( device) # .reshape(-1, self.args.max_seq_length) labels = torch.LongTensor(labels).to( device) # .reshape(1, -1) segs = torch.LongTensor(segs).to(device) # .reshape(1, -1) clss = [(cls + [-1] * (max([len(i) for i in clss]) - len(cls))) for cls in clss] clss = torch.LongTensor(clss).to(device) mask = torch.ByteTensor((1 - (src == 0))).to(device) mask_cls = torch.ByteTensor(( 1 - (clss == -1))) # torch.ByteTensor(mask_cls).to(device) '''src, labels, segs, clss = batch['src'], batch['labels'], batch['segs'], batch['clss'] if torch.cuda.is_available(): src = torch.cuda.LongTensor([t for t in src]).to(device) # .reshape(-1, self.args.max_seq_length) labels = torch.cuda.LongTensor(labels).to(device) # .reshape(1, -1) segs = torch.cuda.LongTensor(segs).to(device) # .reshape(1, -1) clss = [(cls + [-1] * (max([len(i) for i in clss]) - len(cls))) for cls in clss] clss = torch.cuda.LongTensor(clss).to(device) mask = torch.cuda.ByteTensor((1 - (src == 0))).to(device) mask_cls = torch.cuda.ByteTensor((1 - (clss == -1))) else: src = torch.LongTensor(src).to(device) # .reshape(-1, self.args.max_seq_length) labels = torch.LongTensor(labels).to(device) # .reshape(1, -1) segs = torch.LongTensor(segs).to(device) # .reshape(1, -1) clss = [(cls + [-1] * (max([len(i) for i in clss]) - len(cls))) for cls in clss] clss = torch.LongTensor(clss).to(device) mask = torch.ByteTensor((1 - (src == 0))).to(device) mask_cls = torch.ByteTensor((1 - (clss == -1))) # torch.ByteTensor(mask_cls).to(device)''' # src = batch.src # labels = batch.labels # segs = batch.segs # clss = batch.clss # mask = batch.mask # mask_cls = batch.mask_cls logits = self.model(src, segs, clss, mask, mask_cls) # , mask loss = self.loss(logits, labels) n_correct += (torch.argmax(logits, -1) == labels).sum().item() n_total += len(logits) loss_total += loss.item() * len(logits) # loss = (loss * mask.float()).sum() # (loss / loss.numel()).backward() loss.backward() # loss.div(float(normalization)).backward() # 4. Update the parameters and statistics. # if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step() batch_stats = Statistics(float(loss.cpu().item()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) logger.info('step-{}, loss:{:.4f}, acc:{:.4f}'.format( step, loss_total / n_total, n_correct / n_total)) if step % self.check_steps == 0 or step == batch_num: valid_acc_2 = 0 valid_acc = self.test(self.model, test_dataloader, device) if self.args.do_use_second_dataset: valid_acc_2 = self.test(self.model, test_dataloader2, device) if valid_acc > self.best_acc or valid_acc_2 > self.best_acc: self.best_acc = valid_acc self._save( str(self.args.model_name) + str(self.args.lr) + 'valid', epoch, self.best_acc) # self._save(epoch, step) # report_stats = self._maybe_report_training(step, epoch, self.optim.learning_rate, report_stats) # in case of multi step gradient accumulation, # update only after accum batches # valid_acc = self.test(self.model, test_dataset, device) # if valid_acc > self.best_acc: # self.best_acc = valid_acc # self._save(str(self.args.model_name)+str(self.args.lr), epoch, valid_acc) if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step() # return n_correct, n_total, loss_total if self.args.do_eval: # model = trainer.model # self.model.eval() # trainer = build_trainer(args, device_id, model, None) try: self.test(self.model, test_dataset, device) except Exception as e: logger.error(e)
def train(self, train_dataset, device, test_dataset): # , valid_iter_fct=None, valid_steps=-1) normalization = 0 total_stats = Statistics() report_stats = Statistics() self._start_report_manager(start_time=total_stats.start_time) for epoch in range(self.args.train_epochs): n_correct, n_total = 0., 0. reduce_counter = 0 loss_total = 0 logger.info('Getting minibatches') mini_batches = get_minibatches_WDP(train_dataset, self.args.batch_size) logger.info('Number of minibatches: %s' % (len(train_dataset[0]) // self.args.batch_size)) logger.info('Start training...') for step, batch in enumerate(mini_batches): # if self.n_gpu == 0 or (step % self.n_gpu == self.gpu_rank): x, labels = batch if torch.cuda.is_available(): x = torch.cuda.Tensor(x).to(device) labels = torch.cuda.Tensor(labels).to(device) else: x = torch.Tensor(x).to(device) labels = torch.Tensor(labels).to(device) self.optim.zero_grad() logits = self.model(x) # , mask loss = self.loss(logits, labels) n_correct += (torch.argmax(logits, -1) == labels).sum().item() n_total += len(logits) loss_total += loss.item() * len(logits) loss.backward() # loss.div(float(normalization)).backward() # 4. Update the parameters and statistics. # if self.grad_accum_count == 1: # Multi GPU gradient gather if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step() batch_stats = Statistics(float(loss.cpu().item()), normalization) total_stats.update(batch_stats) report_stats.update(batch_stats) logger.info('step-{}, loss:{:.4f}, acc:{:.4f}'.format( step, loss_total / n_total, n_correct / n_total)) if step != 0 and step % self.check_steps == 0: valid_acc = self.test(self.model, test_dataset, device) if valid_acc > self.best_acc: self.best_acc = valid_acc self._save( str(self.args.model_name) + str(self.args.lr) + 'valid', epoch, self.best_acc) # self._save(epoch, step) # report_stats = self._maybe_report_training(step, epoch, self.optim.learning_rate, report_stats) # in case of multi step gradient accumulation, # update only after accum batches valid_acc = self.test(self.model, test_dataset, device) # if valid_acc > self.best_acc: # self.best_acc = valid_acc self._save( str(self.args.model_name) + str(self.args.lr), epoch, valid_acc) if self.grad_accum_count > 1: if self.n_gpu > 1: grads = [ p.grad.data for p in self.model.parameters() if p.requires_grad and p.grad is not None ] distributed.all_reduce_and_rescale_tensors(grads, float(1)) self.optim.step()
def predict_vote(pred_labels, label_ids, test_dataloader): act_pred_label = { } # id:{'entity': '', 'emotion': 0/1/-1, 'predictions': []} prev_entity = "" # "pred: %s, act: %s" % (pred_labels[i] - 1, label_ids[i] - 1) doc_id = 0 for i, test_dataset in enumerate(test_dataloader): # pred_id = int(i / 3) doc = test_dataset['src_txt'] entity = doc[0] polarity = test_dataset['labels'] assert int(polarity) == (label_ids[i]) # print(polarity_str, label_ids[pred_id] - 1) if prev_entity == "" or entity != prev_entity: doc_id += 1 prev_entity = entity act_pred_label[doc_id] = {} act_pred_label[doc_id]['entity'] = entity act_pred_label[doc_id]['emotion'] = int(polarity) # actual label; act_pred_label[doc_id]['predictions'] = [int(pred_labels[i]) ] # predict label else: # if entity == prev_entity: act_pred_label[doc_id]['predictions'].append(int(pred_labels[i])) # print(act_pred_label) acc = 0. total = len(act_pred_label) act_labels_all = [] pred_labels_all = [] for idx in act_pred_label.keys(): each_pred = act_pred_label[idx] predic_labels = each_pred['predictions'] act_label = each_pred['emotion'] num = [] num.append(predic_labels.count(0)) num.append(predic_labels.count(1)) num.append(predic_labels.count(2)) # 0:0, 1:1, 2:2 if num[0] == num[1] and num[0] != 0 and num[0] > num[2]: pred = 0 elif num[1] == num[2] and num[1] != 0 and num[1] > num[0]: pred = 2 elif num[0] == num[2] and num[0] != 0 and num[0] > num[1]: pred = 1 else: pred = num.index(max(num)) # if num[0]== num[1] or num[1] == num[2] or num[0] == num[2]: # print(pred, num) if pred == act_label: acc += 1 act_labels_all.append(act_label) pred_labels_all.append(pred) # print(acc / total) # vote_f1 = metrics.f1_score(act_labels_all, pred_labels_all, labels=[0, 1, 2], average=None) # vote_recall = metrics.recall_score(act_labels_all, pred_labels_all, labels=[0, 1, 2], average=None) # logger.info('>> vote_acc: {:.4f}, vote_recall:{} vote_f1: {}'.format(acc / total, vote_recall, vote_f1)) pred_result = metrics.classification_report( act_labels_all, pred_labels_all, target_names=['NEG', 'NEU', "POS"]) logger.info('>> vote_acc: {:.4f}'.format(acc / total)) logger.info('>> Prediction voted results: \n {}'.format(pred_result))