def train(self, train_iter_fct, train_steps): logger.info('Start training...') step = self.optim._step + 1 # 最终给的batch 是这个 true_batchs = [] # 训练了的batch(true_batchs) 次数 accum = 0 train_iter = train_iter_fct() while step <= train_steps: reduce_counter = 0 for i, batch in enumerate(train_iter): true_batchs.append(batch) accum += 1 # true_batchs append的batch有grad_accum_count个时,开始给进去训练 if accum == self.grad_accum_count: reduce_counter += 1 # 训练 loss = self._gradient_accumulation(true_batchs) if step % 2 == 0: print('step:', step, 'loss:', loss.cpu().detach().numpy()) true_batchs = [] accum = 0 if step % self.save_checkpoint_steps == 0: self._save(step) step += 1 train_iter = train_iter_fct() if step > train_steps: self._save(step)
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == "-1" else "cuda" if (pt != ""): test_from = pt else: test_from = args.test_from logger.info("Loading checkpoint from %s" % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint["opt"]) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) config = BertConfig.from_json_file(args.bert_config_path) model = Summarizer(args, device, load_pretrained_bert=False, bert_config=config) model.load_cp(checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, "valid", shuffle=False), args.batch_size, device, shuffle=False, is_test=False) trainer = build_trainer(args, device_id, model, None) stats = trainer.validate(valid_iter, step) return stats.xent()
def multi_main(args): """ Spawns 1 process per GPU """ init_logger() nb_gpu = args.world_size mp = torch.multiprocessing.get_context("spawn") # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for i in range(nb_gpu): device_id = i procs.append( mp.Process(target=run, args=( args, device_id, error_queue, ), daemon=True)) procs[i].start() logger.info(" Starting process pid: %d " % procs[i].pid) error_handler.add_child(procs[i].pid) for p in procs: p.join()
def test_ext(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = ExtSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) trainer = build_trainer(args, device_id, model, None) trainer.test(test_iter, step)
def build_trainer(args, model, optim): trainer = Trainer(args, model, optim, args.accum_count) if model: n_params = utils.tally_parameters(model) logger.info('* number of parameters: %d' % n_params) return trainer
def test_rouge(temp_dir, cand, ref): candidates = [line.strip() for line in open(cand, encoding="utf-8")] references = [line.strip() for line in open(ref, encoding="utf-8")] assert len(candidates) == len(references) cnt = len(candidates) current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}".format(current_time)) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) os.mkdir(tmp_dir + "/candidate") os.mkdir(tmp_dir + "/reference") try: for i in range(cnt): if len(references[i]) < 1: continue with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w", encoding="utf-8") as f: f.write(candidates[i]) with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w", encoding="utf-8") as f: f.write(references[i]) candidates_, references_ = [], [] for cand, ref in zip(candidates, references): if len(cand) * len(ref) != 0: cand = " ".join(cand.split("<q>")) ref = " ".join(ref.split("<q>")) candidates_.append(cand) references_.append(ref) # calculate rouge score logger.info( "| Calculating rouge score on {} candidate-reference pairs ...". format(len(candidates_))) rouge = Rouge() rouge_result_dict = rouge.get_scores(candidates_, references_, avg=True) finally: if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) return rouge_result_dict
def _save(self, step): model_state_dict = self.model.state_dict() checkpoint = { 'model': model_state_dict, 'opt': self.args, 'optim': self.optim, } checkpoint_path = os.path.join(self.args.model_path, 'model_step_%d.pt' % step) logger.info("Saving checkpoint %s" % checkpoint_path) if not os.path.exists(checkpoint_path): torch.save(checkpoint, checkpoint_path) return checkpoint, checkpoint_path
def wait_and_validate(args, device_id): timestep = 0 if (args.test_all): cp_files = sorted( glob.glob(os.path.join(args.model_path, "model_step_*.pt"))) cp_files.sort(key=os.path.getmtime) xent_lst = [] for i, cp in enumerate(cp_files): step = int(cp.split(".")[-2].split("_")[-1]) xent = validate(args, device_id, cp, step) xent_lst.append((xent, cp)) max_step = xent_lst.index(min(xent_lst)) if (i - max_step > 10): break xent_lst = sorted(xent_lst, key=lambda x: x[0])[:3] logger.info("PPL %s" % str(xent_lst)) for xent, cp in xent_lst: step = int(cp.split(".")[-2].split("_")[-1]) test(args, device_id, cp, step) else: while (True): cp_files = sorted( glob.glob(os.path.join(args.model_path, "model_step_*.pt"))) cp_files.sort(key=os.path.getmtime) if (cp_files): cp = cp_files[-1] time_of_cp = os.path.getmtime(cp) if (not os.path.getsize(cp) > 0): time.sleep(60) continue if (time_of_cp > timestep): timestep = time_of_cp step = int(cp.split(".")[-2].split("_")[-1]) validate(args, device_id, cp, step) test(args, device_id, cp, step) cp_files = sorted( glob.glob(os.path.join(args.model_path, "model_step_*.pt"))) cp_files.sort(key=os.path.getmtime) if (cp_files): cp = cp_files[-1] time_of_cp = os.path.getmtime(cp) if (time_of_cp > timestep): continue else: time.sleep(300)
def _save(self, step): real_model = self.model model_state_dict = real_model.state_dict() checkpoint = { "model": model_state_dict, "opt": self.args, "optim": self.optim, } checkpoint_path = os.path.join(self.args.model_path, "model_step_%d.pt" % step) logger.info("Saving checkpoint %s" % checkpoint_path) if (not os.path.exists(checkpoint_path)): torch.save(checkpoint, checkpoint_path) return checkpoint, checkpoint_path
def output(self, step, num_steps, learning_rate, start): """Write out statistics to stdout. Args: step (int): current step n_batch (int): total batches start (int): start time of step. """ t = self.elapsed_time() step_fmt = "%2d" % step if num_steps > 0: step_fmt = "%s/%5d" % (step_fmt, num_steps) logger.info( ("Step %s; xent: %4.2f; " + "lr: %7.7f; %3.0f docs/s; %6.0f sec") % (step_fmt, self.xent(), learning_rate, self.n_docs / (t + 1e-5), time.time() - start)) sys.stdout.flush()
def build_trainer(args, device_id, model, optim): """ Simplify `Trainer` creation based on user `opt`s* Args: opt (:obj:`Namespace`): user options (usually from argument parsing) model (:obj:`onmt.models.NMTModel`): the model to train fields (dict): dict of fields optim (:obj:`onmt.utils.Optimizer`): optimizer used during training data_type (str): string describing the type of data e.g. "text", "img", "audio" model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object used to save the model """ grad_accum_count = args.accum_count n_gpu = args.world_size if device_id >= 0: gpu_rank = int(args.gpu_ranks[device_id]) else: gpu_rank = 0 n_gpu = 0 print('gpu_rank %d' % gpu_rank) tensorboard_log_dir = args.model_path writer = SummaryWriter(tensorboard_log_dir, comment="Unmt") report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer) trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager) # print(tr) if (model): n_params = _tally_parameters(model) logger.info('* number of parameters: %d' % n_params) return trainer
def test_text_ext(args): logger.info('Loading checkpoint from %s' % args.test_from) checkpoint = torch.load(args.test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) device = "cpu" if args.visible_gpus == '-1' else "cuda" device_id = 0 if device == "cuda" else -1 model = ExtSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.load_text(args, args.text_src, args.text_tgt, device) trainer = build_trainer(args, device_id, model, None) trainer.test(test_iter, -1)
def _save(self, step): real_model = self.model # real_generator = (self.generator.module # if isinstance(self.generator, torch.nn.DataParallel) # else self.generator) model_state_dict = real_model.state_dict() # generator_state_dict = real_generator.state_dict() checkpoint = { 'model': model_state_dict, # 'generator': generator_state_dict, 'opt': self.args, 'optim': self.optim, } checkpoint_path = os.path.join(self.args.model_path, 'model_step_%d.pt' % step) logger.info("Saving checkpoint %s" % checkpoint_path) # checkpoint_path = '%s_step_%d.pt' % (FLAGS.model_path, step) if (not os.path.exists(checkpoint_path)): torch.save(checkpoint, checkpoint_path) return checkpoint, checkpoint_path
def test(args, test_from, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) config = BertConfig.from_json_file(args.bert_config_path) model = Summarizer(args, device, load_pretrained_bert=False, bert_config=config) model.load_cp(checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.batch_size, device, shuffle=False, is_test=True) trainer = build_trainer(args, model, None) trainer.test(test_iter, step)
def train(args, device_id): init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): # 测试,不shuffle return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = Summarizer(args, device, load_pretrained_bert=True) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) model.load_cp(checkpoint) optim = model_builder_LAI.build_optim(args, model, checkpoint) else: optim = model_builder_LAI.build_optim(args, model, None) logger.info('model load success............') logger.info(model) trainer = build_trainer(args, model, optim) trainer.train(train_iter_fct, args.train_steps)
def process(params): temp_dir, data = params candidates, references, pool_id = data cnt = len(candidates) current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}-{}".format(current_time, pool_id)) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) os.mkdir(tmp_dir + "/candidate") os.mkdir(tmp_dir + "/reference") try: for i in range(cnt): if len(references[i]) < 1: continue with open(tmp_dir + "/candidate/cand.{}.txt".format(i), "w", encoding="utf-8") as f: f.write(candidates[i]) with open(tmp_dir + "/reference/ref.{}.txt".format(i), "w", encoding="utf-8") as f: f.write(references[i]) r = pyrouge.Rouge155(temp_dir=temp_dir) r.model_dir = tmp_dir + "/reference/" r.system_dir = tmp_dir + "/candidate/" r.model_filename_pattern = "ref.#ID#.txt" r.system_filename_pattern = r"cand.(\d+).txt" rouge_results = r.convert_and_evaluate() logger.info(rouge_results) results_dict = r.output_to_dict(rouge_results) finally: pass if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) return results_dict
def _format_to_bert(params): json_file, args, save_file = params if (os.path.exists(save_file)): logger.info('Ignore %s' % save_file) return print('----_format use') bert = BertData(args) print('---- after bertdata') logger.info('Processing %s' % json_file) jobs = json.load(open(json_file)) datasets = [] for d in jobs: source, tgt = d['src'], d['tgt'] if (args.oracle_mode == 'greedy'): print('---- use greedy') oracle_ids = greedy_selection(source, tgt, 3) elif (args.oracle_mode == 'combination'): oracle_ids = combination_selection(source, tgt, 3) b_data = bert.preprocess(source, tgt, oracle_ids) if (b_data is None): continue indexed_tokens, labels, segments_ids, cls_ids, src_txt, tgt_txt = b_data b_data_dict = { "src": indexed_tokens, "labels": labels, "segs": segments_ids, 'clss': cls_ids, 'src_txt': src_txt, "tgt_txt": tgt_txt } datasets.append(b_data_dict) print('----_format use 1') logger.info('Saving to %s' % save_file) torch.save(datasets, save_file) datasets = [] gc.collect()
def test(self, test_iter, step, cal_lead=False, cal_oracle=False): """ Validate model. valid_iter: validate data iterator Returns: :obj:`nmt.Statistics`: validation loss statistics """ # Set model in validating mode. def _get_ngrams(n, text): ngram_set = set() text_length = len(text) max_index_ngram_start = text_length - n for i in range(max_index_ngram_start + 1): ngram_set.add(tuple(text[i:i + n])) return ngram_set def _block_tri(c, p): tri_c = _get_ngrams(3, c.split()) for s in p: tri_s = _get_ngrams(3, s.split()) if len(tri_c.intersection(tri_s)) > 0: return True return False if (not cal_lead and not cal_oracle): self.model.eval() stats = Statistics() can_path = '%s_step%d.candidate' % (self.args.result_path, step) gold_path = '%s_step%d.gold' % (self.args.result_path, step) with open(can_path, 'w') as save_pred: with open(gold_path, 'w') as save_gold: with torch.no_grad(): for batch in test_iter: src = batch.src segs = batch.segs clss = batch.clss mask = batch.mask_src mask_cls = batch.mask_cls gold = [] pred = [] if (cal_lead): selected_ids = [list(range(batch.clss.size(1))) ] * batch.batch_size elif (cal_oracle): labels = batch.src_sent_labels selected_ids = [[ j for j in range(batch.clss.size(1)) if labels[i][j] == 1 ] for i in range(batch.batch_size)] else: sent_scores, mask = self.model( src, segs, clss, mask, mask_cls) sent_scores = sent_scores + mask.float() sent_scores = sent_scores.cpu().data.numpy() selected_ids = np.argsort(-sent_scores, 1) if (hasattr(batch, 'src_sent_labels')): labels = batch.src_sent_labels loss = self.loss(sent_scores, labels.float()) loss = (loss * mask.float()).sum() batch_stats = Statistics( float(loss.cpu().data.numpy()), len(labels)) stats.update(batch_stats) for i, idx in enumerate(selected_ids): _pred = [] if (len(batch.src_str[i]) == 0): continue for j in selected_ids[i][:len(batch.src_str[i])]: if (j >= len(batch.src_str[i])): continue candidate = batch.src_str[i][j].strip() if (self.args.block_trigram): if (not _block_tri(candidate, _pred)): _pred.append(candidate) else: _pred.append(candidate) if ((not cal_oracle) and (not self.args.recall_eval) and len(_pred) == 3): break _pred = '<q>'.join(_pred) if (self.args.recall_eval): _pred = ' '.join( _pred.split() [:len(batch.tgt_str[i].split())]) pred.append(_pred) gold.append(batch.tgt_str[i]) for i in range(len(gold)): save_gold.write(gold[i].strip() + '\n') for i in range(len(pred)): save_pred.write(pred[i].strip() + '\n') if (step != -1 and self.args.report_rouge): rouges = test_rouge(self.args.temp_dir, can_path, gold_path) logger.info('Rouges at step %d \n%s' % (step, rouge_results_to_str(rouges))) self._report_step(0, step, valid_stats=stats) return stats
def train(self, train_iter_fct, train_steps, valid_iter_fct=None, valid_steps=-1): """ The main training loops. by iterating over training data (i.e. `train_iter_fct`) and running validation (i.e. iterating over `valid_iter_fct` Args: train_iter_fct(function): a function that returns the train iterator. e.g. something like train_iter_fct = lambda: generator(*args, **kwargs) valid_iter_fct(function): same as train_iter_fct, for valid data train_steps(int): valid_steps(int): save_checkpoint_steps(int): Return: None """ logger.info('Start training...') # step = self.optim._step + 1 step = self.optim._step + 1 true_batchs = [] accum = 0 normalization = 0 train_iter = train_iter_fct() total_stats = Statistics() report_stats = Statistics() self._start_report_manager(start_time=total_stats.start_time) while step <= train_steps: reduce_counter = 0 for i, batch in enumerate(train_iter): if self.n_gpu == 0 or (i % self.n_gpu == self.gpu_rank): true_batchs.append(batch) normalization += batch.batch_size accum += 1 if accum == self.grad_accum_count: reduce_counter += 1 if self.n_gpu > 1: normalization = sum( distributed.all_gather_list(normalization)) self._gradient_accumulation(true_batchs, normalization, total_stats, report_stats) report_stats = self._maybe_report_training( step, train_steps, self.optim.learning_rate, report_stats) true_batchs = [] accum = 0 normalization = 0 if (step % self.save_checkpoint_steps == 0 and self.gpu_rank == 0): self._save(step) step += 1 if step > train_steps: break train_iter = train_iter_fct() return total_stats
def _lazy_dataset_loader(pt_file, corpus_type): dataset = torch.load(pt_file) logger.info('Loading %s dataset from %s, number of examples: %d' % (corpus_type, pt_file, len(dataset))) logger.info('loaded:%s' % (pt_file)) return dataset
def log(self, *args, **kwargs): logger.info(*args, **kwargs)
def _lazy_dataset_loader(pt_file, corpus_type): dataset = torch.load(pt_file) logger.info( 'Loading {} dataset from {}, number of examples: {}'.format( corpus_type, pt_file, len(dataset))) return dataset