def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) logger.info(dir(test_iter)) tokenizer = BertTokenizer.from_pretrained(args.model_path, do_lower_case=True) symbols = { 'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]'] } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def _format_to_bert(params): corpus_type, json_file, args, save_file = params is_test = corpus_type == 'test' if (os.path.exists(save_file)): logger.info('Ignore %s' % save_file) return bert = BertData(args) logger.info('Processing %s' % json_file) jobs = json.load(open(json_file)) datasets = [] for d in jobs: d = json.loads(d) source, tgt, alignment = d['src'], d['tgt'], d['alignment'] # temp code tgt = [tgt] # temp code end sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3) if (args.lower): source = [' '.join(s).lower().split() for s in source] tgt = [' '.join(s).lower().split() for s in tgt] b_data = bert.preprocess(source, tgt, alignment, sent_labels, \ use_bert_basic_tokenizer=args.use_bert_basic_tokenizer, \ is_test=is_test) # b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer) if (b_data is None): continue src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt, alignment = b_data b_data_dict = { "src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs, "src_sent_labels": sent_labels, "segs": segments_ids, 'clss': cls_ids, 'src_txt': src_txt, "tgt_txt": tgt_txt, "alignment": alignment } datasets.append(b_data_dict) logger.info('Processed instances %d' % len(datasets)) logger.info('Saving to %s' % save_file) torch.save(datasets, save_file) datasets = [] gc.collect()
def train_abs_multi(args): """ Spawns 1 process per GPU """ init_logger() nb_gpu = args.world_size mp = torch.multiprocessing.get_context("spawn") # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for i in range(nb_gpu): device_id = i procs.append( mp.Process(target=run, args=(args, device_id, error_queue), daemon=True) ) procs[i].start() logger.info(" Starting process pid: %d " % procs[i].pid) error_handler.add_child(procs[i].pid) for p in procs: p.join()
def load_models_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) return predictor
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = ExtSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'dev', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) trainer = build_trainer(args, device_id, model, None) stats = trainer.validate(valid_iter, step) return stats.xent()
def inference_save_by_id(self, loader, cache_dir, model_file): datalist = CommentDataset(self.args, loader, cache_dir, self.dataname, self.device, self.tokenizer) data_iter = Iterator(datalist, batch_size=len(datalist), device=self.device, shuffle=False) # 1651 if model_file: print('Model is loaded from ', model_file) if logger: logger.info('Model is from {}'.format(model_file)) model_path = os.path.join(self.savepath, model_file) self.model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage)['model'], strict=False) else: print('Not loading pretrained model...') if logger: logger.info('Not loading pretrained model ...') try: print('[Warning] Going to delete the original temp file at ', self.args.savepath) shutil.rmtree(self.args.savepath+'/temp') except: print('Creating new temp folder') os.makedirs(os.path.join(self.args.savepath, 'temp'), exist_ok=True) os.makedirs(os.path.join(self.args.savepath, 'temp_gold'), exist_ok=True) self.predictor.translate(data_iter, model_file, cal_rouge=False, save=False, save_by_id=True)
def test_text_ext(args): logger.info('Loading checkpoint from %s' % args.test_from) checkpoint = torch.load( args.test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) device = "cpu" if args.visible_gpus == '-1' else "cuda" device_id = 0 if device == "cuda" else -1 logger.info('Coming here:1') model = ExtSummarizer(args, device, checkpoint) model.eval() logger.info('Coming here:2') logger.info('args: %s' % args) test_iter = data_loader.load_text( args, args.text_src, args.text_tgt, device) trainer = build_trainer(args, device_id, model, None) stats = trainer.test(test_iter, -1) logger.info('Coming here:3') return stats
def fix_missing_period(args): input_dir = os.path.abspath(args.raw_path) output_dir = os.path.abspath(args.save_path) os.makedirs(output_dir, exist_ok=True) logger.info("Fixing missing period in %s and saving in %s..." % (input_dir, output_dir)) stories = os.listdir(input_dir) for s in stories: if (not s.endswith('story')): continue _fix_missing_period(os.path.join(input_dir, s), os.path.join(output_dir, s)) # Check that the tokenized stories directory contains the same number of files as the original directory num_inputs = len(os.listdir(input_dir)) num_outputs = len(os.listdir(output_dir)) if num_inputs != num_outputs: raise Exception( "The output directory %s contains %i files, but it should contain the same number as %s (which has %i files). Was there an error during processing?" % (output_dir, num_outputs, input_dir, num_inputs)) logger.info("Successfully finished fixing missing period %s to %s.\n" % (input_dir, output_dir))
def output(self, step, num_steps, learning_rate, start): """Write out statistics to stdout. Args: step (int): current step n_batch (int): total batches start (int): start time of step. """ t = self.elapsed_time() step_fmt = "%2d" % step if num_steps > 0: step_fmt = "%s/%5d" % (step_fmt, num_steps) logger.info( ("Step %s; xent: %4.2f; " + "lr: %7.7f; %3.0f docs/s; %6.0f sec") % ( step_fmt, self.xent(), learning_rate, self.n_docs / (t + 1e-5), time.time() - start, ) ) sys.stdout.flush()
def str_format_to_bert_test(source, args, save_file): bert = BertData(args) logger.info('Processing %s' % source) tgt = [word_tokenize(t) for t in sent_tokenize(tgt)] source = [word_tokenize(t) for t in sent_tokenize(source)] sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3) if (args.lower): source = [' '.join(s).lower().split() for s in source] tgt = [' '.join(s).lower().split() for s in tgt] b_data = bert.preprocess( source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer, is_test=True) if (b_data is None): return src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt = b_data b_data_dict = { "src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs, "src_sent_labels": sent_labels, "segs": segments_ids, 'clss': cls_ids, 'src_txt': src_txt, "tgt_txt": tgt_txt } sent_labels = [0 for i in range(len(sent_labels))] tgt = [] datasets = [b_data_dict] logger.info('Saving to %s' % save_file) torch.save(datasets, save_file)
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if pt != '': test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir) model = AbsSummarizer(args, device, checkpoint, symbols=symbols) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False, tokenizer=tokenizer) valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertData(args).tokenizer #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) # tokenizer = None # if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']: # tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_type, do_lower_case=True, cache_dir=args.temp_dir) # # if not tokenizer: # raise NotImplementedError("tokenizer") # tokenizer = add_to_vocab(tokenizer, ['[unused0]', '[unused1]', '[PAD]', '[unused2]']) symbols = {'BOS': tokenizer.convert_tokens_to_ids('[unused0]'), 'EOS': tokenizer.convert_tokens_to_ids('[unused1]'), 'PAD': tokenizer.convert_tokens_to_ids('[PAD]'), 'EOQ': tokenizer.convert_tokens_to_ids('[unused2]')} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def train(args, device_id): init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = Summarizer(args, device, load_pretrained_bert=True) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) model.load_cp(checkpoint) optim = model_builder.build_optim(args, model, checkpoint) else: optim = model_builder.build_optim(args, model, None) logger.info(model) trainer = build_trainer(args, device_id, model, optim) losses, n_docs = trainer.train(train_iter_fct, args.train_steps) save_pickle(losses, 'losses_classifier') save_pickle(n_docs, 'docs_classifier')
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) if args.bart: tokenizer = AutoTokenizer.from_pretrained('/home/ybai/downloads/bart', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False) symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'], 'PAD': tokenizer.encoder['<pad>'], 'EOQ': tokenizer.encoder['madeupword0002']} else: tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=True) symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']} valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == "-1" else "cuda" if pt != "": test_from = pt else: test_from = args.test_from logger.info("Loading checkpoint from %s" % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint["opt"]) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader( args, load_dataset(args, "test", shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True, ) tokenizer = BertTokenizer.from_pretrained( "bert-base-chinese", do_lower_case=True, cache_dir=args.temp_dir ) symbols = { "BOS": tokenizer.vocab["[unused0]"], "EOS": tokenizer.vocab["[unused1]"], "PAD": tokenizer.vocab["[PAD]"], "EOQ": tokenizer.vocab["[unused2]"], } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def build_trainer(args, device_id, model, optim): """ Simplify `Trainer` creation based on user `opt`s* Args: opt (:obj:`Namespace`): user options (usually from argument parsing) model (:obj:`onmt.models.NMTModel`): the model to train fields (dict): dict of fields optim (:obj:`onmt.utils.Optimizer`): optimizer used during training data_type (str): string describing the type of data e.g. "text", "img", "audio" model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object used to save the model """ grad_accum_count = args.accum_count n_gpu = args.world_size if device_id >= 0: gpu_rank = int(args.gpu_ranks[device_id]) else: gpu_rank = 0 n_gpu = 0 print('gpu_rank %d' % gpu_rank) # set tensorboard. tensorboard_log_dir = args.model_path writer = SummaryWriter(tensorboard_log_dir, comment="Unmt") report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer) # build trainer trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager) if (model): # paramaters n_params = _tally_parameters(model) logger.info('* number of parameters: %d' % n_params) return trainer
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) # for chinese tokenization add_token_list = ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'] if args.bart: tokenizer = AutoTokenizer.from_pretrained('bart-base', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False) # tokenizer = AutoTokenizer.from_pretrained('/home/ybai/downloads/bart', do_lower_case=True, # cache_dir=args.temp_dir, local_files_only=False) symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'], 'PAD': tokenizer.encoder['<pad>'], 'EOQ': tokenizer.encoder['madeupword0002']} else: tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False, additional_special_tokens=add_token_list) symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) vocab = get_kobert_vocab(cachedir=args.temp_dir) symbols = { 'BOS': vocab.token_to_idx['[BOS]'], 'EOS': vocab.token_to_idx['[EOS]'], 'PAD': vocab.token_to_idx['[PAD]'], 'EOQ': vocab.token_to_idx['[EOS]'] } predictor = build_predictor(args, vocab, symbols, model, logger) predictor.translate(test_iter, step)
def _format_to_bert(params): corpus_type, json_file, args, save_file = params is_test = corpus_type == 'test' if (os.path.exists(save_file)): logger.info('Ignore %s' % save_file) return bert = BertData(args) logger.info('Processing %s' % json_file) # print("PATH is : {}".format(json_file)) # base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # if corpus_type == 'train': # json_file = os.path.join(base_dir, 'merged_stories_tokenized', json_file) # elif corpus_type == 'valid': # json_file = os.path.join(base_dir, 'merged_stories_tokenized_val', json_file) # elif corpus_type == 'test': # json_file = os.path.join(base_dir, 'merged_stories_tokenized_test', json_file) # else: # print("Not in in dataset") # sys.exit() jobs = json.load(open(json_file)) datasets = [] for d in jobs: source, tgt = d['src'], d['tgt'] sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3) if (args.lower): source = [' '.join(s).lower().split() for s in source] tgt = [' '.join(s).lower().split() for s in tgt] b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer, is_test=is_test) # b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer) if (b_data is None): continue src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt = b_data b_data_dict = {"src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs, "src_sent_labels": sent_labels, "segs": segments_ids, 'clss': cls_ids, 'src_txt': src_txt, "tgt_txt": tgt_txt} datasets.append(b_data_dict) logger.info('Processed instances %d' % len(datasets)) logger.info('Saving to %s' % save_file) torch.save(datasets, save_file) datasets = [] gc.collect()
def getTranslator(): # set up model device = "cpu" logger.info('Loading checkpoint from %s' % args.test_from) checkpoint = torch.load(args.test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) config = BertConfig.from_json_file(args.bert_config_path) model = Summarizer(args, device, load_pretrained_bert=False, bert_config=config) model.load_cp(checkpoint) model.eval() return build_trainer(args, -1, model, None)
def train(args, device_id): init_logger(args.log_file) device = "cpu" if args.visible_gpus == "-1" else "cuda" logger.info("Device ID %d" % device_id) logger.info("Device %s" % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader( args, load_dataset(args, "train", shuffle=True), args.batch_size, device, shuffle=True, is_test=False, ) model = Summarizer(args, device, load_pretrained_bert=True) if args.train_from != "": logger.info("Loading checkpoint from %s" % args.train_from) checkpoint = torch.load( args.train_from, map_location=lambda storage, loc: storage ) opt = vars(checkpoint["opt"]) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) model.load_cp(checkpoint) optim = model_builder.build_optim(args, model, checkpoint) else: optim = model_builder.build_optim(args, model, None) logger.info(model) trainer = build_trainer(args, device_id, model, optim) trainer.train(train_iter_fct, args.train_steps)
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) config = BertConfig.from_json_file(args.bert_config_path) model = Summarizer(args, device, load_pretrained_bert=False, bert_config=config) model.load_cp(checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) trainer = build_trainer(args, device_id, model, None) # comet_experiment.log_parameters(config) with comet_experiment.test(): stats = trainer.validate(valid_iter, step) return stats.xent()
def _format_to_bert(params): corpus_type, json_file, args, save_file = params is_test = corpus_type == 'test' if (os.path.exists(save_file)): logger.info('Ignore %s' % save_file) return bert = BertData(args) logger.info('Processing %s' % json_file) jobs = json.load(open(json_file)) datasets = [] for d in jobs: source, tgt, n_tgt = d['src'], d['tgt'], d['n_tgt'] if (args.lower): source = [' '.join(s).lower().split() for s in source] tgt = [' '.join(s).lower().split() for s in tgt] n_tgt = [' '.join(s).lower().split() for s in n_tgt] b_data = bert.preprocess( source, tgt, n_tgt, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer, is_test=is_test) if (b_data is None): continue p_pair, n_pair, p_segments_ids, n_segments_ids, p_summ_mask, n_summ_mask, src_txt, tgt_txt, n_tgt_text = b_data b_data_dict = { "pos": p_pair, "neg": n_pair, "p_summ_mask": p_summ_mask, 'n_summ_mask': n_summ_mask, "p_segs": p_segments_ids, "n_segs": n_segments_ids, 'src_txt': src_txt, "tgt_txt": tgt_txt, "n_tgt_text": n_tgt_text } datasets.append(b_data_dict) logger.info('Processed instances %d' % len(datasets)) logger.info('Saving to %s' % save_file) torch.save(datasets, save_file) datasets = [] gc.collect()
def test_detector(self, loader, run_gen=False, portion='all'): """ Testing detector """ test = TreeDataset(self.args, loader, self.dataname, self.device, self.tokenizer) #test = CommentDataset(self.args, loader, self.dataname, self.device, self.tokenizer) data_iter = Iterator(test, train=False, device=self.device, batch_size=len(test) if len(test)<self.bs else self.bs, sort_key=lambda x: len(x.src), sort_within_batch=False) # Define trainer train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, train=False) trainer = build_trainer(self.args, self.model, self.optim, train_loss) logger.info('Test on best model (stage-1)') best_model = os.path.join(self.args.savepath, 'best_model.pt') if os.path.exists(best_model): try: self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model']) except: self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model'], strict=False) logger.info('[Warning] The keys in state dict do not strictly match') test_stat = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=False, info="Without Generated Response >>", write_type="w") test_stat.write_results(os.path.join(self.args.savepath, 'result_test.csv'), 'test-'+portion, self.args.label_num) if self.args.test_adv: logger.info('Test on adversarially-trained model (stage-2)') best_model = os.path.join(self.args.savepath, 'best_adv_model.pt') if os.path.exists(best_model): try: self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model']) except: self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model'], strict=False) logger.info('[Warning] The keys in state dict do not strictly match') test_stat = trainer.testing(data_iter, tokenizer=self.tokenizer, gen_flag=True, info="\nWith Generated Response from {} >>".format(best_model.split("/")[-1]), write_type=="a") predictor = build_predictor(self.args, self.model, self.tokenizer, self.symbols, logger) predictor.translate(data_iter, 'best', have_gold=False)
def validate_abs(args, device_id): timestep = 0 tensorboard_writer = SummaryWriter(args.model_path + '/valid_test', comment="Unmt") if (args.test_all): cp_files = sorted(glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) xent_lst = [] for i, cp in enumerate(cp_files): logger.info('validate: %s'%cp) step = int(cp.split('.')[-2].split('_')[-1]) if (args.test_start_from != -1 and step < args.test_start_from): xent_lst.append((1e6, cp)) continue xent = validate(args, device_id, cp, step) tensorboard_writer.add_scalar('valid/xent', xent, step) tensorboard_writer.flush() xent_lst.append((xent, cp)) max_step = xent_lst.index(min(xent_lst)) if (i - max_step > 10): break xent_lst = sorted(xent_lst, key=lambda x: x[0]) logger.info('PPL %s' % str(xent_lst)) for xent, cp in xent_lst: step = int(cp.split('.')[-2].split('_')[-1]) logger.info('test: %s' % cp) test_abs(args, device_id, cp, step) else: while (True): cp_files = sorted(glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) if (cp_files): cp = cp_files[-1] time_of_cp = os.path.getmtime(cp) if (not os.path.getsize(cp) > 0): time.sleep(60) continue if (time_of_cp > timestep): timestep = time_of_cp step = int(cp.split('.')[-2].split('_')[-1]) xent = validate(args, device_id, cp, step) tensorboard_writer.add_scalar('valid/xent', xent, step) tensorboard_writer.flush() test_abs(args, device_id, cp, step) cp_files = sorted(glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) if (cp_files): cp = cp_files[-1] time_of_cp = os.path.getmtime(cp) if (time_of_cp > timestep): continue else: time.sleep(300)
def validate_ext(args, device_id): timestep = 0 if (args.test_all): cp_files = sorted( glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) xent_lst = [] for i, cp in enumerate(cp_files): step = int(cp.split('.')[-2].split('_')[-1]) xent = validate(args, device_id, cp, step) xent_lst.append((xent, cp)) max_step = xent_lst.index(min(xent_lst)) if (i - max_step > 10): break xent_lst = sorted(xent_lst, key=lambda x: x[0])[:3] logger.info('PPL %s' % str(xent_lst)) logger.info( 'Decoding and Computing ROUGE for top-3 models for DEV Set: ') for xent, cp in xent_lst: step = int(cp.split('.')[-2].split('_')[-1]) val_ext(args, device_id, cp, step) logger.info( 'Decoding and Computing ROUGE for top-3 models for TEST Set: ') for xent, cp in xent_lst: step = int(cp.split('.')[-2].split('_')[-1]) test_ext(args, device_id, cp, step) else: while (True): cp_files = sorted( glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) if (cp_files): cp = cp_files[-1] time_of_cp = os.path.getmtime(cp) if (not os.path.getsize(cp) > 0): time.sleep(60) continue if (time_of_cp > timestep): timestep = time_of_cp step = int(cp.split('.')[-2].split('_')[-1]) validate(args, device_id, cp, step) test_ext(args, device_id, cp, step) cp_files = sorted( glob.glob(os.path.join(args.model_path, 'model_step_*.pt'))) cp_files.sort(key=os.path.getmtime) if (cp_files): cp = cp_files[-1] time_of_cp = os.path.getmtime(cp) if (time_of_cp > timestep): continue else: time.sleep(300)
def build_trainer(args, device_id, model, symbols, vocab_size, optim): """ Simplify `Trainer` creation based on user `opt`s* Args: opt (:obj:`Namespace`): user options (usually from argument parsing) model (:obj:`onmt.models.NMTModel`): the model to train fields (dict): dict of fields optim (:obj:`onmt.utils.Optimizer`): optimizer used during training data_type (str): string describing the type of data e.g. "text", "img", "audio" model_saver(:obj:`onmt.models.ModelSaverBase`): the utility object used to save the model """ device = "cpu" if args.visible_gpus == '-1' else "cuda" train_loss = build_loss_compute(model.generator, symbols, vocab_size, device, train=True, label_smoothing=args.label_smoothing) valid_loss = build_loss_compute(model.generator, symbols, vocab_size, train=False, device=device) shard_size = args.max_generator_batches grad_accum_count = args.accum_count n_gpu = args.world_size if device_id >= 0: gpu_rank = int(args.gpu_ranks[device_id]) else: gpu_rank = 0 n_gpu = 0 tensorboard_log_dir = args.model_path # writer = SummaryWriter(tensorboard_log_dir, comment="Unmt") # report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer) trainer = Trainer(args, model, train_loss, valid_loss, optim, shard_size, grad_accum_count, n_gpu, gpu_rank) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) return trainer
def train(args, device_id): init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) else: checkpoint = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): # return data_loader.AbstractiveDataloader(load_dataset('train', True), symbols, FLAGS.batch_size, device, True) return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = Summarizer(args, device, checkpoint) # optim = model_builder.build_optim(args, model.reg, checkpoint) optim = model_builder.build_optim(args, model, checkpoint) # optim = BertAdam() logger.info(model) trainer = build_trainer(args, device_id, model, optim) # trainer.train(train_iter_fct, args.train_steps)
def _preprocess(self, params): corpus_type, json_file, args, save_file = params is_test = corpus_type == 'test' if (os.path.exists(save_file)): logger.info('Ignore %s' % save_file) return bert = PretrainData(args) logger.info('Processing %s' % json_file) jobs = json.load(open(json_file)) datasets = [] for d in jobs: source, tgt = d['src'], d['tgt'] # DDDDDDDDDDDELETE code #if is_test: # if len(d['src']) < 5: # continue # DDDDDDDDDDDELETE code end if (args.lower): source = [' '.join(s).lower().split() for s in source] tgt = [' '.join(s).lower().split() for s in tgt] b_data = bert.preprocess(source, tgt, is_test=is_test) if (b_data is None): continue src_subtoken_idxs, tgt_subtoken_idxs, segments_ids, src_txt, tgt_txt = b_data b_data_dict = {"src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs, "segs": segments_ids, "example_id": d['example_id'], "src_txt": src_txt, "tgt_txt": tgt_txt} datasets.append(b_data_dict) logger.info('Processed instances %d' % len(datasets)) logger.info('Saving to %s' % save_file) torch.save(datasets, save_file) datasets = [] gc.collect()
def format_to_lines(args): corpus_mapping = {} for corpus_type in ['valid', 'test', 'train']: temp = [] for line in open( pjoin(args.map_path, 'mapping_' + corpus_type + '.txt')): temp.append(line.strip()) corpus_mapping[corpus_type] = {key.strip(): 1 for key in temp} logger.info("txt read finished") train_files, valid_files, test_files = [], [], [] for f in glob.glob(pjoin(args.raw_path, '*.json')): real_name = f.split('/')[-1].split('.')[0] if (real_name in corpus_mapping['valid']): valid_files.append(f) elif (real_name in corpus_mapping['test']): test_files.append(f) elif (real_name in corpus_mapping['train']): train_files.append(f) else: logger.info("not in mapping file", f) train_files.append(f) logger.info("data split over") corpora = {'train': train_files, 'valid': valid_files, 'test': test_files} for corpus_type in ['train', 'valid', 'test']: a_lst = [(f, args) for f in corpora[corpus_type]] pool = Pool(args.n_cpus) dataset = [] p_ct = 0 for d in pool.imap_unordered(_format_to_lines, a_lst): dataset.append(d) if (len(dataset) > args.shard_size): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = [] pool.close() pool.join() if (len(dataset) > 0): pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct) with open(pt_file, 'w') as save: # save.write('\n'.join(dataset)) save.write(json.dumps(dataset)) p_ct += 1 dataset = []