def validate(args, device_id, pt, step, tokenizer): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'dev', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) symbols = {'BOS': tokenizer.convert_tokens_to_ids('<s>'), 'EOS': tokenizer.convert_tokens_to_ids('</s>'), 'PAD': tokenizer.convert_tokens_to_ids('[PAD]')} valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def test_text_abs(args): logger.info('Loading checkpoint from %s' % args.test_from) device = "cpu" if args.visible_gpus == '-1' else "cuda" checkpoint = torch.load(args.test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) logger.info('Loading args inside test_text_abs %s' % args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.load_text(args, args.text_src, args.text_tgt, device) logger.info('test_iter is %s' % test_iter) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } logger.info('symbols is %s' % symbols) predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, -1)
def test_text_abs(args, device_id, pt, step, tokenizer): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) symbols = {'BOS': tokenizer.convert_tokens_to_ids('<s>'), 'EOS': tokenizer.convert_tokens_to_ids('</s>'), 'PAD': tokenizer.convert_tokens_to_ids('[PAD]')} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if pt != '': test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir) model = AbsSummarizer(args, device, checkpoint, symbols=symbols) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True, tokenizer=tokenizer) predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) # 为了中文的tokenize能把unused分开 # for chinese tokenization, or it will split the word 'unused' add_token_list = ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'] if args.bart: tokenizer = AutoTokenizer.from_pretrained('bart-base', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False) symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'], 'PAD': 0, 'EOQ': tokenizer.encoder['madeupword0002']} else: tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False, additional_special_tokens=add_token_list) symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" #if (pt != ''): if not (args.test_from): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) tokenizer = BertTokenizer.from_pretrained( '../ETRI_koBERT/003_bert_eojeol_pytorch/vocab.txt', do_lower_case=False, cache_dir=args.temp_dir) if not args.share_emb: tokenizer = add_tokens(tokenizer) symbols = { 'BOS': tokenizer.vocab['<S>'], 'EOS': tokenizer.vocab['<T>'], 'PAD': tokenizer.vocab['[PAD]'] } # symbols = {'BOS': tokenizer.vocab['[BOS]'], 'EOS': tokenizer.vocab['[EOS]'], # 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[EOQ]']} # symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], # 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} # print(tokenizer.vocab_size) # print([(key, value) for key, value in tokenizer.vocab.items()][-10:]) # exit() valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) parser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default="/content/PhoBERT_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') args1, unknown = parser.parse_known_args() bpe = fastBPE(args1) # Load the dictionary vocab = Dictionary() vocab.add_from_file("/content/PhoBERT_base_transformers/dict.txt") tokenizer = bpe symbols = { 'BOS': vocab.indices['[unused0]'], 'EOS': vocab.indices['[unused1]'], 'PAD': vocab.indices['[PAD]'], 'EOQ': vocab.indices['[unused2]'] } valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == "-1" else "cuda" if pt != "": test_from = pt else: test_from = args.test_from logger.info("Loading checkpoint from %s" % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint["opt"]) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() valid_iter = data_loader.Dataloader( args, load_dataset(args, "valid", shuffle=False), args.batch_size, device, shuffle=False, is_test=False, ) tokenizer = BertTokenizer.from_pretrained( "chinese_roberta_wwm_ext_pytorch", do_lower_case=True, cache_dir=args.temp_dir) symbols = { "BOS": tokenizer.vocab["[unused1]"], "EOS": tokenizer.vocab["[unused2]"], "PAD": tokenizer.vocab["[PAD]"], "EOQ": tokenizer.vocab["[unused3]"], } valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def test_abs(args, device_id, pt, step): """ Implements testing process (meta / non-memta) Arguments: device_id (int) : the GPU id to be used pt() : checkpoint model step (int) : checkpoint step Process: - load checkpoint - prepare dataloader class - prepare model class - prepare predictor - predictor.translate() """ device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d', device_id) logger.info('Device %s', device) # Load chekcpoint if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) # Prepare dataloader test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) # Prepare model if (args.meta_mode): model = MTLAbsSummarizer(args, device, checkpoint) else: model = AbsSummarizer(args, device, checkpoint) model.eval() # Prepare predictor tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step) # long time
def train_abs_single(args, device_id): init_logger(args.log_file) logger.info(str(args)) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) else: checkpoint = None if (args.load_from_extractive != ''): logger.info('Loading bert from extractive model %s' % args.load_from_extractive) bert_from_extractive = torch.load(args.load_from_extractive, map_location=lambda storage, loc: storage) bert_from_extractive = bert_from_extractive['model'] else: bert_from_extractive = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = AbsSummarizer(args, device, checkpoint, bert_from_extractive) if (args.sep_optim): optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_bert, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] logger.info(model) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True, label_smoothing=args.label_smoothing) trainer = build_trainer(args, device_id, model, optim, train_loss) trainer.train(train_iter_fct, args.train_steps)
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if not (args.test_from): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertTokenizer.from_pretrained( '../ETRI_koBERT/003_bert_eojeol_pytorch/vocab.txt', do_lower_case=False, cache_dir=args.temp_dir) if not args.share_emb: tokenizer = add_tokens(tokenizer) symbols = { 'BOS': tokenizer.vocab['<S>'], 'EOS': tokenizer.vocab['<T>'], 'PAD': tokenizer.vocab['[PAD]'] } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == "-1" else "cuda" if pt != "": test_from = pt else: test_from = args.test_from logger.info("Loading checkpoint from %s" % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint["opt"]) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader( args, load_dataset(args, "test", shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True, ) tokenizer = BertTokenizer.from_pretrained("bert-base-chinese", do_lower_case=True, cache_dir=args.temp_dir) symbols = { "BOS": tokenizer.vocab["[unused0]"], "EOS": tokenizer.vocab["[unused1]"], "PAD": tokenizer.vocab["[PAD]"], "EOQ": tokenizer.vocab["[unused2]"], } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if pt != '': test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir) model = AbsSummarizer(args, device, checkpoint, symbols=symbols) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False, tokenizer=tokenizer) valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertData(args).tokenizer #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) # tokenizer = None # if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']: # tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_type, do_lower_case=True, cache_dir=args.temp_dir) # # if not tokenizer: # raise NotImplementedError("tokenizer") # tokenizer = add_to_vocab(tokenizer, ['[unused0]', '[unused1]', '[PAD]', '[unused2]']) symbols = {'BOS': tokenizer.convert_tokens_to_ids('[unused0]'), 'EOS': tokenizer.convert_tokens_to_ids('[unused1]'), 'PAD': tokenizer.convert_tokens_to_ids('[PAD]'), 'EOQ': tokenizer.convert_tokens_to_ids('[unused2]')} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
args.large = False args.temp_dir = 'temp' args.finetune_bert = False args.encoder = 'bert' args.max_pos = 256 args.dec_layers = 6 args.share_emb = False args.dec_hidden_size = 768 args.dec_heads = 8 args.dec_ff_size = 2048 args.dec_dropout = 0.2 args.use_bert_emb = False bert_data = BertData(args.model_path, True, 510, 128) BertSumAbs = AbsSummarizer(args, DEVICE, checkpoint) BertSumAbs.eval() data = pd.read_json(DATASET_PATH, encoding='utf-8', lines=True, chunksize=CHUNK_SIZE) for el in tqdm.tqdm(data, total=450000 // CHUNK_SIZE): with open('vectors.npy', 'ab') as fvecs, open('text.jsonl', 'a', encoding='utf-8') as ft: for j in range(CHINK_SIZE): text = el.iloc[j]["text"].lower().replace('\xa0', ' ').replace( '\n', ' ').strip() title = el.iloc[j]["title"].lower()
def train_abs_single(args, device_id): init_logger(args.log_file) logger.info(str(args)) device = "cpu" if args.visible_gpus == "-1" else "cuda" logger.info("Device ID %d" % device_id) logger.info("Device %s" % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != "": logger.info("Loading checkpoint from %s" % args.train_from) checkpoint = torch.load( args.train_from, map_location=lambda storage, loc: storage ) opt = vars(checkpoint["opt"]) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) else: checkpoint = None if args.load_from_extractive != "": logger.info("Loading bert from extractive model %s" % args.load_from_extractive) bert_from_extractive = torch.load( args.load_from_extractive, map_location=lambda storage, loc: storage ) bert_from_extractive = bert_from_extractive["model"] else: bert_from_extractive = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader( args, load_dataset(args, "train", shuffle=True), args.batch_size, device, shuffle=True, is_test=False, ) model = AbsSummarizer(args, device, checkpoint, bert_from_extractive) if args.sep_optim: optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_bert, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] logger.info(model) tokenizer = BertTokenizer.from_pretrained( "chinese_roberta_wwm_ext_pytorch/", do_lower_case=True, cache_dir=args.temp_dir ) symbols = { "BOS": tokenizer.vocab["[unused1]"], "EOS": tokenizer.vocab["[unused2]"], "PAD": tokenizer.vocab["[PAD]"], "EOQ": tokenizer.vocab["[unused3]"], } train_loss = abs_loss( model.generator, symbols, model.vocab_size, device, train=True, label_smoothing=args.label_smoothing, ) trainer = build_trainer(args, device_id, model, optim, train_loss) trainer.train(train_iter_fct, args.train_steps)
def convert_bertabs_checkpoints(path_to_checkpoints, dump_path): """ Copy/paste and tweak the pre-trained weights provided by the creators of BertAbs for the internal architecture. """ # Instantiate the authors' model with the pre-trained weights config = BertAbsConfig( temp_dir=".", finetune_bert=False, large=False, share_emb=True, use_bert_emb=False, encoder="bert", max_pos=512, enc_layers=6, enc_hidden_size=512, enc_heads=8, enc_ff_size=512, enc_dropout=0.2, dec_layers=6, dec_hidden_size=768, dec_heads=8, dec_ff_size=2048, dec_dropout=0.2, ) checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage) original = AbsSummarizer(config, torch.device("cpu"), checkpoints) original.eval() new_model = BertAbsSummarizer(config, torch.device("cpu")) new_model.eval() # ------------------- # Convert the weights # ------------------- logging.info("convert the model") new_model.bert.load_state_dict(original.bert.state_dict()) new_model.decoder.load_state_dict(original.decoder.state_dict()) new_model.generator.load_state_dict(original.generator.state_dict()) # ---------------------------------- # Make sure the outpus are identical # ---------------------------------- logging.info("Make sure that the models' outputs are identical") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # prepare the model inputs encoder_input_ids = tokenizer.encode("This is sample éàalj'-.") encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids))) encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0) decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.") decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids))) decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0) # failsafe to make sure the weights reset does not affect the # loaded weights. assert torch.max( torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0 # forward pass src = encoder_input_ids tgt = decoder_input_ids segs = token_type_ids = None clss = None mask_src = encoder_attention_mask = None mask_tgt = decoder_attention_mask = None mask_cls = None # The original model does not apply the geneator layer immediatly but rather in # the beam search (where it combines softmax + linear layer). Since we already # apply the softmax in our generation process we only apply the linear layer here. # We make sure that the outputs of the full stack are identical output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0] output_original_generator = original.generator(output_original_model) output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0] output_converted_generator = new_model.generator(output_converted_model) maximum_absolute_difference = torch.max( torch.abs(output_converted_model - output_original_model)).item() print("Maximum absolute difference beween weights: {:.2f}".format( maximum_absolute_difference)) maximum_absolute_difference = torch.max( torch.abs(output_converted_generator - output_original_generator)).item() print("Maximum absolute difference beween weights: {:.2f}".format( maximum_absolute_difference)) are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3) if are_identical: logging.info("all weights are equal up to 1e-3") else: raise ValueError( "the weights are different. The new model is likely different from the original one." ) # The model has been saved with torch.save(model) and this is bound to the exact # directory structure. We save the state_dict instead. logging.info("saving the model's state dictionary") torch.save( new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin" )
const=True, default=True) args = parser.parse_args() args.gpu_ranks = [int(i) for i in range(len(args.visible_gpus.split(',')))] args.world_size = len(args.gpu_ranks) os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus device = "cpu" if args.visible_gpus == '-1' else "cuda" device_id = 0 if device == "cuda" else -1 checkpoint = torch.load(args.test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) model = AbsSummarizer(args, device, checkpoint) model.eval() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) def getrespond(request): context = {} if (request.POST): print(request.POST["Language"]) #print(request.POST) context["input"] = request.POST["input_block"] if (request.POST["Language"] != "English"): context[
def train_abs(args, device_id): init_logger(args.log_file) logger.info(str(args)) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) else: checkpoint = None if args.load_from_extractive != '': logger.info('Loading bert from extractive model %s' % args.load_from_extractive) bert_from_extractive = torch.load( args.load_from_extractive, map_location=lambda storage, loc: storage) bert_from_extractive = bert_from_extractive['model'] else: bert_from_extractive = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir) model = AbsSummarizer(args, device, checkpoint, bert_from_extractive, symbols=symbols) if args.sep_optim: optim_enc = model_builder.build_optim_enc(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_enc, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] logger.info(model) def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False, tokenizer=tokenizer) train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True, label_smoothing=args.label_smoothing) trainer = build_trainer(args, device_id, model, optim, train_loss) trainer.train(train_iter_fct, args.train_steps)
def train_abs_single(args, device_id): init_logger(args.log_file) logger.info(str(args)) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) else: checkpoint = None if (args.load_from_extractive != ''): logger.info('Loading bert from extractive model %s' % args.load_from_extractive) bert_from_extractive = torch.load( args.load_from_extractive, map_location=lambda storage, loc: storage) bert_from_extractive = bert_from_extractive['model'] else: bert_from_extractive = None torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) def valid_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = AbsSummarizer(args, device, checkpoint, bert_from_extractive) if (args.sep_optim): optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_bert, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] logger.info(model) print("model.vocab_size" + str(model.vocab_size)) parser = argparse.ArgumentParser() parser.add_argument('--bpe-codes', default="/content/PhoBERT_base_transformers/bpe.codes", required=False, type=str, help='path to fastBPE BPE') args1, unknown = parser.parse_known_args() bpe = fastBPE(args1) # Load the dictionary vocab = Dictionary() vocab.add_from_file("/content/PhoBERT_base_transformers/dict.txt") tokenizer = bpe symbols = { 'BOS': vocab.indices['[unused0]'], 'EOS': vocab.indices['[unused1]'], 'PAD': vocab.indices['[PAD]'], 'EOQ': vocab.indices['[unused2]'] } train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True, label_smoothing=args.label_smoothing) trainer = build_trainer(args, device_id, model, optim, train_loss) trainer.train(train_iter_fct=train_iter_fct, train_steps=args.train_steps, valid_iter_fct=valid_iter_fct)
return get_clf_report(embeds, markup, url2record, best_dist) if __name__ == "__main__": from models.model_builder import AbsSummarizer checkpoint = torch.load(sys.argv[1], map_location=lambda storage, loc: storage) embed_mode = sys.argv[2] args = lambda a: b args.model_path = '/data/alolbuhtijarov/rubert_cased_L-12_H-768_A-12_pt' args.large = False args.temp_dir = 'temp' args.finetune_bert = False args.encoder = 'bert' args.max_pos = 256 args.dec_layers = 6 args.share_emb = False args.dec_hidden_size = 768 args.dec_heads = 8 args.dec_ff_size = 2048 args.dec_dropout = 0.2 args.use_bert_emb = False bert_data = BertData(args.model_path, True, 510, 128) model = AbsSummarizer(args, 'cpu', checkpoint) model.eval() eval_clustering(lambda text: doc2vec(text, model, mode=embed_mode))
def validate(args, device_id, pt, step): ''' Implements validation process (meta / non-memta) Arguments: device_id (int) : the GPU id to be used pt() : checkpoint model step (int) : checkpoint step Process: - load checkpoint - prepare dataloader class - prepare model class - prepare loss func, which return loss class - prepare trainer - trainer.validate() Meta vs Normal - MetaDataloader vs Dataloader - load_dataset vs load_meta_dataset - MTLAbsSummarizer vs AbsSummarizer - build_MTLtrainer vs MTLTrainer ''' device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) # Fix random seed to control experiement torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) # Load checkpoint ard args if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) # which is self.args for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) # Prepare dataloader if (args.meta_mode): def valid_iter_fct(): return data_loader.MetaDataloader(args, load_meta_dataset(args, 'valid', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) else: valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) # Prepare model if (args.meta_mode): model = MTLAbsSummarizer(args, device, checkpoint) else: model = AbsSummarizer(args, device, checkpoint) #model.eval() # Prepare optimizer for inner loop # The optimizer for each task is seperated if (args.meta_mode): optims_inner = [] for i in range(args.num_task): if (args.sep_optim): optim_bert_inner = model_builder.build_optim_bert_inner( args, model, checkpoint, 'maml') optim_dec_inner = model_builder.build_optim_dec_inner( args, model, checkpoint, 'maml') optims_inner.append([optim_bert_inner, optim_dec_inner]) else: self.optims_inner.append([ model_builder.build_optim_inner(args, model, checkpoint, 'maml') ]) # Prepare optimizer (not actually used, but get the step information) if (args.sep_optim): optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optim = [optim_bert, optim_dec] else: optim = [model_builder.build_optim(args, model, checkpoint)] # Prepare loss tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } # Prepare loss computation valid_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=False) # Prepare trainer and perform validation if (args.meta_mode): trainer = build_MTLtrainer(args, device_id, model, optim, optims_inner, valid_loss) stats = trainer.validate(valid_iter_fct, step) else: trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def train_abs_single(args, device_id): """Implements training process (meta / non-meta) Args: device_id (int) : the GPU id to be used """ device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d', device_id) logger.info('Device %s', device) # Fix random seed to control experiement torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: # if use GPU torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) # Load checkpoint and args if args.train_from != '': logger.info('Loading checkpoint from %s', args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) # which is self.args for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) else: checkpoint = None # Load extractive model as initial parameter (proposed by Presumm) if args.load_from_extractive != '': logger.info('Loading bert from extractive model %s', args.load_from_extractive) bert_from_extractive = torch.load( args.load_from_extractive, map_location=lambda storage, loc: storage) bert_from_extractive = bert_from_extractive['model'] else: bert_from_extractive = None # Prepare dataloader if args.meta_mode: def meta_train_iter_fct(): return data_loader.MetaDataloader(args, load_meta_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) else: def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) # Prepare model if args.meta_mode: model = MTLAbsSummarizer(args, device, checkpoint, bert_from_extractive) else: model = AbsSummarizer(args, device, checkpoint, bert_from_extractive) # Prepare optimizer for inner loop # The optimizer for each task is seperated if args.meta_mode: optims_inner = [] for _ in range(args.num_task): if args.sep_optim: optim_bert_inner = model_builder.build_optim_bert_inner( args, model, checkpoint, 'maml') optim_dec_inner = model_builder.build_optim_dec_inner( args, model, checkpoint, 'maml') optims_inner.append([optim_bert_inner, optim_dec_inner]) else: optims_inner.append([ model_builder.build_optim_inner(args, model, checkpoint, 'maml') ]) # Prepare optimizer for outer loop if args.sep_optim: optim_bert = model_builder.build_optim_bert(args, model, checkpoint) optim_dec = model_builder.build_optim_dec(args, model, checkpoint) optims = [optim_bert, optim_dec] else: optims = [model_builder.build_optim(args, model, checkpoint)] # Prepare tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], # id = 1 'EOS': tokenizer.vocab['[unused1]'], # id = 2 'EOQ': tokenizer.vocab['[unused2]'], # id = 3 'PAD': tokenizer.vocab['[PAD]'] # id = 0 } # Self Check : special word ids special_words = [w for w in tokenizer.vocab.keys() if "[" in w] special_word_ids = [ tokenizer.convert_tokens_to_ids(w) for w in special_words ] # Prepare loss computation train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True, label_smoothing=args.label_smoothing) # Prepare trainer and perform training if args.meta_mode: trainer = build_MTLtrainer(args, device_id, model, optims, optims_inner, train_loss) trainer.train(meta_train_iter_fct) else: trainer = build_trainer(args, device_id, model, optims, train_loss) trainer.train(train_iter_fct, args.train_steps)