Example #1
0
 def get_from_pretrained(path):
     conf_path = join(dirname(path), "config.json")
     conf = EncoderDecoderConfig.from_pretrained(conf_path)
     model = EncoderDecoderModel.from_pretrained(path, config=conf)
     return model
Example #2
0
    show_answer(tokenizer, train_input_encodings, train_sentbounds,
                train_sentlabels, 0)
    show_answer(tokenizer, train_input_encodings, train_sentbounds,
                train_sentlabels, -1)
    print("Length of Train Set: {}".format(len(train_contents)))
    print("Done Dataset Processing")
    """The dataset is now ready for training"""
    train_dataset = GenerationDataset(train_input_encodings, train_sentbounds,
                                      train_sentlabels)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)

    if args.modelckpt != "":
        print("Load from modelckpt: {}".format(args.modelckpt))
        model_encdec = EncoderDecoderModel.from_pretrained(args.modelckpt)
    else:
        print("Build from pretrained model: {}".format(modelbase))
        model_encdec = EncoderDecoderModel.from_encoder_decoder_pretrained(
            modelbase, modelbase)  # bert2bert, chinese variant
    # Create sentence scoring model
    for param in model_encdec.parameters():  # freeze everything
        param.requires_grad = False
    model_encoder = model_encdec.encoder
    model_sent_score = torch.nn.Sequential(torch.nn.Linear(
        768 * 2, 2), torch.nn.LogSoftmax(
            dim=1))  # Pooler output + Sentence encoding via Mean-over-Position
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    model_encoder.to(device)
    model_encoder.eval()
Example #3
0
 def test_real_bert_model_from_pretrained(self):
     model = EncoderDecoderModel.from_pretrained("bert-base-uncased",
                                                 "bert-base-uncased")
     self.assertIsNotNone(model)
Example #4
0
    "<bos>", "<eos>", "<persona>", "<speaker1>", "<speaker2>", "<pad>"
]

ATTR_TO_SPECIAL_TOKEN = {
    'bos_token': '<bos>',
    'eos_token': '<eos>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<speaker1>', '<speaker2>', '<persona>']
}

tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)

encoder_decoder_config = EncoderDecoderConfig.from_pretrained(
    './models/checkpoint-1200')
model = EncoderDecoderModel.from_pretrained('./models/checkpoint-1200',
                                            config=encoder_decoder_config)
model.get_encoder().resize_token_embeddings(len(tokenizer))
model.get_decoder().resize_token_embeddings(len(tokenizer))
print(type(model.get_encoder()), type(model.get_decoder()))
# model = SimpleEncoderDecoder(tokenizer)
# model = load()
# model.to('cpu')

# create ids of encoded input vectors
input_ids = tokenizer("I want to buy a car", return_tensors="pt").input_ids

# create BOS token
decoder_input_ids = tokenizer("<bos>",
                              add_special_tokens=False,
                              return_tensors="pt").input_ids
Example #5
0
def encoder_decoder_example():
	from transformers import EncoderDecoderConfig, EncoderDecoderModel
	from transformers import BertConfig, GPT2Config

	pretrained_model_name = 'bert-base-uncased'
	#pretrained_model_name = 'gpt2'

	if 'bert' in pretrained_model_name:
		# Initialize a BERT bert-base-uncased style configuration.
		config_encoder, config_decoder = BertConfig(), BertConfig()
	elif 'gpt2' in pretrained_model_name:
		config_encoder, config_decoder = GPT2Config(), GPT2Config()
	else:
		print('Invalid model, {}.'.format(pretrained_model_name))
		return

	config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

	if 'bert' in pretrained_model_name:
		# Initialize a Bert2Bert model from the bert-base-uncased style configurations.
		model = EncoderDecoderModel(config=config)
		#model = EncoderDecoderModel.from_encoder_decoder_pretrained(pretrained_model_name, pretrained_model_name)  # Initialize Bert2Bert from pre-trained checkpoints.
		tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
	elif 'gpt2' in pretrained_model_name:
		model = EncoderDecoderModel(config=config)
		tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name)

	#print('Configuration of the encoder & decoder:\n{}.\n{}.'.format(model.config.encoder, model.config.decoder))
	#print('Encoder type = {}, decoder type = {}.'.format(type(model.encoder), type(model.decoder)))

	if False:
		# Access the model configuration.
		config_encoder = model.config.encoder
		config_decoder  = model.config.decoder

		# Set decoder config to causal LM.
		config_decoder.is_decoder = True
		config_decoder.add_cross_attention = True

	#--------------------
	input_ids = torch.tensor(tokenizer.encode('Hello, my dog is cute', add_special_tokens=True)).unsqueeze(0)  # Batch size 1.

	if False:
		# Forward.
		outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)

		# Train.
		outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
		loss, logits = outputs.loss, outputs.logits

		# Save the model, including its configuration.
		model.save_pretrained('my-model')

		#--------------------
		# Load model and config from pretrained folder.
		encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
		model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)

	#--------------------
	# Generate.
	#	REF [site] >>
	#		https://huggingface.co/transformers/internal/generation_utils.html
	#		https://huggingface.co/blog/how-to-generate
	generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
	#generated = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, num_return_sequences=5, do_sample=True, top_k=0, temperature=0.7, early_stopping=True, decoder_start_token_id=model.config.decoder.pad_token_id)
	print('Generated = {}.'.format(tokenizer.decode(generated[0], skip_special_tokens=True)))
def generate_predictions(args):

    model_dir = os.path.join(args.model_root_dir, args.run_id,
                             args.translation_model_name)
    print(f"model dir: {model_dir}")
    val_data_path = os.path.join(args.data_out_dir, args.val_dataset_name)
    print(
        f"using model from {get_last_checkpoint(model_dir)} and test data from {val_data_path} to generate predictions"
    )

    dataset_properties = json.load(
        open(os.path.join(model_dir, "dataset_properties.json")))
    special_tokens = dataset_properties["special_tokens"]
    target_vocab = dataset_properties["target_vocab"]

    source_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    source_tokenizer.add_special_tokens(
        {"additional_special_tokens": special_tokens})
    target_tokenizer = PreTrainedArsenalTokenizer(target_vocab=target_vocab)

    bert2arsenal = EncoderDecoderModel.from_pretrained(
        get_last_checkpoint(model_dir))

    val_data = datasets.load_from_disk(val_data_path)

    runid, _, checkpoint = get_last_checkpoint((model_dir)).split("/")[-3:]
    outfile = open(
        os.path.join(
            Path(model_dir).parent, f"predictions_{runid}_{checkpoint}.txt"),
        "w")

    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    bert2arsenal.to(torch_device)

    batch_size = args.batch_size
    num_batches = int(val_data.num_rows / batch_size)

    type_forcing_vocab = target_tokenizer.id2vocab if args.type_forcing else None

    for i in tqdm(range(num_batches)):

        batch_range = range(i * batch_size, (i + 1) * batch_size)
        batch = val_data.select(list(batch_range))

        batch_ids = torch.tensor(batch["input_ids"], device=torch_device)
        batch_masks = torch.tensor(batch["attention_mask"],
                                   device=torch_device)

        # take this little detour with the args for generate() so that we can decide whether
        # we want to add the argument for the type forcing vocab (if using an unpatched
        # transformers version, adding anything about typeforcing (even if disabled) would
        # cause errors about unrecognized arguments)
        generate_args = {
            "input_ids": batch_ids,
            "attention_mask": batch_masks,
            "decoder_start_token_id": target_tokenizer.cls_token_id,
            "num_beams": args.num_beams,
            "num_return_sequences": args.num_outputs,
            "no_repeat_ngram_size": 0
        }
        if args.type_forcing:
            generate_args["type_forcing_vocab"] = type_forcing_vocab

        outputs = bert2arsenal.generate(**generate_args)

        # apparently batch instances and return sequences per instance are stacked along a single dimension
        for j in range(batch_size):
            input = [t for t in batch["input_ids"][j] if t != 0]
            true_seq = [t for t in batch['labels'][j] if t != -100]
            outfile.write(f"{input}\t{true_seq}")
            for k in range(j * args.num_outputs, (j + 1) * args.num_outputs):
                pred_seq = [t for t in outputs[k].tolist() if t != 0]
                outfile.write(f"\t{pred_seq}")
            outfile.write("\n")
        outfile.flush()
    outfile.close()
Example #7
0
#!/usr/bin/env python3

from transformers import EncoderDecoderModel, BertTokenizer

model = EncoderDecoderModel.from_pretrained('bert-base-uncased',
                                            'bert-base-uncased')

tok = BertTokenizer.from_pretrained('bert-base-uncased')
input_ids = tok.encode('Hi it is me.', return_tensors='pt')

output = model.generate(input_ids, bos_token_id=tok.pad_token_id)

print(tok.decode(output[0], skip_special_tokens=True))
import ipdb

ipdb.set_trace()
pass
    print("Show some examples: ")
    show_answer(tokenizer, test_input_encodings, 0)
    # show_answer(tokenizer, test_input_encodings, 100)
    # show_answer(tokenizer, test_input_encodings, 2000)
    show_answer(tokenizer, test_input_encodings, -1)
    print("Length of Infer Set: {}".format(len(test_contents)))
    print("Done Dataset Processing")

    test_dataset = GenerationDataset(test_input_encodings)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)

    """Multiple Instance Inference"""
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = EncoderDecoderModel.from_pretrained(args.modelckpt,
                                                output_attentions=True,
                                                output_hidden_states=True)
    model.to(device)
    model.eval()
    with torch.no_grad():
        fd = open(args.outfile, "w", encoding="utf-8")
        start = time.time()
        for step, batch_in in enumerate(test_loader):
            input_ids = batch_in["input_ids"].to(device)
            attention_mask = batch_in["attention_mask"].to(device)

            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                     decoder_start_token_id=tokenizer.cls_token_id, eos_token_id=tokenizer.sep_token_id,
                                     num_beams=5, num_return_sequences=1,
                                     min_length=3, max_length=15)
            output_strs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
from general_utils import *

from transformers import RobertaTokenizer, EncoderDecoderModel, AutoTokenizer
import pickle

with open('./config.yaml') as f:
    configs = yaml.load(f, Loader=yaml.SafeLoader)

# Get the checkpoints from gcp
os.makedirs(configs['output_dir'] + '/pretrained/', exist_ok=True)
os.system('gsutil -m cp -r "{}/*" "{}"'.format(
    configs['gcp_pretrained_path'], configs['output_dir'] + '/pretrained/'))

test_data = get_data_batch(path='./data/test_tokenized/*', test=True)

model = EncoderDecoderModel.from_pretrained(configs['output_dir'] +
                                            '/pretrained/')
model.to("cuda")

batch_size = configs['batch_size'] * 2  # change to 64 for full evaluation


# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    inputs = tokenizer(batch["original"],
                       padding="max_length",
                       truncation=True,
                       max_length=256,
                       return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    def test_encoder_decoder_save_load_from_encoder_decoder_from_pt(self):
        config = self.get_encoder_decoder_config_small()

        # create two random BERT models for bert2bert & initialize weights (+cross_attention weights)
        encoder_pt = BertModel(config.encoder).to(torch_device).eval()
        decoder_pt = BertLMHeadModel(config.decoder).to(torch_device).eval()

        encoder_decoder_pt = EncoderDecoderModel(
            encoder=encoder_pt, decoder=decoder_pt).to(torch_device).eval()

        input_ids = ids_tensor([13, 5], encoder_pt.config.vocab_size)
        decoder_input_ids = ids_tensor([13, 1], decoder_pt.config.vocab_size)

        pt_input_ids = torch.tensor(input_ids.numpy(),
                                    device=torch_device,
                                    dtype=torch.long)
        pt_decoder_input_ids = torch.tensor(decoder_input_ids.numpy(),
                                            device=torch_device,
                                            dtype=torch.long)

        logits_pt = encoder_decoder_pt(
            input_ids=pt_input_ids,
            decoder_input_ids=pt_decoder_input_ids).logits

        # PyTorch => TensorFlow
        with tempfile.TemporaryDirectory(
        ) as tmp_dirname_1, tempfile.TemporaryDirectory() as tmp_dirname_2:
            encoder_decoder_pt.encoder.save_pretrained(tmp_dirname_1)
            encoder_decoder_pt.decoder.save_pretrained(tmp_dirname_2)
            encoder_decoder_tf = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
                tmp_dirname_1,
                tmp_dirname_2,
                encoder_from_pt=True,
                decoder_from_pt=True)

        logits_tf = encoder_decoder_tf(
            input_ids=input_ids, decoder_input_ids=decoder_input_ids).logits

        max_diff = np.max(
            np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=3)

        # Make sure `from_pretrained` following `save_pretrained` work and give the same result
        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_decoder_tf.save_pretrained(tmp_dirname)
            encoder_decoder_tf = TFEncoderDecoderModel.from_pretrained(
                tmp_dirname)

            logits_tf_2 = encoder_decoder_tf(
                input_ids=input_ids,
                decoder_input_ids=decoder_input_ids).logits

            max_diff = np.max(np.abs(logits_tf_2.numpy() - logits_tf.numpy()))
            self.assertAlmostEqual(max_diff, 0.0, places=3)

        # TensorFlow => PyTorch
        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_decoder_tf.save_pretrained(tmp_dirname)
            encoder_decoder_pt = EncoderDecoderModel.from_pretrained(
                tmp_dirname, from_tf=True)

        max_diff = np.max(
            np.abs(logits_pt.detach().cpu().numpy() - logits_tf.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=3)
Example #11
0
def main(args):
    print(args)
    check_args(args)

    if USE_GPU:
        float_dtype = torch.cuda.FloatTensor
        long_dtype = torch.cuda.LongTensor
    else:
        float_dtype = torch.FloatTensor
        long_dtype = torch.LongTensor

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased-itokens")

    # add_tokens(tokenizer)

    vocab, train_loader, val_loader = build_loaders(args, tokenizer)
    model_kwargs = {}

    encoder_decoder_config = EncoderDecoderConfig.from_pretrained(
        "bert-base-uncased-itokens")
    model = EncoderDecoderModel.from_pretrained("bert-base-uncased-itokens",
                                                config=encoder_decoder_config)

    # modify_network(model, tokenizer)
    # model, model_kwargs = build_model(args, vocab)
    # model.type(float_dtype)
    model.cuda()
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    obj_discriminator, d_obj_kwargs = build_obj_discriminator(args, vocab)
    img_discriminator, d_img_kwargs = build_img_discriminator(args, vocab)
    gan_g_loss, gan_d_loss = get_gan_losses(args.gan_loss_type)

    if obj_discriminator is not None:
        obj_discriminator.type(float_dtype)
        obj_discriminator.train()
        print(obj_discriminator)
        optimizer_d_obj = torch.optim.Adam(obj_discriminator.parameters(),
                                           lr=args.learning_rate)

    if img_discriminator is not None:
        img_discriminator.type(float_dtype)
        img_discriminator.train()
        print(img_discriminator)
        optimizer_d_img = torch.optim.Adam(img_discriminator.parameters(),
                                           lr=args.learning_rate)

    restore_path = None
    if args.restore_from_checkpoint:
        restore_path = '%s_with_model.pt' % args.checkpoint_name
        restore_path = os.path.join(args.output_dir, restore_path)
    if restore_path is not None and os.path.isfile(restore_path):
        print('Restoring from checkpoint:')
        print(restore_path)
        checkpoint = torch.load(restore_path)
        model.load_state_dict(checkpoint['model_state'])
        optimizer.load_state_dict(checkpoint['optim_state'])

        if obj_discriminator is not None:
            obj_discriminator.load_state_dict(checkpoint['d_obj_state'])
            optimizer_d_obj.load_state_dict(checkpoint['d_obj_optim_state'])

        if img_discriminator is not None:
            img_discriminator.load_state_dict(checkpoint['d_img_state'])
            optimizer_d_img.load_state_dict(checkpoint['d_img_optim_state'])

        t = checkpoint['counters']['t']
        if 0 <= args.eval_mode_after <= t:
            model.eval()
        else:
            model.train()
        epoch = checkpoint['counters']['epoch']
    else:
        t, epoch = 0, 0
        checkpoint = {
            'args': args.__dict__,
            'vocab': vocab,
            'model_kwargs': model_kwargs,
            'd_obj_kwargs': d_obj_kwargs,
            'd_img_kwargs': d_img_kwargs,
            'losses_ts': [],
            'losses': defaultdict(list),
            'd_losses': defaultdict(list),
            'checkpoint_ts': [],
            'train_batch_data': [],
            'train_samples': [],
            'train_iou': [],
            'val_batch_data': [],
            'val_samples': [],
            'val_losses': defaultdict(list),
            'val_iou': [],
            'norm_d': [],
            'norm_g': [],
            'counters': {
                't': None,
                'epoch': None,
            },
            'model_state': None,
            'model_best_state': None,
            'optim_state': None,
            'd_obj_state': None,
            'd_obj_best_state': None,
            'd_obj_optim_state': None,
            'd_img_state': None,
            'd_img_best_state': None,
            'd_img_optim_state': None,
            'best_t': [],
        }

    while True:
        if t >= args.num_iterations:
            break
        epoch += 1
        print('Starting epoch %d' % epoch)

        for batch in train_loader:
            print(batch)
            exit()
            if t == args.eval_mode_after:
                print('switching to eval mode')
                model.eval()
                optimizer = optim.Adam(model.parameters(),
                                       lr=args.learning_rate)
            t += 1
            if USE_GPU:
                for k in batch.keys():
                    batch[k] = batch[k].cuda().long()
            masks = None

            with timeit('forward', args.timing):
                output = model(**batch)
            # with timeit('loss', args.timing):
            #   # Skip the pixel loss if using GT boxes
            #   skip_pixel_loss = False
            #   total_loss, losses = calculate_model_losses(
            #                           args, skip_pixel_loss, model, imgs, imgs_pred)

            # if img_discriminator is not None:
            #   scores_fake = img_discriminator(imgs_pred)
            #   weight = args.discriminator_loss_weight * args.d_img_weight
            #   total_loss = add_loss(total_loss, gan_g_loss(scores_fake), losses,
            #                         'g_gan_img_loss', weight)

            losses = {}
            total_loss = output["loss"]
            losses['total_loss'] = total_loss.item()
            if not math.isfinite(losses['total_loss']):
                print('WARNING: Got loss = NaN, not backpropping')
                continue

            optimizer.zero_grad()
            with timeit('backward', args.timing):
                total_loss.backward()
            optimizer.step()
            total_loss_d = None
            ac_loss_real = None
            ac_loss_fake = None
            d_losses = {}

            # if img_discriminator is not None:
            #   d_img_losses = LossManager()
            #   imgs_fake = imgs_pred.detach()
            #   scores_fake = img_discriminator(imgs_fake)
            #   scores_real = img_discriminator(imgs)

            #   d_img_gan_loss = gan_d_loss(scores_real, scores_fake)
            #   d_img_losses.add_loss(d_img_gan_loss, 'd_img_gan_loss')

            #   optimizer_d_img.zero_grad()
            #   d_img_losses.total_loss.backward()
            #   optimizer_d_img.step()

            if t % args.print_every == 0:
                print('t = %d / %d' % (t, args.num_iterations))
                for name, val in losses.items():
                    print(' G [%s]: %.4f' % (name, val))
                    checkpoint['losses'][name].append(val)
                checkpoint['losses_ts'].append(t)

                # if img_discriminator is not None:
                #   for name, val in d_img_losses.items():
                #     print(' D_img [%s]: %.4f' % (name, val))
                #     checkpoint['d_losses'][name].append(val)

            if t % args.checkpoint_every == 0:
                print('checking on train')
                train_results = check_model(args, t, train_loader, model)
                t_losses = train_results[0]

                print('checking on val')
                val_results = check_model(args, t, val_loader, model)
                val_losses = val_results[0]

                for k, v in val_losses.items():
                    checkpoint['val_losses'][k].append(v)

                checkpoint['model_state'] = model.state_dict()

                if obj_discriminator is not None:
                    checkpoint['d_obj_state'] = obj_discriminator.state_dict()
                    checkpoint[
                        'd_obj_optim_state'] = optimizer_d_obj.state_dict()

                if img_discriminator is not None:
                    checkpoint['d_img_state'] = img_discriminator.state_dict()
                    checkpoint[
                        'd_img_optim_state'] = optimizer_d_img.state_dict()

                checkpoint['optim_state'] = optimizer.state_dict()
                checkpoint['counters']['t'] = t
                checkpoint['counters']['epoch'] = epoch
                checkpoint_path = os.path.join(
                    args.output_dir, '%s_with_model.pt' % args.checkpoint_name)
                print('Saving checkpoint to ', checkpoint_path)
                torch.save(checkpoint, checkpoint_path)

                # Save another checkpoint without any model or optim state
                checkpoint_path = os.path.join(
                    args.output_dir, '%s_no_model.pt' % args.checkpoint_name)
                key_blacklist = [
                    'model_state', 'optim_state', 'model_best_state',
                    'd_obj_state', 'd_obj_optim_state', 'd_obj_best_state',
                    'd_img_state', 'd_img_optim_state', 'd_img_best_state'
                ]
                small_checkpoint = {}
                for k, v in checkpoint.items():
                    if k not in key_blacklist:
                        small_checkpoint[k] = v
                torch.save(small_checkpoint, checkpoint_path)
Example #12
0
with open('./config.yaml') as f:
    configs = yaml.load(f, Loader=yaml.SafeLoader)

train_data_batch = get_data_batch(path='./data/train_tokenized/*',
                                  batch_size=configs['batch_size'])
val_data_batch = get_data_batch(path='./data/val_tokenized/*',
                                batch_size=configs['batch_size'])

if configs['load_pretrained']:
    os.makedirs(configs['output_dir'] + '/pretrained/', exist_ok=True)
    os.system('gsutil -m cp -r "{}/*" "{}"'.format(
        configs['gcp_pretrained_path'],
        configs['output_dir'] + '/pretrained/'))
    try:
        roberta_shared = EncoderDecoderModel.from_pretrained(
            configs['output_dir'] + '/pretrained/', tie_encoder_decoder=True)
    except:
        print(
            'Warning: There is no pretrained model in the provided link. Initializing a new model weights.'
        )
        roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained(
            "vinai/phobert-base",
            "vinai/phobert-base",
            tie_encoder_decoder=True)

else:
    roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained(
        "vinai/phobert-base", "vinai/phobert-base", tie_encoder_decoder=True)

# set special tokens
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id
Example #13
0
from transformers import BertTokenizer, EncoderDecoderModel
import os
import streamlit as st

st.header('Rangkuman Cerpen')
st.text('powered by BERT')

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

tokenizer = BertTokenizer.from_pretrained(
    "cahya/bert2bert-indonesian-summarization")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model = EncoderDecoderModel.from_pretrained(
    "cahya/bert2bert-indonesian-summarization")

#
ARTICLE_TO_SUMMARIZE = st.text_area(
    "Masukkan cerpen yang ingin diringkas (max 512 token)")

# generate summary
input_ids = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors='pt')
summary_ids = model.generate(input_ids,
                             min_length=20,
                             max_length=80,
                             num_beams=10,
                             repetition_penalty=2.5,
                             length_penalty=1.0,
                             early_stopping=True,
                             no_repeat_ngram_size=2,
                             use_cache=True,
Example #14
0
            # # decode the output
            # top_k_top_p_pred_decode_str = tokenizer.batch_decode(top_k_top_p, skip_special_tokens=True)
            # batch["top_k_top_p_pred_decode_str"] = top_k_top_p_pred_decode_str
            # print("top_k_top_p_pred_decode_str: ", top_k_top_p_pred_decode_str)
            # label str for rouge
            label_str = [
                " ".join(map(str, label_id)) for label_id in labels.input_ids
            ]
            batch["label_id_str"] = label_str
            label_decode_str = tokenizer.batch_decode(labels.input_ids,
                                                      skip_special_tokens=True)
            print("label_decode_str: ", label_decode_str)
            return batch

        tokenizer = BertTokenizer.from_pretrained(DEFAULT_MODEL_NAME)
        model = EncoderDecoderModel.from_pretrained("ckpt/checkpoint-2800")
        # model.to("cuda")

        lcsts = LCSTS(args.training_path,
                      args.val_path,
                      args.test_path,
                      output_path=args.preprocess_output_path)
        test_dataset = load_dataset('csv', data_files=[lcsts.test_merged_csv
                                                       ])['train']

        pred_str_keys = [
            "greedy_pred_str", "beam_output_pred_str",
            "beam_output_ngram_pred_str", "top_k_only_ngram_pred_str",
            "top_p_only_ngram_pred_str", "top_k_top_p_ngram_pred_str"
        ]
        results = test_dataset.map(generate_summary,
Example #15
0
import torch
from transformers import BertTokenizer, EncoderDecoderModel

input_str = '1999 chevillon nuit saints georges villages france'

encoder_max_length = 128
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
ed_model = EncoderDecoderModel.from_pretrained('./checkpoint-500')
ed_model.to(device)

inputs = tokenizer(input_str,
                   padding='max_length',
                   truncation=True,
                   max_length=encoder_max_length,
                   return_tensors='pt')
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
outputs = ed_model.generate(input_ids, attention_mask=attention_mask)
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print(f'NAME\n{input_str}')
print()
print(f'DESCRIPTION\n{output_str}')
Example #16
0
import transformers
from transformers import EncoderDecoderModel, AutoTokenizer
from tokenizers import Tokenizer
import torch
import sys
from datasets import load_metric

metric = load_metric('sacrebleu')

chk_dir = sys.argv[1]
chk_num = sys.argv[2]
num_beams = int(sys.argv[3])
code_tok = False if sys.argv[4] == 'false' else True
print_bool = False if sys.argv[5] == 'false' else True

model = EncoderDecoderModel.from_pretrained('./{}/checkpoint-{}/'.format(
    chk_dir, chk_num))
code_tokenizer = Tokenizer.from_file(
    'code_tokenizer.json') if code_tok else AutoTokenizer.from_pretrained(
        'bert-base-uncased')
text_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
#print(text_tokenizer.convert_tokens_to_ids())
pad_token_id = 1 if code_tok else 0
bos_token_id = 2 if code_tok else 101
eos_token_id = 3 if code_tok else 102

f = open('tok-eval.tsv', 'r')
for i, line in enumerate(f):
    if i == 0:
        continue
    if i > 1000:
        break
Example #17
0
    def __init__(self) -> None:
        self.lists = {}

        # M-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.bert_multilingual_model = BertForMaskedLM.from_pretrained(
            'bert-base-multilingual-cased').eval()
        self.lists["M-BERT"] = {
            "Tokenizer": self.bert_multilingual_tokenizer,
            "Model": self.bert_multilingual_model
        }
        print("====================================")
        print("[BERT] Google Multilingual BERT loaded")
        print("====================================")

        # KR-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.krbert_tokenizer = BertTokenizerFast.from_pretrained(
            'snunlp/KR-Medium')
        self.krbert_model = BertForMaskedLM.from_pretrained(
            'snunlp/KR-Medium').eval()
        self.lists["KR-Medium"] = {
            "Tokenizer": self.krbert_tokenizer,
            "Model": self.krbert_model
        }
        print("====================================")
        print("[BERT] KR-BERT loaded")
        print("====================================")

        # BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_kor_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.lists["bert-kor-base"] = {
            "Tokenizer": self.bert_kor_tokenizer,
            "Model": self.bert_kor_model
        }
        print("====================================")
        print("[BERT] BERT-kor-base loaded")
        print("====================================")

        # ALBERT
        from transformers import AlbertForMaskedLM
        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.lists["albert-kor-base"] = {
            "Tokenizer": self.albert_tokenizer,
            "Model": self.albert_model
        }
        print("====================================")
        print("[BERT] ALBERT-kor-base loaded")
        print("====================================")

        # XLM-Roberta
        from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
        self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            'xlm-roberta-base')
        self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
            'xlm-roberta-base').eval()
        self.lists["xlm-roberta-base"] = {
            "Tokenizer": self.xlmroberta_tokenizer,
            "Model": self.xlmroberta_model
        }
        print("====================================")
        print("[BERT] XLM-Roberta-kor loaded")
        print("====================================")

        from transformers import BertTokenizerFast, EncoderDecoderModel
        self.tokenizer_bertshared = BertTokenizerFast.from_pretrained(
            "kykim/bertshared-kor-base")
        self.bertshared_model = EncoderDecoderModel.from_pretrained(
            "kykim/bertshared-kor-base")
        self.lists["bertshared-kor-base"] = {
            "Tokenizer": self.tokenizer_bertshared,
            "Model": self.bertshared_model
        }
        print("====================================")
        print("[Seq2seq + BERT] bertshared-kor-base loaded")
        print("====================================")

        # gpt3-kor-small_based_on_gpt2
        from transformers import BertTokenizerFast, GPT2LMHeadModel
        self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.model_gpt3 = GPT2LMHeadModel.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.lists["gpt3-kor-small_based_on_gpt2"] = {
            "Tokenizer": self.tokenizer_gpt3,
            "Model": self.model_gpt3
        }
        print("====================================")
        print("[GPT3] gpt3-small-based-on-gpt2 loaded")
        print("====================================")

        # electra-base-kor
        from transformers import ElectraTokenizerFast, ElectraModel
        self.tokenizer_electra = ElectraTokenizerFast.from_pretrained(
            "kykim/electra-kor-base")
        self.electra_model = ElectraModel.from_pretrained(
            "kykim/electra-kor-base")
        self.lists["electra-kor-base"] = {
            "Tokenizer": self.tokenizer_electra,
            "Model": self.electra_model
        }
        print("====================================")
        print("[ELECTRA] electra-kor-base loaded")
        print("====================================")

        from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering
        self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.lists["electra-kor-QA"] = {
            "Tokenizer": self.electra_tokenizer_QA,
            "Model": self.electra_model_QA
        }
        print("====================================")
        print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded")
        print("====================================")
Example #18
0
    r for r in tqdm.tqdm(
        ria_reader_with_date_approx(
            '/home/aobuhtijarov/datasets/ria/ria.shuffled.val.json'))
])
ria_records.extend([
    r for r in tqdm.tqdm(
        ria_reader_with_date_approx(
            '/home/aobuhtijarov/datasets/ria/ria.shuffled.test.json'))
])

lenta_records = [
    r for r in lenta_records
    if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']
]

model = EncoderDecoderModel.from_pretrained(clust_model)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path,
                                          do_lower_case=False,
                                          do_basic_tokenize=False)
setattr(tokenizer, 'max_tokens_text', 250)
model.cuda()

text_to_vector_func = get_text_to_vector_func('bert-FirstCLS', model,
                                              tokenizer)

lenta_embeds = get_embeds_for_records(lenta_records, text_to_vector_func)
ria_embeds = get_embeds_for_records(ria_records, text_to_vector_func)


def f(start, total, n_jobs):
    print(start, total, n_jobs)
Example #19
0
    def __init__(self, model_name, device):
        self.device = device
        self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        self.model = EncoderDecoderModel.from_pretrained(model_name)

        self.model.device(device)
NL2CST_PORT = get_env('NL2CST_PORT', 8080)
NUM_BEAMS = int(get_env("NUM_BEAMS"))
NUM_OUTPUTS = int(get_env("NUM_OUTPUTS"))
TYPE_FORCING = int(get_env("TYPE_FORCING"))
BATCH_SIZE = int(get_env("BATCH_SIZE"))
CLEAN_INPUT = int(get_env("CLEAN_INPUT"))

app = Flask(__name__)

dataset_properties = json.load(
    open(os.path.join(MODEL_ROOT, "dataset_properties.json")))
target_vocab = dataset_properties["target_vocab"]
special_tokens = dataset_properties["special_tokens"]
max_input_length = dataset_properties["encoder_max_len"]

bert2arsenal = EncoderDecoderModel.from_pretrained(
    get_last_checkpoint(MODEL_ROOT))

tokenizer_path = os.path.join(MODEL_ROOT, "source_tokenizer")

# Try to use saved source tokenizer from file to prevent any downloads.
# Our older trained models didn't save the source tokenizer to disk, so use
# the download method as a fallback to remain compatible with older models.
if os.path.exists(tokenizer_path):
    source_tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
else:
    print(f"no existing source tokenizer found, downloading...")
    source_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    source_tokenizer.add_special_tokens(
        {"additional_special_tokens": special_tokens})
target_tokenizer = PreTrainedArsenalTokenizer(target_vocab=target_vocab)
type_forcing_vocab = target_tokenizer.id2vocab if TYPE_FORCING else None
Example #21
0
def generate_summaries_or_translations(
    examples: List[str],
    out_file: str,
    model_name: str,
    batch_size: int = 8,
    device: str = DEFAULT_DEVICE,
    fp16=False,
    task="summarization",
    prefix=None,
    **generate_kwargs,
) -> Dict:
    """Save model.generate results to <out_file>, and return how long it took."""
    fout = Path(out_file).open("w", encoding="utf-8")
    model_name = str(model_name)
    if "encoder" in model_name and "decoder" in model_name:
        model = EncoderDecoderModel.from_pretrained(model_name).to(device)
    else:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    if fp16:
        model = model.half()

    if "encoder" in model_name and "decoder" in model_name:
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  config=model.config)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}"
                )  # if this is wrong, check config.model_type.

    decoder_start_token_id = None  # default to config
    if isinstance(model.config, EncoderDecoderConfig):
        decoder_start_token_id = model.config.decoder.pad_token_id

    start_time = time.time()
    # update config with task specific params
    use_task_specific_params(model, task)
    if prefix is None:
        # prefix = prefix or getattr(model.config, "prefix", "") or ""
        prefix = ""
    for examples_chunk in tqdm(list(chunks(examples, batch_size))):
        examples_chunk = [prefix + text for text in examples_chunk]
        batch = tokenizer(examples_chunk,
                          return_tensors="pt",
                          truncation=True,
                          padding="longest").to(device)
        summaries = model.generate(
            input_ids=batch.input_ids,
            attention_mask=batch.attention_mask,
            decoder_start_token_id=decoder_start_token_id,
            **generate_kwargs,
        )
        dec = tokenizer.batch_decode(summaries,
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=False)
        for hypothesis in dec:
            fout.write(hypothesis + "\n")
            fout.flush()
    fout.close()
    runtime = int(time.time() - start_time)  # seconds
    n_obs = len(examples)
    return dict(n_obs=n_obs,
                runtime=runtime,
                seconds_per_sample=round(runtime / n_obs, 4))
    for arg in vars(args):
        print("{}: {}".format(arg, getattr(args, arg)))

    return args


if __name__ == "__main__":
    print("Start Cross Attention Distribution Visualization", flush=True)
    args = args_parse()

    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")

    tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
    tokenizer.add_special_tokens({"additional_special_tokens": ["[unused1]"]})
    model = EncoderDecoderModel.from_pretrained(args.modelname,
                                                output_attentions=True)
    model.encoder.config.output_attentions = True
    model.decoder.config.output_attentions = True
    model.to(device)
    model.eval()

    train_contents, train_querys, = get_dataset(args.contentfile,
                                                args.queryfile)

    print("Get Zeroth Sample", flush=True)
    print("Content: {}".format(train_contents[0]), flush=True)
    print("Question (Query): {}".format(train_querys[0]), flush=True)

    data_iterator = JustInTime_InOrder_Iterator(train_contents,
                                                train_querys,
                                                batch_size=args.batch_size)
Example #23
0
from transformers import EncoderDecoderModel, BertTokenizer
import torch
from tqdm import tqdm
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = EncoderDecoderModel.from_pretrained("checkpoint-100000")

with open("src_train.txt", 'r') as f, open("result.txt", 'w') as f2:
    for line in tqdm(f):
        input_ids = torch.tensor(tokenizer.encode(line)).unsqueeze(0)
        generated = model.generate(
            input_ids,
            decoder_start_token_id=model.config.decoder.pad_token_id)
        f2.write(str(tokenizer.decode(generated[0])) + "\n")
Example #24
0
def train_gen_title(run_name: str,
                    config_file: str,
                    train_file: str,
                    train_fraq: float,
                    output_model_path: str,
                    from_pretrained: str = None,
                    checkpoint: str = None):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))

    init_wandb(run_name, config)

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                              do_lower_case=False,
                                              do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    full_dataset = LentaRiaDataset(train_file, tokenizer, max_tokens_text,
                                   max_tokens_title)

    print("Initializing model...")
    if from_pretrained:
        model = EncoderDecoderModel.from_pretrained(from_pretrained)
    else:
        enc_model_path = config["enc_model_path"]
        dec_model_path = config["dec_model_path"]
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(
            enc_model_path, dec_model_path)

    train_size = int(train_fraq * len(full_dataset))

    train_dataset, val_dataset = \
            torch.utils.data.random_split(full_dataset, [train_size, len(full_dataset) - train_size])

    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Val dataset size': len(val_dataset),
    })

    print("Training model...")
    batch_size = config["batch_size"]
    eval_steps = config["eval_steps"]
    save_steps = config["save_steps"]
    logging_steps = config["logging_steps"]
    enc_lr = config["enc_lr"]
    dec_lr = config["dec_lr"]
    warmup_steps = config["num_warmup_steps"]
    max_steps = config["max_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]

    opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps,
                                    max_steps)

    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        save_total_limit=1,
        max_steps=max_steps,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=opt,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
from transformers import BertTokenizerFast
from transformers import EncoderDecoderModel
from datasets import load_metric, load_from_disk

bert2bert = EncoderDecoderModel.from_pretrained("./checkpoint-20").to("cuda")
#bert2bert = BertTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
bert2bert.half()
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

cnndm = load_from_disk("dataset/gigaword")
test_data = cnndm['test']
rouge = load_metric("rouge")


def generate_summary(batch):
    # cut off at BERT max length 64
    inputs = tokenizer(batch["document"],
                       padding="max_length",
                       truncation=True,
                       max_length=32,
                       return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = bert2bert.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred_summary"] = output_str

    return batch

Example #26
0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)
    model_created = False

    if args.checkpoint != None:
        model_created = True
        if args.bart:
            config = BartConfig.from_json_file(args.checkpoint +
                                               "/config.json")
            model = BartForConditionalGeneration.from_pretrained(
                args.checkpoint + "/pytorch_model.bin", config=config)
        else:
            config = EncoderDecoderConfig.from_json_file(args.checkpoint +
                                                         "/config.json")
            model = EncoderDecoderModel.from_pretrained(args.checkpoint +
                                                        "/pytorch_model.bin",
                                                        config=config)

    if args.language == 'fr':
        if args.bart:
            model_name = "WikinewsSum/bart-large-multi-fr-wiki-news"
            #config = BartConfig.from_pretrained(model_name)
            tokenizer = BartTokenizer.from_pretrained(model_name)
            if not model_created:
                model = BartForConditionalGeneration.from_pretrained(
                    model_name)
                model_created = True
        else:
            model_name = "camembert-base"
            #config = CamembertConfig.from_pretrained(model_name)
            tokenizer = CamembertTokenizer.from_pretrained(model_name)
 def get_encoderdecoder_model(self):
     return EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
Example #28
0
def train_style_gen_title(
        run_name: str,
        config_file: str,
        train_file: str,
        dataset_type: str,
        output_model_path: str,
        from_pretrained: str = None,
        checkpoint: str = None
):
    logging.set_verbosity_info()
    config = json.loads(jsonnet_evaluate_file(config_file))
    init_wandb(run_name, config)

    agency_list = config['agency_list']
    print('Agency list:', agency_list)

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]

    print("Initializing model...")

    if from_pretrained:
        model = EncoderDecoderModel.from_pretrained(from_pretrained)
    else:
        enc_model_path = config["enc_model_path"]
        dec_model_path = config["dec_model_path"]
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(enc_model_path, dec_model_path)

    print("Fetching data...")
    if dataset_type == 'tg':
        all_records = [r for r in tqdm.tqdm(tg_reader(train_file))]
    elif dataset_type == 'lenta-ria':
        lenta_records = [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.train.csv')))]
        lenta_records.extend(
            [r for r in tqdm.tqdm(lenta_reader(os.path.join(train_file, 'lenta/lenta-ru-news.val.csv')))]
        )

        ria_records = [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.train.json')))]
        ria_records.extend(
            [r for r in tqdm.tqdm(ria_reader(os.path.join(train_file, 'ria/ria.shuffled.val.json')))]
        )

        random.shuffle(ria_records)

        all_records = [r for r in lenta_records if r['date'][:4] in ['2010', '2011', '2012', '2013', '2014']] + \
            ria_records[:220000]

        random.shuffle(all_records)

    print("Building datasets...")

    agency_to_special_token_id = {a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)}

    full_dataset = AgencyTitleDatasetGeneration(
        all_records, tokenizer, 
        filter_agencies=list(agency_to_special_token_id.keys()), agency_to_special_token_id=agency_to_special_token_id,
        max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title
    )

    train_size = int(0.93 * len(full_dataset))
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset,
                                                               [train_size, len(full_dataset) - train_size])

    print(f"Train dataset length = {len(train_dataset)}\nVal dataset length = {len(val_dataset)}")
    wandb.summary.update({
        'Train dataset size': len(train_dataset),
        'Test dataset size': len(val_dataset),
    })

    print("Training model...")
    batch_size = config["batch_size"]
    eval_steps = config["eval_steps"]
    save_steps = config["save_steps"]
    logging_steps = config["logging_steps"]
    enc_lr = config["enc_lr"]
    dec_lr = config["dec_lr"]
    warmup_steps = config["num_warmup_steps"]
    max_steps = config["max_steps"]
    gradient_accumulation_steps = config["gradient_accumulation_steps"]

    opt = get_separate_lr_optimizer(model, enc_lr, dec_lr, warmup_steps, max_steps)

    training_args = TrainingArguments(
        output_dir=output_model_path,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy='steps',
        do_train=True,
        do_eval=True,
        overwrite_output_dir=False,
        logging_steps=logging_steps,
        save_steps=save_steps,
        eval_steps=eval_steps,
        save_total_limit=2,
        max_steps=max_steps,
        report_to='wandb',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=opt,
    )

    trainer.train(checkpoint)
    model.save_pretrained(output_model_path)
    def create_and_check_encoder_decoder_shared_weights(
            self, config, input_ids, attention_mask, encoder_hidden_states,
            decoder_config, decoder_input_ids, decoder_attention_mask, labels,
            **kwargs):
        torch.manual_seed(0)
        encoder_model, decoder_model = self.get_encoder_decoder_model(
            config, decoder_config)
        model = EncoderDecoderModel(encoder=encoder_model,
                                    decoder=decoder_model)
        model.to(torch_device)
        model.eval()
        # load state dict copies weights but does not tie them
        decoder_state_dict = model.decoder._modules[
            model.decoder.base_model_prefix].state_dict()
        model.encoder.load_state_dict(decoder_state_dict, strict=False)

        torch.manual_seed(0)
        tied_encoder_model, tied_decoder_model = self.get_encoder_decoder_model(
            config, decoder_config)
        config = EncoderDecoderConfig.from_encoder_decoder_configs(
            tied_encoder_model.config,
            tied_decoder_model.config,
            tie_encoder_decoder=True)
        tied_model = EncoderDecoderModel(encoder=tied_encoder_model,
                                         decoder=tied_decoder_model,
                                         config=config)
        tied_model.to(torch_device)
        tied_model.eval()

        model_result = model(
            input_ids=input_ids,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
        )

        tied_model_result = tied_model(
            input_ids=input_ids,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
        )

        # check that models has less parameters
        self.assertLess(sum(p.numel() for p in tied_model.parameters()),
                        sum(p.numel() for p in model.parameters()))
        random_slice_idx = ids_tensor((1, ), model_result[0].shape[-1]).item()

        # check that outputs are equal
        self.assertTrue(
            torch.allclose(model_result[0][0, :, random_slice_idx],
                           tied_model_result[0][0, :, random_slice_idx],
                           atol=1e-4))

        # check that outputs after saving and loading are equal
        with tempfile.TemporaryDirectory() as tmpdirname:
            tied_model.save_pretrained(tmpdirname)
            tied_model = EncoderDecoderModel.from_pretrained(tmpdirname)
            tied_model.to(torch_device)
            tied_model.eval()

            # check that models has less parameters
            self.assertLess(sum(p.numel() for p in tied_model.parameters()),
                            sum(p.numel() for p in model.parameters()))
            random_slice_idx = ids_tensor((1, ),
                                          model_result[0].shape[-1]).item()

            tied_model_result = tied_model(
                input_ids=input_ids,
                decoder_input_ids=decoder_input_ids,
                attention_mask=attention_mask,
                decoder_attention_mask=decoder_attention_mask,
            )

            # check that outputs are equal
            self.assertTrue(
                torch.allclose(model_result[0][0, :, random_slice_idx],
                               tied_model_result[0][0, :, random_slice_idx],
                               atol=1e-4))
Example #30
0
def evaluate_style_gen_title(
    existing_run_name: str,
    existing_run_id: str,
    config_file: str,
    gen_model_file: str,
    discr_model_file: str,
    test_file: str,
    test_sample_rate: float,
):
    logging.set_verbosity_info()
    init_wandb(existing_run_name, None, existing_run_id)

    config = json.loads(jsonnet_evaluate_file(config_file))

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]
    setattr(tokenizer, 'max_tokens_text', max_tokens_text)

    batch_size = config["batch_size"]

    print("Loading model...")
    model = EncoderDecoderModel.from_pretrained(gen_model_file)
    model.eval()
    model.cuda()

    agency_list = config['agency_list']
    discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda()
    
    print("Fetching TG data...")
    test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) 
        if random.random() <= test_sample_rate]
    
    print("Building datasets...")
    
    
    agency_to_special_token_id = {
        a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)
    }

    agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))}

    test_dataset = AgencyTitleDatasetGeneration(
        test_records, tokenizer,
        filter_agencies=list(agency_to_special_token_id.keys()),
        agency_to_special_token_id=agency_to_special_token_id,
        max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title
    )

    print('Dataset size:', len(test_dataset))

    y_pred = []
    y_true = []

    for i in tqdm.trange(0, len(test_dataset), batch_size):
        data = test_dataset[i]
        for k in tuple(data.keys()):
            if k not in ('input_ids', 'attention_mask'):
                del data[k]
            else:
                data[k] = data[k].unsqueeze(0)

        for j in range(i + 1, min(i + batch_size, len(test_dataset))):
            for k in data.keys():
                data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0)

        y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']]
            for j in range(i, min(i + batch_size, len(test_dataset)))])

        data['input_ids'] = data['input_ids'].cuda()
        data['attention_mask'] = data['attention_mask'].cuda()

        output_ids = model.generate(
            **data,
            decoder_start_token_id=model.config.decoder.pad_token_id,
            min_length=7,
            max_length=20,
            num_beams=6
        )

        preds = [
            tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids
        ]

        for title in preds:
            inp = tokenizer(title, 
                add_special_tokens=True, max_length=max_tokens_title,
                padding='max_length', truncation=True
            )

            logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), 
                                   attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0]
            y_pred.append(torch.argmax(logits).item())

    wandb.summary.update({
        'D-Style': classification_report(y_true, y_pred, output_dict=True)
    })