def load_data_to(self, ctxs: Dict[object, BiEncoderPassage], date): year = "_" + str(datetime.strptime(date, "%b-%d-%Y").year) + "_" tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") print(f"Creating bi-encoder dict for {date}...") for file_path in tqdm(self.file_paths): if year in file_path: with open(file_path, 'rb') as f: items = ijson.kvitems(f, '') ocr_text_generators = [] for k, v in items: if date in k: ocr_text_generators.append(self.ocr_text_iter(v)) if len(ocr_text_generators) == 0: continue for gen in ocr_text_generators: for layobj in gen: title, passage, object_id = layobj uid = object_id title = normalize_passage(title) title = title.lower() passage = take_max_model_paragraphs(passage, tokenizer) passage = normalize_passage(passage) ctxs[uid] = BiEncoderPassage(passage, title)
def __init__( self, model_name_or_path, tokenizer_name, model_cache_dir, max_length, with_title, wandb_project, wandb_run_name, **kwargs, ): super().__init__( max_length, with_title, wandb_project, wandb_run_name, ) self.tokenizer = BartTokenizerFast.from_pretrained( tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=model_cache_dir, ) self.model = BartForSequenceOrderingWithMultiPointer.from_pretrained( model_name_or_path, cache_dir=model_cache_dir, )
def __init__(self): super(BartTokenizerWithMapping, self).__init__( huggingface_tokenizer=BartTokenizerFast.from_pretrained( 'facebook/bart-large-cnn'), truncate_left=1, truncate_right=1, starting_tokens_ids=[0], ending_tokens_ids=[2])
def __init__(self): self.model = BartForConditionalGeneration.from_pretrained( "facebook/bart-large-cnn" ) self.model.half() self.model.to(device) self.model.eval() self.tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-large-cnn")
def short_cnn_bart_encoding(data): tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-cnn') texts = [a['content'] for a in data] encodings = tokenizer(texts, truncation=True, max_length=128, padding=True, return_attention_mask=True, return_token_type_ids=True) for idx, article in enumerate(tqdm(data)): article['content'] = encodings.data['input_ids'][idx] article['attention_mask'] = encodings.data['attention_mask'][idx] article['token_type_ids'] = encodings.data['token_type_ids'][idx] return data
def load_data_to(self, ctxs: Dict[object, BiEncoderPassage]): tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") if self.n_random_papers: print("Random newspaper subset...") scan_names = [] for file_path in tqdm(self.file_paths): with open(file_path, 'rb') as f: items = ijson.kvitems(f, '') for k, v in items: scan_names.append(k) papers = list(set([self.get_paper_name(scan) for scan in scan_names])) papers.sort() print(f"{len(papers)} total papers...") random.seed(789) random_papers = random.sample(papers, self.n_random_papers) print(f"Selected random papers: {random_papers}") print("Creating bi-encoder dict...") for file_path in tqdm(self.file_paths): with open(file_path, 'rb') as f: items = ijson.kvitems(f, '') ocr_text_generators = [] for k, v in items: if self.month_str: if self.month_str in k: if self.n_random_papers: if self.get_paper_name(k) in random_papers: ocr_text_generators.append(self.ocr_text_iter(v)) else: ocr_text_generators.append(self.ocr_text_iter(v)) else: if self.n_random_papers: if self.get_paper_name(k) in random_papers: ocr_text_generators.append(self.ocr_text_iter(v)) else: ocr_text_generators.append(self.ocr_text_iter(v)) if len(ocr_text_generators) == 0: continue for gen in ocr_text_generators: for layobj in gen: title, passage, object_id = layobj uid = object_id if self.normalize: title = normalize_passage(title) title = title.lower() passage = take_max_model_paragraphs(passage, tokenizer) passage = normalize_passage(passage) ctxs[uid] = BiEncoderPassage(passage, title)
def __init__(self, data_dir: str, batch_size=8, pre_trained='', with_answers=False): super().__init__() self.batch_size = batch_size self.data_dir = data_dir self.with_answers = with_answers if pre_trained == 't5': self.tokenizer = T5TokenizerFast.from_pretrained( 't5-base', extra_ids=0, additional_special_tokens=['<A>', '<H>', '<R>', '<T>']) elif pre_trained == 'bart': self.tokenizer = BartTokenizerFast.from_pretrained( 'facebook/bart-base', extra_ids=0, additional_special_tokens=['<A>', '<H>', '<R>', '<T>']) else: raise Exception( f'Unknown pre-trained model {pre_trained}, choose t5 or bart.')
def default_tokenizer_fast(self): return BartTokenizerFast.from_pretrained("facebook/bart-large")
from torch.utils.data import DataLoader, random_split import pytorch_lightning as pl import numpy as np import pandas as pd from sklearn import metrics import transformers import torch import pytorch_lightning as pl from transformers import BartTokenizerFast from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler from collections import defaultdict from transformers import BartModel, BartConfig tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-cnn') moral_foundations = [ 'AuthorityVice', 'AuthorityVirtue', 'FairnessVice', 'FairnessVirtue', 'HarmVice', 'HarmVirtue', 'IngroupVice', 'IngroupVirtue', 'PurityVice', 'PurityVirtue' ] def get_target_moral_names(targets): r = [] for idx, t in enumerate(targets): if t: r.append(moral_foundations[idx]) return r
def __init__(self, lr=0.001, discriminator=None, bart_decoder=True, freeze_encoder=True, freeze_decoder=True, contextual_injection=True, n_contextual_linear=2, moral_vec_size=10, use_content_loss=False, content_loss_type='cosine', feed_moral_tokens_to='encoder', use_moral_loss=False, content_loss_weighting=1, moral_loss_weighting=1): super().__init__() assert n_contextual_linear >= 1 self.lr = lr self.contextual_injection = contextual_injection self.feed_moral_tokens_to = feed_moral_tokens_to self.use_moral_loss = use_moral_loss self.use_content_loss = use_content_loss self.content_loss_type = content_loss_type self.content_loss_weighting = content_loss_weighting self.moral_loss_weighting = moral_loss_weighting self.loss_history = [] self.training_epoch_count = 10 self.use_original_morals = False self.tokenizer = BartTokenizerFast.from_pretrained( 'facebook/bart-large-cnn') self.bart_scorer = BartScorer() # Load pretrained model # self.pretrained = BartModel.from_pretrained('facebook/bart-large-cnn') print('Loading pretrained bart-large-cnn...') self.pretrained = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large-cnn').to(device) print('Pretrained bart-large-cnn loaded') # print(self.pretrained) # sys.exit() self.encoder = self.pretrained.model.encoder self.embedding = self.pretrained.model.shared if freeze_encoder: for param in self.encoder.parameters(): param.requires_grad = False self.n_vocab = self.embedding.num_embeddings self.n_encoder_features = self.encoder.layernorm_embedding.normalized_shape[ 0] # Linear layers to combine encodings and moral features self.linears = nn.ModuleList([ nn.Linear(self.n_encoder_features + moral_vec_size, self.embedding.embedding_dim) ]) for i in range(n_contextual_linear - 1): self.linears.append( nn.Linear(self.embedding.embedding_dim, self.embedding.embedding_dim)) # Decoder self.decoder = self.pretrained.model.decoder if freeze_decoder: for param in self.decoder.parameters(): param.requires_grad = False self.lm_head = self.pretrained.lm_head self.discriminator = discriminator for param in self.discriminator.parameters(): param.requires_grad = False self.vocab_size = 50264 self.onehot_embeddings = nn.Linear(self.vocab_size, 1024, bias=False) self.onehot_embeddings.weight = nn.Parameter( self.discriminator.build_lookups()) self.onehot_embeddings.requires_grad = False self.onehot_embeddings.weight.requires_grad = False
''' load data ''' train_contexts, train_questions, train_answers = read_squad( 'data/train-v2.0.json') val_contexts, val_questions, val_answers = read_squad('data/dev-v2.0.json') ''' generate answer end indices ''' add_end_idx(train_answers, train_contexts) add_end_idx(val_answers, val_contexts) ''' tokenizers and models ''' tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base') model = BartForQuestionAnswering.from_pretrained('facebook/bart-base') ''' tokenize ''' train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True) val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True) ''' last step preparing model inputs '''
def __init__(self, hparams): super(AbstractiveSummarizer, self).__init__() self.hparams = hparams if len(self.hparams.dataset) <= 1: self.hparams.dataset = self.hparams.dataset[0] if "longformer-encdec" in self.hparams.model_name_or_path.lower(): self.model = LongformerEncoderDecoderForConditionalGeneration.from_pretrained( self.hparams.model_name_or_path, gradient_checkpointing=True ) self.tokenizer = BartTokenizerFast.from_pretrained( self.hparams.model_name_or_path, add_prefix_space=True ) else: if self.hparams.decoder_model_name_or_path: self.model = EncoderDecoderModel.from_encoder_decoder_pretrained( self.hparams.model_name_or_path, ( self.hparams.decoder_model_name_or_path if self.hparams.decoder_model_name_or_path else self.hparams.model_name_or_path ), gradient_checkpointing=self.hparams.gradient_checkpointing, tie_encoder_decoder=self.hparams.tie_encoder_decoder, ) else: self.model = AutoModelForSeq2SeqLM.from_pretrained( self.hparams.model_name_or_path, gradient_checkpointing=self.hparams.gradient_checkpointing, ) self.tokenizer = AutoTokenizer.from_pretrained( self.hparams.model_name_or_path, use_fast=True ) self.rouge_sentence_split_token = "<q>" self.tokenizer.add_tokens(self.rouge_sentence_split_token) self.rouge_sentence_split_token_id = self.tokenizer.convert_tokens_to_ids( self.rouge_sentence_split_token ) # bo = beginning of # eo = ending of # seq = sequence (not using 's' because 's' stands for sentence in other places) # Use `bos_token` for boseq if `bos_token` is set, otherwise use "[unused0]" # Use `pad_token` for eoseq if `pad_token` is set, otherwise use "[unused1]" do_seq_special_add = False if self.tokenizer.bos_token: self.target_boseq_token = self.tokenizer.bos_token else: self.target_boseq_token = "[unused0]" do_seq_special_add = True if self.tokenizer.pad_token: self.target_eoseq_token = self.tokenizer.pad_token else: self.target_eoseq_token = "[unused1]" do_seq_special_add = True # Convert `target_boseq_token` and `target_eoseq_token` to IDs self.target_boseq_token_id = self.tokenizer.convert_tokens_to_ids( self.target_boseq_token ) self.target_eoseq_token_id = self.tokenizer.convert_tokens_to_ids( self.target_eoseq_token ) # If the `*oseq` tokens are not already "special" then add them as special # tokens so that they are ignored when decoding. if do_seq_special_add: special_tokens_dict = { "additional_special_tokens": [ self.target_boseq_token, self.target_eoseq_token, ] } self.tokenizer.add_special_tokens(special_tokens_dict) if self.hparams.label_smoothing > 0: self.loss_func = LabelSmoothingLoss( self.hparams.label_smoothing, self.tokenizer.vocab_size, ignore_index=self.tokenizer.pad_token_id, ) else: self.loss_func = nn.CrossEntropyLoss( ignore_index=self.tokenizer.pad_token_id ) self.train_dataloader_object = None # not created yet self.rouge_metrics = None self.rouge_scorer = None self.dataset = {} self.tokenized_data_file_paths = {} for split in ["train", "validation", "test"]: features_cache_file = os.path.join( self.hparams.cache_file_path, (split + "_tokenized") ) self.tokenized_data_file_paths[split] = features_cache_file if "longformer" in self.hparams.model_name_or_path: longformer_modifier_ = partial( longformer_modifier, tokenizer=self.tokenizer, attention_window=self.model.config.attention_window, ) self.collate_fn = partial( self.abs_collate_fn, modifier=longformer_modifier_ ) else: self.collate_fn = self.abs_collate_fn
def get_rust_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return BartTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
from transformers import BartForConditionalGeneration as BCD, BartTokenizerFast as BTF import dataset import sys batch_size = int(sys.argv[1]) vit = timm.create_model('vit_base_patch32_384', pretrained=True, num_classes=0) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) bart = BCD.from_pretrained('facebook/bart-base') tokenizer = BTF.from_pretrained('facebook/bart-base') transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((384, 384)), transforms.ToTensor(), transforms.Lambda(dataset.make_img_rgb), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) trainset = dataset.NarrativesDataset(root='./data/images/', file='./data/dataset.jsonl', transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,