def __init__(self, args, t5_type='t5-base'): """ R1 = Raw 1 Training: R1 + R2 + R3 -> M3 """ super().__init__() self.lr = getattr(args, "lr") self.epochs = getattr(args, "epochs") self.warmup_steps = getattr(args, "warmup_steps") self.gpu_id = getattr(args, "gpu_id") self.transformer = T5_Cond_Gen_Wrapper.from_pretrained(t5_type) self.tokenizer = T5TokenizerFast.from_pretrained(t5_type) self.EM_accuracy = CategoricalAccuracy() self.to('cpu' if self.gpu_id == -1 else f"cuda:{self.gpu_id}") self.decoder_tokenizer = T5TokenizerFast.from_pretrained(t5_type) self.decoder_tokenizer.padding_side = 'left' # necessary since initial decoding sequences could have different length self.validation_scores = [] self.encoder = self.transformer.encoder self.decoder = self.transformer.decoder self.lm_head = self.transformer.lm_head
def test_RumorPadaDataset(): split = "train" data_processor = RumorPadaDataProcessor(["charliehebdo", "ferguson", "germanwings-crash", "ottawashooting"], "sydneysiege", DATA_DIR, EXP_DIR) print() print(data_processor.data[split].keys()) print(len(list(data_processor.data[split].values())[0])) for k, v in data_processor.data[split].items(): if type(v) is not int: print(k, v[0]) else: print(k, v[0], len(v[0])) dataset = RumorPadaDataset(split, data_processor, T5TokenizerFast.from_pretrained("t5-base"), 64, 0.5, 0.5) print(len(dataset)) for i, example in enumerate(dataset): for k, v in example.items(): print(k, v) if i == 8: break print() dataloader = DataLoader(dataset, 8) for batch in dataloader: for k, v in batch.items(): print(k, v) break
def test_AbsaSeq2SeqPadaDataProcessor(): data_processor = AbsaSeq2SeqPadaDataProcessor( ["device", "laptops", "rest", "service"], "service", DATA_DIR, EXP_DIR) print() print(data_processor.data["dev"].keys()) print(len(list(data_processor.data["dev"].values())[0])) for k, v in data_processor.data["dev"].items(): if type(v) is not int: print(k, v[-80]) else: print(k, v[-80], len(v[-80])) tokenizer = T5TokenizerFast.from_pretrained("t5-base") percent = 99.8 for split in AbsaSeq2SeqPadaDataProcessor.ALL_SPLITS: print(split, len(data_processor.data[split]["example_id"])) print(split, max(data_processor.data[split]["input_tokens_len"])) print() tokenizer_lens = tokenizer(data_processor.data[split]["input_tokens"], is_split_into_words=True, return_length=True)["length"] print(split, max(tokenizer_lens)) print(split, percentile(tokenizer_lens, percent)) print() tokenizer_lens = tokenizer( data_processor.data[split]["output_labels_tokens"], is_split_into_words=True, return_length=True)["length"] print(split, max(tokenizer_lens)) print(split, percentile(tokenizer_lens, percent)) break
def test_AbsaSeq2SeqPadaDataset(): split = "train" data_processor = AbsaSeq2SeqPadaDataProcessor( ["device", "laptops", "rest"], "service", DATA_DIR, EXP_DIR) print() print(data_processor.data[split].keys()) print(len(list(data_processor.data[split].values())[0])) for k, v in data_processor.data[split].items(): if type(v) is not int: print(k, v[0]) else: print(k, v[0], len(v[0])) dataset = AbsaSeq2SeqPadaDataset( split, data_processor, T5TokenizerFast.from_pretrained("t5-base"), 64, 0.5, 0.5) print(len(dataset)) for i, example in enumerate(dataset): for k, v in example.items(): print(k, v) if i == 8: break print() dataloader = DataLoader(dataset, 8) for batch in dataloader: for k, v in batch.items(): print(k, v) break
def _setup_model_and_tokenizer(self): self.model = T5ForConditionalGeneration.from_pretrained( self.config.model_name) if self.config.fast_tokenizer: self.tokenizer = T5TokenizerFast.from_pretrained( self.config.model_name) else: self.tokenizer = T5Tokenizer.from_pretrained( self.config.model_name)
def from_pretrained(self, model_name="t5-base"): """ Download Model from HF hub :param model_name: T5 :return: Download the model and tokenizer """ self.tokenizer = T5Tokenizer.from_pretrained(f"{model_name}") self.model = T5ForConditionalGeneration.from_pretrained( f"{model_name}", return_dict=True)
def __init__(self, model_name, ckpt_path, num_generations=10): device, gpu_ids = util.get_available_devices(assert_cuda=True) logging.info(device, gpu_ids) self.device = device self.num_generations = num_generations self.model = T5ForConditionalGeneration.from_pretrained(model_name) self.tokenizer = T5TokenizerFast.from_pretrained(model_name) load_ckpt(ckpt_path, self.model, map_location=device) self.model.to(device)
def __init__(self, # model_args t5_model_name: str, eval_metrics: List[str], # model_generate_args beam_size: int, repetition_penalty: float, length_penalty: float, num_beam_groups: int, diversity_penalty: float, skip_special_tokens: bool, clean_up_tokenization_spaces: bool, # model_optimizer_args weight_decay: float, learning_rate: float, adam_epsilon: float, # trainer_args train_batch_size: int, eval_batch_size: int, gradient_accumulation_steps: int, n_gpu: int, num_train_epochs: int, warmup_steps: int, output_dir: str, # dataset_args dataset_obj: Any, data_procesor_obj: Any, src_domains: List[str], trg_domain: str, data_dir: str, experiment_dir: str, max_seq_len: int, dataset_specific_kwargs: Namespace = None, num_labels: int = 2): super().__init__() self.save_hyperparameters() self.tokenizer = T5TokenizerFast.from_pretrained(self.hparams.t5_model_name) self.data_processor, self.datasets = self._init_datasets() self.hparams.num_labels = len(self.data_processor.labels_dict) self.loss_fn = CrossEntropyLoss(ignore_index=T5TextClassifier.LOSS_IGNORE_ID) self.model = T5ForConditionalGeneration.from_pretrained(self.hparams.t5_model_name) self.classifier = CnnClassifier(num_labels=self.hparams.num_labels, hidden_size=self.model.config.hidden_size, max_seq_length=self.hparams.max_seq_len) self.eval_metric_scorer = T5TextClassifier._init_eval_metric_scorer(self.hparams.eval_metrics) self.eval_predictions = dict()
def __init__(self, args, t5_type='t5-base'): """ R1 = Raw 1 Training: R1 + R2 + R3 -> M3 """ super().__init__() self.lr = getattr(args, "lr") self.epochs = getattr(args, "epochs") self.warmup_steps = getattr(args, "warmup_steps") self.gpu_id = getattr(args, "gpu_id") self.transformer = T5ForConditionalGeneration.from_pretrained(t5_type) self.tokenizer = T5TokenizerFast.from_pretrained(t5_type) self.EM_accuracy = CategoricalAccuracy() self.to('cpu' if self.gpu_id == -1 else f"cuda:{self.gpu_id}")
def test_RumorPadaDataProcessor(): data_processor = RumorPadaDataProcessor(["charliehebdo", "ferguson", "germanwings-crash", "ottawashooting"], "sydneysiege", DATA_DIR, EXP_DIR) print() print(data_processor.data["dev"].keys()) print(len(list(data_processor.data["dev"].values())[0])) for k, v in data_processor.data["dev"].items(): if type(v) is not int: print(k, v[-80]) else: print(k, v[-80], len(v[-80])) tokenizer = T5TokenizerFast.from_pretrained("t5-base") percent = 99.8 for split in RumorPadaDataProcessor.ALL_SPLITS: print(split, len(data_processor.data[split]["example_id"])) print() tokenizer_lens = tokenizer(data_processor.data[split]["input_str"], is_split_into_words=False, max_length=128, return_length=True)["length"] print(split, max(tokenizer_lens)) print(split, percentile(tokenizer_lens, percent)) break
def load_model(self, model_dir: str = "outputs", use_gpu: bool = False): """ loads a checkpoint for inferencing/prediction Args: model_dir (str, optional): path to model directory. Defaults to "outputs". use_gpu (bool, optional): if True, model uses gpu for inferencing/prediction. Defaults to True. """ self.model = T5ForConditionalGeneration.from_pretrained(f"{model_dir}") self.tokenizer = T5Tokenizer.from_pretrained(f"{model_dir}") if use_gpu: if torch.cuda.is_available(): self.device = torch.device("cuda") else: raise Exception( "exception ---> no gpu found. set use_gpu=False, to use CPU" ) else: self.device = torch.device("cpu") self.model = self.model.to(self.device)
class QuestionGenerator: """Class loads pipeline for generating questions from text""" model = T5ForConditionalGeneration.from_pretrained( "ThomasSimonini/t5-end2end-question-generation") tokenizer = T5TokenizerFast.from_pretrained("t5-base") tokenizer.sep_token = '<sep>' tokenizer.add_tokens(['<sep>']) @staticmethod def generate(text: str): """ generates questions for given text :param text: sentence or paragraph for question generation :return: list of questions """ try: if len(text) < 50: raise Exception("input too small") generator_args = { 'temperature': 1.02, 'num_beams': 1, 'max_length': 70 } text = "generate questions: " + text + " </s>" input_ids = QuestionGenerator.tokenizer.encode(text, return_tensors="pt") res = QuestionGenerator.model.generate(input_ids, **generator_args) output = QuestionGenerator.tokenizer.batch_decode( res, skip_special_tokens=True) output = output[0].split("<sep>") if len(output[-1]) == 0 or output[-1][-1] != "?": output.pop() output = [" ".join(i.split()) for i in output] return list(set(output)) except Exception as ex: raise ex
def __init__(self, data_dir: str, batch_size=8, pre_trained='', with_answers=False): super().__init__() self.batch_size = batch_size self.data_dir = data_dir self.with_answers = with_answers if pre_trained == 't5': self.tokenizer = T5TokenizerFast.from_pretrained( 't5-base', extra_ids=0, additional_special_tokens=['<A>', '<H>', '<R>', '<T>']) elif pre_trained == 'bart': self.tokenizer = BartTokenizerFast.from_pretrained( 'facebook/bart-base', extra_ids=0, additional_special_tokens=['<A>', '<H>', '<R>', '<T>']) else: raise Exception( f'Unknown pre-trained model {pre_trained}, choose t5 or bart.')
def test_AbsaSeq2SeqDataset(): data_processor = AbsaSeq2SeqDataProcessor(["device", "laptops", "rest"], "service", DATA_DIR) print() print(data_processor.data["dev"].keys()) print(len(list(data_processor.data["dev"].values())[0])) for k, v in data_processor.data["dev"].items(): if type(v) is not int: print(k, v[0]) else: print(k, v[0], len(v[0])) dataset = AbsaSeq2SeqDataset("dev", data_processor, T5TokenizerFast.from_pretrained("t5-base"), 64) print(len(dataset)) for example in dataset: for k, v in example.items(): print(k, v) break dataloader = DataLoader(dataset, 4) for batch in dataloader: for k, v in batch.items(): print(k, v) break
def test_RumorDataset(): data_processor = RumorDataProcessor( ["charliehebdo", "ferguson", "germanwings-crash", "ottawashooting"], "sydneysiege", DATA_DIR) print() print(data_processor.data["dev"].keys()) print(len(list(data_processor.data["dev"].values())[0])) for k, v in data_processor.data["dev"].items(): if type(v) is not int: print(k, v[0]) else: print(k, v[0], len(v[0])) dataset = RumorDataset("dev", data_processor, T5TokenizerFast.from_pretrained("t5-base"), 64) print(len(dataset)) for example in dataset: for k, v in example.items(): print(k, v) break dataloader = DataLoader(dataset, 4) for batch in dataloader: for k, v in batch.items(): print(k, v) break
def __init__(self, split='train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.raw_dataset = raw_dataset self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.split = split if self.verbose: print('Data source: ', self.split) data = self.raw_dataset.data if topk > 0: data = data[:topk] if self.verbose: print(f"Use only {topk} data") self.n_gpus = torch.cuda.device_count() self.rank = rank self.data = data if self.verbose: # if 'sent' not in self.data_out: # print("# all images:", len(self.data)) # else: print("# all sentences:", len(self.data)) self.n_boxes = args.n_boxes if 't5' in self.args.backbone: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = { 'additional_special_tokens': additional_special_tokens } num_added_toks = self.tokenizer.add_special_tokens( special_tokens_dict) self.source_to_h5 = { 'train': nlvr_feature_dir.joinpath(f'train_obj36.h5'), 'valid': nlvr_feature_dir.joinpath(f'valid_obj36.h5'), 'test': nlvr_feature_dir.joinpath(f'test_obj36.h5'), }
def t5_base_tokenizer_fast(self): return T5TokenizerFast.from_pretrained("t5-base")
tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) config = T5Config( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = T5TokenizerFast.from_pretrained(tokenizer_dir, max_len=512) model = T5ForConditionalGeneration(config=config) model.num_parameters() train_dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=f"{data_dir}/train_texts.txt", block_size=128, ) test_dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=f"{data_dir}/valid_texts.txt", block_size=128, )
def __init__(self, split='train,valid', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.raw_dataset = raw_dataset self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.sources = split.split(',') if self.verbose: print('Data sources: ', self.sources) if 't5' in self.args.backbone: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = { 'additional_special_tokens': additional_special_tokens } num_added_toks = self.tokenizer.add_special_tokens( special_tokens_dict) self.img_ids_to_source = {} data_info_dicts = [] for source in self.sources: data_info_path = dataset_dir.joinpath(f'GQA/{source}.json') with open(data_info_path) as f: _data_info_dicts = json.load(f) # source_img_ids.append([d['img_id'] for d in _data_info_dicts]) for _d in _data_info_dicts: self.img_ids_to_source[_d['img_id']] = source _d['source'] = source data_info_dicts.extend(_data_info_dicts) if self.verbose: print(f"Loaded {len(_data_info_dicts)} data from", source) data = data_info_dicts self.n_gpus = torch.cuda.device_count() self.rank = rank if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") self.data = data if self.verbose: print("# all sentences:", len(self.data)) self.n_boxes = args.n_boxes self.source_to_featname = { 'train': 'others', 'valid': 'others', 'submit': 'others', 'testdev': 'testdev' } self.featname_to_h5 = { 'others': vg_dir.joinpath('features/vg_gqa_obj36.h5'), 'testdev': gqa_dir.joinpath('features/gqa_testdev_obj36.h5'), }
import pandas as pd import torch from torch.utils.data import Dataset, DataLoader import pytorch_lightning as pl from transformers import (Adafactor, T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer) MODEL_NAME = "t5-base" tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME) class TranslationDataset(Dataset): def __init__( self, data: pd.DataFrame, tokenizer: T5Tokenizer, text_max_token_len: int = 110, translation_max_token_len: int = 100, ): self.tokenizer = tokenizer self.data = data self.text_max_token_len = text_max_token_len self.translation_max_token_len = translation_max_token_len def __len__(self): return len(self.data) def __getitem__(self, index: int): #Read line of DataFrame
data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.tokenizer_name: tokenizer = T5TokenizerFast.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = T5TokenizerFast.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.config_name: config = T5Config.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir,
def __init__(self, split='train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.raw_dataset = raw_dataset self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.source = split if self.verbose: print('Data source: ', self.source) if self.args.tokenizer is None: self.args.tokenizer = self.args.backbone if 't5' in self.args.tokenizer: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.tokenizer: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) if args.use_vis_order_embedding: additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = { 'additional_special_tokens': additional_special_tokens } num_added_toks = self.tokenizer.add_special_tokens( special_tokens_dict) if self.args.oscar_tags: # Load VG Classes vg_classes = [] with open(vg_dir.joinpath('objects_vocab.txt')) as f: for obj in f.readlines(): vg_classes.append(obj.split(',')[0].lower().strip()) self.vg_classes = vg_classes with open(wmt_data_dir.joinpath(f'raw/{self.source}.en')) as f: source_text_list = f.readlines() with open(wmt_data_dir.joinpath(f'raw/{self.source}.de')) as f: target_text_list = f.readlines() with open( wmt_data_dir.joinpath(f'image_splits/{self.source}.txt')) as f: image_ids = f.readlines() assert len(source_text_list) == len(target_text_list) assert len(source_text_list) == len(image_ids) data = [] for source_text, target_text, image_id in zip(source_text_list, target_text_list, image_ids): datum = { 'img_id': image_id.strip().split('.')[0], 'source_text': source_text.strip(), 'target_text': target_text.strip() } data.append(datum) if self.verbose: print(f"Loaded {len(data)} data from", split) self.n_gpus = torch.cuda.device_count() self.rank = rank if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") self.data = data if self.verbose: print("# all sentences:", len(self.data)) self.source_to_h5 = { 'train': flickr30k_feature_dir.joinpath('trainval_boxes36.h5'), 'val': flickr30k_feature_dir.joinpath('trainval_boxes36.h5'), 'test_2016_flickr': flickr30k_feature_dir.joinpath('trainval_boxes36.h5'), 'test_2017_flickr': flickr30k_feature_dir.joinpath('test2017_boxes36.h5'), 'test_2018_flickr': flickr30k_feature_dir.joinpath('test2018_boxes36.h5'), }
def __init__(self, split='karpathy_train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.raw_dataset = raw_dataset self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.source = split if self.verbose: print('Data source: ', self.source) if self.args.tokenizer is None: self.args.tokenizer = self.args.backbone if 't5' in self.args.tokenizer: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.tokenizer: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = { 'additional_special_tokens': additional_special_tokens } num_added_toks = self.tokenizer.add_special_tokens( special_tokens_dict) if self.args.oscar_tags: # Load VG Classes vg_classes = [] with open(vg_dir.joinpath('objects_vocab.txt')) as f: for obj in f.readlines(): vg_classes.append(obj.split(',')[0].lower().strip()) self.vg_classes = vg_classes data_info_path = dataset_dir.joinpath('COCO/dataset_coco.json') with open(data_info_path) as f: karpathy_data = json.load(f) split_rename = { 'train': 'train', 'restval': 'train', 'val': 'val', 'test': 'test' } n_images = 0 data = [] for datum in karpathy_data['images']: re_split = split_rename[datum['split']] if re_split != self.source.split('_')[-1]: continue if re_split == 'train': for d in datum['sentences']: img_id = datum['filename'].split('.')[0] new_datum = { 'img_id': img_id, 'sent': d['raw'].strip(), 'targets': [d['raw'].strip() for d in datum['sentences']], 'is_train': True, } data.append(new_datum) else: img_id = datum['filename'].split('.')[0] new_datum = { 'img_id': img_id, # 'sent': d['raw'], 'targets': [d['raw'].strip() for d in datum['sentences']], 'is_train': False, } data.append(new_datum) n_images += 1 if self.verbose: print(f"{self.source} has {n_images} images") print(f"Loaded {len(data)} data from", split) self.n_gpus = torch.cuda.device_count() self.rank = rank if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") self.data = data if self.verbose: print("# all sentences:", len(self.data)) self.source_to_h5 = {} if self.args.max_n_boxes == 36: self.source_to_h5.update({ 'train2014': coco_dir.joinpath('features').joinpath('train2014_obj36.h5'), 'val2014': coco_dir.joinpath('features').joinpath('val2014_obj36.h5'), })
def __init__(self, split='train', rank=-1, topk=-1, verbose=True, args=None, is_train=True): self.topk = topk self.verbose = verbose self.args = args # Loading datasets to data self.source = split if self.verbose: print('Data sources: ', self.source) if 't5' in self.args.backbone: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) if args.use_vis_order_embedding: additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = {'additional_special_tokens': additional_special_tokens} num_added_toks = self.tokenizer.add_special_tokens(special_tokens_dict) self.losses = args.losses.split(',') data_info_path = dataset_dir.joinpath(f'VCR/{self.source}.jsonl') with open(data_info_path) as f: data_info_dicts = [json.loads(s) for s in f] if self.topk > 0: data_info_dicts = data_info_dicts[:self.topk] for datum in data_info_dicts: datum['backbone'] = self.args.backbone datum['losses'] = self.losses with Pool(8) as pool: if self.verbose: data = [datum for _data in tqdm( pool.imap(get_datum, data_info_dicts), total=len(data_info_dicts), ncols=100) for datum in _data] else: data = [datum for _data in pool.imap( get_datum, data_info_dicts) for datum in _data] if self.verbose: print(f"Loaded {len(data)} data from", self.source) self.n_gpus = torch.cuda.device_count() self.rank = rank if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") self.data = data if self.verbose and is_train and ('t5' in self.args.backbone or 'bart' in self.args.backbone): from collections import Counter task_counter = Counter() for datum in data: try: task_counter.update([datum['task']]) except KeyError: print(datum) exit() print(task_counter) for k, v in task_counter.items(): print(k, f'{v/len(data)*100:.1f}%') if self.verbose: print("# examples:", len(data)) self.source_to_h5 = { 'train': vcr_feature_dir.joinpath(f'train_boxes36.h5'), 'val': vcr_feature_dir.joinpath(f'val_boxes36.h5'), 'test': vcr_feature_dir.joinpath(f'test_boxes36.h5'), 'train_GT': vcr_feature_dir.joinpath(f'train_boxes_GT.h5'), 'val_GT': vcr_feature_dir.joinpath(f'val_boxes_GT.h5'), 'test_GT': vcr_feature_dir.joinpath(f'test_boxes_GT.h5'), } self.n_boxes = args.n_boxes
from torch.utils.data import Dataset, DataLoader from transformers import T5TokenizerFast import torch import yaml with open('config.yml', 'r') as f: config = yaml.safe_load(f) TOKENIZER = T5TokenizerFast.from_pretrained(config['model']['model_name'], do_lower_case=True) class ShapingDataset(Dataset): def __init__(self, texts, summaries): super().__init__() self.texts = texts self.summaries = summaries self.tokenizer = TOKENIZER self.summary_length = config['model']['summary_length'] self.token_length = config['model']['token_length'] def __len__(self): return len(self.texts) def __getitem__(self, item): texts = str(self.texts[item]) summaries = str(self.summaries[item]) texts_enconding = self.tokenizer( texts, padding='max_length',
def __init__(self, split='train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.raw_dataset = raw_dataset self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.split = split self.sources = split.split(',') if self.verbose: print('Data sources: ', self.sources) if 't5' in self.args.backbone: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone) else: self.tokenizer = T5TokenizerFast.from_pretrained(args.backbone) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = { 'additional_special_tokens': additional_special_tokens } num_added_toks = self.tokenizer.add_special_tokens( special_tokens_dict) # mattnet_maskrcnn_detections_path = refcoco_dir.joinpath( # 'detections/refcocog_umd/res101_coco_minus_refer_notime_dets.json') # with open(mattnet_maskrcnn_detections_path) as f: # mattnet_maskrcnn_detections = json.load(f) data = [] self.refer = REFER('refcocog', 'umd', img_dir=coco_img_dir, ref_dir=refcoco_dir, verbose=verbose) ref_ids = self.refer.getRefIds(split=split) for ref_id in ref_ids: ref = self.refer.Refs[ref_id] image_id = ref["image_id"] ref_id = ref["ref_id"] refBox = self.refer.getRefBox(ref_id) for sent, sent_id in zip(ref["sentences"], ref["sent_ids"]): caption = sent["raw"] data.append({ "caption": caption, "sent_id": sent_id, "image_id": image_id, "refBox": refBox, "ref_id": ref_id, }) self.n_gpus = torch.cuda.device_count() self.rank = rank if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") self.data = data if self.verbose: print("# all sentences:", len(self.data)) self.n_boxes = args.n_boxes self.source_to_h5 = { 'train': refcocog_feature_dir.joinpath(f'train_boxes_GT.h5') } if self.args.RefCOCO_GT: self.source_to_h5['val'] = refcocog_feature_dir.joinpath( f'val_boxes_GT.h5') self.source_to_h5['test'] = refcocog_feature_dir.joinpath( f'test_boxes_GT.h5') else: self.source_to_h5['val'] = refcocog_feature_dir.joinpath( f'val_boxes_mattnet.h5') self.source_to_h5['test'] = refcocog_feature_dir.joinpath( f'test_boxes_mattnet.h5')
def __init__(self, split='vg', rank=-1, topk=-1, verbose=True, args=None, is_train=True): self.topk = topk self.verbose = verbose self.args = args # Loading datasets to data self.sources = split.split(',') if self.verbose: print('Data sources: ', self.sources) # Answer Table from LXMERT (Could be removed) self.answer_table = AnswerTable() if self.verbose: print("Load an answer table of size %d." % (len(self.answer_table.ans2id_map()))) self.img_ids_to_source = {} losses = args.losses.split(',') data = [] for img_source in self.sources: data_info_path = dataset_dir.joinpath(f'lxmert/{img_source}.json') with open(data_info_path) as f: _data = json.load(f) if self.verbose: print(f"Loaded {len(_data)} data from", img_source) # source_img_ids.append([d['img_id'] for d in _data]) for datum in _data: self.img_ids_to_source[datum['img_id']] = img_source # datum['img_source'] = img_source datum['args'] = args datum['is_train'] = is_train datum['caption_only'] = args.caption_only datum['lm'] = 'lm' in losses datum['qa'] = 'qa' in losses datum['ground_caption'] = 'ground_caption' in losses datum['refer'] = 'refer' in losses datum['itm'] = 'itm' in losses datum['caption'] = 'caption' in losses datum['backbone'] = self.args.backbone data.extend(_data) # Modify the answers if 'qa' in args.losses: for datum in data: labelf = datum['labelf'] for _qa_source, labels in labelf.items(): for label in labels: for ans in list(label.keys()): new_ans = self.answer_table.convert_ans(ans) if self.answer_table.used(new_ans): if ans != new_ans: label[new_ans] = label.pop(ans) else: label.pop(ans) if self.verbose: print("# images:", len(data)) if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") if 'qa' in args.losses: self.evaluator = QAEvaluator(data) with Pool(8) as pool: if self.verbose: data = [datum for _data in tqdm( pool.imap(get_datum, data), total=len(data), ncols=100, desc="Creating pretrainig data examples") for datum in _data] else: data = [datum for _data in pool.imap( get_datum, data) for datum in _data] if self.args.itm_cocoonly: caption_sources = ['mscoco'] else: caption_sources = ['mscoco', 'vg'] self.data_captions = [datum for datum in data if datum['text_source'] in caption_sources] self.n_data_captions = len(self.data_captions) if self.verbose: print('# itm data:', self.n_data_captions) self.data = data self.n_data = len(self.data) if self.verbose and is_train: from collections import Counter task_counter = Counter() for datum in data: try: task_counter.update([datum['task']]) except KeyError: print(datum) exit() print(task_counter) for k, v in task_counter.items(): print(k, f'{v/len(data)*100:.1f}%') if self.verbose: print("# examples:", len(data)) self.source_to_h5 = { 'mscoco_resplit_train_train2014': coco_dir.joinpath('features').joinpath('train2014_obj36.h5'), 'mscoco_resplit_train_val2014': coco_dir.joinpath('features').joinpath('val2014_obj36.h5'), 'mscoco_resplit_val': coco_dir.joinpath('features').joinpath('resplit_val_obj36.h5'), 'vgnococo': vg_dir.joinpath('features').joinpath('vg_gqa_obj36.h5'), } self.n_boxes = args.n_boxes if 't5' in self.args.backbone: if self.args.use_vision: # self.tokenizer = VLT5Tokenizer.from_pretrained( # args.backbone, do_lower_case=args.do_lower_case) self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, do_lower_case=args.do_lower_case) else: # self.tokenizer = T5Tokenizer.from_pretrained( # args.backbone, do_lower_case=args.do_lower_case) self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, do_lower_case=args.do_lower_case) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained(args.backbone) additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = {'additional_special_tokens': additional_special_tokens} self.tokenizer.add_special_tokens(special_tokens_dict)
def __init__(self, split='train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.raw_dataset = raw_dataset self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.sources = split.split(',') if self.verbose: print('Data sources: ', self.sources) if 't5' in self.args.backbone: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) if args.use_vis_order_embedding: additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = {'additional_special_tokens': additional_special_tokens} num_added_toks = self.tokenizer.add_special_tokens(special_tokens_dict) self.answer_normalizer = VQAEvaluator() self.img_ids_to_source = {} data_info_dicts = [] for source in self.sources: data_info_path = dataset_dir.joinpath(f'vqa/{source}.json') with open(data_info_path) as f: _data_info_dicts = json.load(f) for _d in _data_info_dicts: if 'vg_qa_full' == source: self.img_ids_to_source[_d['img_id']] = 'vg' elif 'train2014' in _d['img_id']: self.img_ids_to_source[_d['img_id']] = 'train2014' elif 'val2014' in _d['img_id']: self.img_ids_to_source[_d['img_id']] = 'val2014' else: self.img_ids_to_source[_d['img_id']] = source _d['source'] = source data_info_dicts.extend(_data_info_dicts) if self.verbose: print(f"Loaded {len(_data_info_dicts)} data from", source) data = data_info_dicts self.n_gpus = torch.cuda.device_count() self.rank = rank if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") self.data = data if self.verbose: print("# all sentences:", len(self.data)) self.n_boxes = args.n_boxes self.source_to_h5 = { 'train': coco_feature_dir.joinpath(f'train2014_obj36.h5'), 'minival': coco_feature_dir.joinpath(f'val2014_obj36.h5'), 'nominival': coco_feature_dir.joinpath(f'val2014_obj36.h5'), 'test': coco_feature_dir.joinpath(f'test2015_obj36.h5'), 'vg': dataset_dir.joinpath('VG/features').joinpath('vg_gqa_obj36.h5'), 'train2014': coco_feature_dir.joinpath(f'train2014_obj36.h5'), 'val2014': coco_feature_dir.joinpath(f'val2014_obj36.h5'), }
def __init__(self, split='train', raw_dataset=None, rank=-1, topk=-1, verbose=True, args=None, mode='train'): super().__init__() self.topk = topk self.verbose = verbose self.args = args self.mode = mode # Loading datasets to data self.split = split self.sources = split.split(',') if self.verbose: print('Data sources: ', self.sources) if 't5' in self.args.backbone: if self.args.use_vision: self.tokenizer = VLT5TokenizerFast.from_pretrained( args.backbone, max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) else: self.tokenizer = T5TokenizerFast.from_pretrained( args.backbone, max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) elif 'bart' in self.args.backbone: self.tokenizer = BartTokenizer.from_pretrained( args.backbone, # max_length=self.args.max_text_length, do_lower_case=self.args.do_lower_case) additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \ [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)] special_tokens_dict = { 'additional_special_tokens': additional_special_tokens } num_added_toks = self.tokenizer.add_special_tokens( special_tokens_dict) self.img_ids_to_source = {} data_info_dicts = [] for source in self.sources: data_info_path = dataset_dir.joinpath(f'VCR/{source}.jsonl') with open(data_info_path) as f: _data_info_dicts = [json.loads(s) for s in f] for _d in _data_info_dicts: self.img_ids_to_source[_d['img_id']] = source _d['source'] = source data_info_dicts.extend(_data_info_dicts) if self.verbose: print(f"Loaded {len(_data_info_dicts)} data from", source) data = data_info_dicts self.rank = rank if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") self.data = data if self.verbose: print("# all sentences:", len(self.data)) self.n_boxes = args.n_boxes self.source_to_h5 = { 'train': vcr_feature_dir.joinpath(f'train_boxes36.h5'), 'val': vcr_feature_dir.joinpath(f'val_boxes36.h5'), 'test': vcr_feature_dir.joinpath(f'test_boxes36.h5'), 'train_GT': vcr_feature_dir.joinpath(f'train_boxes_GT.h5'), 'val_GT': vcr_feature_dir.joinpath(f'val_boxes_GT.h5'), 'test_GT': vcr_feature_dir.joinpath(f'test_boxes_GT.h5'), }
def get_tokenizer(self, opt): return T5TokenizerFast.from_pretrained(opt['t5_model_arch'], truncation=True)