Ejemplo n.º 1
0
 def t5_base_tokenizer_fast(self):
     return T5TokenizerFast.from_pretrained("t5-base")
Ejemplo n.º 2
0
 def get_tokenizer(self, opt):
     return T5TokenizerFast.from_pretrained(opt['t5_model_arch'],
                                            truncation=True)
Ejemplo n.º 3
0
    def __init__(self,
                 split='train',
                 raw_dataset=None,
                 rank=-1,
                 topk=-1,
                 verbose=True,
                 args=None,
                 mode='train'):
        super().__init__()

        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.split = split
        self.sources = split.split(',')
        if self.verbose:
            print('Data sources: ', self.sources)

        if 't5' in self.args.backbone:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)

        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                    [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
            special_tokens_dict = {
                'additional_special_tokens': additional_special_tokens
            }
            num_added_toks = self.tokenizer.add_special_tokens(
                special_tokens_dict)

        self.img_ids_to_source = {}
        data_info_dicts = []
        for source in self.sources:
            data_info_path = dataset_dir.joinpath(f'VCR/{source}.jsonl')
            with open(data_info_path) as f:
                _data_info_dicts = [json.loads(s) for s in f]
                for _d in _data_info_dicts:
                    self.img_ids_to_source[_d['img_id']] = source
                    _d['source'] = source

                data_info_dicts.extend(_data_info_dicts)
            if self.verbose:
                print(f"Loaded {len(_data_info_dicts)} data from", source)

        data = data_info_dicts

        self.rank = rank

        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        self.data = data

        if self.verbose:
            print("# all sentences:", len(self.data))

        self.n_boxes = args.n_boxes

        self.source_to_h5 = {
            'train': vcr_feature_dir.joinpath(f'train_boxes36.h5'),
            'val': vcr_feature_dir.joinpath(f'val_boxes36.h5'),
            'test': vcr_feature_dir.joinpath(f'test_boxes36.h5'),
            'train_GT': vcr_feature_dir.joinpath(f'train_boxes_GT.h5'),
            'val_GT': vcr_feature_dir.joinpath(f'val_boxes_GT.h5'),
            'test_GT': vcr_feature_dir.joinpath(f'test_boxes_GT.h5'),
        }
Ejemplo n.º 4
0
    def __init__(self,
                 hparams: argparse.Namespace,
                 num_labels=None,
                 mode="base",
                 config=None,
                 tokenizer=None,
                 model=None,
                 **config_kwargs):
        """Initialize a model, tokenizer and config."""
        super().__init__()
        # TODO: move to self.save_hyperparameters()
        # self.save_hyperparameters()
        # can also expand arguments into trainer signature for easier reading

        self.save_hyperparameters(hparams)
        self.step_count = 0
        self.output_dir = Path(self.hparams.output_dir)
        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
        if config is None:
            self.config = AutoConfig.from_pretrained(
                self.hparams.config_name if self.hparams.config_name else
                self.hparams.model_name_or_path,
                **({
                    "num_labels": num_labels
                } if num_labels is not None else {}),
                cache_dir=cache_dir,
                **config_kwargs,
            )
            if self.hparams.tokenizer_name == "t5":
                self.config.vocab_size = hparams.vocab_size
                self.config.decoder_start_token_id = 1
                self.config.eos_token_id = 2
                self.config.pad_token_id = 3
            print("pretrained", self.config)
        else:
            self.config: PretrainedConfig = config

        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop",
                              "dropout", "attention_dropout")
        for p in extra_model_params:
            if getattr(self.hparams, p, None):
                assert hasattr(
                    self.config,
                    p), f"model config doesn't have a `{p}` attribute"
                setattr(self.config, p, getattr(self.hparams, p))

        if tokenizer is None:
            if self.hparams.tokenizer_name and self.hparams.tokenizer_name == "t5" and self.hparams.vocab_file:
                from transformers import T5TokenizerFast, T5Tokenizer
                print(self.hparams.vocab_file)
                self.tokenizer = T5TokenizerFast(self.hparams.vocab_file)
                print("custom tokenizer", self.tokenizer)
            elif self.hparams.tokenizer_name and self.hparams.tokenizer_name == "pegasus" and self.hparams.vocab_file:
                from transformers import PegasusTokenizerFast, PegasusTokenizer
                print(self.hparams.vocab_file)
                self.tokenizer = PegasusTokenizerFast(self.hparams.vocab_file)
                print("custom tokenizer", self.tokenizer)
            else:
                self.tokenizer = AutoTokenizer.from_pretrained(
                    self.hparams.tokenizer_name if self.hparams.tokenizer_name
                    else self.hparams.model_name_or_path,
                    cache_dir=cache_dir,
                )
        else:
            self.tokenizer: PreTrainedTokenizer = tokenizer
        self.model_type = MODEL_MODES[mode]
        if model is None:
            # self.model = self.model_type.from_pretrained(
            #     self.hparams.model_name_or_path,
            #     from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            #     config=self.config,
            #     cache_dir=cache_dir,
            # )
            print(self.config)
            self.model = self.model_type.from_config(
                # self.hparams.model_name_or_path,
                # from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
                config=self.config,
                # cache_dir=cache_dir,
            )
            print(self.model)
        else:
            self.model = model
Ejemplo n.º 5
0
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension,
                                data_files=data_files,
                                cache_dir=model_args.cache_dir)

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer

    if model_args.tokenizer_name:
        tokenizer = T5TokenizerFast.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    elif model_args.model_name_or_path:
        tokenizer = T5TokenizerFast.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.config_name:
        config = T5Config.from_pretrained(model_args.config_name,
                                          cache_dir=model_args.cache_dir,
Ejemplo n.º 6
0
    def __init__(self,
                 split='train',
                 raw_dataset=None,
                 rank=-1,
                 topk=-1,
                 verbose=True,
                 args=None,
                 mode='train'):
        super().__init__()

        self.raw_dataset = raw_dataset
        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.mode = mode

        # Loading datasets to data
        self.split = split
        if self.verbose:
            print('Data source: ', self.split)

        data = self.raw_dataset.data

        if topk > 0:
            data = data[:topk]
            if self.verbose:
                print(f"Use only {topk} data")

        self.n_gpus = torch.cuda.device_count()

        self.rank = rank

        self.data = data

        if self.verbose:
            # if 'sent' not in self.data_out:
            #     print("# all images:", len(self.data))
            # else:
            print("# all sentences:", len(self.data))

        self.n_boxes = args.n_boxes

        if 't5' in self.args.backbone:
            if self.args.use_vision:
                self.tokenizer = VLT5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)
            else:
                self.tokenizer = T5TokenizerFast.from_pretrained(
                    args.backbone,
                    # max_length=self.args.max_text_length,
                    do_lower_case=self.args.do_lower_case)

        elif 'bart' in self.args.backbone:
            self.tokenizer = BartTokenizer.from_pretrained(
                args.backbone,
                # max_length=self.args.max_text_length,
                do_lower_case=self.args.do_lower_case)

            additional_special_tokens = [f'<extra_id_{i}>' for i in range(100-1, -1, -1)] + \
                    [f'<vis_extra_id_{i}>' for i in range(100-1, -1, -1)]
            special_tokens_dict = {
                'additional_special_tokens': additional_special_tokens
            }
            num_added_toks = self.tokenizer.add_special_tokens(
                special_tokens_dict)

        self.source_to_h5 = {
            'train': nlvr_feature_dir.joinpath(f'train_obj36.h5'),
            'valid': nlvr_feature_dir.joinpath(f'valid_obj36.h5'),
            'test': nlvr_feature_dir.joinpath(f'test_obj36.h5'),
        }