def run(self):
        """
        CORPUS/xxx.txt, DATA/xxx.vocab, DATA/xxx.model -> DATA/xxx.txt
        """
        vocab_file = os.path.join(ACE_ROOT,
                                  '%s.vocab' % self.config.model_prefix)
        model_file = os.path.join(ACE_ROOT,
                                  '%s.model' % self.config.model_prefix)

        self.__create_text()
        if not os.path.isfile(vocab_file) or not os.path.isfile(model_file):
            self.__create_vocab()

        tokenizer = AlbertTokenizer(vocab_file=vocab_file,
                                    model_file=model_file,
                                    do_lower_case=self.config.do_lower_case,
                                    remove_space=self.config.remove_space,
                                    keep_accents=self.config.keep_accents,
                                    bos_token=self.config.bos_token,
                                    eos_token=self.config.eos_token,
                                    unk_token=self.config.unk_token,
                                    sep_token=self.config.sep_token,
                                    pad_token=self.config.pad_token,
                                    cls_token=self.config.cls_token,
                                    mask_token=self.config.mask_token)
        for text_file in tqdm(FileUtil.file_list(self.config.corpus_dir),
                              desc='create pretraining data files'):
            if text_file.endswith('.txt'):
                data_file = os.path.join(self.config.data_dir,
                                         os.path.basename(text_file))
                with open(text_file, 'r') as f, open(data_file, 'w') as fw:
                    for line in f.read().splitlines():
                        tokens = tokenizer.tokenize(line)
                        fw.write(' '.join(tokens) + '\n')
Beispiel #2
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_type from name '{pretrained_model_name_or_path}'. Set arg `tokenizer_type` in Tokenizer.load() to one of: 'bert', 'roberta', 'xlnet' "
                )
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        if tokenizer_class == "AlbertTokenizer":
            ret = AlbertTokenizer.from_pretrained(
                pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            ret = XLMRobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "RobertaTokenizer":
            ret = RobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DistilBertTokenizer":
            ret = DistilBertTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "BertTokenizer":
            ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                 keep_accents=True,
                                                 **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
def test2():
    args = Args()
    ner_data_processor = TransformerNerDataProcessor()
    conll_2003 = Path(
        __file__).resolve().parent.parent.parent / "test_data/conll-2003"
    ner_data_processor.set_data_dir(conll_2003)
    labels, label2idx = ner_data_processor.get_labels(default='roberta')
    # train_examples = roberta_ner_data_processor.get_train_examples()
    train_examples = ner_data_processor.get_test_examples()
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    # tokenizer = XLNetTokenizer.from_pretrained("xlnet-base_uncased")
    features = transformer_convert_data_to_features(args,
                                                    train_examples[:5],
                                                    label2idx,
                                                    tokenizer,
                                                    max_seq_len=10)

    model = AlbertNerModel.from_pretrained("albert-base-v2",
                                           num_labels=len(label2idx))

    for idx, each_batch in enumerate(
            ner_data_loader(features, batch_size=5, task='test', auto=True)):
        original_mask = each_batch[1].numpy()
        print(original_mask, original_mask.shape)
        inputs = batch_to_model_inputs(each_batch)
        with torch.no_grad():
            logits, flatted_logits, loss = model(**inputs)
        logits = logits.numpy()
        print(logits)
        print(logits.shape)
        break
Beispiel #4
0
 def __init__(self, args, device='cpu'):
     print(args.bert_model)
     self.tokenizer = AlbertTokenizer.from_pretrained(args.bert_model)
     self.data_dir = args.data_dir
     file_list = get_json_file_list(args.data_dir)
     self.data = []
     #max_article_len = 0
     for file_name in file_list:
         data = json.loads(open(file_name, 'r').read())
         data['high'] = 0
         if ('high' in file_name):
             data['high'] = 1
         self.data.append(data)
         #max_article_len = max(max_article_len, len(nltk.word_tokenize(data['article'])))
     self.data_objs = []
     high_cnt = 0
     middle_cnt = 0
     for sample in self.data:
         high_cnt += sample['high']
         middle_cnt += (1 - sample['high'])
         self.data_objs += self._create_sample(sample)
         #break
     print('high school sample:', high_cnt)
     print('middle school sample:', middle_cnt)
     for i in range(len(self.data_objs)):
         self.data_objs[i].convert_tokens_to_ids(self.tokenizer)
         #break
     torch.save(self.data_objs, args.save_name)
Beispiel #5
0
    def test_full_tokenizer(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [48, 25, 21, 1289])

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(tokens, [
            "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this",
            "▁is", "▁fal", "s", "é", "."
        ])
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                "▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and",
                "▁this", "▁is", "▁fal", "s", "<unk>", "."
            ],
        )
    def __init__(self, data_path, tokenizer, transforms, args):
        self.args = args
        self.data_dir = os.path.dirname(data_path)
        self.data = [json.loads(l) for l in open(data_path)]

        self.max_seq_len = args.max_seq_len  # 512
        self.max_seq_len -= args.num_image_embeds  # 512 - #img_embeds

        self.seq_len = args.seq_len
        self.transforms = transforms

        self.total_len = self.seq_len + self.args.num_image_embeds + 3
        self._tril_matrix = torch.tril(
            torch.ones((self.total_len, self.total_len), dtype=torch.long))

        self.tokenizer = tokenizer  # tokenizer = BertTokenizer.from_pretrained('bert-based-uncased').tokenize

        if args.bert_model == "albert-base-v2":
            self.albert_tokenizer = AlbertTokenizer.from_pretrained(
                args.bert_model)
            self.vocab_stoi = self.albert_tokenizer.get_vocab()  # <unk>, <pad>
            self.vocab_len = len(self.vocab_stoi)  # 30000

        elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT":
            self.BertTokenizer = AutoTokenizer.from_pretrained(args.bert_model)
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 28996

        elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12":
            self.BertTokenizer = AutoTokenizer.from_pretrained(args.bert_model)
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 30522

        elif args.bert_model == "bert-small-scratch":
            self.BertTokenizer = BertTokenizer.from_pretrained(
                "google/bert_uncased_L-4_H-512_A-8")
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 30522

        elif args.bert_model == "bert-base-scratch":
            self.BertTokenizer = BertTokenizer.from_pretrained(
                "bert-base-uncased")
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 30522

        # elif args.bert_model == "load_pretrained_model":
        #     self.BertTokenizer = BertTokenizer.from_pretrained(args.init_model)
        #     self.vocab_stoi = self.BertTokenizer.vocab
        #     self.vocab_len = len(self.vocab_stoi)  # 30522

        else:  # BERT-base, small, tiny
            self.BertTokenizer = BertTokenizer.from_pretrained(args.bert_model)
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 30522
Beispiel #7
0
def test(
    args
):  # Load a trained model that you have fine-tuned (we assume evaluate on cpu)
    processor = data_utils.AscProcessor()
    label_list = processor.get_labels()
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    eval_examples = processor.get_test_examples(args.data_dir)
    eval_features = data_utils.convert_examples_to_features(
        eval_examples, label_list, args.max_seq_length, tokenizer, "asc")

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask,
                              all_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    model = torch.load(os.path.join(args.output_dir, "model.pt"))
    model.cuda()
    model.eval()

    full_logits = []
    full_label_ids = []
    for step, batch in enumerate(eval_dataloader):
        batch = tuple(t.cuda() for t in batch)
        input_ids, segment_ids, input_mask, label_ids = batch

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.cpu().numpy()

        full_logits.extend(logits.tolist())
        full_label_ids.extend(label_ids.tolist())

    output_eval_json = os.path.join(args.output_dir, "predictions.json")
    with open(output_eval_json, "w") as fw:
        json.dump({"logits": full_logits, "label_ids": full_label_ids}, fw)
Beispiel #8
0
    def __init__(self, data_path, tokenizer, transforms, args, is_train=True):
        self.args = args
        self.data_dir = os.path.dirname(data_path)
        self.data = [json.loads(line) for line in open(data_path)]

        self.num_image_embeds = args.num_image_embeds
        self.seq_len = args.seq_len
        self.transforms = transforms

        self.is_train = is_train

        self.tokenizer = tokenizer

        if args.bert_model == "albert-base-v2":
            self.albert_tokenizer = AlbertTokenizer.from_pretrained(
                args.bert_model)
            self.vocab_stoi = self.albert_tokenizer.get_vocab()  # <unk>, <pad>
            self.vocab_len = len(self.vocab_stoi)  # 30000

        elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT":
            self.BertTokenizer = AutoTokenizer.from_pretrained(args.bert_model)
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 28996

        elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12":
            self.BertTokenizer = AutoTokenizer.from_pretrained(args.bert_model)
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 30522

        elif args.bert_model == "bert-small-scratch":
            self.BertTokenizer = BertTokenizer.from_pretrained(
                "google/bert_uncased_L-4_H-512_A-8")
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 30522

        elif args.bert_model == "bert-base-scratch":
            self.BertTokenizer = BertTokenizer.from_pretrained(
                "bert-base-uncased")
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 30522

        else:  # BERT-base, small, tiny
            self.BertTokenizer = BertTokenizer.from_pretrained(args.bert_model)
            self.vocab_stoi = self.BertTokenizer.vocab
            self.vocab_len = len(self.vocab_stoi)  # 30522
Beispiel #9
0
    def test_sequence_builders(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)

        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")

        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)

        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
            tokenizer.sep_token_id
        ]
Beispiel #10
0
def main(dataset_dir):
    try:
        cfg = Box.from_yaml(filename=dataset_dir / 'config.yaml')
    except FileNotFoundError:
        print(f'[!] {dataset_dir} must be a directory and contains config.yaml')
        exit(1)
    print(f'[-] Datasets will be saved to {dataset_dir}\n')

    output_files = ['train.pkl', 'dev.pkl']
    if any([(dataset_dir / p).exists() for p in output_files]):
        print('[!] Directory already contains saved dataset')
        exit(2)

    data_dir = Path(cfg.data_dir)
    data = {
        'train': load_data(data_dir / 'train-v2.0.json'),
        'dev': load_data(data_dir / 'dev-v2.0.json'),
    }
    print()

    tokenizer = AlbertTokenizer.from_pretrained(**cfg.tokenizer)
    data = {k: tokenize(k, v, tokenizer) for k, v in data.items()}

    create_dataset(data, dataset_dir)
Beispiel #11
0
    def test_full_tokenizer(self):
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize(u'This is a test')
        self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test'])

        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [48, 25, 21, 1289])

        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
        self.assertListEqual(tokens, [
            u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and',
            u'▁this', u'▁is', u'▁fal', u's', u'é', u'.'
        ])
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(back_tokens, [
            '▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this',
            '▁is', '▁fal', 's', '<unk>', '.'
        ])
Beispiel #12
0
 def get_tokenizer(self, **kwargs):
     return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
Beispiel #13
0
    def setUp(self):
        super(AlbertTokenizationTest, self).setUp()

        # We have a SentencePiece fixture for testing
        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)
Beispiel #14
0
    # ------------------------------------------------#
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # ------------------------------------------------#
    experiment = 'conceptnet'
    # ------------------------------------------------#
    # print('load_data', args.train_file_name)
    # train_data = load_data(experiment, args.train_file_name, type='json')

    print('load_data', args.trial_file_name)
    devlp_data = load_data(experiment, args.trial_file_name, type='json')

    print('load_vocab', args.bert_vocab_dir)
    tokenizer = AlbertTokenizer.from_pretrained(args.bert_vocab_dir)
    # ------------------------------------------------#

    # ------------------------------------------------#
    # print('make dataloader ...')
    # if args.mission == 'train':
    #     train_dataloader = make_dataloader(
    #         experiment, train_data, tokenizer, batch_size=args.batch_size,
    #         drop_last=False, max_seq_length=64)  # 52 + 3

    #     print('train_data %d ' % len(train_data))

    devlp_dataloader = make_dataloader(experiment,
                                       devlp_data,
                                       tokenizer,
                                       batch_size=args.batch_size,
Beispiel #15
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif 'codebert' in pretrained_model_name_or_path.lower():
                if "mlm" in pretrained_model_name_or_path.lower():
                    raise NotImplementedError(
                        "MLM part of codebert is currently not supported in FARM"
                    )
                else:
                    tokenizer_class = "RobertaTokenizer"
            elif "camembert" in pretrained_model_name_or_path.lower(
            ) or "umberto" in pretrained_model_name_or_path:
                tokenizer_class = "CamembertTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            elif "electra" in pretrained_model_name_or_path.lower():
                tokenizer_class = "ElectraTokenizer"
            elif "word2vec" in pretrained_model_name_or_path.lower() or \
                    "glove" in pretrained_model_name_or_path.lower() or \
                    "fasttext" in pretrained_model_name_or_path.lower():
                tokenizer_class = "EmbeddingTokenizer"
            elif "minilm" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set "
                    f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, "
                    f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or "
                    f"XLNetTokenizer.")
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        if tokenizer_class == "AlbertTokenizer":
            ret = AlbertTokenizer.from_pretrained(
                pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            ret = XLMRobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "RobertaTokenizer":
            ret = RobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DistilBertTokenizer":
            ret = DistilBertTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "BertTokenizer":
            ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                 keep_accents=True,
                                                 **kwargs)
        elif tokenizer_class == "ElectraTokenizer":
            ret = ElectraTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            ret = EmbeddingTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            ret = CamembertTokenizer._from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Beispiel #16
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             use_fast=False,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif 'codebert' in pretrained_model_name_or_path.lower():
                if "mlm" in pretrained_model_name_or_path.lower():
                    raise NotImplementedError(
                        "MLM part of codebert is currently not supported in FARM"
                    )
                else:
                    tokenizer_class = "RobertaTokenizer"
            elif "camembert" in pretrained_model_name_or_path.lower(
            ) or "umberto" in pretrained_model_name_or_path:
                tokenizer_class = "CamembertTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            elif "electra" in pretrained_model_name_or_path.lower():
                tokenizer_class = "ElectraTokenizer"
            elif "word2vec" in pretrained_model_name_or_path.lower() or \
                    "glove" in pretrained_model_name_or_path.lower() or \
                    "fasttext" in pretrained_model_name_or_path.lower():
                tokenizer_class = "EmbeddingTokenizer"
            elif "minilm" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "dpr-question_encoder" in pretrained_model_name_or_path.lower(
            ):
                tokenizer_class = "DPRQuestionEncoderTokenizer"
            elif "dpr-ctx_encoder" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DPRContextEncoderTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set "
                    f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, "
                    f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or "
                    f"XLNetTokenizer.")
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if tokenizer_class == "AlbertTokenizer":
            if use_fast:
                logger.error(
                    'AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.'
                )
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            if use_fast:
                logger.error(
                    'XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.'
                )
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                logger.error(
                    'RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.'
                )
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            if use_fast:
                logger.error(
                    'XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.'
                )
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            if use_fast:
                logger.error(
                    'CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.'
                )
                ret = CamembertTokenizer._from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer._from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast":
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Beispiel #17
0
    def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if tokenizer_class == "AlbertTokenizer":
            if use_fast:
                logger.error('AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.')
                ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True,  **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            if use_fast:
                logger.error('XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.')
                ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                logger.error('RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.')
                ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            if use_fast:
                logger.error('XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.')
                ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:  # because it also might be fast tokekenizer we use "in"
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error('EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.')
                ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "CamembertTokenizer":
            if use_fast:
                logger.error('CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.')
                ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast":
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast":
            if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast":
                ret = DPRContextEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret