コード例 #1
0
    def test_subword_regularization_tokenizer(self):
        # Subword regularization is only available for the slow tokenizer.
        tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB,
                                        keep_accents=True,
                                        sp_model_kwargs={
                                            "enable_sampling": True,
                                            "alpha": 0.1,
                                            "nbest_size": -1
                                        })

        # Subword regularization augments training data with subword sampling.
        # This has a random component. We test if the tokenizer generates different
        # results when subword regularization is enabled.
        tokens_list = []
        for _ in range(5):
            tokens_list.append(
                tokenizer.tokenize(
                    "This is a test for subword regularization."))

        # the list of different pairs of tokens_list
        combinations = itertools.combinations(tokens_list, 2)

        all_equal = True
        for combination in combinations:
            if combination[0] != combination[1]:
                all_equal = False

        self.assertFalse(all_equal)
コード例 #2
0
    def __init__(self, max_seq_len : int, model_name: str, model_path: Optional[str] = '', device: Optional[str] = 'cpu'):
        super(TextEmbedder, self).__init__()
        """
        Parameters
        ----------
        max_len : int

        model_name : str
            The name of the model, should be one of 'word2vec', 'xlm-roberta-base', 'xlm-roberta-large', 'vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2'
        model_path : str, optional
            The path to the w2v file / finetuned Transformer model path. Required for w2v.
        """
        
        assert model_name in ['word2vec', 'xlm-roberta-base', 'xlm-roberta-large', 'vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2']
        self.max_seq_len = max_seq_len
        self.model_name = model_name
        self.device = torch.device(device)
        if model_path == '':
            model_path = model_name  # TODO check if the 
        print('TextEmbedder: Loading model {} ({})'.format(model_name, model_path))
        if model_name == 'word2vec':
            assert os.path.isfile(model_path)
            self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', model_max_length=self.max_seq_len+1)
            self._load_weibo_w2v(model_path)
            self.embed_dim = 300
        elif model_name in ['vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2', 'jordan-m-young/buzz-article-gpt-2']:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=self.max_seq_len)
            self.model = AutoModel.from_pretrained(model_path, return_dict=True).to(self.device)  # T5 for news doesn't have 'add_pooling_layer' option
            self.embed_dim = 768
        else:
            assert model_path in ['xlm-roberta-base', 'xlm-roberta-large'] or os.path.isdir(model_path)
            self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name, model_max_length=self.max_seq_len)
            self.model = XLMRobertaModel.from_pretrained(model_path, return_dict=True, add_pooling_layer=False).to(self.device)
            self.embed_dim = 768
        print('TextEmbedder: Finished loading model {}'.format(model_name))
コード例 #3
0
 def __init__(self, language_code="aze-eng"):
     self.language_code = language_code
     self.translation_data = {
         "train":
         "data/" + self.language_code + "/ted-train.orig." +
         self.language_code,
         "dev":
         "data/" + self.language_code + "/ted-dev.orig." +
         self.language_code,
         "test":
         "data/" + self.language_code + "/ted-test.orig." +
         self.language_code
     }
     self.translation_tokenization = {
         "train": {
             "hrl": None,
             "lrl": None
         },
         "dev": {
             "hrl": None,
             "lrl": None
         },
         "test": {
             "hrl": None,
             "lrl": None
         }
     }
     self.tokenizer = XLMRobertaTokenizer.from_pretrained(
         'xlm-roberta-base', do_lowercase_and_remove_accent=True)
     self.train_loaders = None
     self.valid_loaders = None
     self.test_loaders = None
コード例 #4
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    #TOK_NAME = "bert-base-multilingual-cased"
    #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

    # load my model
    MODEL_NAME = args.model_dir  # model dir.
    model = XLMRobertaForSequenceClassification.from_pretrained(args.model_dir)
    model.parameters
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    output.to_csv('./prediction/roberta-submission13.csv', index=False)
コード例 #5
0
    def __init__(self, choose_model):
        """Initialize Text Encoder.

        Args:
            choose_model (str): Only XLM-R possible for now.
        """
        self.model_max_length = 128
        if choose_model.lower() == "XLM-R".lower():
            self.tokenizer = XLMRobertaTokenizer.from_pretrained(
                'xlm-roberta-base', model_max_length=self.model_max_length)
            self.model = XLMRobertaModel.from_pretrained(
                'xlm-roberta-base', output_hidden_states=True)
            self.device = torch.device(
                'cuda') if torch.cuda.is_available() else torch.device('cpu')
            self.model.to(self.device, non_blocking=True)

            self.PAD_TOKEN = "<pad>"
            self.BOS_TOKEN = "<s>"
            self.EOS_TOKEN = "</s>"
            self.UNK_TOKEN = "<unk>"

            self.add_special_token = True
            self.pad_to_max_length = True

            self.target_embedding_matrix = []
            self.proj_embedding_source_target = []
            self.src_word2ind = {}
            self.trg_word2ind = {}
            self.src_ind2word = {}
            self.trg_ind2word = {}
            self.norm_trg_embedding_matrix = []

        else:
            assert False, print("No correct model was chosen!!")
コード例 #6
0
ファイル: bert_crf.py プロジェクト: smutuvi/NER_BERT_CRF
def generate_training_data(
    config,
    bert_tokenizer=XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')):
    training_data, validation_data = config.data_dir + config.training_data, config.data_dir + config.val_data
    train_sentences, train_labels, label_set = corpus_reader(training_data,
                                                             delim=' ')
    label_set.append('X')
    tag2idx = {t: i for i, t in enumerate(label_set)}
    #print('Training datas: ', len(train_sentences))
    train_dataset = NER_Dataset(tag2idx,
                                train_sentences,
                                train_labels,
                                tokenizer_path=bert_tokenizer)
    # save the tag2indx dictionary. Will be used while prediction
    with open(config.apr_dir + 'tag2idx.pkl', 'wb') as f:
        pickle.dump(tag2idx, f, pickle.HIGHEST_PROTOCOL)
    dev_sentences, dev_labels, _ = corpus_reader(validation_data, delim=' ')
    dev_dataset = NER_Dataset(tag2idx,
                              dev_sentences,
                              dev_labels,
                              tokenizer_path=bert_tokenizer)

    #print(len(train_dataset))
    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=config.batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=pad)
    eval_iter = data.DataLoader(dataset=dev_dataset,
                                batch_size=config.batch_size,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=pad)
    return train_iter, eval_iter, tag2idx
コード例 #7
0
    def __init__(self,
                 model_dir_or_name: str,
                 layers: str = '-1',
                 pooled_cls: bool = False):
        super().__init__()

        self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name)
        self.encoder = RobertaModel.from_pretrained(model_dir_or_name)
        #  检查encoder_layer_number是否合理
        encoder_layer_number = len(self.encoder.encoder.layer)

        if isinstance(layers, list):
            self.layers = [int(l) for l in layers]
        elif isinstance(layers, str):
            self.layers = list(map(int, layers.split(',')))
        else:
            raise TypeError("`layers` only supports str or list[int]")

        for layer in self.layers:
            if layer < 0:
                assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a RoBERTa model with {encoder_layer_number} layers."
            else:
                assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a RoBERTa model with {encoder_layer_number} layers."

        self._cls_index = self.tokenzier.encoder['<s>']
        self._sep_index = self.tokenzier.encoder['</s>']
        # 需要用于生成word_piece
        self._wordpiece_pad_index = self.tokenzier.encoder['<pad>']
        self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>']
        self.pooled_cls = pooled_cls
コード例 #8
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # load tokenizer
    TOK_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME)

    special_tokens_dict = {'additional_special_tokens': ["#", "@", '₩', '^']}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

    # load my model
    MODEL_NAME = args.model_dir  # model dir.
    model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)

    output = pd.DataFrame(pred_answer, columns=['pred'])
    output.to_csv('./prediction/submission.csv', index=False)
コード例 #9
0
ファイル: load_data.py プロジェクト: LizLian/CLIR
 def load_data(self):
     """
     load data from file
     """
     with open(self.path, encoding="utf-8") as infile:
         for line in infile:
             tokens = word_tokenize(line)
             sentence = line
             # encode input sentences with XLMRoberta
             tokenizer = XLMRobertaTokenizer.from_pretrained(
                 'xlm-roberta-base')
             token_len = []
             subwords = []
             for token in tokens:
                 pieces = tokenizer.tokenize(token)
                 token_len.append(len(pieces))
                 subwords.extend(pieces)
             input_ids = tokenizer.encode(subwords,
                                          add_special_tokens=True,
                                          truncation=True,
                                          max_length=self.max_length)
             pad_num = self.max_length - len(input_ids)
             input_ids = input_ids + [0] * pad_num
             instance = Instance(sentence=sentence,
                                 tokens=tokens,
                                 token_len=token_len,
                                 subwords=subwords,
                                 input_ids=input_ids)
             self.data.append(instance)
     return self.data
コード例 #10
0
ファイル: pipeline.py プロジェクト: sputnikav/trankit
    def _setup_config(self, lang):
        torch.cuda.empty_cache()
        # decide whether to run on GPU or CPU
        if self._gpu and torch.cuda.is_available():
            self._use_gpu = True
            master_config.device = torch.device('cuda')
            self._tokbatchsize = 6
            self._tagbatchsize = 24
        else:
            self._use_gpu = False
            master_config.device = torch.device('cpu')
            self._tokbatchsize = 2
            self._tagbatchsize = 12

        if self._cache_dir is None:
            master_config._cache_dir = 'cache/trankit'
        else:
            master_config._cache_dir = self._cache_dir

        if not os.path.exists(master_config._cache_dir):
            os.makedirs(master_config._cache_dir, exist_ok=True)

        master_config.wordpiece_splitter = XLMRobertaTokenizer.from_pretrained(
            'xlm-roberta-base',
            cache_dir=os.path.join(master_config._cache_dir, 'xlmr'))
        self._config = master_config
        self._config.max_input_length = tbname2max_input_length.get(
            lang2treebank[lang], 400)  # this is for tokenizer only
コード例 #11
0
ファイル: xlmr.py プロジェクト: davletov-aa/siamese-wsd
    def __init__(self, config: XLMRobertaConfig, args, data_processor):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.args = args
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(args.model_name)
        input_size = config.hidden_size

        if args.pool_type in {'mmm', 'mmf'}:
            input_size *= 3
        elif args.pool_type in {'mm', 'mf'}:
            input_size *= 2

        if args.target_embeddings == 'concat':
            input_size *= 2
        elif args.target_embeddings.startswith('comb_c'):
            input_size *= 3
        elif args.target_embeddings.startswith('comb_'):
            input_size *= 2
        elif args.target_embeddings.startswith('dist_'):
            input_size = len(
                args.target_embeddings.replace('dist_', '').replace('n',
                                                                    '')) // 2

        print('Classification head input size:', input_size)
        if self.args.loss == 'mse_loss':
            self.syn_mse_clf = RobertaClassificationHead(
                config, 1, input_size, self.args)
        elif self.args.loss == 'crossentropy_loss':
            self.syn_clf = RobertaClassificationHead(config, 2, input_size,
                                                     self.args)
        self.data_processor = data_processor
        self.init_weights()
コード例 #12
0
def build_model():
    
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

    model = TranslationModel()

    return model, tokenizer
コード例 #13
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    #   TOK_NAME = "bert-base-multilingual-cased"
    #TOK_NAME = args.pretrained_model
    #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
    MODEL_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

    # load my model
    model_module = getattr(import_module("transformers"),
                           args.model_type + "ForSequenceClassification")
    model = model_module.from_pretrained(args.model_dir)
    model.parameters
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    output.to_csv(args.out_path, index=False)
コード例 #14
0
    def __init__(self, config: XLMRobertaConfig, local_config: dict,
                 data_processor):
        super().__init__(config)
        syns = sorted(local_config['syns'])
        self.num_clfs = len(syns) + 1 if local_config['train_pos'] else len(
            syns)
        self.clfs_weights = torch.nn.parameter.Parameter(torch.ones(
            self.num_clfs, dtype=torch.float32),
                                                         requires_grad=True)
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.local_config = local_config
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
            local_config['model_name'])
        self.clf2ncls = [2 for clf in syns]
        assert local_config['target_embeddings'] in ['concat', 'none']
        if local_config['target_embeddings'] == 'concat':
            self.syns = nn.Linear(config.hidden_size * 2, len(syns) * 2)
            self.pos_clf = nn.Linear(config.hidden_size * 2,
                                     self.local_config['pos_ncls'])
        else:
            self.syns = nn.Linear(config.hidden_size, len(syns) * 2)
            self.pos_clf = nn.Linear(config.hidden_size,
                                     self.local_config['pos_ncls'])
        print(self.clfs_weights)

        self.data_processor = data_processor

        self.TARGET_START = '•'
        self.TARGET_END = '⁄'

        self.init_weights()
コード例 #15
0
    def get_model_tokenizer(model_path, do_lower_case, seed=42):
        if model_path.startswith('bert'):
            tokenizer = BertTokenizer.from_pretrained(
                model_path, do_lower_case=do_lower_case)
            model = TFBertModel.from_pretrained(model_path,
                                                output_hidden_states=True,
                                                output_attentions=False)
        elif model_path.startswith('roberta'):
            tokenizer = RobertaTokenizer.from_pretrained(
                model_path, do_lower_case=do_lower_case, add_prefix_space=True)
            model = TFRobertaModel.from_pretrained(model_path,
                                                   output_hidden_states=True,
                                                   output_attentions=False)
        elif model_path.startswith('jplu/tf-xlm-roberta'):
            tokenizer = XLMRobertaTokenizer.from_pretrained(
                model_path, do_lower_case=do_lower_case)
            model = TFXLMRobertaModel.from_pretrained(
                model_path, output_hidden_states=True, output_attentions=False)
        elif model_path.startswith('random-bert'):
            tokenizer = BertTokenizer.from_pretrained("bert-base-cased",
                                                      do_lower_case=True)
            config = BertConfig(seed=seed,
                                output_hidden_states=True,
                                output_attentions=False)
            model = TFBertModel(config)
        else:
            raise ValueError(
                f"Unknown Transformer name: {model_path}. "
                f"Please select one of the supported models: {constants.SUPPORTED_MODELS}"
            )

        return model, tokenizer
コード例 #16
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    TOK_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME)

    # load my model
    MODEL_NAME = args.model_dir  # model dir.
    p = Path('.').resolve()  # /opt/ml
    model_dir = p / args.model_dir
    model = XLMRobertaForSequenceClassification.from_pretrained(model_dir)
    model.resize_token_embeddings(len(tokenizer))
    model.parameters
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    output.to_csv(
        './prediction/submission.csv',
        index=False,
    )
コード例 #17
0
 def __init__(self, args=''):
     self.args = args
     self.train_path = os.path.join(cantemist_path, "train-set/")
     self.test_path = os.path.join(cantemist_path, "test-set/")
     self.background_path = os.path.join(cantemist_path, "background-set/")
     self.dev_path = os.path.join(cantemist_path, "dev-set1/")
     self.data_list = []
     self.tokenizer = XLMRobertaTokenizer.from_pretrained(
         self.args.model_name_or_path)
コード例 #18
0
ファイル: OntologyExtractor.py プロジェクト: amkatyshev/OE
 def load_model(self, model: str):
     if torch.cuda.is_available():
         self.device = torch.device("cuda")
         print('There are %d GPU(s) available.' % torch.cuda.device_count())
         print('We will use the GPU:', torch.cuda.get_device_name(0))
     else:
         print('No GPU available, using the CPU instead.')
         self.device = torch.device("cpu")
     self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
コード例 #19
0
ファイル: bert_NER.py プロジェクト: smutuvi/NER_BERT_CRF
def generate_test_data(config, tag2idx, bert_tokenizer=XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')):
    test_data = config.data_dir+config.test_data
    test_sentences, test_labels, _ = corpus_reader(test_data, delim=' ')
    test_dataset = NER_Dataset(tag2idx, test_sentences, test_labels, tokenizer_path = bert_tokenizer)
    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=config.batch_size,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=pad)
    return test_iter
コード例 #20
0
 def load_tokenizer(self):
     if self.model in ["xlm-roberta-base"]:
         return XLMRobertaTokenizer.from_pretrained(self.model,
                                                    strip_accents=False)
     elif self.model in ['bert-base-multilingual-cased']:
         return BertTokenizer.from_pretrained(self.model,
                                              strip_accents=False)
     elif self.model in ['facebook/mbart-large-cc25']:
         return MBartTokenizer.from_pretrained('facebook/mbart-large-cc25',
                                               strip_accents=False)
コード例 #21
0
def make_pretrained_transformer_and_tokenizer(transformer_name: str):
    if 'distilgpt2' in transformer_name:
        print("DistilGPT2!")
        model = GPT2Model.from_pretrained('distilgpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
    else:
        print(f"Loading {transformer_name}!")
        model = XLMRobertaModel.from_pretrained(transformer_name)
        tokenizer = XLMRobertaTokenizer.from_pretrained(transformer_name)

    return model, tokenizer
コード例 #22
0
    def test_tokenization_xlm_roberta(self):
        # Given
        self.base_tokenizer = XLMRobertaTokenizer.from_pretrained(
            'xlm-roberta-large-finetuned-conll03-english',
            do_lower_case=False,
            cache_dir=self.test_dir)
        self.rust_tokenizer = PyXLMRobertaTokenizer(get_from_cache(
            self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
            ['xlm-roberta-large-finetuned-conll03-english']),
                                                    do_lower_case=False)

        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        # Note: the original sentence piece tokenizer strips trailing spaces
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a.strip() for example in self.examples],
            max_len=256,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            if rust.token_ids != baseline['input_ids']:
                if len(rust.token_ids) == len(baseline['input_ids']):
                    if Counter(rust.token_ids) != Counter(
                            baseline['input_ids']):
                        raise AssertionError(
                            f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                            f'Sentence a: {self.examples[idx].text_a} \n'
                            f'Sentence b: {self.examples[idx].text_b} \n'
                            f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                            f'Rust: {rust.token_ids} \n'
                            f'Python {baseline["input_ids"]}')
                else:
                    raise AssertionError(
                        f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n '
                        f'Sentence a: {self.examples[idx].text_a} \n'
                        f'Sentence b: {self.examples[idx].text_b} \n'
                        f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n'
                        f'Rust: {rust.token_ids} \n'
                        f'Python {baseline["input_ids"]}')
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
コード例 #23
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name
    log.info('In add_transformers_vocab')
    log.info(tokenizer_name)
    if tokenizer_name.startswith(
            "bert-"
    ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith(
            "roberta-"):  # or 'roberta' in tokenizer_name:
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name:
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-roberta"):
        tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name,
             len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
コード例 #24
0
ファイル: bert_NER.py プロジェクト: smutuvi/NER_BERT_CRF
def load_model(config):
    f = open(config.apr_dir +'tag2idx.pkl', 'rb')
    tag2idx = pickle.load(f)
    unique_labels = list(tag2idx.keys())
    model = XLMRobertaForTokenClassification.from_pretrained(config.bert_model, num_labels=len(tag2idx))
    checkpoint = torch.load(config.apr_dir + config.model_name, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    global bert_tokenizer
    bert_tokenizer = XLMRobertaTokenizer.from_pretrained(config.bert_model)
    if torch.cuda.is_available():
        model.cuda()
    model.eval()
    return model, bert_tokenizer, unique_labels, tag2idx
コード例 #25
0
    def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
        #download the model or load the model path
        model_path = download_model('xlmr.ned',
                                    cache_dir,
                                    process_func=_unzip_process_func,
                                    verbose=verbose)
        self.classes = ['0', '1']

        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            model_path, num_labels=len(self.classes))

        self.max_length = self.model.roberta.embeddings.position_embeddings.num_embeddings - 2
コード例 #26
0
    def __init__(self,
                 model_dir_or_name: str,
                 layers: str = '-1',
                 pooled_cls: bool = False):
        super().__init__()

        self.tokenzier = XLMRobertaTokenizer.from_pretrained(model_dir_or_name)
        self.encoder = XLMRobertaModel.from_pretrained(model_dir_or_name)

        self._cls_index = self.tokenzier.encoder['<s>']
        self._sep_index = self.tokenzier.encoder['</s>']
        self._wordpiece_pad_index = self.tokenzier.encoder['<pad>']
        self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>']
        self.pooled_cls = pooled_cls
コード例 #27
0
ファイル: main_xlmr.py プロジェクト: jjaacckkyy63/MTQE
    def __init__(self, dataset, data_path, done=True):

        self.tokenizer = XLMRobertaTokenizer.from_pretrained(
            'xlm-roberta-base')
        if done:
            with open(data_path, 'rb') as f:
                self.data = pickle.load(f)
        else:
            self.data = self.parse_data(opt.paths[dataset])
            with open(data_path, 'wb') as f:
                pickle.dump(self.data, f)

        self.all_data, self.score = self.data
        self.vocab = self.tokenizer.get_vocab()
コード例 #28
0
ファイル: XLMRoBERTa.py プロジェクト: zeta1999/KoSentenceBERT
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
        super(XLMRoBERTa, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.do_lower_case = do_lower_case

        if self.do_lower_case is not None:
            tokenizer_args['do_lower_case'] = do_lower_case

        self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path, **model_args)
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)

        if max_seq_length > self.tokenizer.max_len_single_sentence:
            logging.warning("XLM-RoBERTa only allows a max_seq_length of "+self.tokenizer.max_len_single_sentence)
            max_seq_length = self.tokenizer.max_len_single_sentence
        self.max_seq_length = max_seq_length
コード例 #29
0
ファイル: _sentiment.py プロジェクト: aasem/urduhack
def load_models(layers_weights: str = LAYERS_WEIGHTS_PATH, pre_trained: str = "jplu/tf-xlm-roberta-large"):
    """
    Downloads and load a pretrained language model as well as layer weights for classifier

    Args:
        layers_weights (str): Path to the layers weight file
        pre_trained (str): Name or path to the pretrained language model
    Returns:
        None
    """

    tokenizer_ = XLMRobertaTokenizer.from_pretrained(pre_trained)
    lang_model = TFXLMRobertaModel.from_pretrained(pre_trained)
    if Path(layers_weights).exists():
        model_ = compile_model(weights=layers_weights, lang_model=lang_model)
    return model_, tokenizer_
コード例 #30
0
 def __init__(self, config):
     model_name = config.get("model_name", None)
     model_path = config.get("model_path", None)
     device = config.get("device", 0)  # default on gpu 0
     self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
     # the default entailment id is 2 (contradiction is 0, neutral is 1)
     self.contradiction_id = 0
     self.entailment_id = 2
     self.model = XLMRobertaForSequenceClassification.from_pretrained(
         model_path)
     self.model.eval()
     self.model.half()
     self.device = torch.device(
         "cpu" if device < 0 else "cuda:{}".format(device))
     if self.device.type == "cuda":
         self.model = self.model.to(self.device)