def test_subword_regularization_tokenizer(self): # Subword regularization is only available for the slow tokenizer. tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={ "enable_sampling": True, "alpha": 0.1, "nbest_size": -1 }) # Subword regularization augments training data with subword sampling. # This has a random component. We test if the tokenizer generates different # results when subword regularization is enabled. tokens_list = [] for _ in range(5): tokens_list.append( tokenizer.tokenize( "This is a test for subword regularization.")) # the list of different pairs of tokens_list combinations = itertools.combinations(tokens_list, 2) all_equal = True for combination in combinations: if combination[0] != combination[1]: all_equal = False self.assertFalse(all_equal)
def __init__(self, max_seq_len : int, model_name: str, model_path: Optional[str] = '', device: Optional[str] = 'cpu'): super(TextEmbedder, self).__init__() """ Parameters ---------- max_len : int model_name : str The name of the model, should be one of 'word2vec', 'xlm-roberta-base', 'xlm-roberta-large', 'vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2' model_path : str, optional The path to the w2v file / finetuned Transformer model path. Required for w2v. """ assert model_name in ['word2vec', 'xlm-roberta-base', 'xlm-roberta-large', 'vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2'] self.max_seq_len = max_seq_len self.model_name = model_name self.device = torch.device(device) if model_path == '': model_path = model_name # TODO check if the print('TextEmbedder: Loading model {} ({})'.format(model_name, model_path)) if model_name == 'word2vec': assert os.path.isfile(model_path) self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', model_max_length=self.max_seq_len+1) self._load_weibo_w2v(model_path) self.embed_dim = 300 elif model_name in ['vinai/bertweet-base', 'mrm8488/t5-base-finetuned-summarize-news', 'jordan-m-young/buzz-article-gpt-2', 'jordan-m-young/buzz-article-gpt-2']: self.tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=self.max_seq_len) self.model = AutoModel.from_pretrained(model_path, return_dict=True).to(self.device) # T5 for news doesn't have 'add_pooling_layer' option self.embed_dim = 768 else: assert model_path in ['xlm-roberta-base', 'xlm-roberta-large'] or os.path.isdir(model_path) self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name, model_max_length=self.max_seq_len) self.model = XLMRobertaModel.from_pretrained(model_path, return_dict=True, add_pooling_layer=False).to(self.device) self.embed_dim = 768 print('TextEmbedder: Finished loading model {}'.format(model_name))
def __init__(self, language_code="aze-eng"): self.language_code = language_code self.translation_data = { "train": "data/" + self.language_code + "/ted-train.orig." + self.language_code, "dev": "data/" + self.language_code + "/ted-dev.orig." + self.language_code, "test": "data/" + self.language_code + "/ted-test.orig." + self.language_code } self.translation_tokenization = { "train": { "hrl": None, "lrl": None }, "dev": { "hrl": None, "lrl": None }, "test": { "hrl": None, "lrl": None } } self.tokenizer = XLMRobertaTokenizer.from_pretrained( 'xlm-roberta-base', do_lowercase_and_remove_accent=True) self.train_loaders = None self.valid_loaders = None self.test_loaders = None
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer #TOK_NAME = "bert-base-multilingual-cased" #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') # load my model MODEL_NAME = args.model_dir # model dir. model = XLMRobertaForSequenceClassification.from_pretrained(args.model_dir) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv('./prediction/roberta-submission13.csv', index=False)
def __init__(self, choose_model): """Initialize Text Encoder. Args: choose_model (str): Only XLM-R possible for now. """ self.model_max_length = 128 if choose_model.lower() == "XLM-R".lower(): self.tokenizer = XLMRobertaTokenizer.from_pretrained( 'xlm-roberta-base', model_max_length=self.model_max_length) self.model = XLMRobertaModel.from_pretrained( 'xlm-roberta-base', output_hidden_states=True) self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') self.model.to(self.device, non_blocking=True) self.PAD_TOKEN = "<pad>" self.BOS_TOKEN = "<s>" self.EOS_TOKEN = "</s>" self.UNK_TOKEN = "<unk>" self.add_special_token = True self.pad_to_max_length = True self.target_embedding_matrix = [] self.proj_embedding_source_target = [] self.src_word2ind = {} self.trg_word2ind = {} self.src_ind2word = {} self.trg_ind2word = {} self.norm_trg_embedding_matrix = [] else: assert False, print("No correct model was chosen!!")
def generate_training_data( config, bert_tokenizer=XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')): training_data, validation_data = config.data_dir + config.training_data, config.data_dir + config.val_data train_sentences, train_labels, label_set = corpus_reader(training_data, delim=' ') label_set.append('X') tag2idx = {t: i for i, t in enumerate(label_set)} #print('Training datas: ', len(train_sentences)) train_dataset = NER_Dataset(tag2idx, train_sentences, train_labels, tokenizer_path=bert_tokenizer) # save the tag2indx dictionary. Will be used while prediction with open(config.apr_dir + 'tag2idx.pkl', 'wb') as f: pickle.dump(tag2idx, f, pickle.HIGHEST_PROTOCOL) dev_sentences, dev_labels, _ = corpus_reader(validation_data, delim=' ') dev_dataset = NER_Dataset(tag2idx, dev_sentences, dev_labels, tokenizer_path=bert_tokenizer) #print(len(train_dataset)) train_iter = data.DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=4, collate_fn=pad) eval_iter = data.DataLoader(dataset=dev_dataset, batch_size=config.batch_size, shuffle=False, num_workers=1, collate_fn=pad) return train_iter, eval_iter, tag2idx
def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool = False): super().__init__() self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name) self.encoder = RobertaModel.from_pretrained(model_dir_or_name) # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) if isinstance(layers, list): self.layers = [int(l) for l in layers] elif isinstance(layers, str): self.layers = list(map(int, layers.split(','))) else: raise TypeError("`layers` only supports str or list[int]") for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a RoBERTa model with {encoder_layer_number} layers." else: assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a RoBERTa model with {encoder_layer_number} layers." self._cls_index = self.tokenzier.encoder['<s>'] self._sep_index = self.tokenzier.encoder['</s>'] # 需要用于生成word_piece self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>'] self.pooled_cls = pooled_cls
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer TOK_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME) special_tokens_dict = {'additional_special_tokens': ["#", "@", '₩', '^']} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) # load my model MODEL_NAME = args.model_dir # model dir. model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME) model.resize_token_embeddings(len(tokenizer)) model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv('./prediction/submission.csv', index=False)
def load_data(self): """ load data from file """ with open(self.path, encoding="utf-8") as infile: for line in infile: tokens = word_tokenize(line) sentence = line # encode input sentences with XLMRoberta tokenizer = XLMRobertaTokenizer.from_pretrained( 'xlm-roberta-base') token_len = [] subwords = [] for token in tokens: pieces = tokenizer.tokenize(token) token_len.append(len(pieces)) subwords.extend(pieces) input_ids = tokenizer.encode(subwords, add_special_tokens=True, truncation=True, max_length=self.max_length) pad_num = self.max_length - len(input_ids) input_ids = input_ids + [0] * pad_num instance = Instance(sentence=sentence, tokens=tokens, token_len=token_len, subwords=subwords, input_ids=input_ids) self.data.append(instance) return self.data
def _setup_config(self, lang): torch.cuda.empty_cache() # decide whether to run on GPU or CPU if self._gpu and torch.cuda.is_available(): self._use_gpu = True master_config.device = torch.device('cuda') self._tokbatchsize = 6 self._tagbatchsize = 24 else: self._use_gpu = False master_config.device = torch.device('cpu') self._tokbatchsize = 2 self._tagbatchsize = 12 if self._cache_dir is None: master_config._cache_dir = 'cache/trankit' else: master_config._cache_dir = self._cache_dir if not os.path.exists(master_config._cache_dir): os.makedirs(master_config._cache_dir, exist_ok=True) master_config.wordpiece_splitter = XLMRobertaTokenizer.from_pretrained( 'xlm-roberta-base', cache_dir=os.path.join(master_config._cache_dir, 'xlmr')) self._config = master_config self._config.max_input_length = tbname2max_input_length.get( lang2treebank[lang], 400) # this is for tokenizer only
def __init__(self, config: XLMRobertaConfig, args, data_processor): super().__init__(config) self.roberta = RobertaModel(config) self.args = args self.tokenizer = XLMRobertaTokenizer.from_pretrained(args.model_name) input_size = config.hidden_size if args.pool_type in {'mmm', 'mmf'}: input_size *= 3 elif args.pool_type in {'mm', 'mf'}: input_size *= 2 if args.target_embeddings == 'concat': input_size *= 2 elif args.target_embeddings.startswith('comb_c'): input_size *= 3 elif args.target_embeddings.startswith('comb_'): input_size *= 2 elif args.target_embeddings.startswith('dist_'): input_size = len( args.target_embeddings.replace('dist_', '').replace('n', '')) // 2 print('Classification head input size:', input_size) if self.args.loss == 'mse_loss': self.syn_mse_clf = RobertaClassificationHead( config, 1, input_size, self.args) elif self.args.loss == 'crossentropy_loss': self.syn_clf = RobertaClassificationHead(config, 2, input_size, self.args) self.data_processor = data_processor self.init_weights()
def build_model(): tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') model = TranslationModel() return model, tokenizer
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer # TOK_NAME = "bert-base-multilingual-cased" #TOK_NAME = args.pretrained_model #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) MODEL_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) # load my model model_module = getattr(import_module("transformers"), args.model_type + "ForSequenceClassification") model = model_module.from_pretrained(args.model_dir) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv(args.out_path, index=False)
def __init__(self, config: XLMRobertaConfig, local_config: dict, data_processor): super().__init__(config) syns = sorted(local_config['syns']) self.num_clfs = len(syns) + 1 if local_config['train_pos'] else len( syns) self.clfs_weights = torch.nn.parameter.Parameter(torch.ones( self.num_clfs, dtype=torch.float32), requires_grad=True) self.roberta = RobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.local_config = local_config self.tokenizer = XLMRobertaTokenizer.from_pretrained( local_config['model_name']) self.clf2ncls = [2 for clf in syns] assert local_config['target_embeddings'] in ['concat', 'none'] if local_config['target_embeddings'] == 'concat': self.syns = nn.Linear(config.hidden_size * 2, len(syns) * 2) self.pos_clf = nn.Linear(config.hidden_size * 2, self.local_config['pos_ncls']) else: self.syns = nn.Linear(config.hidden_size, len(syns) * 2) self.pos_clf = nn.Linear(config.hidden_size, self.local_config['pos_ncls']) print(self.clfs_weights) self.data_processor = data_processor self.TARGET_START = '•' self.TARGET_END = '⁄' self.init_weights()
def get_model_tokenizer(model_path, do_lower_case, seed=42): if model_path.startswith('bert'): tokenizer = BertTokenizer.from_pretrained( model_path, do_lower_case=do_lower_case) model = TFBertModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=False) elif model_path.startswith('roberta'): tokenizer = RobertaTokenizer.from_pretrained( model_path, do_lower_case=do_lower_case, add_prefix_space=True) model = TFRobertaModel.from_pretrained(model_path, output_hidden_states=True, output_attentions=False) elif model_path.startswith('jplu/tf-xlm-roberta'): tokenizer = XLMRobertaTokenizer.from_pretrained( model_path, do_lower_case=do_lower_case) model = TFXLMRobertaModel.from_pretrained( model_path, output_hidden_states=True, output_attentions=False) elif model_path.startswith('random-bert'): tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=True) config = BertConfig(seed=seed, output_hidden_states=True, output_attentions=False) model = TFBertModel(config) else: raise ValueError( f"Unknown Transformer name: {model_path}. " f"Please select one of the supported models: {constants.SUPPORTED_MODELS}" ) return model, tokenizer
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer TOK_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME) # load my model MODEL_NAME = args.model_dir # model dir. p = Path('.').resolve() # /opt/ml model_dir = p / args.model_dir model = XLMRobertaForSequenceClassification.from_pretrained(model_dir) model.resize_token_embeddings(len(tokenizer)) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv( './prediction/submission.csv', index=False, )
def __init__(self, args=''): self.args = args self.train_path = os.path.join(cantemist_path, "train-set/") self.test_path = os.path.join(cantemist_path, "test-set/") self.background_path = os.path.join(cantemist_path, "background-set/") self.dev_path = os.path.join(cantemist_path, "dev-set1/") self.data_list = [] self.tokenizer = XLMRobertaTokenizer.from_pretrained( self.args.model_name_or_path)
def load_model(self, model: str): if torch.cuda.is_available(): self.device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') self.device = torch.device("cpu") self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
def generate_test_data(config, tag2idx, bert_tokenizer=XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')): test_data = config.data_dir+config.test_data test_sentences, test_labels, _ = corpus_reader(test_data, delim=' ') test_dataset = NER_Dataset(tag2idx, test_sentences, test_labels, tokenizer_path = bert_tokenizer) test_iter = data.DataLoader(dataset=test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=1, collate_fn=pad) return test_iter
def load_tokenizer(self): if self.model in ["xlm-roberta-base"]: return XLMRobertaTokenizer.from_pretrained(self.model, strip_accents=False) elif self.model in ['bert-base-multilingual-cased']: return BertTokenizer.from_pretrained(self.model, strip_accents=False) elif self.model in ['facebook/mbart-large-cc25']: return MBartTokenizer.from_pretrained('facebook/mbart-large-cc25', strip_accents=False)
def make_pretrained_transformer_and_tokenizer(transformer_name: str): if 'distilgpt2' in transformer_name: print("DistilGPT2!") model = GPT2Model.from_pretrained('distilgpt2') tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2') else: print(f"Loading {transformer_name}!") model = XLMRobertaModel.from_pretrained(transformer_name) tokenizer = XLMRobertaTokenizer.from_pretrained(transformer_name) return model, tokenizer
def test_tokenization_xlm_roberta(self): # Given self.base_tokenizer = XLMRobertaTokenizer.from_pretrained( 'xlm-roberta-large-finetuned-conll03-english', do_lower_case=False, cache_dir=self.test_dir) self.rust_tokenizer = PyXLMRobertaTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['xlm-roberta-large-finetuned-conll03-english']), do_lower_case=False) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When # Note: the original sentence piece tokenizer strips trailing spaces output_rust = self.rust_tokenizer.encode_list( [example.text_a.strip() for example in self.examples], max_len=256, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): if rust.token_ids != baseline['input_ids']: if len(rust.token_ids) == len(baseline['input_ids']): if Counter(rust.token_ids) != Counter( baseline['input_ids']): raise AssertionError( f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' f'Sentence a: {self.examples[idx].text_a} \n' f'Sentence b: {self.examples[idx].text_b} \n' f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' f'Rust: {rust.token_ids} \n' f'Python {baseline["input_ids"]}') else: raise AssertionError( f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' f'Sentence a: {self.examples[idx].text_a} \n' f'Sentence b: {self.examples[idx].text_b} \n' f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' f'Rust: {rust.token_ids} \n' f'Python {baseline["input_ids"]}') assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name log.info('In add_transformers_vocab') log.info(tokenizer_name) if tokenizer_name.startswith( "bert-" ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name: tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith( "roberta-"): # or 'roberta' in tokenizer_name: tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name: tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-roberta"): tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def load_model(config): f = open(config.apr_dir +'tag2idx.pkl', 'rb') tag2idx = pickle.load(f) unique_labels = list(tag2idx.keys()) model = XLMRobertaForTokenClassification.from_pretrained(config.bert_model, num_labels=len(tag2idx)) checkpoint = torch.load(config.apr_dir + config.model_name, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) global bert_tokenizer bert_tokenizer = XLMRobertaTokenizer.from_pretrained(config.bert_model) if torch.cuda.is_available(): model.cuda() model.eval() return model, bert_tokenizer, unique_labels, tag2idx
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification #download the model or load the model path model_path = download_model('xlmr.ned', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.classes = ['0', '1'] self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path) self.model = XLMRobertaForSequenceClassification.from_pretrained( model_path, num_labels=len(self.classes)) self.max_length = self.model.roberta.embeddings.position_embeddings.num_embeddings - 2
def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool = False): super().__init__() self.tokenzier = XLMRobertaTokenizer.from_pretrained(model_dir_or_name) self.encoder = XLMRobertaModel.from_pretrained(model_dir_or_name) self._cls_index = self.tokenzier.encoder['<s>'] self._sep_index = self.tokenzier.encoder['</s>'] self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>'] self.pooled_cls = pooled_cls
def __init__(self, dataset, data_path, done=True): self.tokenizer = XLMRobertaTokenizer.from_pretrained( 'xlm-roberta-base') if done: with open(data_path, 'rb') as f: self.data = pickle.load(f) else: self.data = self.parse_data(opt.paths[dataset]) with open(data_path, 'wb') as f: pickle.dump(self.data, f) self.all_data, self.score = self.data self.vocab = self.tokenizer.get_vocab()
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): super(XLMRoBERTa, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if self.do_lower_case is not None: tokenizer_args['do_lower_case'] = do_lower_case self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path, **model_args) self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) if max_seq_length > self.tokenizer.max_len_single_sentence: logging.warning("XLM-RoBERTa only allows a max_seq_length of "+self.tokenizer.max_len_single_sentence) max_seq_length = self.tokenizer.max_len_single_sentence self.max_seq_length = max_seq_length
def load_models(layers_weights: str = LAYERS_WEIGHTS_PATH, pre_trained: str = "jplu/tf-xlm-roberta-large"): """ Downloads and load a pretrained language model as well as layer weights for classifier Args: layers_weights (str): Path to the layers weight file pre_trained (str): Name or path to the pretrained language model Returns: None """ tokenizer_ = XLMRobertaTokenizer.from_pretrained(pre_trained) lang_model = TFXLMRobertaModel.from_pretrained(pre_trained) if Path(layers_weights).exists(): model_ = compile_model(weights=layers_weights, lang_model=lang_model) return model_, tokenizer_
def __init__(self, config): model_name = config.get("model_name", None) model_path = config.get("model_path", None) device = config.get("device", 0) # default on gpu 0 self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path) # the default entailment id is 2 (contradiction is 0, neutral is 1) self.contradiction_id = 0 self.entailment_id = 2 self.model = XLMRobertaForSequenceClassification.from_pretrained( model_path) self.model.eval() self.model.half() self.device = torch.device( "cpu" if device < 0 else "cuda:{}".format(device)) if self.device.type == "cuda": self.model = self.model.to(self.device)