def post_processor(self, tokenizer): return TemplateProcessing( seq_a=["$0", "</s>"], seq_b=["$1", "</s>"], special_tokens=[ ("</s>", tokenizer.get_vocab()["</s>"]), ], )
def load_tokenizer(tokenizer_path): tokenizer = Tokenizer.from_file(tokenizer_path) tokenizer.post_processor = TemplateProcessing( single='[CLS] $A [SEP]', pair='[CLS] $A [SEP] $B:1 [SEP]:1', special_tokens=[('[CLS]', 1), ('[SEP]', 2)], ) return tokenizer
def post_processor(self, tokenizer): eos = self.original_tokenizer.eos_token return TemplateProcessing( seq_a=["$0", eos], seq_b=["$1", eos], special_tokens=[ (eos, tokenizer.get_vocab()[eos]), ], )
def tokenize(dt, df): from tokenizers import Tokenizer from tokenizers.models import WordPiece from tokenizers.pre_tokenizers import Whitespace from tokenizers import normalizers from tokenizers.normalizers import NFD, StripAccents from tokenizers.processors import TemplateProcessing from tokenizers.trainers import WordPieceTrainer #print(df.head()) #print(df.query_text.head()) #print(df.query_text.to_list()) #exit(0) data_source = get_data_source(dt) token_file = Path(data_dir, data_source, 'tokenizer.json') vocab_file = Path(data_dir, data_source, 'vocab.txt') corpus_file = Path(data_dir, data_source, 'corpus.txt') if vocab_file.is_file() and corpus_file.is_file(): print("corpus and token files already generated") return 0 bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=25000, min_frequency=3, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #print(df.query_text.to_list()) bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer) bert_tokenizer.save(str(token_file)) #bert_tokenizer.save_model(directory=data_dir,name='tokenizer') df['range_idx'] = range(0, df.shape[0]) df['mean_rank_group'] = df.groupby( ['session_id'], sort=False)['range_idx'].transform(np.mean) df['separate_column'] = df['range_idx'] < df['mean_rank_group'] df = df.groupby(['session_id', 'separate_column'], as_index=False, sort=False)['query_text'].agg( ' '.join).drop(columns='separate_column') #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index() df.query_text.to_csv(corpus_file, header=False, index=False) with open(token_file) as token_f: jdata = json.load(token_f) with open(vocab_file, "w") as fd: for k in jdata['model']['vocab'].keys(): print(k, file=fd)
def test_instantiate(self): bert = self.get_bert() assert bert is not None assert isinstance(bert, PostProcessor) assert isinstance(bert, TemplateProcessing) assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing) # It is absolutely legal to have tokens with spaces in the name: processor = TemplateProcessing( single=["[ C L S ]", "Token with space"], special_tokens=[("[ C L S ]", 0), ("Token with space", 1)], ) # Sequence identifiers must be well formed: with pytest.raises(Exception, match="Cannot build Piece"): processor = TemplateProcessing(single="[CLS] $$ [SEP]") with pytest.raises(Exception, match="Cannot build Piece"): processor = TemplateProcessing(single="[CLS] $A: [SEP]") # Special tokens must be provided when used in template: with pytest.raises(Exception, match="Missing SpecialToken\(s\) with id\(s\)"): processor = TemplateProcessing(single=["[CLS]"])
def __init__( self, replacement: str = "▁", add_prefix_space: bool = True, unk_token: Union[str, AddedToken] = "<unk>", eos_token: Union[str, AddedToken] = "</s>", pad_token: Union[str, AddedToken] = "<pad>", ): self.special_tokens = { "pad": {"id": 0, "token": pad_token}, "eos": {"id": 1, "token": eos_token}, "unk": {"id": 2, "token": unk_token}, } self.special_tokens_list = [None] * len(self.special_tokens) for token_dict in self.special_tokens.values(): self.special_tokens_list[token_dict["id"]] = token_dict["token"] tokenizer = Tokenizer(Unigram()) tokenizer.normalizer = normalizers.Sequence( [ normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " "), normalizers.Lowercase(), ] ) tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [ pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), pre_tokenizers.Digits(individual_digits=True), pre_tokenizers.Punctuation(), ] ) tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.post_processor = TemplateProcessing( single=f"$A {self.special_tokens['eos']['token']}", special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])], ) parameters = { "model": "SentencePieceUnigram", "replacement": replacement, "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def _prepare_pipeline(self): self.tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) self.tokenizer.enable_padding( pad_id=self.__class__.SPECIAL_TOKENS.index("[PAD]"), pad_token="[PAD]")
def wordpiece_tokenize(line): tokenizer = Tokenizer(WordPiece(wordpiece_dict3)) tokenizer.enable_padding(length=200) tokenizer.enable_truncation(max_length=200) tokenizer.pre_tokenizer = WhitespaceSplit() tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) output = tokenizer.encode(line) return (output.ids)
def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first')
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer: bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) trainer = WordPieceTrainer( vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer
def train_wordpiece_bert(): """ Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html """ from tokenizers.models import WordPiece bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) from tokenizers import normalizers from tokenizers.normalizers import Lowercase, NFD, StripAccents bert_tokenizer.normalizer = normalizers.Sequence( [NFD(), Lowercase(), StripAccents()]) from tokenizers.pre_tokenizers import Whitespace bert_tokenizer.pre_tokenizer = Whitespace() from tokenizers.processors import TemplateProcessing bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) bert_tokenizer.decoder = decoders.WordPiece() from tokenizers.trainers import WordPieceTrainer trainer = WordPieceTrainer( vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) files = [ DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a for a in ["test", "train", "valid"] ] bert_tokenizer.train(files, trainer) bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json') return bert_tokenizer
def __init__( self, batch_size: int = 1, val_batch_size: int = None, dataset=None, languages=None, tokenizer: Tokenizer = None, device='cpu', ): super(WMT20DataModule, self).__init__() self.batch_size = batch_size self.val_batch_size = val_batch_size if val_batch_size is not None else batch_size if dataset is None: raise ValueError(f"dataset is required for {self}") self.dataset = dataset if languages is None: raise ValueError(f"languages is required for {self}") self.languages = languages self.tokenizer = tokenizer pad_token = "[PAD]" self.tokenizer.enable_padding(pad_id=tokenizer.token_to_id(pad_token), pad_token=pad_token) translate_postprocessor = TemplateProcessing( single="[TRANSLATE] $0 [SEP]", special_tokens=[("[TRANSLATE]", tokenizer.token_to_id('[TRANSLATE]')), ("[SEP]", tokenizer.token_to_id('[SEP]'))], ) tokenizer.post_processor = translate_postprocessor self.device = device return
parser.add_argument( "--dataset", type=str, default="/data/nv419/VQG_DATA/processed/iq_dataset.hdf5") parser.add_argument( "--val_dataset", type=str, default="/data/nv419/VQG_DATA/processed/iq_val_dataset.hdf5") args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args.device = device tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", special_tokens=[("[CLS]", 1), ("[SEP]", 2)], ) data_loader = get_loader(os.path.join(os.getcwd(), args.dataset), tokenizer, args.batch_size, shuffle=True, num_workers=8) val_data_loader = get_loader(os.path.join(os.getcwd(), args.val_dataset), tokenizer, args.batch_size, shuffle=False, num_workers=8) trainVQG = TrainVQG(args, tokenizer) # .to(device)
def get_bert(self): return TemplateProcessing( seq_a=["[CLS]", "$0", "[SEP]"], seq_b=["$1", "[SEP]"], special_tokens=[("[CLS]", 1), ("[SEP]", 0)], )
from tokenizers import Tokenizer, normalizers from tokenizers.models import WordLevel from tokenizers.pre_tokenizers import Whitespace from tokenizers.trainers import WordLevelTrainer from tokenizers.processors import TemplateProcessing t = Tokenizer(WordLevel(unk_token="[UNK]")) t.pre_tokenizer = Whitespace() trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]) t.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", # , # pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 2), ("[SEP]", 3), ]) files = ['tok-train-shuf-tgt.tsv'] t.train(files, trainer) t.save("code_tokenizer.json")
def test_quicktour(self, doc_wiki_tokenizer): def print(*args, **kwargs): pass try: # START reload_tokenizer tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json") # END reload_tokenizer except Exception: tokenizer = Tokenizer.from_file(doc_wiki_tokenizer) # START encode output = tokenizer.encode("Hello, y'all! How are you 😁 ?") # END encode # START print_tokens print(output.tokens) # ["Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?"] # END print_tokens assert output.tokens == [ "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", ] # START print_ids print(output.ids) # [27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35] # END print_ids assert output.ids == [ 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35 ] # START print_offsets print(output.offsets[9]) # (26, 27) # END print_offsets assert output.offsets[9] == (26, 27) # START use_offsets sentence = "Hello, y'all! How are you 😁 ?" sentence[26:27] # "😁" # END use_offsets assert sentence[26:27] == "😁" # START check_sep tokenizer.token_to_id("[SEP]") # 2 # END check_sep assert tokenizer.token_to_id("[SEP]") == 2 # START init_template_processing from tokenizers.processors import TemplateProcessing tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) # END init_template_processing # START print_special_tokens output = tokenizer.encode("Hello, y'all! How are you 😁 ?") print(output.tokens) # ["[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]"] # END print_special_tokens assert output.tokens == [ "[CLS]", "Hello", ",", "y", "'", "all", "!", "How", "are", "you", "[UNK]", "?", "[SEP]", ] # START print_special_tokens_pair output = tokenizer.encode("Hello, y'all!", "How are you 😁 ?") print(output.tokens) # ["[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]"] # END print_special_tokens_pair assert output.tokens == [ "[CLS]", "Hello", ",", "y", "'", "all", "!", "[SEP]", "How", "are", "you", "[UNK]", "?", "[SEP]", ] # START print_type_ids print(output.type_ids) # [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] # END print_type_ids assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] # START encode_batch output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"]) # END encode_batch # START encode_batch_pair output = tokenizer.encode_batch( [["Hello, y'all!", "How are you 😁 ?"], ["Hello to you too!", "I'm fine, thank you!"]]) # END encode_batch_pair # START enable_padding tokenizer.enable_padding(pad_id=3, pad_token="[PAD]") # END enable_padding # START print_batch_tokens output = tokenizer.encode_batch(["Hello, y'all!", "How are you 😁 ?"]) print(output[1].tokens) # ["[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]"] # END print_batch_tokens assert output[1].tokens == [ "[CLS]", "How", "are", "you", "[UNK]", "?", "[SEP]", "[PAD]" ] # START print_attention_mask print(output[1].attention_mask) # [1, 1, 1, 1, 1, 1, 1, 0] # END print_attention_mask assert output[1].attention_mask == [1, 1, 1, 1, 1, 1, 1, 0]
import os os.environ["CUDA_VISIBLE_DEVICES"] = "2" uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt' paths = [str(x) for x in Path(".").glob('data/feature_sequence/*.txt')] tokenizer = Tokenizer(WordLevel()) tokenizer.pre_tokenizer = Whitespace() # trainer = trainers.BpeTrainer( trainer = trainers.WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(trainer, [uid_task_id_sequence_path]) tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) # tokenizer.save_model("tmp") tokenizer.model.save('data/bert_and_tokenizer', 'uid_task_id') # tokenizer = ByteLevelBPETokenizer( # "./tmp/vocab.json", # "./tmp/merges.txt", # ) # task id的词汇表大小 task_id_vocab_size = 6033 config = BertConfig(
def get_roberta(self): return TemplateProcessing( seq_a="<s> $0 </s>", seq_b="</s> $0 </s>", special_tokens=[("<s>", 0), ("</s>", 1)], )
def get_bert(self): return TemplateProcessing( single=["[CLS]", "$0", "[SEP]"], pair=["[CLS]", "$A", "[SEP]", "$B:1", "[SEP]:1"], special_tokens=[("[CLS]", 1), ("[SEP]", 0)], )
def __getitem__(self, item): """ Args: item: int, idx Returns: tokens: tokens of query + context, [seq_len] token_type_ids: token type ids, 0 for query, 1 for context, [seq_len] start_labels: start labels of NER in tokens, [seq_len] end_labels: end labels of NER in tokens, [seq_len] label_mask: label mask, 1 for counting into loss, 0 for ignoring. [seq_len] match_labels: match labels, [seq_len, seq_len] sample_idx: sample id label_idx: label id """ cls_tok = "[CLS]" sep_tok = "[SEP]" if 'roberta' in self.args.bert_config_dir: cls_tok = "<s>" sep_tok = "</s>" # begin{get the label2idx dictionary} label2idx = {} label2idx_list = self.args.label2idx_list for labidx in label2idx_list: lab, idx = labidx label2idx[lab] = int(idx) # print('label2idx: ',label2idx) # end{get the label2idx dictionary} # begin{get the morph2idx dictionary} morph2idx = {} morph2idx_list = self.args.morph2idx_list for morphidx in morph2idx_list: morph, idx = morphidx morph2idx[morph] = int(idx) # end{get the morph2idx dictionary} data = self.all_data[item] tokenizer = self.tokenzier # AutoTokenizer(self.args.bert_config_dir) qas_id = data.get("qas_id", "0.0") sample_idx, label_idx = qas_id.split(".") sample_idx = torch.LongTensor([int(sample_idx)]) label_idx = torch.LongTensor([int(label_idx)]) query = data["query"] context = data["context"].strip() if '\u200b' in context: context = context.replace('\u200b', '') elif '\ufeff' in context: context = context.replace('\ufeff', '') elif ' ' in context: context = context.replace(' ', ' ') span_position_label = data["span_position_label"] # context = "Japan -DOCSTART- began the defence of their Asian Cup on Friday ." start_positions = [] end_positions = [] for seidx, label in span_position_label.items(): sidx, eidx = seidx.split(';') start_positions.append(int(sidx)) end_positions.append(int(eidx)) # add space offsets words = context.split() # convert the span position into the character index, space is also a position. pos_start_positions = start_positions pos_end_positions = end_positions pos_span_idxs = [] for sidx, eidx in zip(pos_start_positions, pos_end_positions): pos_span_idxs.append((sidx, eidx)) # all span (sidx, eidx) all_span_idxs = enumerate_spans(context.split(), offset=0, max_span_width=self.args.max_span_len) # get the span-length of each span # begin{compute the span weight} all_span_weights = [] for span_idx in all_span_idxs: weight = self.args.neg_span_weight if span_idx in pos_span_idxs: weight = 1.0 all_span_weights.append(weight) # end{compute the span weight} all_span_lens = [] for idxs in all_span_idxs: sid, eid = idxs slen = eid - sid + 1 all_span_lens.append(slen) morph_idxs = self.case_feature_tokenLevel(morph2idx, all_span_idxs, words, self.args.max_span_len) if 'roberta' in self.args.bert_config_dir: tokenizer.post_processor = TemplateProcessing( single="<s> $A </s>", pair="<s> $A </s> $B:1 </s>:1", special_tokens=[ ("<s>", tokenizer.token_to_id("<s>")), ("</s>", tokenizer.token_to_id("</s>")), ], ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) p1 = tokenizer.token_to_id("<s>") p2 = tokenizer.token_to_id("</s>") print("p1", p1) print("p2", p2) query_context_tokens = tokenizer.encode(context, add_special_tokens=True) tokens = query_context_tokens.ids # subword index type_ids = query_context_tokens.type_ids # the split of two sentence on the subword-level, 0 for first sent, 1 for the second sent offsets = query_context_tokens.offsets # the subword's start-index and end-idx of the character-level. # print("current sent: ", context) all_span_idxs_ltoken, all_span_word, all_span_idxs_new_label = self.convert2tokenIdx( words, tokens, type_ids, offsets, all_span_idxs, span_position_label) pos_span_idxs_ltoken, pos_span_word, pos_span_idxs_new_label = self.convert2tokenIdx( words, tokens, type_ids, offsets, pos_span_idxs, span_position_label) span_label_ltoken = [] for seidx_str, label in all_span_idxs_new_label.items(): span_label_ltoken.append(label2idx[label]) ''' an example of tokens, type_ids, and offsets value. inputs: query = "you are beautiful ." context = 'i love you .' outputs: tokens: [101, 2017, 2024, 3376, 1012, 102, 1045, 2293, 2017, 1012, 102] type_ids: [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1] offsets: [(0, 0), (0, 3), (4, 7), (8, 17), (18, 19), (0, 0), (0, 1), (2, 6), (7, 10), (11, 12), (0, 0)] query_context_tokens.tokens: ['[CLS]', 'you', 'are', 'beautiful', '.', '[SEP]', 'i', 'love', 'you', '.', '[SEP]'] query_context_tokens.words: [None, 0, 1, 2, 3, None, 0, 1, 2, 3, None] ''' # # the max-end-index should not exceed the max-length. # all_span_idxs_ltoken # return tokens, type_ids, all_span_idxs_ltoken, pos_span_mask_ltoken # truncate tokens = tokens[:self.max_length] type_ids = type_ids[:self.max_length] all_span_idxs_ltoken = all_span_idxs_ltoken[:self.max_num_span] # pos_span_mask_ltoken = pos_span_mask_ltoken[:self.max_num_span] span_label_ltoken = span_label_ltoken[:self.max_num_span] all_span_lens = all_span_lens[:self.max_num_span] morph_idxs = morph_idxs[:self.max_num_span] all_span_weights = all_span_weights[:self.max_num_span] # make sure last token is [SEP] sep_token = tokenizer.token_to_id(sep_tok) if tokens[-1] != sep_token: assert len(tokens) == self.max_length tokens = tokens[:-1] + [sep_token] # padding to the max length. import numpy as np real_span_mask_ltoken = np.ones_like(span_label_ltoken) if self.pad_to_maxlen: tokens = self.pad(tokens, 0) type_ids = self.pad(type_ids, 1) all_span_idxs_ltoken = self.pad(all_span_idxs_ltoken, value=(0, 0), max_length=self.max_num_span) # pos_span_mask_ltoken = self.pad(pos_span_mask_ltoken,value=0,max_length=self.max_num_span) real_span_mask_ltoken = self.pad(real_span_mask_ltoken, value=0, max_length=self.max_num_span) span_label_ltoken = self.pad(span_label_ltoken, value=0, max_length=self.max_num_span) all_span_lens = self.pad(all_span_lens, value=0, max_length=self.max_num_span) morph_idxs = self.pad(morph_idxs, value=0, max_length=self.max_num_span) all_span_weights = self.pad(all_span_weights, value=0, max_length=self.max_num_span) tokens = torch.LongTensor(tokens) type_ids = torch.LongTensor( type_ids) # use to split the first and second sentence. all_span_idxs_ltoken = torch.LongTensor(all_span_idxs_ltoken) # pos_span_mask_ltoken = torch.LongTensor(pos_span_mask_ltoken) real_span_mask_ltoken = torch.LongTensor(real_span_mask_ltoken) span_label_ltoken = torch.LongTensor(span_label_ltoken) all_span_lens = torch.LongTensor(all_span_lens) morph_idxs = torch.LongTensor(morph_idxs) # print("all_span_weights: ",all_span_weights) all_span_weights = torch.Tensor(all_span_weights) min_idx = np.max(np.array(all_span_idxs_ltoken)) return [ tokens, type_ids, # use to split the first and second sentence. all_span_idxs_ltoken, morph_idxs, span_label_ltoken, all_span_lens, all_span_weights, # pos_span_mask_ltoken, real_span_mask_ltoken, words, all_span_word, all_span_idxs, ]
def get_roberta(self): return TemplateProcessing( single="<s> $0 </s>", pair="<s> $A </s> </s> $B </s>", special_tokens=[("<s>", 0), ("</s>", 1)], )