forked from cvlab-columbia/globetrotter
/
tokenization.py
executable file
·164 lines (130 loc) · 6.65 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Two different tokenizations. The two of them BPE encodings.
First, code adapted from fairseq/models/roberta/hub_interface.py
Used to make it easier to reuse the dictionary from their xlm model.
This one contains a very big dictionary, so take care and reduce size of embeddings or model size explodes
Second, create our own dictionary just from the text from our datasets, using huggingface tokenization
"""
import os
import shutil
import typing
from argparse import Namespace
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from fairseq.data import encoders, Dictionary
from tokenizers import ByteLevelBPETokenizer, Tokenizer
import json
special_tokens = ['<pad>', '<txt>', '<img>', '<mask>', '<sep>']
class FairseqTokenizerBPE(nn.Module):
def __init__(self, tokenizer_path):
super().__init__()
self.dict = Dictionary.load(os.path.join(tokenizer_path, 'dict.txt'))
# <sep> and <pad> already exist in the dictionary
self.index_special_tokens = {tok: self.dict.add_symbol(tok) for tok in special_tokens}
args = Namespace(
bpe='sentencepiece',
sample_break_mode='complete',
sentencepiece_vocab=os.path.join(tokenizer_path, 'sentencepiece.bpe.model')
)
self.bpe = encoders.build_bpe(args)
# this is useful for determining the device
self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
self.info = 'fairseq'
@property
def device(self):
return self._float_tensor.device
def encode(self, sentence: str):
"""
BPE-encode a sentence (or multiple sentences).
We simplify the original code, and do not add sentence and sequence separators.
The BPE encoding follows GPT-2.
Note that special tokens like <pad>, <sep>, etc. have to be encoded separately. Otherwise self.bpe.encode does
not understand them as atomic tokens
"""
bpe_sentence = self.bpe.encode(sentence)
tokens = self.dict.encode_line(bpe_sentence, append_eos=False, add_if_not_exist=False)
return tokens.long(), bpe_sentence.split(' ')
def decode(self, tokens: torch.LongTensor):
assert tokens.dim() == 1
tokens = tokens.numpy()
if tokens[0] == self.dict.bos():
tokens = tokens[1:] # remove <s>. We do not use it anyway
eos_mask = (tokens == self.dict.eos())
doc_mask = eos_mask[1:] & eos_mask[:-1]
sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
sentences = [self.bpe.decode(self.dict.string(s)) for s in sentences]
if len(sentences) == 1:
return sentences[0]
return sentences
def id_to_token(self, id):
return NotImplemented
def __len__(self):
return len(self.dict)
# This is simply for PyCharm to find the correct reference to the methods of the class
def __call__(self, *input, **kwargs) -> typing.Any:
return super().__call__(*input, **kwargs)
class HuggingfaceTokenizerBPE(nn.Module):
def __init__(self, text_files, dataset_info_path='', config_data=None):
super().__init__()
# The default vocab size in the BERT model is 30522. If we want a number larger than that, we will also have to
# change the BERT configuration.
vocab_size = 30000
self.info = f'hug{vocab_size}'
with open(f'config/data/{config_data}.json') as json_file:
tokenizer_from = json.load(json_file)['tokenizer_from']
config_name = config_data if tokenizer_from == "" else tokenizer_from
print(os.path.join(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json'))
# The loading is only properly implemented starting from version 0.8. However, it makes the system use a lot of
# CPU for no reason (it is much slower). Maybe it will be fixed in the future.
if not os.path.isfile(os.path.join(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json')):
text_files = text_files()
self.tokenizer = ByteLevelBPETokenizer()
# Join into a single file. This should NOT be necessary but it does not work properly with a lot of files
with open('/tmp/text_files.txt', 'wb') as outfile:
for filename in tqdm(text_files, desc='Joining all files into one for tokenization'):
with open(filename, 'rb') as readfile:
shutil.copyfileobj(readfile, outfile)
text_files = '/tmp/text_files.txt'
self.tokenizer.train(text_files, vocab_size=vocab_size, special_tokens=special_tokens)
self.tokenizer.save(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}')
# No "else", always load for consistency
vocab_file = os.path.join(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json')
merges_file = os.path.join(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-merges.txt')
self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file)
self.tokenizer.add_special_tokens(special_tokens)
self.index_special_tokens = {tok: self.tokenizer.encode(tok).ids[0] for tok in special_tokens}
@property
def device(self):
return self._float_tensor.device
def encode(self, sentence: str):
output = self.tokenizer.encode(sentence)
token_ids = output.ids
tokens = output.tokens
return torch.tensor(token_ids), tokens
def decode(self, tokens: torch.LongTensor):
assert tokens.dim() == 1
tokens = list(tokens.cpu().numpy())
sentences = self.tokenizer.decode(tokens)
return sentences
def id_to_token(self, token_id):
if type(token_id) != torch.Tensor:
token_id = torch.tensor(token_id)
return self.tokenizer.id_to_token(token_id)
def token_to_id(self, token):
assert type(token) == str
return self.tokenizer.token_to_id(token)
def __len__(self):
return self.tokenizer.get_vocab_size()
# This is simply for PyCharm to find the correct reference to the methods of the class
def __call__(self, *input, **kwargs) -> typing.Any:
return super().__call__(*input, **kwargs)
def create_tokenizer(tokenizer_type, list_txt_files=None, tokenizer_path=None, dataset_info_path=None,
config_data=None):
if tokenizer_type == 'fairseq':
tokenizer = FairseqTokenizerBPE(tokenizer_path=tokenizer_path)
else:
tokenizer = HuggingfaceTokenizerBPE(text_files=list_txt_files, dataset_info_path=dataset_info_path,
config_data=config_data)
return tokenizer