def get_daily_dialog_tokenizer(tokenizer_location=None): ''' Get the daily dialog tokenizer. Trains a new one if no location is provided :param tokenizer_location: Json containing information about the tokenizer. :return: ''' if tokenizer_location: tokenizer = Tokenizer.from_file(tokenizer_location, ) tokenizer.enable_padding() return tokenizer else: dataset_train = datasets.load_dataset("daily_dialog", split="train", ) utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train] trainer = WordPieceTrainer( vocab_size = 2048, special_tokens = token_utils.special_tokens.values() ) custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], )) custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) custom_tokenizer.pre_tokenizer = Whitespace() custom_tokenizer.train_from_iterator(utterances, trainer, ) custom_tokenizer.enable_padding() # Write every dialogue to file location = './daily_dialog/' custom_tokenizer.save(location + "tokenizer.json") return custom_tokenizer
def get_recurrent_tokenizer(vocab, max_context_tokens, unk_token, pad_token, device="cpu"): """ Return a tokenizer to be used with recurrent-based models """ question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) question_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) question_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) question_tokenizer.enable_padding(direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token) context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token)) context_tokenizer.normalizer = Sequence( [StripAccents(), Lowercase(), Strip()]) context_tokenizer.pre_tokenizer = PreSequence( [Whitespace(), Punctuation()]) context_tokenizer.enable_padding( direction="right", pad_id=vocab[pad_token], pad_type_id=1, pad_token=pad_token, ) context_tokenizer.enable_truncation(max_context_tokens) return RecurrentSquadTokenizer(question_tokenizer, context_tokenizer, device=device)
def wordpiece_tokenize(line): tokenizer = Tokenizer(WordPiece(wordpiece_dict3)) tokenizer.enable_padding(length=200) tokenizer.enable_truncation(max_length=200) tokenizer.pre_tokenizer = WhitespaceSplit() tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) output = tokenizer.encode(line) return (output.ids)
def test_post_process(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) tokenizer.enable_padding(length=4) encoding = tokenizer.encode("my name is john") pair_encoding = tokenizer.encode("pair") # Can post process a single encoding output = tokenizer.post_process(encoding) assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] # Can post process a pair of encodings output = tokenizer.post_process(encoding, pair_encoding) assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
def create_tokenizer(sentence_list): filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt' with open(filename, 'w') as f: for s in sentence_list: f.write(f'{s}\n') tokenizer = Tokenizer(WordPiece()) tokenizer.pre_tokenizer = Whitespace() tokenizer.decoder = decoders.WordPiece() tokenizer.enable_padding(pad_token='[PAD]', pad_id=0) trainer = WordPieceTrainer( vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]']) tokenizer.train(trainer, [filename]) os.remove(filename) return tokenizer
def test_padding(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # By default it does nothing when encoding single sequence tokenizer.enable_padding() output = tokenizer.encode("my name") assert output.tokens == ["my", "name"] # Can pad to the longest in a batch output = tokenizer.encode_batch(["my name", "my name is john"]) assert all([len(encoding) == 4 for encoding in output]) # Can pad to the specified max length otherwise tokenizer.enable_padding(max_length=4) output = tokenizer.encode("my name") assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] output = tokenizer.encode("my name", "pair") assert output.tokens == ["my", "name", "pair", "[PAD]"]
def __init__( self, inputs: List[str], targets: List[Tensor], key: Tensor, tokenizer: Tokenizer, max_len: int = 200, batch_size: int = 32, ) -> None: self.inputs = inputs self.targets = np.array(onp.array(targets)) tokenizer.enable_truncation(max_len) tokenizer.enable_padding() self.tokenizer = tokenizer self.max_len = max_len self.batch_size = batch_size self.key = key self.len = len(self.inputs) self.num_batches = math.ceil(self.len / batch_size)
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. out_f.write( json.dumps((examples_per_file[input_file], len(label_counter))) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: for example, labels in g: example_batch.append(example) labels_batch.append(labels) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write( json.dumps([example.ids, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels in zip(example_batch, labels_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() out_f.write(json.dumps([example.ids, labels]) + '\n')
class SentencePieceBPETokenizer: """Custom SentencePiece tokenizer""" unk_token = '<unk>' pad_token = '<pad>' def __init__(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None, dropout: float = None, max_length: Optional[int] = 64) -> None: """Constructor Args: vocab (Dict[str, int]): A dictionary of string keys and their ids. merges (List[Tuple[str, str]]): A list of pairs of tokens. dropout (float): BPE dropout max_length (int, optional): The max length at which to truncate. Defaults to `64`. """ self.tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token)) self.tokenizer.normalizer = BertNormalizer() # noqa self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace() # noqa self.tokenizer.decoder = decoders.Metaspace() # noqa self.tokenizer.add_special_tokens([self.pad_token, self.unk_token]) self.tokenizer.enable_padding(pad_token=self.pad_token) self.tokenizer.enable_truncation(max_length) @classmethod def train(cls, dataset: Sequence[str], vocab_size: int = 1000, min_frequency: int = 2, dropout: float = 0.0, max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer': instance = cls(dropout=dropout, max_length=max_length) trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[cls.pad_token, cls.unk_token]) instance.tokenizer.train_from_iterator(dataset, trainer=trainer) instance.tokenizer.model.dropout = None return instance @property def vocab_size(self): return len(self.tokenizer.get_vocab()) def serialize(self): return self.tokenizer.to_str() @classmethod def deserialize(cls, s: str) -> 'SentencePieceBPETokenizer': tokenizer = cls() tokenizer.tokenizer = Tokenizer.from_str(s) return tokenizer def encode(self, text: str) -> Dict[str, Any]: encoding = self.tokenizer.encode(text) outputs = { 'ids': torch.tensor(encoding.ids), 'mask': torch.tensor(encoding.attention_mask), 'spans': encoding.offsets, } return outputs def encode_batch(self, batch: List[str]): encodings = self.tokenizer.encode_batch(batch) outputs = { 'ids': torch.tensor([e.ids for e in encodings]), 'mask': torch.tensor([e.attention_mask for e in encodings]), 'spans': [e.offsets for e in encodings], } return outputs
class LitTokenizer: def __init__(self, padding=False, truncation=False, max_length=None, lower=False, lang=None): super().__init__() self.UNK_WORD = '[UNK]' self.PAD_WORD = '[PAD]' self.MASK_WORD = '[MASK]' self.SOS_WORD = '[SOS]' self.EOS_WORD = '[EOS]' self.special_tokens = [ self.UNK_WORD, self.PAD_WORD, self.MASK_WORD, self.SOS_WORD, self.EOS_WORD ] # Define tokenizer self.tokenizer = None self.configure_tokenizers(padding, truncation, max_length, lower) # Other self.lang = lang def get_vocab_size(self): return self.tokenizer.get_vocab_size() def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first') def load_vocab(self, vocab, merges): vocab, merges = tok_model.read_file(vocab, merges) self.tokenizer.model = tok_model(vocab, merges) def train_vocab(self, files, vocab_size=32000, min_frequency=3): # Train trainer trainer = tok_trainer(vocab_size=vocab_size, min_frequency=min_frequency) self.tokenizer.train(files, trainer) def save_vocab(self, output_dir, prefix): self.tokenizer.model.save(output_dir, prefix) def pad(self, examples, keys=None): pad_idx = self.special_tokens.index(self.PAD_WORD) # Keys to modify if not keys: keys = list(examples[0].keys()) d = {} for k in keys: # Collect same-type items (list of IDs, list of masks,...) d[k] = [x[k] for x in examples] # Get max length (value to pad) max_length = max([x.shape[-1] for x in d[k]]) # Apply padding for i, x in enumerate(examples): unpadded_t = x[k] if k == "ids": tmp = torch.full((max_length, ), fill_value=pad_idx, device=unpadded_t.device) # All padding elif k == "attention_mask": tmp = torch.full( (max_length, ), fill_value=0, device=unpadded_t.device) # No attention mask else: raise TypeError("Unknown key") tmp[:unpadded_t.shape[-1]] = unpadded_t d[k][i] = tmp return d def encode(self, x): return self.tokenizer.encode(x) def decode(self, x): if isinstance(x, torch.Tensor): assert len(x.shape) == 2 x = x.detach().cpu().numpy() return [self.tokenizer.decode(x_i) for x_i in x]
from tokenizers import Tokenizer from tokenizers.pre_tokenizers import Whitespace from tokenizers.models import WordLevel VOCAB_FILE = "data/tx1_vocab.txt" with open(VOCAB_FILE, "r") as f: words = list(set(f.read().strip().split("\n"))) vocab = {} for i, word in enumerate(["<pad>", "<unk>"] + words): vocab[word] = i tokenizer = Tokenizer(WordLevel(vocab, unk_token="<unk>")) tokenizer.enable_padding(pad_token="<pad>") tokenizer.pre_tokenizer = Whitespace() tokenizer.save("data/tokenizer-LakhNES-tx1.json")
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. if args.labels_in: labels = json.load(open(args.labels_in)) mlb = mlb.fit([labels]) else: mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. if args.labels_in: n_labels = len(json.load(open(args.labels_in))) else: n_labels = len(label_counter) out_f.write( json.dumps((examples_per_file[input_file], n_labels)) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] doc_idx_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: START_POS = int(args.window_start) / 100 for doc_idx, (example, labels) in enumerate(g): #example = ' '.join(example.split(' ')[-510:]) example_batch.append(example) labels_batch.append(labels) doc_idx_batch.append(doc_idx) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps( [block, labels, doc_idx]) + '\n') else: window = get_window(example, START_POS) assert len(window) == 512 assert all( [type(y) is int for y in window]) out_f.write( json.dumps([window, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps([block, labels, doc_idx]) + '\n') else: out_f.write( json.dumps( [get_window(example, START_POS), labels]) + '\n')