def configure(self): if isinstance(Tokenizer, UnsupportedPackage): Tokenizer.raise_error(self.__provider__) self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer( BPE.from_file(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel()
def test_instantiate(self, roberta_files): assert isinstance(BPE(), Model) assert isinstance(BPE(), BPE) vocab = {"a": 0, "b": 1, "ab": 2} merges = [("a", "b")] assert isinstance(BPE(vocab, merges), Model) assert isinstance( BPE.from_file(roberta_files["vocab"], roberta_files["merges"]), BPE) with pytest.raises( ValueError, match="`vocab` and `merges` must be both specified"): BPE(vocab=vocab) with pytest.raises( ValueError, match="`vocab` and `merges` must be both specified"): BPE(merges=merges) assert isinstance( pickle.loads(pickle.dumps(BPE(vocab, merges))), BPE, ) # Deprecated calls in 0.9 with pytest.deprecated_call(): assert isinstance( BPE(roberta_files["vocab"], roberta_files["merges"]), Model) with pytest.raises( ValueError, match="`vocab` and `merges` must be both specified"): BPE(vocab=roberta_files["vocab"]) with pytest.raises( ValueError, match="`vocab` and `merges` must be both specified"): BPE(merges=roberta_files["merges"]) with pytest.deprecated_call(): assert isinstance( pickle.loads( pickle.dumps( BPE(roberta_files["vocab"], roberta_files["merges"]))), BPE, )
tokenizer = Tokenizer(BPE()) tokenizer.normalizer = Sequence([ # NFKC(), Lowercase() ]) tokenizer.pre_tokenizer = ByteLevel() tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True) tokenizer.train(trainer, [f"{proc_path}/names.txt"]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.model.save(proc_path) tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json', f'{proc_path}/merges.txt') with open(f"{proc_path}/vocab.json", "r") as f: bpe_vocab = json.load(f) bpe_vocab_idx = {v: k for k, v in bpe_vocab.items()} char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1} print(f"Char map size: {len(char_map)}\n") MAX_LEN_OF_WORD = max([len(w) for w in bpe_vocab]) print(f"Max length of word: {MAX_LEN_OF_WORD}\n") if ZERO_PAD: word_map = { k: [char_map[c] for c in k] + [0] * (MAX_LEN_OF_WORD - len(k))
def main(): args = build_argparser().parse_args() # load vocabulary file for model vocab = load_vocab_file(args.vocab) log.debug("Loaded vocab file from {}, get {} tokens".format( args.vocab, len(vocab))) # create tokenizer tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges))) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() log.info('OpenVINO Runtime') log.info('\tbuild: {}'.format(get_version())) core = Core() # read IR log.info('Reading model {}'.format(args.model)) model = core.read_model(args.model) # check number inputs and outputs if len(model.inputs) != 1: raise RuntimeError( 'The demo expects model with single input, while provided {}'. format(len(model.inputs))) if len(model.outputs) != 1: raise RuntimeError( 'The demo expects model with single output, while provided {}'. format(len(model.outputs))) input_tensor = model.inputs[0].any_name if not args.dynamic_shape and ( model.inputs[0].partial_shape.is_dynamic or model.inputs[0].shape[1] != args.max_seq_len): model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(args.max_seq_len)]) }) if args.dynamic_shape: model.reshape({ input_tensor: PartialShape([Dimension(1), Dimension(0, args.max_seq_len)]) }) # load model to the device compiled_model = core.compile_model(model, args.device) output_tensor = compiled_model.outputs[0] infer_request = compiled_model.create_infer_request() log.info('The model {} is loaded to {}'.format(args.model, args.device)) if args.input: def prompts(): for prompt in args.input: log.info("Input prompt: {}".format(prompt)) yield prompt else: def prompts(): while True: yield input('Type input prompt (empty string to exit):') # loop on user's or prepared prompts for prompt in prompts(): if not prompt.strip(): break # encode input tokens = tokenizer.encode_batch([prompt])[0].ids input_ids = np.array([tokens], dtype=np.int32) # maximum number of tokens that can be processed by network at once max_length = args.max_seq_len eos_token_id = len(vocab) - 1 cur_input_len = input_ids.shape[-1] # maximum number of tokens that will be generated max_sample_token_num = args.max_sample_token_num + cur_input_len t0 = time.perf_counter() t_count = 0 while True: model_input = input_ids if not args.dynamic_shape: # pad the rest of the request pad_len = max_length - cur_input_len model_input = np.concatenate( (input_ids, [[eos_token_id] * pad_len]), axis=-1) # create numpy inputs for OpenVINO runtime inputs = { input_tensor: model_input, } # infer by OpenVINO runtime t_start = time.perf_counter() outputs = infer_request.infer(inputs)[output_tensor] t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(model_input.shape[1], 1 / (t_end - t_start), t_end - t_start)) next_token_logits = outputs[:, cur_input_len - 1, :] # pre-process distribution next_token_scores = process_logits(input_ids, next_token_logits, eos_token_id) if args.top_k > 0: next_token_scores = get_top_k_logits(next_token_scores, args.top_k) if args.top_p < 1.0: next_token_scores = get_top_p_logits(next_token_scores, args.top_p) # get next token id probs = softmax(next_token_scores) next_tokens = np.random.choice(probs.shape[-1], 1, p=probs[0], replace=True) # update info for the next step input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1) cur_input_len = input_ids.shape[-1] if stop_criteria(input_ids, min(max_length, max_sample_token_num), eos_token_id): break t1 = time.perf_counter() text = tokenizer.decode_batch(input_ids)[0] log.info( "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)". format(t_count, t1 - t0, (t1 - t0) / t_count)) # print result log.info("GENERATED SEQUENCE: {}".format(text))
from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import Whitespace from tokenizers.normalizers import Sequence, Lowercase, Strip def train(dataset_path, output_dir='data/tokenizer/', vocab_size=30_000, min_frequency=3): trainer = BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']) tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = Whitespace() tokenizer.normalizer = Sequence([Lowercase(), Strip()]) files = [dataset_path] tokenizer.train(trainer, files) files = tokenizer.model.save(output_dir) tokenizer.model = BPE.from_file(*files, unk_token='[UNK]') tokenizer.save(f'{output_dir}/tokenizer.json') if __name__ == '__main__': fire.Fire(train)