def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((DataTrainingArguments, CustomOthersArguments))

    (data_args, custom_args) = parser.parse_args_into_dataclasses()

    train_files = list(sorted(glob.glob(f'{data_args.train_dir}/*.{data_args.ext}')))
    validation_files = list(sorted(glob.glob(f'{data_args.eval_dir}/*.{data_args.ext}')))

    additional_special_tokens = ADDITIONAL_SPECIAL_TOKENS

    pre_tokenizer_func = PRE_TOKENIZERS_MAP.get(custom_args.pre_tokenizer_type, None)
    if pre_tokenizer_func is None:
        raise NotImplementedError
    elif custom_args.pre_tokenizer_type == 'sefr_cut':
        raise ValueError('sefr_cut is slow use fake_sefr_cu with sefr_cut_pre_tokenizer instead')

    if not os.path.exists(custom_args.output_file) or custom_args.overwrite_output_file:
        trainer = WordLevelTrainer(pre_tokenize_func=pre_tokenizer_func,
                                   vocab_size=custom_args.vocab_size,
                                   vocab_min_freq=custom_args.vocab_min_freq,
                                   input_files=train_files,
                                   additional_special_tokens=additional_special_tokens)
        trainer.count_parallel()
        trainer.save_vocab(custom_args.output_file)
    if custom_args.pre_tokenizer_type == 'fake_sefr_cut':
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            FakeSefrCustomTokenizer(PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token']))
    else:
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            CustomPreTokenizer(pre_tokenizer_func))
    tokenizer = Tokenizer(models.WordLevel.from_file(custom_args.output_file, unk_token='<unk>'))
    tokenizer.pre_tokenizer = custom_pre_tokenizer

    if custom_args.debug:
        print('Tokenize following text.')
        texts = ['<s>โรนัลโดเขาได้เล่นกับทีม</s>', 'โปรตุเกสมีโรนัลโด',
                 'โรนัลโดเขาได้เล่นกับทีม\nโปรตุเกสมีโรนัลโด']
        ids = [e.ids for e in tokenizer.encode_batch(texts)]
        decoded_texts = tokenizer.decode_batch(ids)
        decoded_texts = [text.replace(' ', '') for text in decoded_texts]
        for text, i, decoded_text in zip(texts, ids, decoded_texts):
            print('Text: ', text, '>>', 'Tokenized: ', i, '>>', 'Decoded: ', decoded_text)
        with open(validation_files[0], 'r') as f:
            while True:
                line = f.readline()
                if line:
                    line = line.strip()
                    if len(line) > 0 and not line.isspace():
                        encoded = tokenizer.encode(line)
                        decoded = tokenizer.decode(encoded.ids).replace(' ', '')
                        print('Text: ', line, '>>', encoded.ids, '>>', decoded)
                else:
                    break
    def test_decode(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can decode single sequences
        output = tokenizer.decode([0, 1, 2, 3])
        assert output == "my name is john"

        # Can decode batch
        output = tokenizer.decode_batch([[0, 1, 2, 3], [4]])
        assert output == ["my name is john", "pair"]
class HuggingFaceWordLevelTokenizer(TokenizerBase):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        from tokenizers import Tokenizer, models, normalizers, pre_tokenizers

        self.tokenizer = Tokenizer(
            models.WordLevel(unk_token=self.unknown_token))
        self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        if self.lower:
            self.tokenizer.normalizer = normalizers.Lowercase()

    def fit(self, *, texts=None, text_batch_iter=None, max_tokens=None):
        from tokenizers import trainers

        trainer = trainers.WordLevelTrainer(vocab_size=self.max_vocab_size,
                                            special_tokens=list(
                                                self.special_tokens))
        self.tokenizer.train_from_iterator(text_batch_iter, trainer=trainer)
        self.token_to_id = self.tokenizer.get_vocab()
        self.id_to_token = {
            token_id: token
            for token, token_id in self.token_to_id.items()
        }

    def encode(self, texts):
        id_seqs = self.tokenizer.encode_batch(texts)
        id_seqs = [id_seq.ids for id_seq in id_seqs]
        return self._post_process(
            id_seqs,
            pad_id=self.token_to_id[self.pad_token]
            if self.pad_token else None,
            sos_id=self.token_to_id[self.sos_token]
            if self.sos_token else None,
            eos_id=self.token_to_id[self.eos_token]
            if self.eos_token else None,
        )

    def decode(self, id_seqs):
        self.tokenizer.decode_batch(id_seqs)
Exemple #4
0
print(f"Transformer tokenizer took: {time_p} sec")

print(f"SpeedUp Ratio: {time_p / time_r}")

ids_r = [sentence.ids for sentence in encoded_r]
diff_ids = 0
for i in range(0, len(encoded_r)):
    if encoded_r[i].ids != encoded_p[i]:
        diff_ids += 1
        if args.debug:
            print(encoded_r[i].ids)
            print(encoded_p[i])
            print(encoded_r[i].tokens)
            print(tok_p.tokenize(text[i]))
            print(text[i])
            print("")
print(f"Ids differences: {diff_ids}")

decoded_r = tok_r.decode_batch([sentence.ids for sentence in encoded_r], False)
decoded_p = [tok_p.decode(en) for en in encoded_p]
diff_decoded = 0
for i in range(0, len(text)):
    if decoded_r[i] != decoded_p[i]:
        diff_decoded += 1
        if args.debug:
            print(f"Original:  {text[i]}")
            print(f"Rust:      {decoded_r[i]}")
            print(f"Python:    {decoded_p[i]}")
            print("")
print(f"Decoding differences: {diff_decoded}")
Exemple #5
0
def main():
    args = build_argparser().parse_args()

    # load vocabulary file for model
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # create tokenizer
    tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges)))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    log.info('OpenVINO Inference Engine')
    log.info('\tbuild: {}'.format(get_version()))
    ie = IECore()

    # read IR
    model_xml = args.model
    model_bin = model_xml.with_suffix(".bin")
    log.info('Reading model {}'.format(args.model))
    ie_net = ie.read_network(model=model_xml, weights=model_bin)

    # check input and output names
    if len(ie_net.input_info) != 1:
        raise RuntimeError(
            'The demo expects model with single input, while provided {}'.
            format(len(ie_net.input_info)))
    if len(ie_net.outputs) != 1:
        raise RuntimeError(
            'The demo expects model with single output, while provided {}'.
            format(len(ie_net.outputs)))
    input_names = next(iter(ie_net.input_info))
    output_names = next(iter(ie_net.outputs))

    # load model to the device
    ie_net_exec = ie.load_network(network=ie_net, device_name=args.device)
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.input:

        def prompts():
            for prompt in args.input:
                log.info("Input prompt: {}".format(prompt))
                yield prompt
    else:

        def prompts():
            while True:
                yield input('Type input prompt (empty string to exit):')

    # loop on user's or prepared prompts
    for prompt in prompts():
        if not prompt.strip():
            break

        # encode input
        tokens = tokenizer.encode_batch([prompt])[0].ids
        input_ids = np.array([tokens], dtype=np.int32)

        # maximum number of tokens that can be processed by network at once
        max_length = ie_net.input_info[input_names].input_data.shape[1]

        eos_token_id = len(vocab) - 1

        cur_input_len = input_ids.shape[-1]

        # maximum number of tokens that will be generated
        max_sample_token_num = args.max_sample_token_num + cur_input_len

        t0 = time.perf_counter()
        t_count = 0

        while True:
            # pad the rest of the request
            pad_len = max_length - cur_input_len
            model_input = np.concatenate(
                (input_ids, [[eos_token_id] * pad_len]), axis=-1)

            # create numpy inputs for IE
            inputs = {
                input_names: model_input,
            }

            # infer by IE
            t_start = time.perf_counter()
            res = ie_net_exec.infer(inputs=inputs)
            t_end = time.perf_counter()
            t_count += 1
            log.info(
                "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)"
                .format(max_length, 1 / (t_end - t_start), t_end - t_start))

            outputs = res[output_names]
            next_token_logits = outputs[:, cur_input_len - 1, :]

            # pre-process distribution
            next_token_scores = process_logits(input_ids, next_token_logits,
                                               eos_token_id)
            if args.top_k > 0:
                next_token_scores = get_top_k_logits(next_token_scores,
                                                     args.top_k)

            if args.top_p < 1.0:
                next_token_scores = get_top_p_logits(next_token_scores,
                                                     args.top_p)

            # get next token id
            probs = softmax(next_token_scores)
            next_tokens = np.random.choice(probs.shape[-1],
                                           1,
                                           p=probs[0],
                                           replace=True)

            # update info for the next step
            input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1)

            cur_input_len = input_ids.shape[-1]

            if stop_criteria(input_ids, min(max_length, max_sample_token_num),
                             eos_token_id):
                break

        t1 = time.perf_counter()

        text = tokenizer.decode_batch(input_ids)[0]

        log.info(
            "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)"
            .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count))

        # print result
        log.info("GENERATED SEQUENCE: {}".format(text))
Exemple #6
0
def main():
    args = build_argparser().parse_args()

    # load vocabulary file for model
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # create tokenizer
    tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges)))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    log.info('OpenVINO Runtime')
    log.info('\tbuild: {}'.format(get_version()))
    core = Core()

    # read IR
    log.info('Reading model {}'.format(args.model))
    model = core.read_model(args.model)

    # check number inputs and outputs
    if len(model.inputs) != 1:
        raise RuntimeError(
            'The demo expects model with single input, while provided {}'.
            format(len(model.inputs)))
    if len(model.outputs) != 1:
        raise RuntimeError(
            'The demo expects model with single output, while provided {}'.
            format(len(model.outputs)))
    input_tensor = model.inputs[0].any_name

    if not args.dynamic_shape and (
            model.inputs[0].partial_shape.is_dynamic
            or model.inputs[0].shape[1] != args.max_seq_len):
        model.reshape({
            input_tensor:
            PartialShape([Dimension(1),
                          Dimension(args.max_seq_len)])
        })

    if args.dynamic_shape:
        model.reshape({
            input_tensor:
            PartialShape([Dimension(1),
                          Dimension(0, args.max_seq_len)])
        })

    # load model to the device
    compiled_model = core.compile_model(model, args.device)
    output_tensor = compiled_model.outputs[0]
    infer_request = compiled_model.create_infer_request()
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.input:

        def prompts():
            for prompt in args.input:
                log.info("Input prompt: {}".format(prompt))
                yield prompt
    else:

        def prompts():
            while True:
                yield input('Type input prompt (empty string to exit):')

    # loop on user's or prepared prompts
    for prompt in prompts():
        if not prompt.strip():
            break

        # encode input
        tokens = tokenizer.encode_batch([prompt])[0].ids
        input_ids = np.array([tokens], dtype=np.int32)

        # maximum number of tokens that can be processed by network at once
        max_length = args.max_seq_len

        eos_token_id = len(vocab) - 1

        cur_input_len = input_ids.shape[-1]

        # maximum number of tokens that will be generated
        max_sample_token_num = args.max_sample_token_num + cur_input_len

        t0 = time.perf_counter()
        t_count = 0

        while True:
            model_input = input_ids
            if not args.dynamic_shape:
                # pad the rest of the request
                pad_len = max_length - cur_input_len
                model_input = np.concatenate(
                    (input_ids, [[eos_token_id] * pad_len]), axis=-1)

            # create numpy inputs for OpenVINO runtime
            inputs = {
                input_tensor: model_input,
            }

            # infer by OpenVINO runtime
            t_start = time.perf_counter()
            outputs = infer_request.infer(inputs)[output_tensor]
            t_end = time.perf_counter()
            t_count += 1
            log.info(
                "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)"
                .format(model_input.shape[1], 1 / (t_end - t_start),
                        t_end - t_start))

            next_token_logits = outputs[:, cur_input_len - 1, :]

            # pre-process distribution
            next_token_scores = process_logits(input_ids, next_token_logits,
                                               eos_token_id)
            if args.top_k > 0:
                next_token_scores = get_top_k_logits(next_token_scores,
                                                     args.top_k)

            if args.top_p < 1.0:
                next_token_scores = get_top_p_logits(next_token_scores,
                                                     args.top_p)

            # get next token id
            probs = softmax(next_token_scores)
            next_tokens = np.random.choice(probs.shape[-1],
                                           1,
                                           p=probs[0],
                                           replace=True)

            # update info for the next step
            input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1)

            cur_input_len = input_ids.shape[-1]

            if stop_criteria(input_ids, min(max_length, max_sample_token_num),
                             eos_token_id):
                break

        t1 = time.perf_counter()

        text = tokenizer.decode_batch(input_ids)[0]

        log.info(
            "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)".
            format(t_count, t1 - t0, (t1 - t0) / t_count))

        # print result
        log.info("GENERATED SEQUENCE: {}".format(text))