Esempio n. 1
0
def main():
    args = build_argparser().parse_args()

    # load vocabulary file for model
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # create tokenizer
    tokenizer = Tokenizer(BPE(str(args.vocab), str(args.merges)))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    log.info('OpenVINO Inference Engine')
    log.info('\tbuild: {}'.format(get_version()))
    ie = IECore()

    # read IR
    model_xml = args.model
    model_bin = model_xml.with_suffix(".bin")
    log.info('Reading model {}'.format(args.model))
    ie_net = ie.read_network(model=model_xml, weights=model_bin)

    # check input and output names
    if len(ie_net.input_info) != 1:
        raise RuntimeError(
            'The demo expects model with single input, while provided {}'.
            format(len(ie_net.input_info)))
    if len(ie_net.outputs) != 1:
        raise RuntimeError(
            'The demo expects model with single output, while provided {}'.
            format(len(ie_net.outputs)))
    input_names = next(iter(ie_net.input_info))
    output_names = next(iter(ie_net.outputs))

    # load model to the device
    ie_net_exec = ie.load_network(network=ie_net, device_name=args.device)
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.input:

        def prompts():
            for prompt in args.input:
                log.info("Input prompt: {}".format(prompt))
                yield prompt
    else:

        def prompts():
            while True:
                yield input('Type input prompt (empty string to exit):')

    # loop on user's or prepared prompts
    for prompt in prompts():
        if not prompt.strip():
            break

        # encode input
        tokens = tokenizer.encode_batch([prompt])[0].ids
        input_ids = np.array([tokens], dtype=np.int32)

        # maximum number of tokens that can be processed by network at once
        max_length = ie_net.input_info[input_names].input_data.shape[1]

        eos_token_id = len(vocab) - 1

        cur_input_len = input_ids.shape[-1]

        # maximum number of tokens that will be generated
        max_sample_token_num = args.max_sample_token_num + cur_input_len

        t0 = time.perf_counter()
        t_count = 0

        while True:
            # pad the rest of the request
            pad_len = max_length - cur_input_len
            model_input = np.concatenate(
                (input_ids, [[eos_token_id] * pad_len]), axis=-1)

            # create numpy inputs for IE
            inputs = {
                input_names: model_input,
            }

            # infer by IE
            t_start = time.perf_counter()
            res = ie_net_exec.infer(inputs=inputs)
            t_end = time.perf_counter()
            t_count += 1
            log.info(
                "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)"
                .format(max_length, 1 / (t_end - t_start), t_end - t_start))

            outputs = res[output_names]
            next_token_logits = outputs[:, cur_input_len - 1, :]

            # pre-process distribution
            next_token_scores = process_logits(input_ids, next_token_logits,
                                               eos_token_id)
            if args.top_k > 0:
                next_token_scores = get_top_k_logits(next_token_scores,
                                                     args.top_k)

            if args.top_p < 1.0:
                next_token_scores = get_top_p_logits(next_token_scores,
                                                     args.top_p)

            # get next token id
            probs = softmax(next_token_scores)
            next_tokens = np.random.choice(probs.shape[-1],
                                           1,
                                           p=probs[0],
                                           replace=True)

            # update info for the next step
            input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1)

            cur_input_len = input_ids.shape[-1]

            if stop_criteria(input_ids, min(max_length, max_sample_token_num),
                             eos_token_id):
                break

        t1 = time.perf_counter()

        text = tokenizer.decode_batch(input_ids)[0]

        log.info(
            "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)"
            .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count))

        # print result
        log.info("GENERATED SEQUENCE: {}".format(text))
Esempio n. 2
0
def main():
    args = build_argparser().parse_args()

    # load vocabulary file for model
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # create tokenizer
    tokenizer = Tokenizer(BPE.from_file(str(args.vocab), str(args.merges)))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    log.info('OpenVINO Runtime')
    log.info('\tbuild: {}'.format(get_version()))
    core = Core()

    # read IR
    log.info('Reading model {}'.format(args.model))
    model = core.read_model(args.model)

    # check number inputs and outputs
    if len(model.inputs) != 1:
        raise RuntimeError(
            'The demo expects model with single input, while provided {}'.
            format(len(model.inputs)))
    if len(model.outputs) != 1:
        raise RuntimeError(
            'The demo expects model with single output, while provided {}'.
            format(len(model.outputs)))
    input_tensor = model.inputs[0].any_name

    if not args.dynamic_shape and (
            model.inputs[0].partial_shape.is_dynamic
            or model.inputs[0].shape[1] != args.max_seq_len):
        model.reshape({
            input_tensor:
            PartialShape([Dimension(1),
                          Dimension(args.max_seq_len)])
        })

    if args.dynamic_shape:
        model.reshape({
            input_tensor:
            PartialShape([Dimension(1),
                          Dimension(0, args.max_seq_len)])
        })

    # load model to the device
    compiled_model = core.compile_model(model, args.device)
    output_tensor = compiled_model.outputs[0]
    infer_request = compiled_model.create_infer_request()
    log.info('The model {} is loaded to {}'.format(args.model, args.device))

    if args.input:

        def prompts():
            for prompt in args.input:
                log.info("Input prompt: {}".format(prompt))
                yield prompt
    else:

        def prompts():
            while True:
                yield input('Type input prompt (empty string to exit):')

    # loop on user's or prepared prompts
    for prompt in prompts():
        if not prompt.strip():
            break

        # encode input
        tokens = tokenizer.encode_batch([prompt])[0].ids
        input_ids = np.array([tokens], dtype=np.int32)

        # maximum number of tokens that can be processed by network at once
        max_length = args.max_seq_len

        eos_token_id = len(vocab) - 1

        cur_input_len = input_ids.shape[-1]

        # maximum number of tokens that will be generated
        max_sample_token_num = args.max_sample_token_num + cur_input_len

        t0 = time.perf_counter()
        t_count = 0

        while True:
            model_input = input_ids
            if not args.dynamic_shape:
                # pad the rest of the request
                pad_len = max_length - cur_input_len
                model_input = np.concatenate(
                    (input_ids, [[eos_token_id] * pad_len]), axis=-1)

            # create numpy inputs for OpenVINO runtime
            inputs = {
                input_tensor: model_input,
            }

            # infer by OpenVINO runtime
            t_start = time.perf_counter()
            outputs = infer_request.infer(inputs)[output_tensor]
            t_end = time.perf_counter()
            t_count += 1
            log.info(
                "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)"
                .format(model_input.shape[1], 1 / (t_end - t_start),
                        t_end - t_start))

            next_token_logits = outputs[:, cur_input_len - 1, :]

            # pre-process distribution
            next_token_scores = process_logits(input_ids, next_token_logits,
                                               eos_token_id)
            if args.top_k > 0:
                next_token_scores = get_top_k_logits(next_token_scores,
                                                     args.top_k)

            if args.top_p < 1.0:
                next_token_scores = get_top_p_logits(next_token_scores,
                                                     args.top_p)

            # get next token id
            probs = softmax(next_token_scores)
            next_tokens = np.random.choice(probs.shape[-1],
                                           1,
                                           p=probs[0],
                                           replace=True)

            # update info for the next step
            input_ids = np.concatenate((input_ids, [next_tokens]), axis=-1)

            cur_input_len = input_ids.shape[-1]

            if stop_criteria(input_ids, min(max_length, max_sample_token_num),
                             eos_token_id):
                break

        t1 = time.perf_counter()

        text = tokenizer.decode_batch(input_ids)[0]

        log.info(
            "{} requests were processed in {:0.2f}sec ({:0.2}sec per request)".
            format(t_count, t1 - t0, (t1 - t0) / t_count))

        # print result
        log.info("GENERATED SEQUENCE: {}".format(text))