def main():
    args = build_argparser().parse_args()

    paragraphs = get_paragraphs(args.input)

    preprocessing_start_time = perf_counter()
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # get context as a string (as we might need it's length for the sequence reshape)
    context = '\n'.join(paragraphs)
    visualizer = Visualizer(context, args.colors)
    # encode context into token ids list
    c_tokens = text_to_tokens(context.lower(), vocab)
    total_latency = (perf_counter() - preprocessing_start_time) * 1e3

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams,
                                        args.num_threads)
        model_adapter = OpenvinoAdapter(
            create_core(),
            args.model,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests)
    elif args.adapter == 'ovms':
        model_adapter = OVMSAdapter(args.model)

    config = {
        'vocab': vocab,
        'input_names': args.input_names,
        'output_names': args.output_names,
        'max_answer_token_num': args.max_answer_token_num,
        'squad_ver': args.model_squad_ver
    }
    model = BertQuestionAnswering(model_adapter, config)
    if args.reshape:
        # find the closest multiple of 64, if it is smaller than current network's sequence length, do reshape
        new_length = min(
            model.max_length,
            int(
                np.ceil(
                    (len(c_tokens[0]) + args.max_question_token_num) / 64) *
                64))
        if new_length < model.max_length:
            try:
                model.reshape(new_length)
            except RuntimeError:
                log.error(
                    "Failed to reshape the network, please retry the demo without '-r' option"
                )
                sys.exit(-1)
        else:
            log.debug(
                "\tSkipping network reshaping,"
                " as (context length + max question length) exceeds the current (input) network sequence length"
            )
    model.log_layers_info()

    pipeline = AsyncPipeline(model)

    if args.questions:

        def questions():
            for question in args.questions:
                log.info("\n\tQuestion: {}".format(question))
                yield question
    else:

        def questions():
            while True:
                yield input('\n\tType a question (empty string to exit): ')

    for question in questions():
        if not question.strip():
            break

        answers = []
        next_window_id = 0
        next_window_id_to_show = 0
        start_time = perf_counter()
        q_tokens_id, _ = text_to_tokens(question.lower(), vocab)
        source = ContextSource(q_tokens_id, c_tokens, model.max_length)

        while True:
            if pipeline.callback_exceptions:
                raise pipeline.callback_exceptions[0]
            results = pipeline.get_result(next_window_id_to_show)
            if results:
                next_window_id_to_show += 1
                update_answers_list(answers, results[0])
                continue

            if pipeline.is_ready():
                if source.is_over():
                    break
                pipeline.submit_data(source.get_data(), next_window_id, None)
                next_window_id += 1
            else:
                pipeline.await_any()

        pipeline.await_all()
        for window_id in range(next_window_id_to_show, next_window_id):
            results = pipeline.get_result(window_id)
            while results is None:
                results = pipeline.get_result(window_id)
            update_answers_list(answers, results[0])

        visualizer.show_answers(answers)
        total_latency += (perf_counter() - start_time) * 1e3

    log.info("Metrics report:")
    log.info("\tLatency: {:.1f} ms".format(total_latency))
Exemple #2
0
def main():
    args = build_argparser().parse_args()

    paragraphs = get_paragraphs(args.input)

    preprocessing_start_time = perf_counter()
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(args.vocab, len(vocab)))

    # get context as a string (as we might need it's length for the sequence reshape)
    context = '\n'.join(paragraphs)
    sentences = re.split(sentence_splitter, context)
    preprocessed_sentences = [text_to_tokens(sentence, vocab) for sentence in sentences]
    max_sentence_length = max([len(tokens) + 2 for tokens, _ in preprocessed_sentences])
    preprocessing_total_time = (perf_counter() - preprocessing_start_time) * 1e3
    source = tuple(zip(sentences, preprocessed_sentences))

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams, args.num_threads)
        model_adapter = OpenvinoAdapter(create_core(), args.model, device=args.device, plugin_config=plugin_config,
                                        max_num_requests=args.num_infer_requests, model_parameters = {'input_layouts': args.layout})
    elif args.adapter == 'ovms':
        model_adapter = OVMSAdapter(args.model)

    enable_padding = not args.dynamic_shape
    model = BertNamedEntityRecognition(model_adapter, {'vocab': vocab, 'input_names': args.input_names, 'enable_padding': enable_padding})
    if max_sentence_length > model.max_length:
        model.reshape(max_sentence_length if enable_padding else (1, max_sentence_length))
    model.log_layers_info()

    pipeline = AsyncPipeline(model)

    next_sentence_id = 0
    next_sentence_id_to_show = 0
    start_time = perf_counter()

    while True:
        if pipeline.callback_exceptions:
            raise pipeline.callback_exceptions[0]
        results = pipeline.get_result(next_sentence_id_to_show)
        if results:
            (score, filtered_labels_id), meta = results
            next_sentence_id_to_show += 1
            print_raw_results(score, filtered_labels_id, meta)
            continue

        if pipeline.is_ready():
            if next_sentence_id == len(source):
                break
            sentence, (c_tokens_id, c_token_s_e) = source[next_sentence_id]
            pipeline.submit_data(c_tokens_id, next_sentence_id, {'sentence': sentence, 'c_token_s_e': c_token_s_e})
            next_sentence_id += 1
        else:
            pipeline.await_any()

    pipeline.await_all()
    if pipeline.callback_exceptions:
        raise pipeline.callback_exceptions[0]
    for sentence_id in range(next_sentence_id_to_show, next_sentence_id):
        results = pipeline.get_result(sentence_id)
        (score, filtered_labels_id), meta = results
        print_raw_results(score, filtered_labels_id, meta)

    total_latency = (perf_counter() - start_time) * 1e3 + preprocessing_total_time
    log.info("Metrics report:")
    log.info("\tLatency: {:.1f} ms".format(total_latency))
Exemple #3
0
def main():
    args = build_argparser().parse_args()

    paragraphs = get_paragraphs(args.input)

    vocab_start_time = perf_counter()
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))
    visualizer = Visualizer(args.colors)
    total_latency = (perf_counter() - vocab_start_time) * 1e3

    ie = create_core()
    plugin_config = get_user_config(args.device, args.num_streams,
                                    args.num_threads)
    model_emb_adapter = OpenvinoAdapter(
        ie,
        args.model_emb,
        device=args.device,
        plugin_config=plugin_config,
        max_num_requests=args.num_infer_requests)
    model_emb = BertEmbedding(model_emb_adapter, {
        'vocab': vocab,
        'input_names': args.input_names_emb
    })
    model_emb.log_layers_info()

    # reshape BertEmbedding model to infer short questions and long contexts
    max_len_context = 384
    max_len_question = 32

    for new_length in [max_len_question, max_len_context]:
        model_emb.reshape(new_length)
        if new_length == max_len_question:
            emb_exec_net = ie.load_network(model_emb_adapter.net, args.device)
        else:
            emb_pipeline = AsyncPipeline(model_emb)

    if args.model_qa:
        model_qa_adapter = OpenvinoAdapter(
            ie,
            args.model_qa,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests)
        config = {
            'vocab': vocab,
            'input_names': args.input_names_qa,
            'output_names': args.output_names_qa,
            'max_answer_token_num': args.max_answer_token_num,
            'squad_ver': args.model_qa_squad_ver
        }
        model_qa = BertQuestionAnswering(model_qa_adapter, config)
        model_qa.log_layers_info()
        qa_pipeline = AsyncPipeline(model_qa)

    log.info("\t\tStage 1    (Calc embeddings for the context)")
    contexts_all = []
    start_time = perf_counter()

    # get context as string and then encode it into token id list
    # calculate number of tokens for context in each request.
    # reserve 3 positions for special tokens [CLS] q_tokens [SEP] c_tokens [SEP]
    if args.model_qa:
        # to make context be able to pass model_qa together with question
        c_window_len = model_qa.max_length - (max_len_question + 3)
    else:
        # to make context be able to pass model_emb without question
        c_window_len = max_len_context - 2

    def calc_question_embedding(tokens_id):
        num = min(max_len_question - 2, len(tokens_id))
        inputs, _ = model_emb.preprocess((tokens_id[:num], max_len_question))
        raw_result = emb_exec_net.infer(inputs)
        return model_emb.postprocess(raw_result, None)

    source = ContextSource(paragraphs, vocab, c_window_len)
    next_window_id = 0
    next_window_id_to_show = 0
    contexts_all = []

    while True:
        if emb_pipeline.callback_exceptions:
            raise emb_pipeline.callback_exceptions[0]
        results = emb_pipeline.get_result(next_window_id_to_show)
        if results:
            embedding, meta = results
            meta['c_data'].emb = embedding
            contexts_all.append(meta['c_data'])
            next_window_id_to_show += 1
            continue

        if emb_pipeline.is_ready():
            if source.is_over():
                break
            c_data = source.get_data()
            num = min(max_len_context - 2, len(c_data.c_tokens_id))
            emb_pipeline.submit_data(
                (c_data.c_tokens_id[:num], max_len_context), next_window_id,
                {'c_data': c_data})
            next_window_id += 1
        else:
            emb_pipeline.await_any()

    emb_pipeline.await_all()
    for window_id in range(next_window_id_to_show, next_window_id):
        results = emb_pipeline.get_result(window_id)
        while results is None:
            results = emb_pipeline.get_result(window_id)
        embedding, meta = results
        meta['c_data'].emb = embedding
        contexts_all.append(meta['c_data'])
        next_window_id_to_show += 1

    total_latency += (perf_counter() - start_time) * 1e3
    context_embeddings_time = total_latency

    if args.questions:

        def questions():
            for question in args.questions:
                log.info("\n\tQuestion: {}".format(question))
                yield question
    else:

        def questions():
            while True:
                yield input('\n\tType a question (empty string to exit): ')

    for question in questions():
        if not question.strip():
            break

        start_time = perf_counter()
        log.info(
            "\t\tStage 2    (Calc question embedding and compare with {} context embeddings)"
            .format(len(contexts_all)))
        q_tokens_id, _ = text_to_tokens(question.lower(), vocab)
        q_emb = calc_question_embedding(q_tokens_id)
        distances = [(np.linalg.norm(context.emb - q_emb, 2), context)
                     for context in contexts_all]
        distances.sort(key=lambda x: x[0])
        keep_num = min(args.best_n, len(distances))
        distances_filtered = distances[:keep_num]

        log.info(
            "The closest {} contexts to question filtered from {} context embeddings:"
            .format(keep_num, len(distances)))
        visualizer.show_closest_contexts(distances_filtered)

        if args.model_qa:
            answers = []
            next_context_id = 0
            next_context_id_to_show = 0

            while True:
                if qa_pipeline.callback_exceptions:
                    raise qa_pipeline.callback_exceptions[0]
                results = qa_pipeline.get_result(next_context_id_to_show)
                if results:
                    next_context_id_to_show += 1
                    output, meta = results
                    update_answers_list(answers, output, meta['c_data'])
                    continue

                if qa_pipeline.is_ready():
                    if next_context_id == len(distances_filtered):
                        break
                    _, c_data = distances_filtered[next_context_id]
                    qa_pipeline.submit_data((c_data, q_tokens_id),
                                            next_context_id,
                                            {'c_data': c_data})
                    next_context_id += 1
                else:
                    qa_pipeline.await_any()

            qa_pipeline.await_all()
            for context_id in range(next_context_id_to_show, next_context_id):
                results = qa_pipeline.get_result(context_id)
                while results is None:
                    results = qa_pipeline.get_result(context_id)
                output, meta = results
                update_answers_list(answers, output, meta['c_data'])

            log.info(
                "\t\tStage 3    (Show top 3 answers from {} closest contexts of Stage 1)"
                .format(len(answers)))
            answers = sorted(answers, key=lambda x: -x[0])[:3]
            visualizer.show_answers(answers)

        total_latency += (perf_counter() - start_time) * 1e3

    log.info("Metrics report:")
    log.info("\tContext embeddings latency (stage 1): {:.1f} ms".format(
        context_embeddings_time))
    log.info("\tLatency (all stages): {:.1f} ms".format(total_latency))