Example #1
0
def main():
    args = build_argparser().parse_args()

    paragraphs = get_paragraphs(args.input)

    preprocessing_start_time = perf_counter()
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(args.vocab, len(vocab)))

    # get context as a string (as we might need it's length for the sequence reshape)
    context = '\n'.join(paragraphs)
    sentences = re.split(sentence_splitter, context)
    preprocessed_sentences = [text_to_tokens(sentence, vocab) for sentence in sentences]
    max_sentence_length = max([len(tokens) + 2 for tokens, _ in preprocessed_sentences])
    preprocessing_total_time = (perf_counter() - preprocessing_start_time) * 1e3
    source = tuple(zip(sentences, preprocessed_sentences))

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams, args.num_threads)
        model_adapter = OpenvinoAdapter(create_core(), args.model, device=args.device, plugin_config=plugin_config,
                                        max_num_requests=args.num_infer_requests, model_parameters = {'input_layouts': args.layout})
    elif args.adapter == 'ovms':
        model_adapter = OVMSAdapter(args.model)

    enable_padding = not args.dynamic_shape
    model = BertNamedEntityRecognition(model_adapter, {'vocab': vocab, 'input_names': args.input_names, 'enable_padding': enable_padding})
    if max_sentence_length > model.max_length:
        model.reshape(max_sentence_length if enable_padding else (1, max_sentence_length))
    model.log_layers_info()

    pipeline = AsyncPipeline(model)

    next_sentence_id = 0
    next_sentence_id_to_show = 0
    start_time = perf_counter()

    while True:
        if pipeline.callback_exceptions:
            raise pipeline.callback_exceptions[0]
        results = pipeline.get_result(next_sentence_id_to_show)
        if results:
            (score, filtered_labels_id), meta = results
            next_sentence_id_to_show += 1
            print_raw_results(score, filtered_labels_id, meta)
            continue

        if pipeline.is_ready():
            if next_sentence_id == len(source):
                break
            sentence, (c_tokens_id, c_token_s_e) = source[next_sentence_id]
            pipeline.submit_data(c_tokens_id, next_sentence_id, {'sentence': sentence, 'c_token_s_e': c_token_s_e})
            next_sentence_id += 1
        else:
            pipeline.await_any()

    pipeline.await_all()
    if pipeline.callback_exceptions:
        raise pipeline.callback_exceptions[0]
    for sentence_id in range(next_sentence_id_to_show, next_sentence_id):
        results = pipeline.get_result(sentence_id)
        (score, filtered_labels_id), meta = results
        print_raw_results(score, filtered_labels_id, meta)

    total_latency = (perf_counter() - start_time) * 1e3 + preprocessing_total_time
    log.info("Metrics report:")
    log.info("\tLatency: {:.1f} ms".format(total_latency))
def main():
    log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout)
    args = build_argparser().parse_args()

    if args.colors:
        COLOR_RED = "\033[91m"
        COLOR_RESET = "\033[0m"
    else:
        COLOR_RED = ""
        COLOR_RESET = ""

    # load vocabulary file for model
    log.info("Loading vocab file:\t{}".format(args.vocab))
    vocab = load_vocab_file(args.vocab)
    log.info("{} tokens loaded".format(len(vocab)))

    # get context as a string (as we might need it's length for the sequence reshape)
    paragraphs = get_paragraphs(args.input)
    context = '\n'.join(paragraphs)
    log.info("Size: {} chars".format(len(context)))
    log.info("Context: " + COLOR_RED + context + COLOR_RESET)
    # encode context into token ids list
    c_tokens_id, c_tokens_se = text_to_tokens(context.lower(), vocab)

    log.info("Initializing Inference Engine")
    ie = IECore()
    version = ie.get_versions(args.device)[args.device]
    version_str = "{}.{}.{}".format(version.major, version.minor, version.build_number)
    log.info("Plugin version is {}".format(version_str))

    # read IR
    model_xml = args.model
    model_bin = model_xml.with_suffix(".bin")
    log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin))
    ie_encoder = ie.read_network(model=model_xml, weights=model_bin)

    if args.reshape:
        # reshape the sequence length to the context + maximum question length (in tokens)
        first_input_layer = next(iter(ie_encoder.input_info))
        c = ie_encoder.input_info[first_input_layer].input_data.shape[1]
        # find the closest multiple of 64, if it is smaller than current network's sequence length, let' use that
        seq = min(c, int(np.ceil((len(c_tokens_id) + args.max_question_token_num) / 64) * 64))
        if seq < c:
            new_shapes = {}
            for input_name, input_info in ie_encoder.input_info.items():
                n, c = input_info.input_data.shape
                new_shapes[input_name] = [n, seq]
                log.info("Reshaped input {} from {} to the {}".format(
                    input_name, input_info.input_data.shape, new_shapes[input_name]))
            log.info("Attempting to reshape the network to the modified inputs...")
            try:
                ie_encoder.reshape(new_shapes)
                log.info("Successful!")
            except RuntimeError:
                log.error("Failed to reshape the network, please retry the demo without '-r' option")
                sys.exit(-1)
        else:
            log.info("Skipping network reshaping,"
                     " as (context length + max question length) exceeds the current (input) network sequence length")

    # check input and output names
    input_names = [i.strip() for i in args.input_names.split(',')]
    output_names = [o.strip() for o in args.output_names.split(',')]
    if ie_encoder.input_info.keys() != set(input_names) or ie_encoder.outputs.keys() != set(output_names):
        log.error("Input or Output names do not match")
        log.error("    The demo expects input->output names: {}->{}. "
                  "Please use the --input_names and --output_names to specify the right names "
                  "(see actual values below)".format(input_names, output_names))
        log.error("    Actual network input->output names: {}->{}".format(list(ie_encoder.input_info.keys()),
                                                                          list(ie_encoder.outputs.keys())))
        raise Exception("Unexpected network input or output names")

    # load model to the device
    log.info("Loading model to the {}".format(args.device))
    ie_encoder_exec = ie.load_network(network=ie_encoder, device_name=args.device)

    if args.questions:
        def questions():
            for question in args.questions:
                log.info("Question: {}".format(question))
                yield question
    else:
        def questions():
            while True:
                yield input('Type question (empty string to exit):')

    # loop on user's or prepared questions
    for question in questions():
        if not question.strip():
            break

        q_tokens_id, _ = text_to_tokens(question.lower(), vocab)

        # maximum number of tokens that can be processed by network at once
        max_length = ie_encoder.input_info[input_names[0]].input_data.shape[1]

        # calculate number of tokens for context in each inference request.
        # reserve 3 positions for special tokens
        # [CLS] q_tokens [SEP] c_tokens [SEP]
        c_wnd_len = max_length - (len(q_tokens_id) + 3)

        # token num between two neighbour context windows
        # 1/2 means that context windows are overlapped by half
        c_stride = c_wnd_len // 2

        t0 = time.perf_counter()
        t_count = 0

        # array of answers from each window
        answers = []

        # init a window to iterate over context
        c_s, c_e = 0, min(c_wnd_len, len(c_tokens_id))

        # iterate while context window is not empty
        while c_e > c_s:
            # form the request
            tok_cls = vocab['[CLS]']
            tok_sep = vocab['[SEP]']
            input_ids = [tok_cls] + q_tokens_id + [tok_sep] + c_tokens_id[c_s:c_e] + [tok_sep]
            token_type_ids = [0] + [0] * len(q_tokens_id) + [0] + [1] * (c_e - c_s) + [0]
            attention_mask = [1] * len(input_ids)

            # pad the rest of the request
            pad_len = max_length - len(input_ids)
            input_ids += [0] * pad_len
            token_type_ids += [0] * pad_len
            attention_mask += [0] * pad_len

            # create numpy inputs for IE
            inputs = {
                input_names[0]: np.array([input_ids], dtype=np.int32),
                input_names[1]: np.array([attention_mask], dtype=np.int32),
                input_names[2]: np.array([token_type_ids], dtype=np.int32),
            }
            if len(input_names)>3:
                inputs[input_names[3]] = np.arange(len(input_ids), dtype=np.int32)[None, :]

            t_start = time.perf_counter()
            # infer by IE
            res = ie_encoder_exec.infer(inputs=inputs)
            t_end = time.perf_counter()
            t_count += 1
            log.info("Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)".format(
                max_length,
                1 / (t_end - t_start),
                t_end - t_start
            ))

            # get start-end scores for context
            def get_score(name):
                out = np.exp(res[name].reshape((max_length,)))
                return out / out.sum(axis=-1)

            score_s = get_score(output_names[0])
            score_e = get_score(output_names[1])

            # get 'no-answer' score (not valid if model has been fine-tuned on squad1.x)
            if args.model_squad_ver.split('.')[0] == '1':
                score_na = 0
            else:
                score_na = score_s[0] * score_e[0]

            # find product of all start-end combinations to find the best one
            c_s_idx = len(q_tokens_id) + 2  # index of first context token in tensor
            c_e_idx = max_length - (1 + pad_len)  # index of last+1 context token in tensor
            score_mat = np.matmul(
                score_s[c_s_idx:c_e_idx].reshape((c_e - c_s, 1)),
                score_e[c_s_idx:c_e_idx].reshape((1, c_e - c_s))
            )
            # reset candidates with end before start
            score_mat = np.triu(score_mat)
            # reset long candidates (>max_answer_token_num)
            score_mat = np.tril(score_mat, args.max_answer_token_num - 1)
            # find the best start-end pair
            max_s, max_e = divmod(score_mat.flatten().argmax(), score_mat.shape[1])
            max_score = score_mat[max_s, max_e] * (1 - score_na)

            # convert to context text start-end index
            max_s = c_tokens_se[c_s + max_s][0]
            max_e = c_tokens_se[c_s + max_e][1]

            # check that answers list does not have duplicates (because of context windows overlapping)
            same = [i for i, a in enumerate(answers) if a[1] == max_s and a[2] == max_e]
            if same:
                assert len(same) == 1
                # update existing answer record
                a = answers[same[0]]
                answers[same[0]] = (max(max_score, a[0]), max_s, max_e)
            else:
                # add new record
                answers.append((max_score, max_s, max_e))

            # check that context window reached the end
            if c_e == len(c_tokens_id):
                break

            # move to next window position
            c_s = min(c_s + c_stride, len(c_tokens_id))
            c_e = min(c_s + c_wnd_len, len(c_tokens_id))

        t1 = time.perf_counter()
        log.info("The performance below is reported only for reference purposes, "
                 "please use the benchmark_app tool (part of the OpenVINO samples) for any actual measurements.")
        log.info("{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)".format(
            t_count,
            max_length,
            t1 - t0,
            (t1 - t0) / t_count
        ))

        # print top 3 results
        answers = sorted(answers, key=lambda x: -x[0])
        for score, s, e in answers[:3]:
            log.info("---answer: {:0.2f} {}".format(score, context[s:e]))
            c_s, c_e = find_sentence_range(context, s, e)
            log.info("   " + context[c_s:s] + COLOR_RED + context[s:e] + COLOR_RESET + context[e:c_e])
Example #3
0
def main():
    log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout)
    args = build_argparser().parse_args()

    # load vocabulary file for model
    log.info("Loading vocab file:\t{}".format(args.vocab))
    vocab = load_vocab_file(args.vocab)
    log.info("{} tokens loaded".format(len(vocab)))

    # get context as a string (as we might need it's length for the sequence reshape)
    paragraphs = get_paragraphs(args.input)
    context = '\n'.join(paragraphs)
    log.info("Size: {} chars".format(len(context)))
    sentences = re.split(sentence_splitter, context)
    preprocessed_sentences = [text_to_tokens(sentence, vocab) for sentence in sentences]
    max_sent_length = max([len(tokens) + 2 for tokens, _ in preprocessed_sentences])

    log.info("Initializing Inference Engine")
    ie = IECore()
    version = ie.get_versions(args.device)[args.device]
    version_str = "{}.{}.{}".format(version.major, version.minor, version.build_number)
    log.info("Plugin version is {}".format(version_str))

    # read IR
    model_xml = args.model
    model_bin = model_xml.with_suffix(".bin")
    log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin))
    ie_encoder = ie.read_network(model=model_xml, weights=model_bin)

    # check input and output names
    input_names = [i.strip() for i in args.input_names.split(',')]
    if ie_encoder.input_info.keys() != set(input_names):
        log.error("Input names do not match")
        log.error("    The demo expects input names: {}. "
                  "Please use the --input_names to specify the right names "
                  "(see actual values below)".format(input_names))
        log.error("    Actual network input names: {}".format(list(ie_encoder.input_info.keys())))
        raise Exception("Unexpected network input names")
    if len(ie_encoder.outputs) != 1:
        log.log.error('Demo expects model with single output, while provided {}'.format(len(ie_encoder.outputs)))
        raise Exception('Unexpected number of outputs')
    output_names = list(ie_encoder.outputs)
    max_length = ie_encoder.input_info[input_names[0]].input_data.shape[1]
    if max_sent_length > max_length:
        input_shapes = {
            input_names[0]: [1, max_sent_length],
            input_names[1]: [1, max_sent_length],
            input_names[2]: [1, max_sent_length]
        }
        ie_encoder.reshape(input_shapes)
        max_length = max_sent_length
    # load model to the device
    log.info("Loading model to the {}".format(args.device))
    ie_encoder_exec = ie.load_network(network=ie_encoder, device_name=args.device)
    # maximum number of tokens that can be processed by network at once
    t0 = time.perf_counter()
    t_count = 0

    def get_score(name):
        out = np.exp(res[name][0])
        return out / out.sum(axis=-1, keepdims=True)

    for sentence, (c_tokens_id, c_token_s_e) in zip(sentences, preprocessed_sentences):
        # form the request
        tok_cls = vocab['[CLS]']
        tok_sep = vocab['[SEP]']
        input_ids = [tok_cls] + c_tokens_id + [tok_sep]
        token_type_ids = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)

        # pad the rest of the request
        pad_len = max_length - len(input_ids)
        input_ids += [0] * pad_len
        token_type_ids += [0] * pad_len
        attention_mask += [0] * pad_len

        # create numpy inputs for IE
        inputs = {
            input_names[0]: np.array([input_ids], dtype=np.int32),
            input_names[1]: np.array([attention_mask], dtype=np.int32),
            input_names[2]: np.array([token_type_ids], dtype=np.int32),
        }
        if len(input_names)>3:
            inputs[input_names[3]] = np.arange(len(input_ids), dtype=np.int32)[None, :]

        t_start = time.perf_counter()
        # infer by IE
        res = ie_encoder_exec.infer(inputs=inputs)
        t_end = time.perf_counter()
        t_count += 1
        log.info("Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)".format(
            max_length,
            1 / (t_end - t_start),
            t_end - t_start
        ))


        score = get_score(output_names[0])
        labels_idx = score.argmax(-1)
        filtered_labels_idx = [
            (idx, label_idx)
            for idx, label_idx in enumerate(labels_idx)
            if label_idx != 0 and 0 < idx < max_length - pad_len
        ]

        if not filtered_labels_idx:
            continue

        log.info('Sentence: \n\t{}'.format(sentence))
        visualized = set()
        for idx, label_idx in filtered_labels_idx:
            word_s, word_e = c_token_s_e[idx - 1]
            if (word_s, word_e) in visualized:
                continue
            visualized.add((word_s, word_e))
            word = sentence[word_s:word_e]
            log.info('\n\tWord: {}\n\tConfidence: {}\n\tTag: {}'.format(word, score[idx][label_idx], label_to_tag[label_idx]))

    t1 = time.perf_counter()
    log.info("The performance below is reported only for reference purposes, "
            "please use the benchmark_app tool (part of the OpenVINO samples) for any actual measurements.")
    log.info("{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)".format(
        t_count,
        max_length,
        t1 - t0,
        (t1 - t0) / t_count
    ))
def main():
    log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout)
    args = build_argparser().parse_args()

    log.info("Creating Inference Engine")
    ie = IECore()

    #read model to calculate embedding
    model_xml_emb = args.model_emb
    model_bin_emb = model_xml_emb.with_suffix(".bin")

    log.info("Loading embedding network files:\n\t{}\n\t{}".format(model_xml_emb, model_bin_emb))
    ie_encoder_emb = ie.read_network(model=model_xml_emb, weights=model_bin_emb)
    input_names_model_emb = list(ie_encoder_emb.input_info.keys())
    input_names_emb = args.input_names_emb.split(',')
    log.info("Expected embedding input names: {}".format(input_names_emb))
    log.info("Network embedding input names: {}".format(input_names_model_emb))
    # check input names
    if set(input_names_model_emb) != set(input_names_emb):
        log.error("Unexpected embedding network input names")
        raise Exception("Unexpected embedding network input names")

    # check outputs
    output_names_model_emb = list(ie_encoder_emb.outputs.keys())
    if len(output_names_model_emb)>1:
        log.error("Expected only single output in embedding network but {} outputs detected".format(output_names_model_emb))
        raise Exception("Unexpected number of embedding network outputs")


    #reshape embedding model to infer short questions and long contexts
    ie_encoder_exec_emb_dict = {}
    max_length_c = 384
    max_length_q = 32

    for length in [max_length_q, max_length_c]:
        new_shapes = {}
        for i, input_info in ie_encoder_emb.input_info.items():
            new_shapes[i] = [1, length]
            log.info("Reshaped input {} from {} to the {}".format(
                i,
                input_info.input_data.shape,
                new_shapes[i]))
        log.info("Attempting to reshape the context embedding network to the modified inputs...")

        try:
            ie_encoder_emb.reshape(new_shapes)
            log.info("Successful!")
        except RuntimeError:
            log.error("Failed to reshape the embedding network")
            raise

        # Loading model to the plugin
        log.info("Loading model to the plugin")
        ie_encoder_exec_emb_dict[length] = ie.load_network(network=ie_encoder_emb, device_name=args.device)

    # Read model for final exact qa
    if args.model_qa:
        model_xml = args.model_qa
        model_bin = model_xml.with_suffix(".bin")
        log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin))

        ie_encoder_qa = ie.read_network(model=model_xml, weights=model_bin)
        ie_encoder_qa.batch_size = 1

        input_names_qa = args.input_names_qa.split(',')
        output_names_qa = args.output_names_qa.split(',')
        log.info("Expected input->output names: {}->{}".format(input_names_qa, output_names_qa))

        #check input and output names
        input_names_model_qa = list(ie_encoder_qa.input_info.keys())
        output_names_model_qa = list(ie_encoder_qa.outputs.keys())
        log.info("Network input->output names: {}->{}".format(input_names_model_qa, output_names_model_qa))
        if set(input_names_model_qa) != set(input_names_qa) or set(output_names_model_qa) != set(output_names_qa):
            log.error("Unexpected network input or output names")
            raise Exception("Unexpected network input or output names")

        # Loading model to the plugin
        log.info("Loading model to the plugin")
        ie_encoder_qa_exec = ie.load_network(network=ie_encoder_qa, device_name=args.device)

        max_length_qc = ie_encoder_qa.input_info[input_names_qa[0]].input_data.shape[1]

    #load vocabulary file for all models
    log.info("Loading vocab file:\t{}".format(args.vocab))
    vocab = load_vocab_file(args.vocab)
    log.info("{} tokens loaded".format(len(vocab)))

    #define function to infer embedding
    def calc_emb(tokens_id, max_length):
        num = min(max_length - 2, len(tokens_id))

        # forms the request
        pad_len = max_length - num - 2
        tok_cls = [vocab['[CLS]']]
        tok_sep = [vocab['[SEP]']]
        tok_pad = [vocab['[PAD]']]

        dtype = np.int32
        inputs = {
            input_names_emb[0]: np.array([tok_cls + tokens_id[:num] + tok_sep + tok_pad * pad_len], dtype=dtype),
            input_names_emb[1]: np.array([[1]     + [1] * num       + [1]     + [0]     * pad_len], dtype=dtype),
            input_names_emb[2]: np.array([[0]     + [0] * num       + [0]     + tok_pad * pad_len], dtype=dtype),
            input_names_emb[3]: np.arange(max_length, dtype=dtype)[None, :]
        }

        # calc embedding
        ie_encoder_exec_emb = ie_encoder_exec_emb_dict[max_length]

        t_start = time.perf_counter()
        res = ie_encoder_exec_emb.infer(inputs=inputs)
        t_end = time.perf_counter()
        log.info("embedding calculated for sequence of length {} with {:0.2f} requests/sec ({:0.2} sec per request)".format(
            max_length,
            1 / (t_end - t_start),
            t_end - t_start
        ))


        res = res[output_names_model_emb[0]]
        return res.squeeze(0)

    #small class to store context as text and tokens and its embedding vector
    class ContextData:
        def __init__(self, context, c_tokens_id, c_tokens_se):
            self.context = context
            self.c_tokens_id = c_tokens_id
            self.c_tokens_se = c_tokens_se
            self.c_emb = calc_emb(self.c_tokens_id, max_length_c)

    paragraphs = get_paragraphs(args.input)
    contexts_all = []

    log.info("Indexing {} paragraphs...".format(len(paragraphs)))
    for par in paragraphs:
        c_tokens_id, c_tokens_se = text_to_tokens(par.lower(), vocab)
        if not c_tokens_id:
            continue

        # get context as string and then encode it into token id list
        # calculate number of tokens for context in each request.
        # reserve 3 positions for special tokens
        # [CLS] q_tokens [SEP] c_tokens [SEP]
        if args.model_qa:
            #to make context be able to pass model_qa together with question
            c_wnd_len = max_length_qc - (max_length_q + 3)
        else:
            #to make context be able to pass model_emb without question
            c_wnd_len = max_length_c - 2

        # token num between 2 neighbours context windows
        # 1/2 means that context windows are interleaved by half
        c_stride = c_wnd_len // 2

        # init scan window
        c_s, c_e = 0, min(c_wnd_len, len(c_tokens_id))

        # iterate while context window is not empty
        while c_e > c_s:
            contexts_all.append(ContextData(par, c_tokens_id[c_s:c_e], c_tokens_se[c_s:c_e]))

            # check that context window reach the end
            if c_e == len(c_tokens_id):
                break

            # move to next window position
            c_s, c_e = c_s+c_stride, c_e+c_stride

            shift_left = max(0, c_e - len(c_tokens_id))
            c_s, c_e = c_s -shift_left, c_e-shift_left
            assert c_s >= 0, "start can be left of 0 only with window less than len but in this case we can not be here"

    if args.questions:
        def questions():
            for question in args.questions:
                log.info("Question: {}".format(question))
                yield question
    else:
        def questions():
            while True:
                yield input('Type question (empty string to exit):')

    # loop on user's or prepared questions
    for question in questions():
        if not question.strip():
            break

        log.info("---Stage 1---Calc question embedding and compare with {} context embeddings".format(len(contexts_all)))
        q_tokens_id, _ = text_to_tokens(question.lower(), vocab)

        q_emb = calc_emb(q_tokens_id, max_length_q)
        distances = [(np.linalg.norm(c.c_emb - q_emb, 2), c) for c in contexts_all]
        distances.sort(key=lambda x: x[0])
        keep_num = min(args.best_n, len(distances))
        distances_filtered = distances[:keep_num]

        #print short list
        print("The closest contexts to question:")
        for i, (dist, c_data) in enumerate(distances_filtered):
            print("#{}: embedding distance {} for context '{}'".format(i + 1, dist, c_data.context))

        #run model_qa if available to find exact answer to question in filtered in contexts
        if args.model_qa:

            log.info("---Stage 2---Looking for exact answers in {} contexts filtered in from {}".format(keep_num, len(distances)))
            # array of answers from each context_data
            answers = []

            for dist, c_data in distances_filtered:
                #forms the request
                tok_cls = [vocab['[CLS]']]
                tok_sep = [vocab['[SEP]']]
                tok_pad = [vocab['[PAD]']]
                req_len = len(q_tokens_id) + len(c_data.c_tokens_id) + 3
                pad_len = max_length_qc - req_len
                assert pad_len >= 0

                input_ids = tok_cls + q_tokens_id + tok_sep + c_data.c_tokens_id + tok_sep + tok_pad*pad_len
                token_type_ids = [0] * (len(q_tokens_id)+2) + [1] * (len(c_data.c_tokens_id)+1) + tok_pad * pad_len
                attention_mask = [1] * req_len + [0] * pad_len

                #create numpy inputs for IE
                inputs = {
                    input_names_qa[0]: np.array([input_ids], dtype=np.int32),
                    input_names_qa[1]: np.array([attention_mask], dtype=np.int32),
                    input_names_qa[2]: np.array([token_type_ids], dtype=np.int32),
                }
                if len(input_names_qa) > 3:
                    inputs['position_ids'] = np.arange(max_length_qc, dtype=np.int32)[None, :]

                #infer by IE
                t_start = time.perf_counter()
                res = ie_encoder_qa_exec.infer(inputs=inputs)
                t_end = time.perf_counter()
                log.info(
                    "Exact answer calculated for sequence of length {} with {:0.2f} requests/sec ({:0.2} sec per request)".format(
                        max_length_qc,
                        1 / (t_end - t_start),
                        t_end - t_start
                    ))

                #get start-end scores for context
                def get_score(name):
                    out = np.exp(res[name].reshape((max_length_qc, )))
                    return out / out.sum(axis=-1)
                score_s = get_score(output_names_qa[0])
                score_e = get_score(output_names_qa[1])

                # find product of all start-end combinations to find the best one
                c_s_idx = len(q_tokens_id) + 2 # index of first context token in tensor
                c_e_idx = max_length_qc-(1+pad_len) # index of last+1 context token in tensor
                score_mat = np.matmul(
                    score_s[c_s_idx:c_e_idx].reshape((len(c_data.c_tokens_id), 1)),
                    score_e[c_s_idx:c_e_idx].reshape((1, len(c_data.c_tokens_id)))
                )
                # reset candidates with end before start
                score_mat = np.triu(score_mat)
                # reset long candidates (>max_answer_token_num)
                score_mat = np.tril(score_mat, args.max_answer_token_num - 1)
                # find the best start-end pair
                max_s, max_e = divmod(score_mat.flatten().argmax(), score_mat.shape[1])
                max_score = score_mat[max_s, max_e]

                # convert to context text start-end index
                max_s = c_data.c_tokens_se[max_s][0]
                max_e = c_data.c_tokens_se[max_e][1]

                # check that answers list does not have answer yet
                # it could be because of context windows overlapping
                same = [i for i, a in enumerate(answers) if a[1] == max_s and a[2]==max_e and a[3] is c_data.context]
                if same:
                    assert len(same) == 1
                    #update exist answer record
                    a = answers[same[0]]
                    answers[same[0]] = (max(max_score, a[0]), max_s, max_e, c_data.context)
                else:
                    #add new record
                    answers.append((max_score, max_s, max_e, c_data.context))

            def mark(txt):
                return "\033[91m" + txt + "\033[0m" if args.colors else "*" + txt + "*"

            #print top 3 results
            answers.sort(key=lambda x: -x[0])
            log.info("---Stage 3---Find best 3 answers from {} results of Stage 1".format(len(answers)))
            for score, s, e, context in answers[:3]:
                print("Answer (score: {:0.2f}): {}".format(score, mark(context[s:e])))
                print(context[:s] + mark(context[s:e]) + context[e:])
def main():
    args = build_argparser().parse_args()

    paragraphs = get_paragraphs(args.input)

    preprocessing_start_time = perf_counter()
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))

    # get context as a string (as we might need it's length for the sequence reshape)
    context = '\n'.join(paragraphs)
    visualizer = Visualizer(context, args.colors)
    # encode context into token ids list
    c_tokens = text_to_tokens(context.lower(), vocab)
    total_latency = (perf_counter() - preprocessing_start_time) * 1e3

    if args.adapter == 'openvino':
        plugin_config = get_user_config(args.device, args.num_streams,
                                        args.num_threads)
        model_adapter = OpenvinoAdapter(
            create_core(),
            args.model,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests)
    elif args.adapter == 'ovms':
        model_adapter = OVMSAdapter(args.model)

    config = {
        'vocab': vocab,
        'input_names': args.input_names,
        'output_names': args.output_names,
        'max_answer_token_num': args.max_answer_token_num,
        'squad_ver': args.model_squad_ver
    }
    model = BertQuestionAnswering(model_adapter, config)
    if args.reshape:
        # find the closest multiple of 64, if it is smaller than current network's sequence length, do reshape
        new_length = min(
            model.max_length,
            int(
                np.ceil(
                    (len(c_tokens[0]) + args.max_question_token_num) / 64) *
                64))
        if new_length < model.max_length:
            try:
                model.reshape(new_length)
            except RuntimeError:
                log.error(
                    "Failed to reshape the network, please retry the demo without '-r' option"
                )
                sys.exit(-1)
        else:
            log.debug(
                "\tSkipping network reshaping,"
                " as (context length + max question length) exceeds the current (input) network sequence length"
            )
    model.log_layers_info()

    pipeline = AsyncPipeline(model)

    if args.questions:

        def questions():
            for question in args.questions:
                log.info("\n\tQuestion: {}".format(question))
                yield question
    else:

        def questions():
            while True:
                yield input('\n\tType a question (empty string to exit): ')

    for question in questions():
        if not question.strip():
            break

        answers = []
        next_window_id = 0
        next_window_id_to_show = 0
        start_time = perf_counter()
        q_tokens_id, _ = text_to_tokens(question.lower(), vocab)
        source = ContextSource(q_tokens_id, c_tokens, model.max_length)

        while True:
            if pipeline.callback_exceptions:
                raise pipeline.callback_exceptions[0]
            results = pipeline.get_result(next_window_id_to_show)
            if results:
                next_window_id_to_show += 1
                update_answers_list(answers, results[0])
                continue

            if pipeline.is_ready():
                if source.is_over():
                    break
                pipeline.submit_data(source.get_data(), next_window_id, None)
                next_window_id += 1
            else:
                pipeline.await_any()

        pipeline.await_all()
        for window_id in range(next_window_id_to_show, next_window_id):
            results = pipeline.get_result(window_id)
            while results is None:
                results = pipeline.get_result(window_id)
            update_answers_list(answers, results[0])

        visualizer.show_answers(answers)
        total_latency += (perf_counter() - start_time) * 1e3

    log.info("Metrics report:")
    log.info("\tLatency: {:.1f} ms".format(total_latency))
Example #6
0
def setup(url):
    global vocab
    global ie_encoder
    global input_names
    global output_names
    global model
    global c_tokens_id
    global ie_encoder_exec
    global args
    global c_tokens_se
    global context
    global COLOR_RED
    global COLOR_RESET



    log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout)
    args = build_argparser().parse_args()

    if args.colors:
        COLOR_RED = "\033[91m"
        COLOR_RESET = "\033[0m"
    else:
        COLOR_RED = ""
        COLOR_RESET = ""

    # load vocabulary file for model
    log.info("Loading vocab file:\t{}".format(args.vocab))

    vocab = load_vocab_file(args.vocab)
    log.info("{} tokens loaded".format(len(vocab)))

    # get context as a string (as we might need it's length for the sequence reshape)
    p = url
    paragraphs = get_paragraphs([p])
    context = '\n'.join(paragraphs)
    log.info("Size: {} chars".format(len(context)))
    log.info("Context: " + COLOR_RED + context + COLOR_RESET)
    # encode context into token ids list
    c_tokens_id, c_tokens_se = text_to_tokens(context.lower(), vocab)

    log.info("Initializing Inference Engine")
    ie = IECore()
    version = ie.get_versions(args.device)[args.device]
    version_str = "{}.{}.{}".format(version.major, version.minor, version.build_number)
    log.info("Plugin version is {}".format(version_str))

    # read IR
    model_xml = args.model
    model_bin = model_xml.with_suffix(".bin")
    log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin))

    ie_encoder = ie.read_network(model=model_xml, weights=model_bin)

    if args.reshape:
        # reshape the sequence length to the context + maximum question length (in tokens)
        first_input_layer = next(iter(ie_encoder.inputs))
        c = ie_encoder.inputs[first_input_layer].shape[1]
        # find the closest multiple of 64, if it is smaller than current network's sequence length, let' use that
        seq = min(c, int(np.ceil((len(c_tokens_id) + args.max_question_token_num) / 64) * 64))
        if seq < c:
            input_info = list(ie_encoder.inputs)
            new_shapes = dict([])
            for i in input_info:
                n, c = ie_encoder.inputs[i].shape
                new_shapes[i] = [n, seq]
                log.info("Reshaped input {} from {} to the {}".format(i, ie_encoder.inputs[i].shape, new_shapes[i]))
            log.info("Attempting to reshape the network to the modified inputs...")
            try:
                ie_encoder.reshape(new_shapes)
                log.info("Successful!")
            except RuntimeError:
                log.error("Failed to reshape the network, please retry the demo without '-r' option")
                sys.exit(-1)
        else:
            log.info("Skipping network reshaping,"
                     " as (context length + max question length) exceeds the current (input) network sequence length")

    # check input and output names
    input_names = list(i.strip() for i in args.input_names.split(','))
    output_names = list(o.strip() for o in args.output_names.split(','))
    if ie_encoder.inputs.keys() != set(input_names) or ie_encoder.outputs.keys() != set(output_names):
        log.error("Input or Output names do not match")
        log.error("    The demo expects input->output names: {}->{}. "
                  "Please use the --input_names and --output_names to specify the right names "
                  "(see actual values below)".format(input_names, output_names))
        log.error("    Actual network input->output names: {}->{}".format(list(ie_encoder.inputs.keys()),
                                                                          list(ie_encoder.outputs.keys())))
        log.error("    Actual network input->output values: {}->{}".format(list(ie_encoder.inputs.values()),
                                                                          list(ie_encoder.outputs.values())))
        raise Exception("Unexpected network input or output names")


    # load model to the device
    log.info("Loading model to the {}".format(args.device))
    ie_encoder_exec = ie.load_network(network=ie_encoder, device_name=args.device)
Example #7
0
def main():
    args = build_argparser().parse_args()

    paragraphs = get_paragraphs(args.input)

    vocab_start_time = perf_counter()
    vocab = load_vocab_file(args.vocab)
    log.debug("Loaded vocab file from {}, get {} tokens".format(
        args.vocab, len(vocab)))
    visualizer = Visualizer(args.colors)
    total_latency = (perf_counter() - vocab_start_time) * 1e3

    ie = create_core()
    plugin_config = get_user_config(args.device, args.num_streams,
                                    args.num_threads)
    model_emb_adapter = OpenvinoAdapter(
        ie,
        args.model_emb,
        device=args.device,
        plugin_config=plugin_config,
        max_num_requests=args.num_infer_requests)
    model_emb = BertEmbedding(model_emb_adapter, {
        'vocab': vocab,
        'input_names': args.input_names_emb
    })
    model_emb.log_layers_info()

    # reshape BertEmbedding model to infer short questions and long contexts
    max_len_context = 384
    max_len_question = 32

    for new_length in [max_len_question, max_len_context]:
        model_emb.reshape(new_length)
        if new_length == max_len_question:
            emb_exec_net = ie.load_network(model_emb_adapter.net, args.device)
        else:
            emb_pipeline = AsyncPipeline(model_emb)

    if args.model_qa:
        model_qa_adapter = OpenvinoAdapter(
            ie,
            args.model_qa,
            device=args.device,
            plugin_config=plugin_config,
            max_num_requests=args.num_infer_requests)
        config = {
            'vocab': vocab,
            'input_names': args.input_names_qa,
            'output_names': args.output_names_qa,
            'max_answer_token_num': args.max_answer_token_num,
            'squad_ver': args.model_qa_squad_ver
        }
        model_qa = BertQuestionAnswering(model_qa_adapter, config)
        model_qa.log_layers_info()
        qa_pipeline = AsyncPipeline(model_qa)

    log.info("\t\tStage 1    (Calc embeddings for the context)")
    contexts_all = []
    start_time = perf_counter()

    # get context as string and then encode it into token id list
    # calculate number of tokens for context in each request.
    # reserve 3 positions for special tokens [CLS] q_tokens [SEP] c_tokens [SEP]
    if args.model_qa:
        # to make context be able to pass model_qa together with question
        c_window_len = model_qa.max_length - (max_len_question + 3)
    else:
        # to make context be able to pass model_emb without question
        c_window_len = max_len_context - 2

    def calc_question_embedding(tokens_id):
        num = min(max_len_question - 2, len(tokens_id))
        inputs, _ = model_emb.preprocess((tokens_id[:num], max_len_question))
        raw_result = emb_exec_net.infer(inputs)
        return model_emb.postprocess(raw_result, None)

    source = ContextSource(paragraphs, vocab, c_window_len)
    next_window_id = 0
    next_window_id_to_show = 0
    contexts_all = []

    while True:
        if emb_pipeline.callback_exceptions:
            raise emb_pipeline.callback_exceptions[0]
        results = emb_pipeline.get_result(next_window_id_to_show)
        if results:
            embedding, meta = results
            meta['c_data'].emb = embedding
            contexts_all.append(meta['c_data'])
            next_window_id_to_show += 1
            continue

        if emb_pipeline.is_ready():
            if source.is_over():
                break
            c_data = source.get_data()
            num = min(max_len_context - 2, len(c_data.c_tokens_id))
            emb_pipeline.submit_data(
                (c_data.c_tokens_id[:num], max_len_context), next_window_id,
                {'c_data': c_data})
            next_window_id += 1
        else:
            emb_pipeline.await_any()

    emb_pipeline.await_all()
    for window_id in range(next_window_id_to_show, next_window_id):
        results = emb_pipeline.get_result(window_id)
        while results is None:
            results = emb_pipeline.get_result(window_id)
        embedding, meta = results
        meta['c_data'].emb = embedding
        contexts_all.append(meta['c_data'])
        next_window_id_to_show += 1

    total_latency += (perf_counter() - start_time) * 1e3
    context_embeddings_time = total_latency

    if args.questions:

        def questions():
            for question in args.questions:
                log.info("\n\tQuestion: {}".format(question))
                yield question
    else:

        def questions():
            while True:
                yield input('\n\tType a question (empty string to exit): ')

    for question in questions():
        if not question.strip():
            break

        start_time = perf_counter()
        log.info(
            "\t\tStage 2    (Calc question embedding and compare with {} context embeddings)"
            .format(len(contexts_all)))
        q_tokens_id, _ = text_to_tokens(question.lower(), vocab)
        q_emb = calc_question_embedding(q_tokens_id)
        distances = [(np.linalg.norm(context.emb - q_emb, 2), context)
                     for context in contexts_all]
        distances.sort(key=lambda x: x[0])
        keep_num = min(args.best_n, len(distances))
        distances_filtered = distances[:keep_num]

        log.info(
            "The closest {} contexts to question filtered from {} context embeddings:"
            .format(keep_num, len(distances)))
        visualizer.show_closest_contexts(distances_filtered)

        if args.model_qa:
            answers = []
            next_context_id = 0
            next_context_id_to_show = 0

            while True:
                if qa_pipeline.callback_exceptions:
                    raise qa_pipeline.callback_exceptions[0]
                results = qa_pipeline.get_result(next_context_id_to_show)
                if results:
                    next_context_id_to_show += 1
                    output, meta = results
                    update_answers_list(answers, output, meta['c_data'])
                    continue

                if qa_pipeline.is_ready():
                    if next_context_id == len(distances_filtered):
                        break
                    _, c_data = distances_filtered[next_context_id]
                    qa_pipeline.submit_data((c_data, q_tokens_id),
                                            next_context_id,
                                            {'c_data': c_data})
                    next_context_id += 1
                else:
                    qa_pipeline.await_any()

            qa_pipeline.await_all()
            for context_id in range(next_context_id_to_show, next_context_id):
                results = qa_pipeline.get_result(context_id)
                while results is None:
                    results = qa_pipeline.get_result(context_id)
                output, meta = results
                update_answers_list(answers, output, meta['c_data'])

            log.info(
                "\t\tStage 3    (Show top 3 answers from {} closest contexts of Stage 1)"
                .format(len(answers)))
            answers = sorted(answers, key=lambda x: -x[0])[:3]
            visualizer.show_answers(answers)

        total_latency += (perf_counter() - start_time) * 1e3

    log.info("Metrics report:")
    log.info("\tContext embeddings latency (stage 1): {:.1f} ms".format(
        context_embeddings_time))
    log.info("\tLatency (all stages): {:.1f} ms".format(total_latency))
Example #8
0
if args.colors:
    COLOR_RED = "\033[91m"
    COLOR_RESET = "\033[0m"
else:
    COLOR_RED = ""
    COLOR_RESET = ""

# load vocabulary file for model
log.info("Loading vocab file:\t{}".format(args.vocab))

vocab = load_vocab_file(args.vocab)
log.info("{} tokens loaded".format(len(vocab)))

# get context as a string (as we might need it's length for the sequence reshape)
p = url
paragraphs = get_paragraphs([p])
context = '\n'.join(paragraphs)
log.info("Size: {} chars".format(len(context)))
log.info("Context: " + COLOR_RED + context + COLOR_RESET)
# encode context into token ids list
c_tokens_id, c_tokens_se = text_to_tokens(context.lower(), vocab)

log.info("Initializing Inference Engine")
ie = IECore()
version = ie.get_versions(args.device)[args.device]
version_str = "{}.{}.{}".format(version.major, version.minor,
                                version.build_number)
log.info("Plugin version is {}".format(version_str))

# read IR
model_xml = args.model