def main(): log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() # load vocabulary file for model log.info("Loading vocab file:\t{}".format(args.vocab)) vocab = load_vocab_file(args.vocab) log.info("{} tokens loaded".format(len(vocab))) # get context as a string (as we might need it's length for the sequence reshape) paragraphs = get_paragraphs(args.input) context = '\n'.join(paragraphs) log.info("Size: {} chars".format(len(context))) sentences = re.split(sentence_splitter, context) preprocessed_sentences = [text_to_tokens(sentence, vocab) for sentence in sentences] max_sent_length = max([len(tokens) + 2 for tokens, _ in preprocessed_sentences]) log.info("Initializing Inference Engine") ie = IECore() version = ie.get_versions(args.device)[args.device] version_str = "{}.{}.{}".format(version.major, version.minor, version.build_number) log.info("Plugin version is {}".format(version_str)) # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) ie_encoder = ie.read_network(model=model_xml, weights=model_bin) # check input and output names input_names = [i.strip() for i in args.input_names.split(',')] if ie_encoder.input_info.keys() != set(input_names): log.error("Input names do not match") log.error(" The demo expects input names: {}. " "Please use the --input_names to specify the right names " "(see actual values below)".format(input_names)) log.error(" Actual network input names: {}".format(list(ie_encoder.input_info.keys()))) raise Exception("Unexpected network input names") if len(ie_encoder.outputs) != 1: log.log.error('Demo expects model with single output, while provided {}'.format(len(ie_encoder.outputs))) raise Exception('Unexpected number of outputs') output_names = list(ie_encoder.outputs) max_length = ie_encoder.input_info[input_names[0]].input_data.shape[1] if max_sent_length > max_length: input_shapes = { input_names[0]: [1, max_sent_length], input_names[1]: [1, max_sent_length], input_names[2]: [1, max_sent_length] } ie_encoder.reshape(input_shapes) max_length = max_sent_length # load model to the device log.info("Loading model to the {}".format(args.device)) ie_encoder_exec = ie.load_network(network=ie_encoder, device_name=args.device) # maximum number of tokens that can be processed by network at once t0 = time.perf_counter() t_count = 0 def get_score(name): out = np.exp(res[name][0]) return out / out.sum(axis=-1, keepdims=True) for sentence, (c_tokens_id, c_token_s_e) in zip(sentences, preprocessed_sentences): # form the request tok_cls = vocab['[CLS]'] tok_sep = vocab['[SEP]'] input_ids = [tok_cls] + c_tokens_id + [tok_sep] token_type_ids = [0] * len(input_ids) attention_mask = [1] * len(input_ids) # pad the rest of the request pad_len = max_length - len(input_ids) input_ids += [0] * pad_len token_type_ids += [0] * pad_len attention_mask += [0] * pad_len # create numpy inputs for IE inputs = { input_names[0]: np.array([input_ids], dtype=np.int32), input_names[1]: np.array([attention_mask], dtype=np.int32), input_names[2]: np.array([token_type_ids], dtype=np.int32), } if len(input_names)>3: inputs[input_names[3]] = np.arange(len(input_ids), dtype=np.int32)[None, :] t_start = time.perf_counter() # infer by IE res = ie_encoder_exec.infer(inputs=inputs) t_end = time.perf_counter() t_count += 1 log.info("Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)".format( max_length, 1 / (t_end - t_start), t_end - t_start )) score = get_score(output_names[0]) labels_idx = score.argmax(-1) filtered_labels_idx = [ (idx, label_idx) for idx, label_idx in enumerate(labels_idx) if label_idx != 0 and 0 < idx < max_length - pad_len ] if not filtered_labels_idx: continue log.info('Sentence: \n\t{}'.format(sentence)) visualized = set() for idx, label_idx in filtered_labels_idx: word_s, word_e = c_token_s_e[idx - 1] if (word_s, word_e) in visualized: continue visualized.add((word_s, word_e)) word = sentence[word_s:word_e] log.info('\n\tWord: {}\n\tConfidence: {}\n\tTag: {}'.format(word, score[idx][label_idx], label_to_tag[label_idx])) t1 = time.perf_counter() log.info("The performance below is reported only for reference purposes, " "please use the benchmark_app tool (part of the OpenVINO samples) for any actual measurements.") log.info("{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)".format( t_count, max_length, t1 - t0, (t1 - t0) / t_count ))
def encode_txt(txt): if do_lower_case: txt = txt.lower() return tokens_bert.text_to_tokens(txt, vocab)
def main(): log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() if args.colors: COLOR_RED = "\033[91m" COLOR_RESET = "\033[0m" else: COLOR_RED = "" COLOR_RESET = "" # load vocabulary file for model log.info("Loading vocab file:\t{}".format(args.vocab)) vocab = load_vocab_file(args.vocab) log.info("{} tokens loaded".format(len(vocab))) # get context as a string (as we might need it's length for the sequence reshape) paragraphs = get_paragraphs(args.input) context = '\n'.join(paragraphs) log.info("Size: {} chars".format(len(context))) log.info("Context: " + COLOR_RED + context + COLOR_RESET) # encode context into token ids list c_tokens_id, c_tokens_se = text_to_tokens(context.lower(), vocab) log.info("Initializing Inference Engine") ie = IECore() version = ie.get_versions(args.device)[args.device] version_str = "{}.{}.{}".format(version.major, version.minor, version.build_number) log.info("Plugin version is {}".format(version_str)) # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) ie_encoder = ie.read_network(model=model_xml, weights=model_bin) if args.reshape: # reshape the sequence length to the context + maximum question length (in tokens) first_input_layer = next(iter(ie_encoder.input_info)) c = ie_encoder.input_info[first_input_layer].input_data.shape[1] # find the closest multiple of 64, if it is smaller than current network's sequence length, let' use that seq = min(c, int(np.ceil((len(c_tokens_id) + args.max_question_token_num) / 64) * 64)) if seq < c: new_shapes = {} for input_name, input_info in ie_encoder.input_info.items(): n, c = input_info.input_data.shape new_shapes[input_name] = [n, seq] log.info("Reshaped input {} from {} to the {}".format( input_name, input_info.input_data.shape, new_shapes[input_name])) log.info("Attempting to reshape the network to the modified inputs...") try: ie_encoder.reshape(new_shapes) log.info("Successful!") except RuntimeError: log.error("Failed to reshape the network, please retry the demo without '-r' option") sys.exit(-1) else: log.info("Skipping network reshaping," " as (context length + max question length) exceeds the current (input) network sequence length") # check input and output names input_names = [i.strip() for i in args.input_names.split(',')] output_names = [o.strip() for o in args.output_names.split(',')] if ie_encoder.input_info.keys() != set(input_names) or ie_encoder.outputs.keys() != set(output_names): log.error("Input or Output names do not match") log.error(" The demo expects input->output names: {}->{}. " "Please use the --input_names and --output_names to specify the right names " "(see actual values below)".format(input_names, output_names)) log.error(" Actual network input->output names: {}->{}".format(list(ie_encoder.input_info.keys()), list(ie_encoder.outputs.keys()))) raise Exception("Unexpected network input or output names") # load model to the device log.info("Loading model to the {}".format(args.device)) ie_encoder_exec = ie.load_network(network=ie_encoder, device_name=args.device) if args.questions: def questions(): for question in args.questions: log.info("Question: {}".format(question)) yield question else: def questions(): while True: yield input('Type question (empty string to exit):') # loop on user's or prepared questions for question in questions(): if not question.strip(): break q_tokens_id, _ = text_to_tokens(question.lower(), vocab) # maximum number of tokens that can be processed by network at once max_length = ie_encoder.input_info[input_names[0]].input_data.shape[1] # calculate number of tokens for context in each inference request. # reserve 3 positions for special tokens # [CLS] q_tokens [SEP] c_tokens [SEP] c_wnd_len = max_length - (len(q_tokens_id) + 3) # token num between two neighbour context windows # 1/2 means that context windows are overlapped by half c_stride = c_wnd_len // 2 t0 = time.perf_counter() t_count = 0 # array of answers from each window answers = [] # init a window to iterate over context c_s, c_e = 0, min(c_wnd_len, len(c_tokens_id)) # iterate while context window is not empty while c_e > c_s: # form the request tok_cls = vocab['[CLS]'] tok_sep = vocab['[SEP]'] input_ids = [tok_cls] + q_tokens_id + [tok_sep] + c_tokens_id[c_s:c_e] + [tok_sep] token_type_ids = [0] + [0] * len(q_tokens_id) + [0] + [1] * (c_e - c_s) + [0] attention_mask = [1] * len(input_ids) # pad the rest of the request pad_len = max_length - len(input_ids) input_ids += [0] * pad_len token_type_ids += [0] * pad_len attention_mask += [0] * pad_len # create numpy inputs for IE inputs = { input_names[0]: np.array([input_ids], dtype=np.int32), input_names[1]: np.array([attention_mask], dtype=np.int32), input_names[2]: np.array([token_type_ids], dtype=np.int32), } if len(input_names)>3: inputs[input_names[3]] = np.arange(len(input_ids), dtype=np.int32)[None, :] t_start = time.perf_counter() # infer by IE res = ie_encoder_exec.infer(inputs=inputs) t_end = time.perf_counter() t_count += 1 log.info("Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)".format( max_length, 1 / (t_end - t_start), t_end - t_start )) # get start-end scores for context def get_score(name): out = np.exp(res[name].reshape((max_length,))) return out / out.sum(axis=-1) score_s = get_score(output_names[0]) score_e = get_score(output_names[1]) # get 'no-answer' score (not valid if model has been fine-tuned on squad1.x) if args.model_squad_ver.split('.')[0] == '1': score_na = 0 else: score_na = score_s[0] * score_e[0] # find product of all start-end combinations to find the best one c_s_idx = len(q_tokens_id) + 2 # index of first context token in tensor c_e_idx = max_length - (1 + pad_len) # index of last+1 context token in tensor score_mat = np.matmul( score_s[c_s_idx:c_e_idx].reshape((c_e - c_s, 1)), score_e[c_s_idx:c_e_idx].reshape((1, c_e - c_s)) ) # reset candidates with end before start score_mat = np.triu(score_mat) # reset long candidates (>max_answer_token_num) score_mat = np.tril(score_mat, args.max_answer_token_num - 1) # find the best start-end pair max_s, max_e = divmod(score_mat.flatten().argmax(), score_mat.shape[1]) max_score = score_mat[max_s, max_e] * (1 - score_na) # convert to context text start-end index max_s = c_tokens_se[c_s + max_s][0] max_e = c_tokens_se[c_s + max_e][1] # check that answers list does not have duplicates (because of context windows overlapping) same = [i for i, a in enumerate(answers) if a[1] == max_s and a[2] == max_e] if same: assert len(same) == 1 # update existing answer record a = answers[same[0]] answers[same[0]] = (max(max_score, a[0]), max_s, max_e) else: # add new record answers.append((max_score, max_s, max_e)) # check that context window reached the end if c_e == len(c_tokens_id): break # move to next window position c_s = min(c_s + c_stride, len(c_tokens_id)) c_e = min(c_s + c_wnd_len, len(c_tokens_id)) t1 = time.perf_counter() log.info("The performance below is reported only for reference purposes, " "please use the benchmark_app tool (part of the OpenVINO samples) for any actual measurements.") log.info("{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)".format( t_count, max_length, t1 - t0, (t1 - t0) / t_count )) # print top 3 results answers = sorted(answers, key=lambda x: -x[0]) for score, s, e in answers[:3]: log.info("---answer: {:0.2f} {}".format(score, context[s:e])) c_s, c_e = find_sentence_range(context, s, e) log.info(" " + context[c_s:s] + COLOR_RED + context[s:e] + COLOR_RESET + context[e:c_e])
def main(): log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() log.info("Creating Inference Engine") ie = IECore() #read model to calculate embedding model_xml_emb = args.model_emb model_bin_emb = model_xml_emb.with_suffix(".bin") log.info("Loading embedding network files:\n\t{}\n\t{}".format(model_xml_emb, model_bin_emb)) ie_encoder_emb = ie.read_network(model=model_xml_emb, weights=model_bin_emb) input_names_model_emb = list(ie_encoder_emb.input_info.keys()) input_names_emb = args.input_names_emb.split(',') log.info("Expected embedding input names: {}".format(input_names_emb)) log.info("Network embedding input names: {}".format(input_names_model_emb)) # check input names if set(input_names_model_emb) != set(input_names_emb): log.error("Unexpected embedding network input names") raise Exception("Unexpected embedding network input names") # check outputs output_names_model_emb = list(ie_encoder_emb.outputs.keys()) if len(output_names_model_emb)>1: log.error("Expected only single output in embedding network but {} outputs detected".format(output_names_model_emb)) raise Exception("Unexpected number of embedding network outputs") #reshape embedding model to infer short questions and long contexts ie_encoder_exec_emb_dict = {} max_length_c = 384 max_length_q = 32 for length in [max_length_q, max_length_c]: new_shapes = {} for i, input_info in ie_encoder_emb.input_info.items(): new_shapes[i] = [1, length] log.info("Reshaped input {} from {} to the {}".format( i, input_info.input_data.shape, new_shapes[i])) log.info("Attempting to reshape the context embedding network to the modified inputs...") try: ie_encoder_emb.reshape(new_shapes) log.info("Successful!") except RuntimeError: log.error("Failed to reshape the embedding network") raise # Loading model to the plugin log.info("Loading model to the plugin") ie_encoder_exec_emb_dict[length] = ie.load_network(network=ie_encoder_emb, device_name=args.device) # Read model for final exact qa if args.model_qa: model_xml = args.model_qa model_bin = model_xml.with_suffix(".bin") log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) ie_encoder_qa = ie.read_network(model=model_xml, weights=model_bin) ie_encoder_qa.batch_size = 1 input_names_qa = args.input_names_qa.split(',') output_names_qa = args.output_names_qa.split(',') log.info("Expected input->output names: {}->{}".format(input_names_qa, output_names_qa)) #check input and output names input_names_model_qa = list(ie_encoder_qa.input_info.keys()) output_names_model_qa = list(ie_encoder_qa.outputs.keys()) log.info("Network input->output names: {}->{}".format(input_names_model_qa, output_names_model_qa)) if set(input_names_model_qa) != set(input_names_qa) or set(output_names_model_qa) != set(output_names_qa): log.error("Unexpected network input or output names") raise Exception("Unexpected network input or output names") # Loading model to the plugin log.info("Loading model to the plugin") ie_encoder_qa_exec = ie.load_network(network=ie_encoder_qa, device_name=args.device) max_length_qc = ie_encoder_qa.input_info[input_names_qa[0]].input_data.shape[1] #load vocabulary file for all models log.info("Loading vocab file:\t{}".format(args.vocab)) vocab = load_vocab_file(args.vocab) log.info("{} tokens loaded".format(len(vocab))) #define function to infer embedding def calc_emb(tokens_id, max_length): num = min(max_length - 2, len(tokens_id)) # forms the request pad_len = max_length - num - 2 tok_cls = [vocab['[CLS]']] tok_sep = [vocab['[SEP]']] tok_pad = [vocab['[PAD]']] dtype = np.int32 inputs = { input_names_emb[0]: np.array([tok_cls + tokens_id[:num] + tok_sep + tok_pad * pad_len], dtype=dtype), input_names_emb[1]: np.array([[1] + [1] * num + [1] + [0] * pad_len], dtype=dtype), input_names_emb[2]: np.array([[0] + [0] * num + [0] + tok_pad * pad_len], dtype=dtype), input_names_emb[3]: np.arange(max_length, dtype=dtype)[None, :] } # calc embedding ie_encoder_exec_emb = ie_encoder_exec_emb_dict[max_length] t_start = time.perf_counter() res = ie_encoder_exec_emb.infer(inputs=inputs) t_end = time.perf_counter() log.info("embedding calculated for sequence of length {} with {:0.2f} requests/sec ({:0.2} sec per request)".format( max_length, 1 / (t_end - t_start), t_end - t_start )) res = res[output_names_model_emb[0]] return res.squeeze(0) #small class to store context as text and tokens and its embedding vector class ContextData: def __init__(self, context, c_tokens_id, c_tokens_se): self.context = context self.c_tokens_id = c_tokens_id self.c_tokens_se = c_tokens_se self.c_emb = calc_emb(self.c_tokens_id, max_length_c) paragraphs = get_paragraphs(args.input) contexts_all = [] log.info("Indexing {} paragraphs...".format(len(paragraphs))) for par in paragraphs: c_tokens_id, c_tokens_se = text_to_tokens(par.lower(), vocab) if not c_tokens_id: continue # get context as string and then encode it into token id list # calculate number of tokens for context in each request. # reserve 3 positions for special tokens # [CLS] q_tokens [SEP] c_tokens [SEP] if args.model_qa: #to make context be able to pass model_qa together with question c_wnd_len = max_length_qc - (max_length_q + 3) else: #to make context be able to pass model_emb without question c_wnd_len = max_length_c - 2 # token num between 2 neighbours context windows # 1/2 means that context windows are interleaved by half c_stride = c_wnd_len // 2 # init scan window c_s, c_e = 0, min(c_wnd_len, len(c_tokens_id)) # iterate while context window is not empty while c_e > c_s: contexts_all.append(ContextData(par, c_tokens_id[c_s:c_e], c_tokens_se[c_s:c_e])) # check that context window reach the end if c_e == len(c_tokens_id): break # move to next window position c_s, c_e = c_s+c_stride, c_e+c_stride shift_left = max(0, c_e - len(c_tokens_id)) c_s, c_e = c_s -shift_left, c_e-shift_left assert c_s >= 0, "start can be left of 0 only with window less than len but in this case we can not be here" if args.questions: def questions(): for question in args.questions: log.info("Question: {}".format(question)) yield question else: def questions(): while True: yield input('Type question (empty string to exit):') # loop on user's or prepared questions for question in questions(): if not question.strip(): break log.info("---Stage 1---Calc question embedding and compare with {} context embeddings".format(len(contexts_all))) q_tokens_id, _ = text_to_tokens(question.lower(), vocab) q_emb = calc_emb(q_tokens_id, max_length_q) distances = [(np.linalg.norm(c.c_emb - q_emb, 2), c) for c in contexts_all] distances.sort(key=lambda x: x[0]) keep_num = min(args.best_n, len(distances)) distances_filtered = distances[:keep_num] #print short list print("The closest contexts to question:") for i, (dist, c_data) in enumerate(distances_filtered): print("#{}: embedding distance {} for context '{}'".format(i + 1, dist, c_data.context)) #run model_qa if available to find exact answer to question in filtered in contexts if args.model_qa: log.info("---Stage 2---Looking for exact answers in {} contexts filtered in from {}".format(keep_num, len(distances))) # array of answers from each context_data answers = [] for dist, c_data in distances_filtered: #forms the request tok_cls = [vocab['[CLS]']] tok_sep = [vocab['[SEP]']] tok_pad = [vocab['[PAD]']] req_len = len(q_tokens_id) + len(c_data.c_tokens_id) + 3 pad_len = max_length_qc - req_len assert pad_len >= 0 input_ids = tok_cls + q_tokens_id + tok_sep + c_data.c_tokens_id + tok_sep + tok_pad*pad_len token_type_ids = [0] * (len(q_tokens_id)+2) + [1] * (len(c_data.c_tokens_id)+1) + tok_pad * pad_len attention_mask = [1] * req_len + [0] * pad_len #create numpy inputs for IE inputs = { input_names_qa[0]: np.array([input_ids], dtype=np.int32), input_names_qa[1]: np.array([attention_mask], dtype=np.int32), input_names_qa[2]: np.array([token_type_ids], dtype=np.int32), } if len(input_names_qa) > 3: inputs['position_ids'] = np.arange(max_length_qc, dtype=np.int32)[None, :] #infer by IE t_start = time.perf_counter() res = ie_encoder_qa_exec.infer(inputs=inputs) t_end = time.perf_counter() log.info( "Exact answer calculated for sequence of length {} with {:0.2f} requests/sec ({:0.2} sec per request)".format( max_length_qc, 1 / (t_end - t_start), t_end - t_start )) #get start-end scores for context def get_score(name): out = np.exp(res[name].reshape((max_length_qc, ))) return out / out.sum(axis=-1) score_s = get_score(output_names_qa[0]) score_e = get_score(output_names_qa[1]) # find product of all start-end combinations to find the best one c_s_idx = len(q_tokens_id) + 2 # index of first context token in tensor c_e_idx = max_length_qc-(1+pad_len) # index of last+1 context token in tensor score_mat = np.matmul( score_s[c_s_idx:c_e_idx].reshape((len(c_data.c_tokens_id), 1)), score_e[c_s_idx:c_e_idx].reshape((1, len(c_data.c_tokens_id))) ) # reset candidates with end before start score_mat = np.triu(score_mat) # reset long candidates (>max_answer_token_num) score_mat = np.tril(score_mat, args.max_answer_token_num - 1) # find the best start-end pair max_s, max_e = divmod(score_mat.flatten().argmax(), score_mat.shape[1]) max_score = score_mat[max_s, max_e] # convert to context text start-end index max_s = c_data.c_tokens_se[max_s][0] max_e = c_data.c_tokens_se[max_e][1] # check that answers list does not have answer yet # it could be because of context windows overlapping same = [i for i, a in enumerate(answers) if a[1] == max_s and a[2]==max_e and a[3] is c_data.context] if same: assert len(same) == 1 #update exist answer record a = answers[same[0]] answers[same[0]] = (max(max_score, a[0]), max_s, max_e, c_data.context) else: #add new record answers.append((max_score, max_s, max_e, c_data.context)) def mark(txt): return "\033[91m" + txt + "\033[0m" if args.colors else "*" + txt + "*" #print top 3 results answers.sort(key=lambda x: -x[0]) log.info("---Stage 3---Find best 3 answers from {} results of Stage 1".format(len(answers))) for score, s, e, context in answers[:3]: print("Answer (score: {:0.2f}): {}".format(score, mark(context[s:e]))) print(context[:s] + mark(context[s:e]) + context[e:])
def setup(url): global vocab global ie_encoder global input_names global output_names global model global c_tokens_id global ie_encoder_exec global args global c_tokens_se global context global COLOR_RED global COLOR_RESET log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) args = build_argparser().parse_args() if args.colors: COLOR_RED = "\033[91m" COLOR_RESET = "\033[0m" else: COLOR_RED = "" COLOR_RESET = "" # load vocabulary file for model log.info("Loading vocab file:\t{}".format(args.vocab)) vocab = load_vocab_file(args.vocab) log.info("{} tokens loaded".format(len(vocab))) # get context as a string (as we might need it's length for the sequence reshape) p = url paragraphs = get_paragraphs([p]) context = '\n'.join(paragraphs) log.info("Size: {} chars".format(len(context))) log.info("Context: " + COLOR_RED + context + COLOR_RESET) # encode context into token ids list c_tokens_id, c_tokens_se = text_to_tokens(context.lower(), vocab) log.info("Initializing Inference Engine") ie = IECore() version = ie.get_versions(args.device)[args.device] version_str = "{}.{}.{}".format(version.major, version.minor, version.build_number) log.info("Plugin version is {}".format(version_str)) # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) ie_encoder = ie.read_network(model=model_xml, weights=model_bin) if args.reshape: # reshape the sequence length to the context + maximum question length (in tokens) first_input_layer = next(iter(ie_encoder.inputs)) c = ie_encoder.inputs[first_input_layer].shape[1] # find the closest multiple of 64, if it is smaller than current network's sequence length, let' use that seq = min(c, int(np.ceil((len(c_tokens_id) + args.max_question_token_num) / 64) * 64)) if seq < c: input_info = list(ie_encoder.inputs) new_shapes = dict([]) for i in input_info: n, c = ie_encoder.inputs[i].shape new_shapes[i] = [n, seq] log.info("Reshaped input {} from {} to the {}".format(i, ie_encoder.inputs[i].shape, new_shapes[i])) log.info("Attempting to reshape the network to the modified inputs...") try: ie_encoder.reshape(new_shapes) log.info("Successful!") except RuntimeError: log.error("Failed to reshape the network, please retry the demo without '-r' option") sys.exit(-1) else: log.info("Skipping network reshaping," " as (context length + max question length) exceeds the current (input) network sequence length") # check input and output names input_names = list(i.strip() for i in args.input_names.split(',')) output_names = list(o.strip() for o in args.output_names.split(',')) if ie_encoder.inputs.keys() != set(input_names) or ie_encoder.outputs.keys() != set(output_names): log.error("Input or Output names do not match") log.error(" The demo expects input->output names: {}->{}. " "Please use the --input_names and --output_names to specify the right names " "(see actual values below)".format(input_names, output_names)) log.error(" Actual network input->output names: {}->{}".format(list(ie_encoder.inputs.keys()), list(ie_encoder.outputs.keys()))) log.error(" Actual network input->output values: {}->{}".format(list(ie_encoder.inputs.values()), list(ie_encoder.outputs.values()))) raise Exception("Unexpected network input or output names") # load model to the device log.info("Loading model to the {}".format(args.device)) ie_encoder_exec = ie.load_network(network=ie_encoder, device_name=args.device)
def update_output_div(n_clicks, input_value): # loop on user's or prepared questions for question in [input_value]: if not question.strip(): break q_tokens_id, _ = text_to_tokens(question.lower(), vocab) # maximum number of tokens that can be processed by network at once max_length = ie_encoder.inputs[input_names[0]].shape[1] # calculate number of tokens for context in each inference request. # reserve 3 positions for special tokens # [CLS] q_tokens [SEP] c_tokens [SEP] c_wnd_len = max_length - (len(q_tokens_id) + 3) # token num between two neighbour context windows # 1/2 means that context windows are overlapped by half c_stride = c_wnd_len // 2 t0 = time.perf_counter() t_count = 0 # array of answers from each window answers = [] # init a window to iterate over context c_s, c_e = 0, min(c_wnd_len, len(c_tokens_id)) # iterate while context window is not empty while c_e > c_s: # form the request tok_cls = vocab['[CLS]'] tok_sep = vocab['[SEP]'] input_ids = [tok_cls] + q_tokens_id + [ tok_sep ] + c_tokens_id[c_s:c_e] + [tok_sep] token_type_ids = [0] + [0] * len(q_tokens_id) + [ 0 ] + [1] * (c_e - c_s) + [0] attention_mask = [1] * len(input_ids) # pad the rest of the request pad_len = max_length - len(input_ids) input_ids += [0] * pad_len token_type_ids += [0] * pad_len attention_mask += [0] * pad_len # create numpy inputs for IE inputs = { input_names[0]: np.array([input_ids], dtype=np.int32), input_names[1]: np.array([attention_mask], dtype=np.int32), input_names[2]: np.array([token_type_ids], dtype=np.int32), } if len(input_names) > 3: inputs[input_names[3]] = np.arange(len(input_ids), dtype=np.int32)[None, :] t_start = time.perf_counter() # infer by IE res = ie_encoder_exec.infer(inputs=inputs) t_end = time.perf_counter() t_count += 1 log.info( "Sequence of length {} is processed with {:0.2f} requests/sec ({:0.2} sec per request)" .format(max_length, 1 / (t_end - t_start), t_end - t_start)) # get start-end scores for context def get_score(name): out = np.exp(res[name].reshape((max_length, ))) return out / out.sum(axis=-1) score_s = get_score(output_names[0]) score_e = get_score(output_names[1]) # get 'no-answer' score (not valid if model has been fine-tuned on squad1.x) if args.model_squad_ver.split('.')[0] == '1': score_na = 0 else: score_na = score_s[0] * score_e[0] # find product of all start-end combinations to find the best one c_s_idx = len( q_tokens_id) + 2 # index of first context token in tensor c_e_idx = max_length - ( 1 + pad_len) # index of last+1 context token in tensor score_mat = np.matmul( score_s[c_s_idx:c_e_idx].reshape((c_e - c_s, 1)), score_e[c_s_idx:c_e_idx].reshape((1, c_e - c_s))) # reset candidates with end before start score_mat = np.triu(score_mat) # reset long candidates (>max_answer_token_num) score_mat = np.tril(score_mat, args.max_answer_token_num - 1) # find the best start-end pair max_s, max_e = divmod(score_mat.flatten().argmax(), score_mat.shape[1]) max_score = score_mat[max_s, max_e] * (1 - score_na) # convert to context text start-end index max_s = c_tokens_se[c_s + max_s][0] max_e = c_tokens_se[c_s + max_e][1] # check that answers list does not have duplicates (because of context windows overlapping) same = [ i for i, a in enumerate(answers) if a[1] == max_s and a[2] == max_e ] if same: assert len(same) == 1 # update existing answer record a = answers[same[0]] answers[same[0]] = (max(max_score, a[0]), max_s, max_e) else: # add new record answers.append((max_score, max_s, max_e)) # check that context window reached the end if c_e == len(c_tokens_id): break # move to next window position c_s = min(c_s + c_stride, len(c_tokens_id)) c_e = min(c_s + c_wnd_len, len(c_tokens_id)) t1 = time.perf_counter() log.info( "The performance below is reported only for reference purposes, " "please use the benchmark_app tool (part of the OpenVINO samples) for any actual measurements." ) log.info( "{} requests of {} length were processed in {:0.2f}sec ({:0.2}sec per request)" .format(t_count, max_length, t1 - t0, (t1 - t0) / t_count)) # print top 3 results answers = sorted(answers, key=lambda x: -x[0]) for score, s, e in answers[:3]: resp = "Answer: {}".format(context[s:e]) + "\n" #cont = context[c_s:s] + COLOR_RED + context[s:e] + COLOR_RESET + context[e:c_e] cont = "Can I answer any other Questions? " log.info("---answer: {:0.2f} {}".format(score, context[s:e])) c_s, c_e = find_sentence_range(context, s, e) log.info(" " + context[c_s:s] + COLOR_RED + context[s:e] + COLOR_RESET + context[e:c_e]) return resp
COLOR_RESET = "" # load vocabulary file for model log.info("Loading vocab file:\t{}".format(args.vocab)) vocab = load_vocab_file(args.vocab) log.info("{} tokens loaded".format(len(vocab))) # get context as a string (as we might need it's length for the sequence reshape) p = url paragraphs = get_paragraphs([p]) context = '\n'.join(paragraphs) log.info("Size: {} chars".format(len(context))) log.info("Context: " + COLOR_RED + context + COLOR_RESET) # encode context into token ids list c_tokens_id, c_tokens_se = text_to_tokens(context.lower(), vocab) log.info("Initializing Inference Engine") ie = IECore() version = ie.get_versions(args.device)[args.device] version_str = "{}.{}.{}".format(version.major, version.minor, version.build_number) log.info("Plugin version is {}".format(version_str)) # read IR model_xml = args.model model_bin = model_xml.with_suffix(".bin") log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin)) ie_encoder = ie.read_network(model=model_xml, weights=model_bin)