def run_client(): """ Ask a question of context on TRTIS. :param context: str :param question: str :param question_id: int :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_examples = read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative) eval_features = [] def append_feature(feature): eval_features.append(feature) convert_examples_to_features(examples=eval_examples[0:], tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) protocol_str = 'grpc' # http or grpc url = FLAGS.trtis_server_url verbose = True model_name = FLAGS.trtis_model_name model_version = FLAGS.trtis_model_version batch_size = FLAGS.predict_batch_size protocol = ProtocolType.from_str(protocol_str) # or 'grpc' ctx = InferContext(url, protocol, model_name, model_version, verbose) channel = grpc.insecure_channel(url) stub = grpc_service_pb2_grpc.GRPCServiceStub(channel) prof_request = grpc_service_pb2.server__status__pb2.model__config__pb2.ModelConfig( ) prof_response = stub.Profile(prof_request) status_ctx = ServerStatusContext(url, protocol, model_name=model_name, verbose=verbose) model_config_pb2.ModelConfig() status_result = status_ctx.get_server_status() outstanding = {} max_outstanding = 20 sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features)) recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features)) def process_outstanding(do_wait): if (len(outstanding) == 0): return ready_id = ctx.get_ready_async_request(do_wait) if (ready_id is None): return # If we are here, we got an id result = ctx.get_async_run_results(ready_id, False) stop = time.time() if (result is None): raise ValueError( "Context returned null for async id marked as done") outResult = outstanding.pop(ready_id) time_list.append(stop - outResult.start_time) batch_count = len(outResult.inputs[label_id_key]) for i in range(batch_count): unique_id = int(outResult.inputs[label_id_key][i][0]) start_logits = [float(x) for x in result["start_logits"][i].flat] end_logits = [float(x) for x in result["end_logits"][i].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) recv_prog.update(n=batch_count) all_results = [] time_list = [] print("Starting Sending Requests....\n") all_results_start = time.time() for inputs_dict in batch(eval_features, batch_size): present_batch_size = len(inputs_dict[label_id_key]) outputs_dict = { 'start_logits': InferContext.ResultFormat.RAW, 'end_logits': InferContext.ResultFormat.RAW } start = time.time() async_id = ctx.async_run(inputs_dict, outputs_dict, batch_size=present_batch_size) outstanding[async_id] = PendingResult(async_id=async_id, start_time=start, inputs=inputs_dict) sent_prog.update(n=present_batch_size) # Try to process at least one response per request process_outstanding(len(outstanding) >= max_outstanding) tqdm.tqdm.write( "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format( len(outstanding))) # Now process all outstanding requests while (len(outstanding) > 0): process_outstanding(True) all_results_end = time.time() all_results_total = (all_results_end - all_results_start) * 1000.0 print("-----------------------------") print("Individual Time Runs - Ignoring first two iterations") print("Total Time: {} ms".format(all_results_total)) print("-----------------------------") print("-----------------------------") print("Total Inference Time = %0.2f for" "Sentences processed = %d" % (sum(time_list), len(eval_features))) print("Throughput Average (sentences/sec) = %0.2f" % (len(eval_features) / all_results_total * 1000.0)) print("-----------------------------") time_list.sort() avg = np.mean(time_list) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) print("-----------------------------") print("Summary Statistics") print("Batch size =", FLAGS.predict_batch_size) print("Sequence Length =", FLAGS.max_seq_length) print("Latency Confidence Level 95 (ms) =", cf_95 * 1000) print("Latency Confidence Level 99 (ms) =", cf_99 * 1000) print("Latency Confidence Level 100 (ms) =", cf_100 * 1000) print("Latency Average (ms) =", avg * 1000) print("-----------------------------") output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)
image_idx = (image_idx + 1) % len(image_data) if image_idx == 0: last_request = True # Send request if not FLAGS.async_set: results.append( ctx.run({input_name: input_batch}, { output_name: (InferContext.ResultFormat.CLASS, FLAGS.classes) }, FLAGS.batch_size)) result_filenames.append(input_filenames) else: ctx.async_run( partial(completion_callback, input_filenames, user_data), {input_name: input_batch}, { output_name: (InferContext.ResultFormat.CLASS, FLAGS.classes) }, FLAGS.batch_size) sent_count += 1 # For async, retrieve results according to the send order if FLAGS.async_set: processed_count = 0 while processed_count < sent_count: (request_id, input_filenames) = user_data.completed_requests.get() results.append(ctx.get_async_run_results(request_id)) result_filenames.append(input_filenames) processed_count += 1 for idx in range(len(results)): print("Request {}, batch size {}".format(idx, FLAGS.batch_size))
def run_client(): """ Ask a question of context on TRTIS. :param context: str :param question: str :param question_id: int :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # ------------------------------------------------------------- # Creation of examples here # ------------------------------------------------------------- paragraph = """The koala (Phascolarctos cinereus, or, inaccurately, koala bear[a]) is an arboreal herbivorous marsupial native to Australia. It is the only extant representative of the family Phascolarctidae and its closest living relatives are the wombats, which comprise the family Vombatidae. The koala is found in coastal areas of the mainland's eastern and southern regions, inhabiting Queensland, New South Wales, Victoria, and South Australia. It is easily recognisable by its stout, tailless body and large head with round, fluffy ears and large, spoon-shaped nose. The koala has a body length of 60–85 cm (24–33 in) and weighs 4–15 kg (9–33 lb). Fur colour ranges from silver grey to chocolate brown. Koalas from the northern populations are typically smaller and lighter in colour than their counterparts further south. These populations possibly are separate subspecies, but this is disputed. """ question_text = "Who is Koala?" examples = [] example = SquadExample( qas_id=1, question_text=question_text, doc_tokens=convert_doc_tokens(paragraph_text=paragraph)) for iterator in range(30): examples.append(example) # Switching from predict_file read to api-read # eval_examples = read_squad_examples( # input_file=FLAGS.predict_file, is_training=False, # version_2_with_negative=FLAGS.version_2_with_negative) eval_examples = examples eval_features = [] def append_feature(feature): eval_features.append(feature) convert_examples_to_features(examples=eval_examples[0:], tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) protocol_str = 'grpc' # http or grpc url = FLAGS.trtis_server_url verbose = True model_name = FLAGS.trtis_model_name model_version = FLAGS.trtis_model_version batch_size = FLAGS.predict_batch_size protocol = ProtocolType.from_str(protocol_str) # or 'grpc' ctx = InferContext(url, protocol, model_name, model_version, verbose) channel = grpc.insecure_channel(url) stub = grpc_service_pb2_grpc.GRPCServiceStub(channel) prof_request = grpc_service_pb2.server__status__pb2.model__config__pb2.ModelConfig( ) prof_response = stub.Profile(prof_request) status_ctx = ServerStatusContext(url, protocol, model_name=model_name, verbose=verbose) model_config_pb2.ModelConfig() status_result = status_ctx.get_server_status() outstanding = {} max_outstanding = 20 sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features)) recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features)) def process_outstanding(do_wait): if (len(outstanding) == 0): return ready_id = ctx.get_ready_async_request(do_wait) if (ready_id is None): return # If we are here, we got an id result = ctx.get_async_run_results(ready_id, False) stop = time.time() if (result is None): raise ValueError( "Context returned null for async id marked as done") outResult = outstanding.pop(ready_id) time_list.append(stop - outResult.start_time) batch_count = len(outResult.inputs[label_id_key]) for i in range(batch_count): unique_id = int(outResult.inputs[label_id_key][i][0]) start_logits = [float(x) for x in result["start_logits"][i].flat] end_logits = [float(x) for x in result["end_logits"][i].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) recv_prog.update(n=batch_count) all_results = [] time_list = [] print("Starting Sending Requests....\n") all_results_start = time.time() for inputs_dict in batch(eval_features, batch_size): present_batch_size = len(inputs_dict[label_id_key]) outputs_dict = { 'start_logits': InferContext.ResultFormat.RAW, 'end_logits': InferContext.ResultFormat.RAW } start = time.time() async_id = ctx.async_run(inputs_dict, outputs_dict, batch_size=present_batch_size) outstanding[async_id] = PendingResult(async_id=async_id, start_time=start, inputs=inputs_dict) sent_prog.update(n=present_batch_size) # Try to process at least one response per request process_outstanding(len(outstanding) >= max_outstanding) tqdm.tqdm.write( "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format( len(outstanding))) # Now process all outstanding requests while (len(outstanding) > 0): process_outstanding(True) all_results_end = time.time() all_results_total = (all_results_end - all_results_start) * 1000.0 print("-----------------------------") print("Individual Time Runs - Ignoring first two iterations") print("Total Time: {} ms".format(all_results_total)) print("-----------------------------") print("-----------------------------") print("Total Inference Time = %0.2f for" "Sentences processed = %d" % (sum(time_list), len(eval_features))) print("Throughput Average (sentences/sec) = %0.2f" % (len(eval_features) / all_results_total * 1000.0)) print("-----------------------------") time_list.sort() avg = np.mean(time_list) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) print("-----------------------------") print("Summary Statistics") print("Batch size =", FLAGS.predict_batch_size) print("Sequence Length =", FLAGS.max_seq_length) print("Latency Confidence Level 95 (ms) =", cf_95 * 1000) print("Latency Confidence Level 99 (ms) =", cf_99 * 1000) print("Latency Confidence Level 100 (ms) =", cf_100 * 1000) print("Latency Average (ms) =", avg * 1000) print("-----------------------------")
def main(_): """ Ask a question of context on Triton. :param context: str :param question: str :param question_id: int :return: """ os.environ[ "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) # Get the Data if FLAGS.predict_file: eval_examples = read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative) elif FLAGS.question and FLAGS.answer: input_data = [{ "paragraphs": [{ "context": FLAGS.context, "qas": [{ "id": 0, "question": FLAGS.question }] }] }] eval_examples = read_squad_examples( input_file=None, is_training=False, version_2_with_negative=FLAGS.version_2_with_negative, input_data=input_data) else: raise ValueError( "Either predict_file or question+answer need to defined") # Get Eval Features = Preprocessing eval_features = [] def append_feature(feature): eval_features.append(feature) convert_examples_to_features(examples=eval_examples[0:], tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) protocol_str = 'grpc' # http or grpc url = FLAGS.triton_server_url verbose = True model_name = FLAGS.triton_model_name model_version = FLAGS.triton_model_version batch_size = FLAGS.predict_batch_size protocol = ProtocolType.from_str(protocol_str) # or 'grpc' ctx = InferContext(url, protocol, model_name, model_version, verbose) status_ctx = ServerStatusContext(url, protocol, model_name=model_name, verbose=verbose) model_config_pb2.ModelConfig() status_result = status_ctx.get_server_status() user_data = UserData() max_outstanding = 20 # Number of outstanding requests outstanding = 0 sent_prog = tqdm.tqdm(desc="Send Requests", total=len(eval_features)) recv_prog = tqdm.tqdm(desc="Recv Requests", total=len(eval_features)) def process_outstanding(do_wait, outstanding): if (outstanding == 0 or do_wait is False): return outstanding # Wait for deferred items from callback functions (infer_ctx, ready_id, idx, start_time, inputs) = user_data._completed_requests.get() if (ready_id is None): return outstanding # If we are here, we got an id result = ctx.get_async_run_results(ready_id) stop = time.time() if (result is None): raise ValueError( "Context returned null for async id marked as done") outstanding -= 1 time_list.append(stop - start_time) batch_count = len(inputs[label_id_key]) for i in range(batch_count): unique_id = int(inputs[label_id_key][i][0]) start_logits = [float(x) for x in result["start_logits"][i].flat] end_logits = [float(x) for x in result["end_logits"][i].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) recv_prog.update(n=batch_count) return outstanding all_results = [] time_list = [] print("Starting Sending Requests....\n") all_results_start = time.time() idx = 0 for inputs_dict in batch(eval_features, batch_size): present_batch_size = len(inputs_dict[label_id_key]) outputs_dict = { 'start_logits': InferContext.ResultFormat.RAW, 'end_logits': InferContext.ResultFormat.RAW } start_time = time.time() ctx.async_run(partial(completion_callback, user_data, idx, start_time, inputs_dict), inputs_dict, outputs_dict, batch_size=present_batch_size) outstanding += 1 idx += 1 sent_prog.update(n=present_batch_size) # Try to process at least one response per request outstanding = process_outstanding(outstanding >= max_outstanding, outstanding) tqdm.tqdm.write( "All Requests Sent! Waiting for responses. Outstanding: {}.\n".format( outstanding)) # Now process all outstanding requests while (outstanding > 0): outstanding = process_outstanding(True, outstanding) all_results_end = time.time() all_results_total = (all_results_end - all_results_start) * 1000.0 print("-----------------------------") print("Total Time: {} ms".format(all_results_total)) print("-----------------------------") print("-----------------------------") print("Total Inference Time = %0.2f for" "Sentences processed = %d" % (sum(time_list), len(eval_features))) print("Throughput Average (sentences/sec) = %0.2f" % (len(eval_features) / all_results_total * 1000.0)) print("-----------------------------") if FLAGS.output_dir and FLAGS.predict_file: # When inferencing on a dataset, get inference statistics and write results to json file time_list.sort() avg = np.mean(time_list) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) print("-----------------------------") print("Summary Statistics") print("Batch size =", FLAGS.predict_batch_size) print("Sequence Length =", FLAGS.max_seq_length) print("Latency Confidence Level 95 (ms) =", cf_95 * 1000) print("Latency Confidence Level 99 (ms) =", cf_99 * 1000) print("Latency Confidence Level 100 (ms) =", cf_100 * 1000) print("Latency Average (ms) =", avg * 1000) print("-----------------------------") output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, FLAGS.version_2_with_negative, FLAGS.verbose_logging) else: # When inferencing on a single example, write best answer to stdout all_predictions, all_nbest_json, scores_diff_json = get_predictions( eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, FLAGS.version_2_with_negative, FLAGS.verbose_logging) print( "Context is: %s \n\nQuestion is: %s \n\nPredicted Answer is: %s" % (FLAGS.context, FLAGS.question, all_predictions[0]))