def inference(features): global h_output print("\nRunning Inference...") eval_start_time = time.time() # Copy inputs cuda.memcpy_htod_async(d_inputs[0], features["input_ids"], stream) cuda.memcpy_htod_async(d_inputs[1], features["segment_ids"], stream) cuda.memcpy_htod_async(d_inputs[2], features["input_mask"], stream) # Run inference context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) # Synchronize the stream stream.synchronize() h_output = h_output.transpose((1,0,2,3,4)) eval_time_elapsed = time.time() - eval_start_time print("------------------------") print("Running inference in {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed)) print("------------------------") for index, batch in enumerate(h_output): # Data Post-processing start_logits = batch[:, 0] end_logits = batch[:, 1] prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features, start_logits, end_logits, args.n_best_size, args.max_answer_length) print("Processing output {:} in batch".format(index)) print("Answer: '{}'".format(prediction)) print("With probability: {:.3f}%".format(nbest_json[0]['probability'] * 100.0))
def predict(self, inputs: Tuple[Text, List[Text]]) -> List[Any]: import helpers.data_processing as dp from polygraphy.backend.trt import TrtRunner import numpy as np import collections import time def question_features(tokens, question): # Extract features from the paragraph and question return dp.convert_example_to_features( tokens, question, self._trtModel.tokenizer, self._trtModel.max_seq_length, self._trtModel.doc_stride, self._trtModel.max_query_length) features = [] doc_tokens = dp.convert_doc_tokens(inputs[0]) ques_list = inputs[1] batch_size = len(ques_list) if batch_size < 16: # Pad the input batch to batch_size to match the model expected input. pad = [ques_list[0]] * (16 - batch_size) ques_list.extend(pad) for question_text in ques_list: features.append(question_features(doc_tokens, question_text)[0]) input_ids_batch = np.dstack( [feature.input_ids for feature in features]).squeeze() segment_ids_batch = np.dstack( [feature.segment_ids for feature in features]).squeeze() input_mask_batch = np.dstack( [feature.input_mask for feature in features]).squeeze() inputs = { "input_ids": input_ids_batch, "input_mask": input_mask_batch, "segment_ids": segment_ids_batch } output = self._trtModel.infer_context.infer(inputs) start_logits = output['cls_squad_logits'][:, :, 0, :, :] end_logits = output['cls_squad_logits'][:, :, 1, :, :] networkOutputs = [ self._NetworkOutput(start_logits=start_logits[i, :], end_logits=end_logits[i, :], feature_index=0) for i in range(self._batch_size) ] predictions = [] for feature, networkOutput in zip(features, networkOutputs): prediction, _, _ = dp.get_predictions( doc_tokens, [feature], [networkOutput], self._trtModel.n_best_size, self._trtModel.max_answer_length) predictions.append(prediction) return [ "[Q]: " + ques + " [A]:" + prediction for ques, prediction in zip(ques_list, predictions) ]
def inference(features, tokens): global h_output _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name "NetworkOutput", ["start_logits", "end_logits", "feature_index"]) networkOutputs = [] eval_time_elapsed = 0 for feature_index, feature in enumerate(features): # Copy inputs input_ids_batch = np.dstack([feature.input_ids] * args.batch_size).squeeze() segment_ids_batch = np.dstack([feature.segment_ids] * args.batch_size).squeeze() input_mask_batch = np.dstack([feature.input_mask] * args.batch_size).squeeze() input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel())) segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel())) input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel())) eval_start_time = time.time() cuda.memcpy_htod_async(d_inputs[0], input_ids, stream) cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream) cuda.memcpy_htod_async(d_inputs[2], input_mask, stream) # Run inference context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] + [int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) # Synchronize the stream stream.synchronize() eval_time_elapsed += (time.time() - eval_start_time) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() # Only retrieve and post-process the first batch batch = h_output[0] networkOutputs.append(_NetworkOutput( start_logits = np.array(batch.squeeze()[:, 0]), end_logits = np.array(batch.squeeze()[:, 1]), feature_index = feature_index )) eval_time_elapsed /= len(features) # Total number of n-best predictions to generate in the nbest_predictions.json output file n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 prediction, nbest_json, scores_diff_json = dp.get_predictions(tokens, features, networkOutputs, args.n_best_size, args.max_answer_length) return eval_time_elapsed, prediction, nbest_json
def inference(features, tokens): global h_output _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name "NetworkOutput", ["start_logits", "end_logits", "feature_index"]) networkOutputs = [] eval_time_elapsed = 0 for feature_index, feature in enumerate(features): # Copy inputs input_ids = cuda.register_host_memory( np.ascontiguousarray(feature.input_ids.ravel())) segment_ids = cuda.register_host_memory( np.ascontiguousarray(feature.segment_ids.ravel())) input_mask = cuda.register_host_memory( np.ascontiguousarray(feature.input_mask.ravel())) eval_start_time = time.time() cuda.memcpy_htod_async(d_inputs[0], input_ids, stream) cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream) cuda.memcpy_htod_async(d_inputs[2], input_mask, stream) # Run inference context.execute_async_v2( bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) # Synchronize the stream stream.synchronize() eval_time_elapsed += (time.time() - eval_start_time) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() for index, batch in enumerate(h_output): # Data Post-processing networkOutputs.append( _NetworkOutput( start_logits=np.array(batch.squeeze()[:, 0]), end_logits=np.array(batch.squeeze()[:, 1]), feature_index=feature_index)) eval_time_elapsed /= len(features) prediction, nbest_json, scores_diff_json = dp.get_predictions( tokens, features, networkOutputs, args.n_best_size, args.max_answer_length) return eval_time_elapsed, prediction, nbest_json
def inference(features, tokens): _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name "NetworkOutput", ["start_logits", "end_logits", "feature_index"]) networkOutputs = [] eval_time_elapsed = 0 for feature_index, feature in enumerate(features): # Copy inputs input_ids = np.ascontiguousarray(feature.input_ids.ravel()) segment_ids = np.ascontiguousarray(feature.segment_ids.ravel()) input_mask = np.ascontiguousarray(feature.input_mask.ravel()) eval_start_time = time.time() # Run inference h_output = bert.run(input_ids, segment_ids, input_mask) eval_time_elapsed += (time.time() - eval_start_time) # Data Post-processing if len(h_output.shape) == 1: S = int(h_output.shape[0] / 2) networkOutputs.append( _NetworkOutput(start_logits=np.array(h_output[0:S]), end_logits=np.array(h_output[S:S * 2]), feature_index=feature_index)) else: for index, batch in enumerate(h_output): networkOutputs.append( _NetworkOutput( start_logits=np.array(batch.squeeze()[:, 0]), end_logits=np.array(batch.squeeze()[:, 1]), feature_index=feature_index)) eval_time_elapsed /= len(features) # Total number of n-best predictions to generate in the nbest_predictions.json output file n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 prediction, nbest_json, scores_diff_json = dp.get_predictions( tokens, features, networkOutputs, args.n_best_size, args.max_answer_length) return eval_time_elapsed, prediction, nbest_json
def transformation(): """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert it to a pandas data frame for internal use and then convert the predictions back to CSV (which really just means one prediction per line, since there's a single column. """ # Convert from json if (not flask.request.content_type == 'application/json'): return flask.Response( response= 'This predictor only supports json data. We have a request of type ' + flask.request.content_type, status=415, mimetype='text/plain') print("Getting request.") json_data = flask.request.get_json(force=True) short_paragraph_text = json_data["short_paragraph_text"] question_text = json_data["question_text"] print("Got request, starting prediction.") # Do prediction h_output, doc_tokens, features, sentences_sec = ScoringService.predict( short_paragraph_text, question_text) print("Finished prediction.") result = "" for index, batch in enumerate(h_output): start_logits = batch[:, 0] end_logits = batch[:, 1] # The total number of n-best predictions to generate in the nbest_predictions.json output file n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 (prediction, nbest_json, scores_diff_json) = \ dp.get_predictions(doc_tokens, features, start_logits, end_logits, n_best_size, max_answer_length) result += "Answer: '{}'".format( prediction) + " with prob: {:.3f}% ".format( nbest_json[0]['probability'] * 100.0) + "at {:.3f} Sentences/Sec.".format(sentences_sec) return flask.Response(response=result, status=200, mimetype='text/plain')
def inference(features, tokens): global h_output _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name "NetworkOutput", ["start_logits", "end_logits", "feature_index"]) networkOutputs = [] eval_time_elapsed = 0 for feature_index, feature in enumerate(features): # Copy inputs B = 1 S = np.sum(feature.input_mask) input_ids = feature.input_ids[0:S] segment_ids = feature.segment_ids[0:S] cu_seq_lens = np.array([0, S], dtype=np.int32) if context.get_binding_shape(0)[0] != S: context.set_binding_shape(0, (S, )) if context.get_binding_shape(1)[0] != S: context.set_binding_shape(1, (S, )) if context.get_binding_shape(2)[0] != 2: context.set_binding_shape(2, (2, )) if context.get_binding_shape(3)[0] != S: context.set_binding_shape(3, (S, )) h_input_ids = cuda.register_host_memory( np.ascontiguousarray(input_ids.ravel())) h_segment_ids = cuda.register_host_memory( np.ascontiguousarray(segment_ids.ravel())) h_cu_seq_lens = cuda.register_host_memory( np.ascontiguousarray(cu_seq_lens.ravel())) eval_start_time = time.time() cuda.memcpy_htod_async(d_inputs[0], h_input_ids, stream) cuda.memcpy_htod_async(d_inputs[1], h_segment_ids, stream) cuda.memcpy_htod_async(d_inputs[2], h_cu_seq_lens, stream) # Run inference context.execute_async_v2( bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) # Synchronize the stream stream.synchronize() eval_time_elapsed += (time.time() - eval_start_time) # Transfer predictions back from GPU cuda.memcpy_dtoh_async(h_output, d_output, stream) stream.synchronize() # Only retrieve and post-process the first batch networkOutputs.append( _NetworkOutput(start_logits=np.array(h_output[0:S]), end_logits=np.array(h_output[S:S * 2]), feature_index=feature_index)) eval_time_elapsed /= len(features) # Total number of n-best predictions to generate in the nbest_predictions.json output file n_best_size = 20 # The maximum length of an answer that can be generated. This is needed # because the start and end predictions are not conditioned on one another max_answer_length = 30 prediction, nbest_json, scores_diff_json = dp.get_predictions( tokens, features, networkOutputs, args.n_best_size, args.max_answer_length) return eval_time_elapsed, prediction, nbest_json