Ejemplo n.º 1
0
        def inference(features):
            global h_output
            print("\nRunning Inference...")
            eval_start_time = time.time()

            # Copy inputs
            cuda.memcpy_htod_async(d_inputs[0], features["input_ids"], stream)
            cuda.memcpy_htod_async(d_inputs[1], features["segment_ids"], stream)
            cuda.memcpy_htod_async(d_inputs[2], features["input_mask"], stream)
            # Run inference
            context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
            # Transfer predictions back from GPU
            cuda.memcpy_dtoh_async(h_output, d_output, stream)
            # Synchronize the stream
            stream.synchronize()

            h_output = h_output.transpose((1,0,2,3,4))
            eval_time_elapsed = time.time() - eval_start_time

            print("------------------------")
            print("Running inference in {:.3f} Sentences/Sec".format(1.0/eval_time_elapsed))
            print("------------------------")

            for index, batch in enumerate(h_output):
                # Data Post-processing
                start_logits = batch[:, 0]
                end_logits = batch[:, 1]

                prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,
                        start_logits, end_logits, args.n_best_size, args.max_answer_length)

                print("Processing output {:} in batch".format(index))
                print("Answer: '{}'".format(prediction))
                print("With probability: {:.3f}%".format(nbest_json[0]['probability'] * 100.0))
Ejemplo n.º 2
0
    def predict(self, inputs: Tuple[Text, List[Text]]) -> List[Any]:
        import helpers.data_processing as dp
        from polygraphy.backend.trt import TrtRunner
        import numpy as np
        import collections
        import time

        def question_features(tokens, question):
            # Extract features from the paragraph and question
            return dp.convert_example_to_features(
                tokens, question, self._trtModel.tokenizer,
                self._trtModel.max_seq_length, self._trtModel.doc_stride,
                self._trtModel.max_query_length)

        features = []
        doc_tokens = dp.convert_doc_tokens(inputs[0])
        ques_list = inputs[1]

        batch_size = len(ques_list)
        if batch_size < 16:
            # Pad the input batch to batch_size to match the model expected input.
            pad = [ques_list[0]] * (16 - batch_size)
            ques_list.extend(pad)

        for question_text in ques_list:
            features.append(question_features(doc_tokens, question_text)[0])

        input_ids_batch = np.dstack(
            [feature.input_ids for feature in features]).squeeze()
        segment_ids_batch = np.dstack(
            [feature.segment_ids for feature in features]).squeeze()
        input_mask_batch = np.dstack(
            [feature.input_mask for feature in features]).squeeze()

        inputs = {
            "input_ids": input_ids_batch,
            "input_mask": input_mask_batch,
            "segment_ids": segment_ids_batch
        }
        output = self._trtModel.infer_context.infer(inputs)

        start_logits = output['cls_squad_logits'][:, :, 0, :, :]
        end_logits = output['cls_squad_logits'][:, :, 1, :, :]
        networkOutputs = [
            self._NetworkOutput(start_logits=start_logits[i, :],
                                end_logits=end_logits[i, :],
                                feature_index=0)
            for i in range(self._batch_size)
        ]
        predictions = []
        for feature, networkOutput in zip(features, networkOutputs):
            prediction, _, _ = dp.get_predictions(
                doc_tokens, [feature], [networkOutput],
                self._trtModel.n_best_size, self._trtModel.max_answer_length)
            predictions.append(prediction)

        return [
            "[Q]: " + ques + "     [A]:" + prediction
            for ques, prediction in zip(ques_list, predictions)
        ]
Ejemplo n.º 3
0
        def inference(features, tokens):
            global h_output

            _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
                    "NetworkOutput",
                    ["start_logits", "end_logits", "feature_index"])
            networkOutputs = []

            eval_time_elapsed = 0
            for feature_index, feature in enumerate(features):
                # Copy inputs
                input_ids_batch = np.dstack([feature.input_ids] * args.batch_size).squeeze()
                segment_ids_batch = np.dstack([feature.segment_ids] * args.batch_size).squeeze()
                input_mask_batch = np.dstack([feature.input_mask] * args.batch_size).squeeze()

                input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel()))
                segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel()))
                input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel()))

                eval_start_time = time.time()
                cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
                cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
                cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)

                # Run inference
                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] + [int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
                # Synchronize the stream
                stream.synchronize()
                eval_time_elapsed += (time.time() - eval_start_time)

                # Transfer predictions back from GPU
                cuda.memcpy_dtoh_async(h_output, d_output, stream)
                stream.synchronize()

                # Only retrieve and post-process the first batch
                batch = h_output[0]
                networkOutputs.append(_NetworkOutput(
                    start_logits = np.array(batch.squeeze()[:, 0]),
                    end_logits = np.array(batch.squeeze()[:, 1]),
                    feature_index = feature_index
                    ))

            eval_time_elapsed /= len(features)

            # Total number of n-best predictions to generate in the nbest_predictions.json output file
            n_best_size = 20

            # The maximum length of an answer that can be generated. This is needed
            # because the start and end predictions are not conditioned on one another
            max_answer_length = 30

            prediction, nbest_json, scores_diff_json = dp.get_predictions(tokens, features,
                    networkOutputs, args.n_best_size, args.max_answer_length)

            return eval_time_elapsed, prediction, nbest_json
        def inference(features, tokens):
            global h_output

            _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
                "NetworkOutput",
                ["start_logits", "end_logits", "feature_index"])
            networkOutputs = []

            eval_time_elapsed = 0
            for feature_index, feature in enumerate(features):
                # Copy inputs
                input_ids = cuda.register_host_memory(
                    np.ascontiguousarray(feature.input_ids.ravel()))
                segment_ids = cuda.register_host_memory(
                    np.ascontiguousarray(feature.segment_ids.ravel()))
                input_mask = cuda.register_host_memory(
                    np.ascontiguousarray(feature.input_mask.ravel()))

                eval_start_time = time.time()
                cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
                cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
                cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)

                # Run inference
                context.execute_async_v2(
                    bindings=[int(d_inp)
                              for d_inp in d_inputs] + [int(d_output)],
                    stream_handle=stream.handle)
                # Synchronize the stream
                stream.synchronize()
                eval_time_elapsed += (time.time() - eval_start_time)

                # Transfer predictions back from GPU
                cuda.memcpy_dtoh_async(h_output, d_output, stream)
                stream.synchronize()

                for index, batch in enumerate(h_output):
                    # Data Post-processing
                    networkOutputs.append(
                        _NetworkOutput(
                            start_logits=np.array(batch.squeeze()[:, 0]),
                            end_logits=np.array(batch.squeeze()[:, 1]),
                            feature_index=feature_index))

            eval_time_elapsed /= len(features)

            prediction, nbest_json, scores_diff_json = dp.get_predictions(
                tokens, features, networkOutputs, args.n_best_size,
                args.max_answer_length)

            return eval_time_elapsed, prediction, nbest_json
Ejemplo n.º 5
0
    def inference(features, tokens):

        _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
            "NetworkOutput", ["start_logits", "end_logits", "feature_index"])
        networkOutputs = []

        eval_time_elapsed = 0
        for feature_index, feature in enumerate(features):
            # Copy inputs
            input_ids = np.ascontiguousarray(feature.input_ids.ravel())
            segment_ids = np.ascontiguousarray(feature.segment_ids.ravel())
            input_mask = np.ascontiguousarray(feature.input_mask.ravel())

            eval_start_time = time.time()

            # Run inference
            h_output = bert.run(input_ids, segment_ids, input_mask)
            eval_time_elapsed += (time.time() - eval_start_time)

            # Data Post-processing
            if len(h_output.shape) == 1:
                S = int(h_output.shape[0] / 2)
                networkOutputs.append(
                    _NetworkOutput(start_logits=np.array(h_output[0:S]),
                                   end_logits=np.array(h_output[S:S * 2]),
                                   feature_index=feature_index))
            else:
                for index, batch in enumerate(h_output):
                    networkOutputs.append(
                        _NetworkOutput(
                            start_logits=np.array(batch.squeeze()[:, 0]),
                            end_logits=np.array(batch.squeeze()[:, 1]),
                            feature_index=feature_index))

        eval_time_elapsed /= len(features)

        # Total number of n-best predictions to generate in the nbest_predictions.json output file
        n_best_size = 20

        # The maximum length of an answer that can be generated. This is needed
        # because the start and end predictions are not conditioned on one another
        max_answer_length = 30

        prediction, nbest_json, scores_diff_json = dp.get_predictions(
            tokens, features, networkOutputs, args.n_best_size,
            args.max_answer_length)

        return eval_time_elapsed, prediction, nbest_json
Ejemplo n.º 6
0
def transformation():
    """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
    it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
    just means one prediction per line, since there's a single column.
    """

    # Convert from json
    if (not flask.request.content_type == 'application/json'):
        return flask.Response(
            response=
            'This predictor only supports json data. We have a request of type '
            + flask.request.content_type,
            status=415,
            mimetype='text/plain')
    print("Getting request.")
    json_data = flask.request.get_json(force=True)
    short_paragraph_text = json_data["short_paragraph_text"]
    question_text = json_data["question_text"]

    print("Got request, starting prediction.")
    # Do prediction
    h_output, doc_tokens, features, sentences_sec = ScoringService.predict(
        short_paragraph_text, question_text)
    print("Finished prediction.")
    result = ""
    for index, batch in enumerate(h_output):
        start_logits = batch[:, 0]
        end_logits = batch[:, 1]

        # The total number of n-best predictions to generate in the nbest_predictions.json output file
        n_best_size = 20

        # The maximum length of an answer that can be generated. This is needed
        #  because the start and end predictions are not conditioned on one another
        max_answer_length = 30


        (prediction, nbest_json, scores_diff_json) = \
            dp.get_predictions(doc_tokens, features, start_logits, end_logits, n_best_size, max_answer_length)

        result += "Answer: '{}'".format(
            prediction) + " with prob: {:.3f}% ".format(
                nbest_json[0]['probability'] *
                100.0) + "at {:.3f} Sentences/Sec.".format(sentences_sec)

    return flask.Response(response=result, status=200, mimetype='text/plain')
Ejemplo n.º 7
0
        def inference(features, tokens):
            global h_output

            _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
                "NetworkOutput",
                ["start_logits", "end_logits", "feature_index"])
            networkOutputs = []

            eval_time_elapsed = 0
            for feature_index, feature in enumerate(features):
                # Copy inputs
                B = 1
                S = np.sum(feature.input_mask)
                input_ids = feature.input_ids[0:S]
                segment_ids = feature.segment_ids[0:S]
                cu_seq_lens = np.array([0, S], dtype=np.int32)

                if context.get_binding_shape(0)[0] != S:
                    context.set_binding_shape(0, (S, ))
                if context.get_binding_shape(1)[0] != S:
                    context.set_binding_shape(1, (S, ))
                if context.get_binding_shape(2)[0] != 2:
                    context.set_binding_shape(2, (2, ))
                if context.get_binding_shape(3)[0] != S:
                    context.set_binding_shape(3, (S, ))

                h_input_ids = cuda.register_host_memory(
                    np.ascontiguousarray(input_ids.ravel()))
                h_segment_ids = cuda.register_host_memory(
                    np.ascontiguousarray(segment_ids.ravel()))
                h_cu_seq_lens = cuda.register_host_memory(
                    np.ascontiguousarray(cu_seq_lens.ravel()))

                eval_start_time = time.time()
                cuda.memcpy_htod_async(d_inputs[0], h_input_ids, stream)
                cuda.memcpy_htod_async(d_inputs[1], h_segment_ids, stream)
                cuda.memcpy_htod_async(d_inputs[2], h_cu_seq_lens, stream)

                # Run inference
                context.execute_async_v2(
                    bindings=[int(d_inp)
                              for d_inp in d_inputs] + [int(d_output)],
                    stream_handle=stream.handle)
                # Synchronize the stream
                stream.synchronize()
                eval_time_elapsed += (time.time() - eval_start_time)

                # Transfer predictions back from GPU
                cuda.memcpy_dtoh_async(h_output, d_output, stream)
                stream.synchronize()

                # Only retrieve and post-process the first batch
                networkOutputs.append(
                    _NetworkOutput(start_logits=np.array(h_output[0:S]),
                                   end_logits=np.array(h_output[S:S * 2]),
                                   feature_index=feature_index))

            eval_time_elapsed /= len(features)

            # Total number of n-best predictions to generate in the nbest_predictions.json output file
            n_best_size = 20

            # The maximum length of an answer that can be generated. This is needed
            # because the start and end predictions are not conditioned on one another
            max_answer_length = 30

            prediction, nbest_json, scores_diff_json = dp.get_predictions(
                tokens, features, networkOutputs, args.n_best_size,
                args.max_answer_length)

            return eval_time_elapsed, prediction, nbest_json