Exemple #1
0
def quantize_statically(model,
                        inputs,
                        data_loader,
                        linear_only=False,
                        module_swap=False):
    log_feature_usage("export.quantize.statically")
    if (hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder)
            and linear_only):
        log_accelerator_feature_usage("quantize.statically")
        qconfig = QConfig(
            activation=HistogramObserver.with_args(reduce_range=False),
            weight=default_weight_observer,
        )
        qconfig_dict = {"": None}
        if module_swap:
            layers = model.encoder.encoder.transformer.layers.layers
            layers_str = "encoder.encoder.transformer.layers.layers"
        else:
            layers = model.encoder.encoder.transformer.layers
            layers_str = "encoder.encoder.transformer.layers"

        # skip first layer
        for layer_idx in range(1, len(layers)):
            qconfig_dict[
                layers_str +
                ".{}.attention.input_projection".format(layer_idx)] = qconfig
            qconfig_dict[
                layers_str +
                ".{}.attention.output_projection".format(layer_idx)] = qconfig
            for mlp_idx, m in enumerate(layers[layer_idx].residual_mlp.mlp):
                # Only quantize first linear otherwise there are accuarcy issues
                if type(m) == torch.nn.Linear and mlp_idx < 1:
                    qconfig_dict[layers_str + ".{}.residual_mlp.mlp.{}".format(
                        layer_idx, mlp_idx)] = qconfig
        trace = model.graph_mode_quantize(inputs,
                                          data_loader,
                                          qconfig_dict=qconfig_dict,
                                          force_quantize=True)
    else:
        trace = model.graph_mode_quantize(inputs, data_loader)

    return trace
Exemple #2
0
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # TODO(T88310041) Remove These
        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_profiling_mode(False)
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control
        accel = AccelerateOptions(export_config.accelerate)
        print(f"Using accelerate options: {accel.__dict__}")

        # what hosts can this model run on
        # by default, pytext works on CPU and CUDA (because it implements set_device)
        model_host = ["cpu", "cuda"]
        if accel.use_cuda:
            # CUDA FP16 models only work on CUDA
            model_host = ["cuda"]
        if accel.use_nnpi:
            model_host = ["nnpi"]
        if hasattr(model, "set_host"):
            model.set_host(model_host)

        # what is the type of this model
        # pytext models are nlp models
        model_type = ["nlp"]

        instance_paths_p = any(
            True for _ in find_module_instances(model, RoBERTaEncoder, []))
        if instance_paths_p:
            model_type.append("transformer")

        instance_paths_p = any(
            True for _ in find_module_instances(model, BiLSTM, []))
        if instance_paths_p:
            model_type.append("BiLSTM")

        if hasattr(model, "set_model"):
            model.set_type(model_type)

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        model = rewrite_nnpi_modules(model, accel)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        # Default to dynamic
        if accel.use_fx_quantize:
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = quantize_fx(
                model,
                inputs,
                data_loader,
                accel.use_nnpi_fx_dynamic_quantize
                or accel.use_cpu_fx_dynamic_quantize,
                accel.use_nnpi_fx_static_selectively_quantize
                or accel.use_cpu_fx_static_selectively_quantize,
            )
        elif (quantize or accel.use_nnpi_quantize) and hasattr(
                model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = accel.use_nnpi_quantize
            module_swap = accel.use_nnpi
            trace = quantize_statically(model, inputs, data_loader,
                                        quantize_linear_only, module_swap)
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()
            if accel.use_cuda and (accel.use_cuda_half_ft
                                   or accel.use_cuda_dq):
                log_accelerator_feature_usage(
                    "build.CUDA.half.faster_transformers")
                # We need a separate path for GPU-only tracing, as we can't just trace a CPU model
                # and invoke .cuda().half(),
                # as we don't have equivalent CPU implementations of these operators.
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True
                if accel.use_cuda_dq:
                    model = swap_modules(model, MODULE_TO_REWRITER["cuda-dq"])
                else:
                    model = swap_modules(model, MODULE_TO_REWRITER["cuda"])
                model.eval()
                model.half().cuda()
                # obtain new inputs with cuda/fp16 enabled.
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                trace = model.trace(inputs)
                print("Traced (faster_transformers)!")
                # should be unnecessary.
                trace.cuda().half()
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                results = trace(*inputs)
                assert results
            else:
                trace = model.trace(inputs)
                print("Traced!")
                if accel.use_cuda and accel.use_cuda_half:
                    log_accelerator_feature_usage("build.CUDA.half")
                    # convert trace to half precision
                    trace.cuda().half()

                    #### trace test: demonstrate that it is usable
                    precision.FP16_ENABLED = True
                    cuda.CUDA_ENABLED = True
                    unused_raw_batch, batch = next(
                        iter(self.data.batches(Stage.TRAIN, load_early=True)))
                    inputs = model.onnx_trace_input(batch)
                    assert trace(*inputs)
                    #### end of trace test
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if accel.use_nnpi and not accel.use_nnpi_split:
            print("Lowering using to_glow")
            trace = lower_modules_to_accelerator(
                model,
                trace,
                export_config,
                accel.use_nnpi_throughput_optimized,
                accel.use_nnpi_throughput_maximized,
                accel.use_nnpi_gelu_clip,
            )
        if accel.use_nnpi_split:
            print("Lowering split model to Glow")
            trace = lower_split_model_to_accelerator(model, trace,
                                                     export_config)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Exemple #3
0
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize

        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control
        inference_interface = export_config.inference_interface

        # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI
        use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate)
        use_cuda_half = "cuda:half" in accelerate
        use_nnpi_quantize = "nnpi:quantize" in accelerate

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        if use_nnpi:
            model = swap_modules_for_accelerator(model)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        if (quantize or use_nnpi_quantize) and hasattr(model,
                                                       "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize"
                                                         in accelerate)
            module_swap = use_nnpi
            trace = quantize_statically(model, inputs, data_loader,
                                        quantize_linear_only, module_swap)
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()

            trace = model.trace(inputs)
            print("traced!")

            if use_cuda_half:
                log_accelerator_feature_usage("build.CUDA.half")
                # convert trace to half precision
                trace.cuda().half()

                #### trace test: demonstrate that it is usable
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                assert trace(*inputs)
                #### end of trace test

        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        if inference_interface is not None:
            if hasattr(trace, "inference_interface"):
                trace.inference_interface(inference_interface)
            else:
                print(
                    "inference_interface not supported by model. Ignoring inference_interface"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if use_nnpi:
            print("lowering using to_glow")
            trace = lower_modules_to_accelerator(model, trace, export_config)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Exemple #4
0
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # TODO(T88310041) Remove These
        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_profiling_mode(False)
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize

        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control

        # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI
        use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate)
        use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate
        use_cuda_half = "cuda:half" in accelerate
        use_cuda_half_faster_transformers = "cuda:half:ft" in accelerate

        use_nnpi_quantize = "nnpi:quantize" in accelerate
        use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate
        use_nnpi_fx_static_selectively_quantize = (
            "nnpi:fx_static_selectively_quantize" in accelerate
        )
        use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate
        use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate
        use_cpu_fx_static_selectively_quantize = (
            "cpu:fx_static_selectively_quantize" in accelerate
        )
        use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate
        use_fx_quantize = (
            use_nnpi_fx_static_quantize
            or use_nnpi_fx_static_selectively_quantize
            or use_nnpi_fx_dynamic_quantize
            or use_cpu_fx_static_quantize
            or use_cpu_fx_static_selectively_quantize
            or use_cpu_fx_dynamic_quantize
        )

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        if use_nnpi or use_fx_quantize:
            model = swap_modules(model, MODULE_TO_REWRITER["nnpi"])

        if "nnpi:split" in accelerate:
            model = split_model_for_accelerator(model)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True))
        )
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        # Default to dynamic
        if use_fx_quantize:
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = quantize_fx(
                model,
                inputs,
                data_loader,
                use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize,
                use_nnpi_fx_static_selectively_quantize
                or use_cpu_fx_static_selectively_quantize,
            )
        elif (quantize or use_nnpi_quantize) and hasattr(model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize" in accelerate)
            module_swap = use_nnpi
            trace = quantize_statically(
                model, inputs, data_loader, quantize_linear_only, module_swap
            )
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()
            if use_cuda_half_faster_transformers:
                log_accelerator_feature_usage("build.CUDA.half.faster_transformers")
                # We need a separate path for GPU-only tracing, as we can't just trace a CPU model
                # and invoke .cuda().half(),
                # as we don't have equivalent CPU implementations of these operators.
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True

                model = swap_modules(model, MODULE_TO_REWRITER["cuda"])
                model.eval()
                model.half().cuda()
                # obtain new inputs with cuda/fp16 enabled.
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True))
                )
                inputs = model.onnx_trace_input(batch)
                trace = model.trace(inputs)
                print("traced (faster_transformers)!")
                # should be unnecessary.
                trace.cuda().half()
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True))
                )
                inputs = model.onnx_trace_input(batch)
                assert trace(*inputs)
            else:
                trace = model.trace(inputs)
                print("traced!")
                if use_cuda_half:
                    log_accelerator_feature_usage("build.CUDA.half")
                    # convert trace to half precision
                    trace.cuda().half()

                    #### trace test: demonstrate that it is usable
                    precision.FP16_ENABLED = True
                    cuda.CUDA_ENABLED = True
                    unused_raw_batch, batch = next(
                        iter(self.data.batches(Stage.TRAIN, load_early=True))
                    )
                    inputs = model.onnx_trace_input(batch)
                    assert trace(*inputs)
                    #### end of trace test
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length", seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length", batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if use_nnpi:
            print("lowering using to_glow")
            trace = lower_modules_to_accelerator(
                model, trace, export_config, use_nnpi_throughput_optimized
            )
        if "split" in accelerate:
            print("lowering split model to glow")
            trace = lower_split_model_to_accelerator(model, trace, export_config)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace