Beispiel #1
0
 def load_cached_embeddings(self, cache_path: str) -> None:
     """
     Load cached embeddings from file
     """
     with PathManager.open(cache_path, "rb") as f:
         self.embed_vocab, self.stoi, self.embedding_vectors = torch.load(f)
Beispiel #2
0
    def sensitivity_analysis(self, trainer, state, eval_data, metric_reporter,
                             train_config):
        """
        Analysis the sensitivity of each weight tensor to the metric.
        Prune the weight tensor one by one and evaluate the metric if the
        correspond weight tensor is pruned.
        Args:
            trainer (trainer): batch iterator of training data
            state (TrainingState): the state of the current training
            eval_data (BatchIterator): batch iterator of evaluation data
            metric_reporter (MetricReporter): compute metric based on training
            output and report results to console, file.. etc
            train_config (PyTextConfig): training config

        Returns:
            analysis_result: a string of each layer sensitivity to metric.
        """
        print("Analyzed_sparsity: {}".format(self.analyzed_sparsity))
        print("Evaluation metric_reporter: {}".format(
            type(metric_reporter).__name__))
        output_path = (
            os.path.dirname(train_config.task.metric_reporter.output_path) +
            "/sensitivity_analysis_sparsifier.ckp")

        # param_dict: the dict maps weight tensor to the parameter name
        self.param_dict = self.get_sparsifiable_params(state.model)

        # set model to evaluation mode
        state.stage = Stage.EVAL
        state.model.eval(Stage.EVAL)

        metric_dict = {}
        all_param_list = [None] + list(self.param_dict.keys())
        print("All prunable parameters", all_param_list)

        # print the sensitivity results for each weight
        print("#" * 40)
        print("save the analysis result to: ", output_path)
        print("Pruning Sensitivity Test: param / shape / eval metric")

        # iterate through all_param_list to test pruning snesitivity
        for param_name in all_param_list:
            print("=" * 40)
            print("Testing {}".format(param_name))
            state.model.load_state_dict(self.loaded_model["model_state"])

            current_metric, prunable_param_shape = self.layer_wise_analysis(
                param_name, self.param_dict, trainer, state, eval_data,
                metric_reporter)
            if param_name is None:
                baseline_metric = current_metric
            metric_dict[param_name] = current_metric - baseline_metric
        print("#" * 40)

        # remove baseline metric from the analysis results
        if None in metric_dict:
            del metric_dict[None]
        # write the test result into the checkpoint
        if state.rank == 0:
            with PathManager.open(output_path, "w") as fp:
                json.dump(metric_dict, fp)

        return metric_dict
Beispiel #3
0
 def save(self, state, save_path):
     with PathManager.open(save_path, "wb") as f:
         torch.save(state, f)
Beispiel #4
0
 def __init__(self, *args, **kwargs):
     self._file = PathManager.open(*args, **kwargs)
Beispiel #5
0
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize

        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control

        # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI
        use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate)
        use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate
        use_cuda_half = "cuda:half" in accelerate
        use_nnpi_quantize = "nnpi:quantize" in accelerate
        use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate
        use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate
        use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate
        use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate
        use_fx_quantize = (use_nnpi_fx_static_quantize
                           or use_nnpi_fx_dynamic_quantize
                           or use_cpu_fx_static_quantize
                           or use_cpu_fx_dynamic_quantize)

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        if use_nnpi or use_fx_quantize:
            model = swap_modules_for_accelerator(model)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        # Default to dynamic
        if use_fx_quantize:
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = quantize_fx(
                model,
                inputs,
                data_loader,
                use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize,
            )
        elif (quantize or use_nnpi_quantize) and hasattr(
                model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize"
                                                         in accelerate)
            module_swap = use_nnpi
            trace = quantize_statically(model, inputs, data_loader,
                                        quantize_linear_only, module_swap)
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()

            trace = model.trace(inputs)
            print("traced!")

            if use_cuda_half:
                log_accelerator_feature_usage("build.CUDA.half")
                # convert trace to half precision
                trace.cuda().half()

                #### trace test: demonstrate that it is usable
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                assert trace(*inputs)
                #### end of trace test

        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if use_nnpi:
            print("lowering using to_glow")
            trace = lower_modules_to_accelerator(
                model, trace, export_config, use_nnpi_throughput_optimized)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Beispiel #6
0
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # TODO(T88310041) Remove These
        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_profiling_mode(False)
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize

        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control

        # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI
        use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate)
        use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate
        use_nnpi_gelu_clip = "nnpi:gelu_clip" in accelerate
        use_cuda_half = "cuda:half" in accelerate
        use_cuda_half_faster_transformers = "cuda:half:ft" in accelerate

        use_nnpi_quantize = "nnpi:quantize" in accelerate
        use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate
        use_nnpi_fx_static_selectively_quantize = (
            "nnpi:fx_static_selectively_quantize" in accelerate
        )
        use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate
        use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate
        use_cpu_fx_static_selectively_quantize = (
            "cpu:fx_static_selectively_quantize" in accelerate
        )
        use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate
        use_fx_quantize = (
            use_nnpi_fx_static_quantize
            or use_nnpi_fx_static_selectively_quantize
            or use_nnpi_fx_dynamic_quantize
            or use_cpu_fx_static_quantize
            or use_cpu_fx_static_selectively_quantize
            or use_cpu_fx_dynamic_quantize
        )

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        if use_nnpi or use_fx_quantize:
            model = swap_modules(model, MODULE_TO_REWRITER["nnpi"])

        if "nnpi:split" in accelerate:
            model = split_model_for_accelerator(model)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True))
        )
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        # Default to dynamic
        if use_fx_quantize:
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = quantize_fx(
                model,
                inputs,
                data_loader,
                use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize,
                use_nnpi_fx_static_selectively_quantize
                or use_cpu_fx_static_selectively_quantize,
            )
        elif (quantize or use_nnpi_quantize) and hasattr(model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize" in accelerate)
            module_swap = use_nnpi
            trace = quantize_statically(
                model, inputs, data_loader, quantize_linear_only, module_swap
            )
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()
            if use_cuda_half_faster_transformers:
                log_accelerator_feature_usage("build.CUDA.half.faster_transformers")
                # We need a separate path for GPU-only tracing, as we can't just trace a CPU model
                # and invoke .cuda().half(),
                # as we don't have equivalent CPU implementations of these operators.
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True

                model = swap_modules(model, MODULE_TO_REWRITER["cuda"])
                model.eval()
                model.half().cuda()
                # obtain new inputs with cuda/fp16 enabled.
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True))
                )
                inputs = model.onnx_trace_input(batch)
                trace = model.trace(inputs)
                print("traced (faster_transformers)!")
                # should be unnecessary.
                trace.cuda().half()
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True))
                )
                inputs = model.onnx_trace_input(batch)
                results = trace(*inputs)
                assert results
                print(results)
            else:
                trace = model.trace(inputs)
                print("traced!")
                if use_cuda_half:
                    log_accelerator_feature_usage("build.CUDA.half")
                    # convert trace to half precision
                    trace.cuda().half()

                    #### trace test: demonstrate that it is usable
                    precision.FP16_ENABLED = True
                    cuda.CUDA_ENABLED = True
                    unused_raw_batch, batch = next(
                        iter(self.data.batches(Stage.TRAIN, load_early=True))
                    )
                    inputs = model.onnx_trace_input(batch)
                    assert trace(*inputs)
                    #### end of trace test
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length", seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length", batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if use_nnpi:
            print("lowering using to_glow")
            trace = lower_modules_to_accelerator(
                model,
                trace,
                export_config,
                use_nnpi_throughput_optimized,
                use_nnpi_gelu_clip,
            )
        if "split" in accelerate:
            print("lowering split model to glow")
            trace = lower_split_model_to_accelerator(model, trace, export_config)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Beispiel #7
0
 def from_vocab_filename(cls, vocab_filename: str) -> "ScriptBPE":
     with PathManager.open(vocab_filename) as vocab_file:
         return cls(cls.load_vocab(vocab_file))
Beispiel #8
0
    def torchscript_export(self,
                           model,
                           export_path=None,
                           sort_input=False,
                           sort_key=1,
                           **kwargs):
        # unpack export kwargs
        quantize = kwargs.get("quantize", False)
        accelerate = kwargs.get("accelerate", [])
        seq_padding_control = kwargs.get("seq_padding_control")
        batch_padding_control = kwargs.get("batch_padding_control")
        inference_interface = kwargs.get("inference_interface")

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)
        if "half" in accelerate:
            model.half()
        if quantize and hasattr(model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = model.graph_mode_quantize(inputs, data_loader)
        else:
            if quantize:
                model.quantize()
            trace = model.trace(inputs)
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring padding_control"
                )
        if inference_interface is not None:
            if hasattr(trace, "inference_interface"):
                trace.inference_interface(inference_interface)
            else:
                print(
                    "inference_interface not supported by model. Ignoring inference_interface"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if "nnpi" in accelerate:
            trace._c = torch._C._freeze_module(
                trace._c,
                preservedAttrs=[
                    "make_prediction", "make_batch", "set_padding_control"
                ],
            )
        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Beispiel #9
0
    def torchscript_export(self,
                           model,
                           export_path=None,
                           export_config=None):  # noqa
        # unpack export config
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize
        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control
        inference_interface = export_config.inference_interface

        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        model(*inputs)
        if quantize:
            model.quantize()
        if accelerate is not None and "half" in accelerate:
            model.half()
        if self.trace_both_encoders:
            trace = jit.trace(model, inputs)
        else:
            trace = jit.trace(model.encoder1, (inputs[0], ))
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace,
                                         self.trace_both_encoders)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring padding_control"
                )
        if inference_interface is not None:
            if hasattr(trace, "inference_interface"):
                trace.inference_interface(inference_interface)
            else:
                print(
                    "inference_interface not supported by model. Ignoring inference_interface"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if accelerate is not None and "nnpi" in accelerate:
            trace._c = torch._C._freeze_module(
                trace._c,
                preservedAttrs=[
                    "make_prediction", "make_batch", "set_padding_control"
                ],
            )
        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Beispiel #10
0
    def __init__(
        self,
        config: Config,
        right_dim: int,
        left_dim: int,
        to_dim: int,
        export_type=ExportType.NONE,
    ) -> None:
        super().__init__(config)

        self.mlp_for_right = MLPDecoderTwoTower.get_mlp(
            right_dim,
            0,
            config.right_hidden_dims,
            config.layer_norm,
            config.dropout,
            config.activation,
            export_embedding=True,
        )
        self.mlp_for_left = MLPDecoderTwoTower.get_mlp(
            left_dim,
            0,
            config.left_hidden_dims,
            config.layer_norm,
            config.dropout,
            config.activation,
            export_embedding=True,
        )

        if config.dense_tower_dims is None:
            from_dim = config.right_hidden_dims[-1] + config.left_hidden_dims[
                -1]
            self.concat_dense = False
        else:
            self.mlp_for_dense = MLPDecoderTwoTower.get_mlp(
                from_dim=config.dense_tower_dims[0],
                to_dim=config.dense_tower_dims[-1],
                hidden_dims=config.dense_tower_dims[1:-1],
                layer_norm=config.layer_norm,
                dropout=config.dropout,
                activation=config.activation,
            )
            from_dim = (config.right_hidden_dims[-1] +
                        config.left_hidden_dims[-1] +
                        config.dense_tower_dims[-1])
            self.concat_dense = True

        self.mlp = MLPDecoderTwoTower.get_mlp(
            from_dim,
            to_dim,
            config.hidden_dims,
            config.layer_norm,
            config.dropout,
            config.activation,
        )

        # load model
        if config.load_model_path:
            with PathManager.open(config.load_model_path, "rb") as f:
                model = torch.load(f,
                                   map_location=lambda s, l:
                                   default_restore_location(s, "cpu"))
            mlp_state = {
                k.replace("decoder.", ""): v
                for k, v in model["model_state"].items()
                if k.startswith("decoder.mlp")
                or k.startswith("decoder.mlp_for_right")
                or k.startswith("decoder.mlp_for_left")
                or k.startswith("decoder.mlp_for_dense")
            }
            if mlp_state["mlp.0.weight"].shape[1] != from_dim:
                mlp_state = {
                    k: v
                    for k, v in mlp_state.items() if not k.startswith("mlp.")
                }
                print("top mlp weights not loaded")
            self.load_state_dict(mlp_state, strict=config.load_strict)
            print("loaded mlp state")

        self.out_dim = to_dim
        self.export_type = export_type
        log_class_usage
Beispiel #11
0
def reader_raw(file_path, vocab):
    with PathManager.open(file_path, "r") as r:
        for line in r:
            yield vocab[line.strip()]
Beispiel #12
0
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # TODO(T88310041) Remove These
        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_profiling_mode(False)
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control
        accel = AccelerateOptions(export_config.accelerate)
        print(f"Using accelerate options: {accel.__dict__}")

        # what hosts can this model run on
        # by default, pytext works on CPU and CUDA (because it implements set_device)
        model_host = ["cpu", "cuda"]
        if accel.use_cuda:
            # CUDA FP16 models only work on CUDA
            model_host = ["cuda"]
        if accel.use_nnpi:
            model_host = ["nnpi"]
        if hasattr(model, "set_host"):
            model.set_host(model_host)

        # what is the type of this model
        # pytext models are nlp models
        model_type = ["nlp"]

        instance_paths_p = any(
            True for _ in find_module_instances(model, RoBERTaEncoder, []))
        if instance_paths_p:
            model_type.append("transformer")

        instance_paths_p = any(
            True for _ in find_module_instances(model, BiLSTM, []))
        if instance_paths_p:
            model_type.append("BiLSTM")

        if hasattr(model, "set_model"):
            model.set_type(model_type)

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        model = rewrite_nnpi_modules(model, accel)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        # Default to dynamic
        if accel.use_fx_quantize:
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = quantize_fx(
                model,
                inputs,
                data_loader,
                accel.use_nnpi_fx_dynamic_quantize
                or accel.use_cpu_fx_dynamic_quantize,
                accel.use_nnpi_fx_static_selectively_quantize
                or accel.use_cpu_fx_static_selectively_quantize,
            )
        elif (quantize or accel.use_nnpi_quantize) and hasattr(
                model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = accel.use_nnpi_quantize
            module_swap = accel.use_nnpi
            trace = quantize_statically(model, inputs, data_loader,
                                        quantize_linear_only, module_swap)
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()
            if accel.use_cuda and (accel.use_cuda_half_ft
                                   or accel.use_cuda_dq):
                log_accelerator_feature_usage(
                    "build.CUDA.half.faster_transformers")
                # We need a separate path for GPU-only tracing, as we can't just trace a CPU model
                # and invoke .cuda().half(),
                # as we don't have equivalent CPU implementations of these operators.
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True
                if accel.use_cuda_dq:
                    model = swap_modules(model, MODULE_TO_REWRITER["cuda-dq"])
                else:
                    model = swap_modules(model, MODULE_TO_REWRITER["cuda"])
                model.eval()
                model.half().cuda()
                # obtain new inputs with cuda/fp16 enabled.
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                trace = model.trace(inputs)
                print("Traced (faster_transformers)!")
                # should be unnecessary.
                trace.cuda().half()
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                results = trace(*inputs)
                assert results
            else:
                trace = model.trace(inputs)
                print("Traced!")
                if accel.use_cuda and accel.use_cuda_half:
                    log_accelerator_feature_usage("build.CUDA.half")
                    # convert trace to half precision
                    trace.cuda().half()

                    #### trace test: demonstrate that it is usable
                    precision.FP16_ENABLED = True
                    cuda.CUDA_ENABLED = True
                    unused_raw_batch, batch = next(
                        iter(self.data.batches(Stage.TRAIN, load_early=True)))
                    inputs = model.onnx_trace_input(batch)
                    assert trace(*inputs)
                    #### end of trace test
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if accel.use_nnpi and not accel.use_nnpi_split:
            print("Lowering using to_glow")
            trace = lower_modules_to_accelerator(
                model,
                trace,
                export_config,
                accel.use_nnpi_throughput_optimized,
                accel.use_nnpi_throughput_maximized,
                accel.use_nnpi_gelu_clip,
            )
        if accel.use_nnpi_split:
            print("Lowering split model to Glow")
            trace = lower_split_model_to_accelerator(model, trace,
                                                     export_config)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Beispiel #13
0
def get_test_sample():
    with PathManager.open(RAW_TEST_PATH, "r") as f:
        data = json.load(f)
    return data
Beispiel #14
0
    def __init__(self, config: Config, output_encoded_layers: bool,
                 **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)

        # map to the real model_path
        config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path]
                             if config.model_path
                             in resources.roberta.RESOURCE_MAP else
                             config.model_path)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        # sharing compression across each layers

        # create compress layer if use linear multihead attention
        if config.use_linformer_encoder:
            compress_layer = nn.Linear(
                config.max_seq_len - 2,
                (config.max_seq_len - 2) // config.linformer_compressed_ratio,
            )

        self.use_selfie_encoder = config.use_selfie_encoder

        if config.use_linformer_encoder:
            if config.linformer_quantize:
                layers = [
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=QuantizedMultiheadLinearAttention(
                            embed_dim=config.embedding_dim,
                            num_heads=config.num_attention_heads,
                            compress_layer=compress_layer,
                        ),
                    ) for _ in range(config.num_encoder_layers)
                ]
            else:
                layers = [
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=MultiheadLinearAttention(
                            embed_dim=config.embedding_dim,
                            num_heads=config.num_attention_heads,
                            compress_layer=compress_layer,
                        ),
                    ) for _ in range(config.num_encoder_layers)
                ]
        else:
            layers = [
                TransformerLayer(
                    embedding_dim=config.embedding_dim,
                    attention=MultiheadSelfAttention(
                        embed_dim=config.embedding_dim,
                        num_heads=config.num_attention_heads,
                        scaling=config.scaling,
                    ),
                    normalize_before=config.normalize_before,
                ) for _ in range(config.num_encoder_layers)
            ]

        self.encoder = (SentenceEncoder(transformer=Transformer(
            vocab_size=config.vocab_size,
            embedding_dim=config.embedding_dim,
            layers=layers,
            max_seq_len=config.max_seq_len,
            normalize_before=config.normalize_before,
        )) if not self.use_selfie_encoder else PostEncoder(
            transformer=SELFIETransformer(
                vocab_size=config.vocab_size,
                embedding_dim=config.embedding_dim,
                layers=layers,
                max_seq_len=config.max_seq_len,
            )))
        self.apply(init_params)
        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(f,
                                           map_location=lambda s, l:
                                           default_restore_location(s, "cpu"))
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            else:
                self.load_state_dict(roberta_state)

        if config.use_bias_finetuning:
            for (n, p) in self.encoder.named_parameters():
                # "encoder.transformer.layers.0.attention.input_projection.weight" -> false
                # "encoder.transformer.layers.0.attention.input_projection.bias" -> true
                if n.split(".")[-1] != "bias":
                    p.requires_grad_(False)

        self._prune_transformer_layers_and_heads(config)

        self.export_encoder = config.export_encoder
        self.variable_size_embedding = config.variable_size_embedding
        self.use_linformer_encoder = config.use_linformer_encoder
        log_class_usage(__class__)
Beispiel #15
0
    def torchscript_export(self,
                           model,
                           export_path=None,
                           export_config=None):  # noqa
        # unpack export config
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize
        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control

        if (accelerate is not None) and (accelerate != []):
            raise RuntimeError(
                "old-style task.py does not support export for NNPI accelerators"
            )

        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        model(*inputs)
        if quantize:
            model.quantize()
        if self.trace_both_encoders:
            trace = jit.trace(model, inputs)
        else:
            trace = jit.trace(model.encoder1, (inputs[0], ))
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace,
                                         self.trace_both_encoders)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Beispiel #16
0
def print_sample(file_name):
    with PathManager.open(file_name, "r") as given_file:
        for _i in range(SAMPLE_PRINT_COUNT):
            line = next(given_file).strip()
            print(line)
Beispiel #17
0
    def __init__(self, config: Config, output_encoded_layers: bool,
                 **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)

        # map to the real model_path
        config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path]
                             if config.model_path
                             in resources.roberta.RESOURCE_MAP else
                             config.model_path)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        # sharing compression across each layers

        # create compress layer if use linear multihead attention
        if config.use_linformer_encoder:
            compress_layer = nn.Linear(
                config.max_seq_len - 2,
                (config.max_seq_len - 2) // config.linformer_compressed_ratio,
            )

        self.use_selfie_encoder = config.use_selfie_encoder

        layers = [
            TransformerLayer(
                embedding_dim=config.embedding_dim,
                attention=MultiheadLinearAttention(
                    embed_dim=config.embedding_dim,
                    num_heads=config.num_attention_heads,
                    compress_layer=compress_layer,
                ) if config.use_linformer_encoder else MultiheadSelfAttention(
                    embed_dim=config.embedding_dim,
                    num_heads=config.num_attention_heads),
            ) for _ in range(config.num_encoder_layers)
        ]

        self.encoder = (SentenceEncoder(transformer=Transformer(
            vocab_size=config.vocab_size,
            embedding_dim=config.embedding_dim,
            layers=layers,
            max_seq_len=config.max_seq_len,
        )) if not self.use_selfie_encoder else PostEncoder(
            transformer=SELFIETransformer(
                vocab_size=config.vocab_size,
                embedding_dim=config.embedding_dim,
                layers=layers,
                max_seq_len=config.max_seq_len,
            )))
        self.apply(init_params)
        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(f,
                                           map_location=lambda s, l:
                                           default_restore_location(s, "cpu"))
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            else:
                self.load_state_dict(roberta_state)

        self.representation_dim = self._embedding().weight.size(-1)
        self.export_encoder = config.export_encoder
        self.variable_size_embedding = config.variable_size_embedding
        log_class_usage(__class__)
Beispiel #18
0
    def __init__(
        self, config: Config, output_encoded_layers: bool, *args, **kwargs
    ) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)
        # Load config
        config_file = os.path.join(config.bert_cpt_dir, "config.json")
        local_config_path = PathManager.get_local_path(config_file)
        bert_config = BertConfig.from_json_file(local_config_path)
        print("Bert model config {}".format(bert_config))
        # Instantiate model.
        model = BertModel(bert_config)
        weights_path = os.path.join(config.bert_cpt_dir, "pytorch_model.bin")
        # load pre-trained weights if weights_path exists
        if config.load_weights and PathManager.isfile(weights_path):
            with PathManager.open(weights_path, "rb") as fd:
                state_dict = torch.load(fd)

            missing_keys: List[str] = []
            unexpected_keys: List[str] = []
            error_msgs: List[str] = []
            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, "_metadata", None)
            for key in list(state_dict.keys()):
                new_key = None
                if key.endswith("LayerNorm.gamma"):  # compatibility with v0.5 models
                    new_key = key.replace("LayerNorm.gamma", "LayerNorm.weight")
                if key.endswith("LayerNorm.beta"):  # compatibility with v0.5 models
                    new_key = key.replace("LayerNorm.beta", "LayerNorm.bias")
                if new_key is not None:
                    state_dict[new_key] = state_dict.pop(key)

            if metadata is not None:
                state_dict._metadata = metadata

            def load(module, prefix=""):
                local_metadata = (
                    {} if metadata is None else metadata.get(prefix[:-1], {})
                )
                module._load_from_state_dict(
                    state_dict,
                    prefix,
                    local_metadata,
                    True,
                    missing_keys,
                    unexpected_keys,
                    error_msgs,
                )
                for name, child in module._modules.items():
                    if child is not None:
                        load(child, prefix + name + ".")

            load(model, prefix="" if hasattr(model, "bert") else "bert.")
            if len(missing_keys) > 0:
                print(
                    "Weights of {} not initialized from pretrained model: {}".format(
                        model.__class__.__name__, missing_keys
                    )
                )
            if len(unexpected_keys) > 0:
                print(
                    "Weights from pretrained model not used in {}: {}".format(
                        model.__class__.__name__, unexpected_keys
                    )
                )

        self.bert = model
        log_class_usage(__class__)