Beispiel #1
0
    def __init__(self, config: Config, output_encoded_layers: bool,
                 **kwarg) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)

        # map to the real model_path
        config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path]
                             if config.model_path
                             in resources.roberta.RESOURCE_MAP else
                             config.model_path)
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        # sharing compression across each layers

        # create compress layer if use linear multihead attention
        if config.use_linformer_encoder:
            compress_layer = nn.Linear(
                config.max_seq_len - 2,
                (config.max_seq_len - 2) // config.linformer_compressed_ratio,
            )

        self.use_selfie_encoder = config.use_selfie_encoder

        if config.use_linformer_encoder:
            if config.linformer_quantize:
                layers = [
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=QuantizedMultiheadLinearAttention(
                            embed_dim=config.embedding_dim,
                            num_heads=config.num_attention_heads,
                            compress_layer=compress_layer,
                        ),
                    ) for _ in range(config.num_encoder_layers)
                ]
            else:
                layers = [
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=MultiheadLinearAttention(
                            embed_dim=config.embedding_dim,
                            num_heads=config.num_attention_heads,
                            compress_layer=compress_layer,
                        ),
                    ) for _ in range(config.num_encoder_layers)
                ]
        else:
            layers = [
                TransformerLayer(
                    embedding_dim=config.embedding_dim,
                    attention=MultiheadSelfAttention(
                        embed_dim=config.embedding_dim,
                        num_heads=config.num_attention_heads,
                    ),
                ) for _ in range(config.num_encoder_layers)
            ]

        self.encoder = (SentenceEncoder(transformer=Transformer(
            vocab_size=config.vocab_size,
            embedding_dim=config.embedding_dim,
            layers=layers,
            max_seq_len=config.max_seq_len,
        )) if not self.use_selfie_encoder else PostEncoder(
            transformer=SELFIETransformer(
                vocab_size=config.vocab_size,
                embedding_dim=config.embedding_dim,
                layers=layers,
                max_seq_len=config.max_seq_len,
            )))
        self.apply(init_params)
        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(f,
                                           map_location=lambda s, l:
                                           default_restore_location(s, "cpu"))
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            else:
                self.load_state_dict(roberta_state)

        self.export_encoder = config.export_encoder
        self.variable_size_embedding = config.variable_size_embedding
        self.use_linformer_encoder = config.use_linformer_encoder
        log_class_usage(__class__)
Beispiel #2
0
    def __init__(  # noqa C901
        self,
        config: Config,
        output_encoded_layers: bool,
        token_embedding: nn.Embedding = None,
        **kwarg,
    ) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)

        # map to the real model_path
        config.model_path = (
            resources.roberta.RESOURCE_MAP[config.model_path]
            if config.model_path in resources.roberta.RESOURCE_MAP
            else config.model_path
        )
        # assert config.pretrained_encoder.load_path, "Load path cannot be empty."
        # sharing compression across each layers

        # create compress layer if use linear multihead attention
        if config.use_linformer_encoder:
            compress_layer = nn.Linear(
                config.max_seq_len - 2,
                (config.max_seq_len - 2) // config.linformer_compressed_ratio,
            )

        self.use_selfie_encoder = config.use_selfie_encoder
        self.skip_token_embed = config.skip_token_embed

        if config.use_linformer_encoder:
            if config.linformer_quantize:
                layers = [
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=QuantizedMultiheadLinearAttention(
                            embed_dim=config.embedding_dim,
                            num_heads=config.num_attention_heads,
                            compress_layer=compress_layer,
                        ),
                    )
                    for _ in range(config.num_encoder_layers)
                ]
            else:
                layers = [
                    TransformerLayer(
                        embedding_dim=config.embedding_dim,
                        attention=MultiheadLinearAttention(
                            embed_dim=config.embedding_dim,
                            num_heads=config.num_attention_heads,
                            compress_layer=compress_layer,
                        ),
                    )
                    for _ in range(config.num_encoder_layers)
                ]
        else:
            layers = [
                TransformerLayer(
                    embedding_dim=config.embedding_dim,
                    attention=MultiheadSelfAttention(
                        embed_dim=config.embedding_dim,
                        num_heads=config.num_attention_heads,
                        scaling=config.scaling,
                    ),
                    normalize_before=config.normalize_before,
                )
                for _ in range(config.num_encoder_layers)
            ]
        if not config.skip_token_embed:
            self.encoder = (
                SentenceEncoder(
                    transformer=Transformer(
                        vocab_size=config.vocab_size,
                        embedding_dim=config.embedding_dim,
                        layers=layers,
                        max_seq_len=config.max_seq_len,
                        normalize_before=config.normalize_before,
                        token_embedding=token_embedding,
                    )
                )
                if not self.use_selfie_encoder
                else PostEncoder(
                    transformer=SELFIETransformer(
                        vocab_size=config.vocab_size,
                        embedding_dim=config.embedding_dim,
                        layers=layers,
                        max_seq_len=config.max_seq_len,
                    )
                )
            )
        else:
            self.encoder = PassthroughEncoder(
                transformer=PassthroughTransformer(
                    vocab_size=config.vocab_size,
                    embedding_dim=config.embedding_dim,
                    layers=layers,
                    max_seq_len=config.max_seq_len,
                    normalize_before=config.normalize_before,
                )
            )
        self.apply(init_params)

        if config.prune_before_load:
            self._prune_transformer_layers_and_heads(config)

        if config.model_path:
            with PathManager.open(config.model_path, "rb") as f:
                roberta_state = torch.load(
                    f, map_location=lambda s, l: default_restore_location(s, "cpu")
                )
            # In case the model has previously been loaded in PyText and finetuned,
            # then we dont need to do the special state dict translation. Load
            # it directly
            if not config.is_finetuned:
                self.encoder.load_roberta_state_dict(roberta_state["model"])
            elif config.load_partial_model is not None:
                roberta_state = {
                    k.replace(config.load_partial_model + ".", ""): v
                    for k, v in roberta_state["model_state"].items()
                    if k.startswith(config.load_partial_model)
                }
                self.load_state_dict(roberta_state)
            else:
                self.load_state_dict(roberta_state)

        if config.use_bias_finetuning:
            for (n, p) in self.encoder.named_parameters():
                # "encoder.transformer.layers.0.attention.input_projection.weight" -> false
                # "encoder.transformer.layers.0.attention.input_projection.bias" -> true
                if n.split(".")[-1] != "bias":
                    p.requires_grad_(False)

        if not config.prune_before_load:
            self._prune_transformer_layers_and_heads(config)

        self.export_encoder = config.export_encoder
        self.variable_size_embedding = config.variable_size_embedding
        self.use_linformer_encoder = config.use_linformer_encoder
        log_class_usage(__class__)
Beispiel #3
0
    def __init__(
        self, config: Config, output_encoded_layers: bool, *args, **kwargs
    ) -> None:
        super().__init__(config, output_encoded_layers=output_encoded_layers)
        # Load config
        config_file = os.path.join(config.bert_cpt_dir, "config.json")
        local_config_path = PathManager.get_local_path(config_file)
        bert_config = BertConfig.from_json_file(local_config_path)
        print("Bert model config {}".format(bert_config))
        # Instantiate model.
        model = BertModel(bert_config)
        weights_path = os.path.join(config.bert_cpt_dir, "pytorch_model.bin")
        # load pre-trained weights if weights_path exists
        if config.load_weights and PathManager.isfile(weights_path):
            with PathManager.open(weights_path, "rb") as fd:
                state_dict = torch.load(fd)

            missing_keys: List[str] = []
            unexpected_keys: List[str] = []
            error_msgs: List[str] = []
            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, "_metadata", None)
            for key in list(state_dict.keys()):
                new_key = None
                if key.endswith("LayerNorm.gamma"):  # compatibility with v0.5 models
                    new_key = key.replace("LayerNorm.gamma", "LayerNorm.weight")
                if key.endswith("LayerNorm.beta"):  # compatibility with v0.5 models
                    new_key = key.replace("LayerNorm.beta", "LayerNorm.bias")
                if new_key is not None:
                    state_dict[new_key] = state_dict.pop(key)

            if metadata is not None:
                state_dict._metadata = metadata

            def load(module, prefix=""):
                local_metadata = (
                    {} if metadata is None else metadata.get(prefix[:-1], {})
                )
                module._load_from_state_dict(
                    state_dict,
                    prefix,
                    local_metadata,
                    True,
                    missing_keys,
                    unexpected_keys,
                    error_msgs,
                )
                for name, child in module._modules.items():
                    if child is not None:
                        load(child, prefix + name + ".")

            load(model, prefix="" if hasattr(model, "bert") else "bert.")
            if len(missing_keys) > 0:
                print(
                    "Weights of {} not initialized from pretrained model: {}".format(
                        model.__class__.__name__, missing_keys
                    )
                )
            if len(unexpected_keys) > 0:
                print(
                    "Weights from pretrained model not used in {}: {}".format(
                        model.__class__.__name__, unexpected_keys
                    )
                )

        self.bert = model
        log_class_usage(__class__)
Beispiel #4
0
    def torchscript_export(
        self,
        model,
        export_path=None,
        sort_input=False,
        sort_key=1,
        export_config=None,
    ):
        # TODO(T88310041) Remove These
        torch._C._jit_set_profiling_executor(True)
        torch._C._jit_set_profiling_mode(False)
        # unpack export config
        if export_config is None:
            export_config = ExportConfig()

        quantize = export_config.torchscript_quantize

        accelerate = export_config.accelerate
        seq_padding_control = export_config.seq_padding_control
        batch_padding_control = export_config.batch_padding_control

        # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI
        use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate)
        use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate
        use_nnpi_gelu_clip = "nnpi:gelu_clip" in accelerate
        use_cuda_half = "cuda:half" in accelerate
        use_cuda_half_faster_transformers = "cuda:half:ft" in accelerate

        use_nnpi_quantize = "nnpi:quantize" in accelerate
        use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate
        use_nnpi_fx_static_selectively_quantize = (
            "nnpi:fx_static_selectively_quantize" in accelerate)
        use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate
        use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate
        use_cpu_fx_static_selectively_quantize = (
            "cpu:fx_static_selectively_quantize" in accelerate)
        use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate
        use_fx_quantize = (use_nnpi_fx_static_quantize
                           or use_nnpi_fx_static_selectively_quantize
                           or use_nnpi_fx_dynamic_quantize
                           or use_cpu_fx_static_quantize
                           or use_cpu_fx_static_selectively_quantize
                           or use_cpu_fx_dynamic_quantize)

        # what hosts can this model run on
        # by default, pytext works on CPU and CUDA (because it implements set_device)
        model_host = ["cpu", "cuda"]

        if use_cuda_half or use_cuda_half_faster_transformers:
            # CUDA FP16 models only work on CUDA
            model_host = ["cuda"]

        if (use_nnpi or use_nnpi_quantize or use_nnpi_gelu_clip
                or use_nnpi_throughput_optimized):
            model_host = ["nnpi"]

        if hasattr(model, "set_host"):
            model.set_host(model_host)

        # what is the type of this model
        # pytext models are nlp models
        model_type = ["nlp"]

        instance_paths_p = any(
            True for _ in find_module_instances(model, RoBERTaEncoder, []))
        if instance_paths_p:
            model_type.append("transformer")

        instance_paths_p = any(
            True for _ in find_module_instances(model, BiLSTM, []))
        if instance_paths_p:
            model_type.append("BiLSTM")

        if hasattr(model, "set_model"):
            model.set_type(model_type)

        # Make sure to put the model on CPU and disable CUDA before exporting to
        # ONNX to disable any data_parallel pieces
        cuda.CUDA_ENABLED = False
        model.cpu()
        optimizer = self.trainer.optimizer
        optimizer.pre_export(model)

        if use_nnpi or use_fx_quantize:
            model = swap_modules(model, MODULE_TO_REWRITER["nnpi"])

        if "nnpi:split" in accelerate:
            model = split_model_for_accelerator(model)

        # Trace needs eval mode, to disable dropout etc
        model.eval()
        model.prepare_for_onnx_export_()

        unused_raw_batch, batch = next(
            iter(self.data.batches(Stage.TRAIN, load_early=True)))
        inputs = model.onnx_trace_input(batch)
        # call model forward to set correct device types
        if sort_input:
            _, sorted_indices = sort(inputs[sort_key], descending=True)
            inputs = [i.index_select(0, sorted_indices) for i in inputs]
        model(*inputs)

        # Default to dynamic
        if use_fx_quantize:
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            trace = quantize_fx(
                model,
                inputs,
                data_loader,
                use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize,
                use_nnpi_fx_static_selectively_quantize
                or use_cpu_fx_static_selectively_quantize,
            )
        elif (quantize or use_nnpi_quantize) and hasattr(
                model, "graph_mode_quantize"):
            data_loader = self.data.batches(Stage.TRAIN, load_early=False)
            print("Quantizing the model ...")
            # recognize legazy nnpi_q or $platform:$option syntax
            quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize"
                                                         in accelerate)
            module_swap = use_nnpi
            trace = quantize_statically(model, inputs, data_loader,
                                        quantize_linear_only, module_swap)
        else:
            if quantize:
                log_feature_usage("quantize.dynamically.CPU")
                model.quantize()
            if use_cuda_half_faster_transformers:
                log_accelerator_feature_usage(
                    "build.CUDA.half.faster_transformers")
                # We need a separate path for GPU-only tracing, as we can't just trace a CPU model
                # and invoke .cuda().half(),
                # as we don't have equivalent CPU implementations of these operators.
                precision.FP16_ENABLED = True
                cuda.CUDA_ENABLED = True

                model = swap_modules(model, MODULE_TO_REWRITER["cuda"])
                model.eval()
                model.half().cuda()
                # obtain new inputs with cuda/fp16 enabled.
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                trace = model.trace(inputs)
                print("traced (faster_transformers)!")
                # should be unnecessary.
                trace.cuda().half()
                unused_raw_batch, batch = next(
                    iter(self.data.batches(Stage.TRAIN, load_early=True)))
                inputs = model.onnx_trace_input(batch)
                results = trace(*inputs)
                assert results
                print(results)
            else:
                trace = model.trace(inputs)
                print("traced!")
                if use_cuda_half:
                    log_accelerator_feature_usage("build.CUDA.half")
                    # convert trace to half precision
                    trace.cuda().half()

                    #### trace test: demonstrate that it is usable
                    precision.FP16_ENABLED = True
                    cuda.CUDA_ENABLED = True
                    unused_raw_batch, batch = next(
                        iter(self.data.batches(Stage.TRAIN, load_early=True)))
                    inputs = model.onnx_trace_input(batch)
                    assert trace(*inputs)
                    #### end of trace test
        if hasattr(model, "torchscriptify"):
            trace = model.torchscriptify(self.data.tensorizers, trace)
        if hasattr(trace, "validate"):
            trace.validate(export_config)
        if seq_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("sequence_length",
                                          seq_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring seq_padding_control"
                )
        if batch_padding_control is not None:
            if hasattr(trace, "set_padding_control"):
                trace.set_padding_control("batch_length",
                                          batch_padding_control)
            else:
                print(
                    "Padding_control not supported by model. Ignoring batch_padding_control"
                )
        trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None)
        if use_nnpi:
            print("lowering using to_glow")
            trace = lower_modules_to_accelerator(
                model,
                trace,
                export_config,
                use_nnpi_throughput_optimized,
                use_nnpi_gelu_clip,
            )
        if "split" in accelerate:
            print("lowering split model to glow")
            trace = lower_split_model_to_accelerator(model, trace,
                                                     export_config)

        if export_path is not None:
            print(f"Saving torchscript model to: {export_path}")
            with PathManager.open(export_path, "wb") as f:
                torch.jit.save(trace, f)
        return trace
Beispiel #5
0
def get_test_sample():
    with PathManager.open(RAW_TEST_PATH, "r") as f:
        data = json.load(f)
    return data