def load_cached_embeddings(self, cache_path: str) -> None: """ Load cached embeddings from file """ with PathManager.open(cache_path, "rb") as f: self.embed_vocab, self.stoi, self.embedding_vectors = torch.load(f)
def sensitivity_analysis(self, trainer, state, eval_data, metric_reporter, train_config): """ Analysis the sensitivity of each weight tensor to the metric. Prune the weight tensor one by one and evaluate the metric if the correspond weight tensor is pruned. Args: trainer (trainer): batch iterator of training data state (TrainingState): the state of the current training eval_data (BatchIterator): batch iterator of evaluation data metric_reporter (MetricReporter): compute metric based on training output and report results to console, file.. etc train_config (PyTextConfig): training config Returns: analysis_result: a string of each layer sensitivity to metric. """ print("Analyzed_sparsity: {}".format(self.analyzed_sparsity)) print("Evaluation metric_reporter: {}".format( type(metric_reporter).__name__)) output_path = ( os.path.dirname(train_config.task.metric_reporter.output_path) + "/sensitivity_analysis_sparsifier.ckp") # param_dict: the dict maps weight tensor to the parameter name self.param_dict = self.get_sparsifiable_params(state.model) # set model to evaluation mode state.stage = Stage.EVAL state.model.eval(Stage.EVAL) metric_dict = {} all_param_list = [None] + list(self.param_dict.keys()) print("All prunable parameters", all_param_list) # print the sensitivity results for each weight print("#" * 40) print("save the analysis result to: ", output_path) print("Pruning Sensitivity Test: param / shape / eval metric") # iterate through all_param_list to test pruning snesitivity for param_name in all_param_list: print("=" * 40) print("Testing {}".format(param_name)) state.model.load_state_dict(self.loaded_model["model_state"]) current_metric, prunable_param_shape = self.layer_wise_analysis( param_name, self.param_dict, trainer, state, eval_data, metric_reporter) if param_name is None: baseline_metric = current_metric metric_dict[param_name] = current_metric - baseline_metric print("#" * 40) # remove baseline metric from the analysis results if None in metric_dict: del metric_dict[None] # write the test result into the checkpoint if state.rank == 0: with PathManager.open(output_path, "w") as fp: json.dump(metric_dict, fp) return metric_dict
def save(self, state, save_path): with PathManager.open(save_path, "wb") as f: torch.save(state, f)
def __init__(self, *args, **kwargs): self._file = PathManager.open(*args, **kwargs)
def torchscript_export( self, model, export_path=None, sort_input=False, sort_key=1, export_config=None, ): # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize accelerate = export_config.accelerate seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate) use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate use_cuda_half = "cuda:half" in accelerate use_nnpi_quantize = "nnpi:quantize" in accelerate use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate use_fx_quantize = (use_nnpi_fx_static_quantize or use_nnpi_fx_dynamic_quantize or use_cpu_fx_static_quantize or use_cpu_fx_dynamic_quantize) # Make sure to put the model on CPU and disable CUDA before exporting to # ONNX to disable any data_parallel pieces cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) if use_nnpi or use_fx_quantize: model = swap_modules_for_accelerator(model) # Trace needs eval mode, to disable dropout etc model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) # call model forward to set correct device types if sort_input: _, sorted_indices = sort(inputs[sort_key], descending=True) inputs = [i.index_select(0, sorted_indices) for i in inputs] model(*inputs) # Default to dynamic if use_fx_quantize: data_loader = self.data.batches(Stage.TRAIN, load_early=False) trace = quantize_fx( model, inputs, data_loader, use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize, ) elif (quantize or use_nnpi_quantize) and hasattr( model, "graph_mode_quantize"): data_loader = self.data.batches(Stage.TRAIN, load_early=False) print("Quantizing the model ...") # recognize legazy nnpi_q or $platform:$option syntax quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize" in accelerate) module_swap = use_nnpi trace = quantize_statically(model, inputs, data_loader, quantize_linear_only, module_swap) else: if quantize: log_feature_usage("quantize.dynamically.CPU") model.quantize() trace = model.trace(inputs) print("traced!") if use_cuda_half: log_accelerator_feature_usage("build.CUDA.half") # convert trace to half precision trace.cuda().half() #### trace test: demonstrate that it is usable precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) assert trace(*inputs) #### end of trace test if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace) if hasattr(trace, "validate"): trace.validate(export_config) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring seq_padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring batch_padding_control" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if use_nnpi: print("lowering using to_glow") trace = lower_modules_to_accelerator( model, trace, export_config, use_nnpi_throughput_optimized) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def torchscript_export( self, model, export_path=None, sort_input=False, sort_key=1, export_config=None, ): # TODO(T88310041) Remove These torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(False) # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize accelerate = export_config.accelerate seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control # introduce a single nnpi:quantize that obviates need for torchscript quantize on NNPI use_nnpi = ("nnpi" in accelerate) or ("nnpi:quantize" in accelerate) use_nnpi_throughput_optimized = "nnpi:throughput_optimized" in accelerate use_nnpi_gelu_clip = "nnpi:gelu_clip" in accelerate use_cuda_half = "cuda:half" in accelerate use_cuda_half_faster_transformers = "cuda:half:ft" in accelerate use_nnpi_quantize = "nnpi:quantize" in accelerate use_nnpi_fx_static_quantize = "nnpi:fx_static_quantize" in accelerate use_nnpi_fx_static_selectively_quantize = ( "nnpi:fx_static_selectively_quantize" in accelerate ) use_nnpi_fx_dynamic_quantize = "nnpi:fx_dynamic_quantize" in accelerate use_cpu_fx_static_quantize = "cpu:fx_static_quantize" in accelerate use_cpu_fx_static_selectively_quantize = ( "cpu:fx_static_selectively_quantize" in accelerate ) use_cpu_fx_dynamic_quantize = "cpu:fx_dynamic_quantize" in accelerate use_fx_quantize = ( use_nnpi_fx_static_quantize or use_nnpi_fx_static_selectively_quantize or use_nnpi_fx_dynamic_quantize or use_cpu_fx_static_quantize or use_cpu_fx_static_selectively_quantize or use_cpu_fx_dynamic_quantize ) # Make sure to put the model on CPU and disable CUDA before exporting to # ONNX to disable any data_parallel pieces cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) if use_nnpi or use_fx_quantize: model = swap_modules(model, MODULE_TO_REWRITER["nnpi"]) if "nnpi:split" in accelerate: model = split_model_for_accelerator(model) # Trace needs eval mode, to disable dropout etc model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True)) ) inputs = model.onnx_trace_input(batch) # call model forward to set correct device types if sort_input: _, sorted_indices = sort(inputs[sort_key], descending=True) inputs = [i.index_select(0, sorted_indices) for i in inputs] model(*inputs) # Default to dynamic if use_fx_quantize: data_loader = self.data.batches(Stage.TRAIN, load_early=False) trace = quantize_fx( model, inputs, data_loader, use_nnpi_fx_dynamic_quantize or use_cpu_fx_dynamic_quantize, use_nnpi_fx_static_selectively_quantize or use_cpu_fx_static_selectively_quantize, ) elif (quantize or use_nnpi_quantize) and hasattr(model, "graph_mode_quantize"): data_loader = self.data.batches(Stage.TRAIN, load_early=False) print("Quantizing the model ...") # recognize legazy nnpi_q or $platform:$option syntax quantize_linear_only = use_nnpi_quantize or ("nnpi_quantize" in accelerate) module_swap = use_nnpi trace = quantize_statically( model, inputs, data_loader, quantize_linear_only, module_swap ) else: if quantize: log_feature_usage("quantize.dynamically.CPU") model.quantize() if use_cuda_half_faster_transformers: log_accelerator_feature_usage("build.CUDA.half.faster_transformers") # We need a separate path for GPU-only tracing, as we can't just trace a CPU model # and invoke .cuda().half(), # as we don't have equivalent CPU implementations of these operators. precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True model = swap_modules(model, MODULE_TO_REWRITER["cuda"]) model.eval() model.half().cuda() # obtain new inputs with cuda/fp16 enabled. unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True)) ) inputs = model.onnx_trace_input(batch) trace = model.trace(inputs) print("traced (faster_transformers)!") # should be unnecessary. trace.cuda().half() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True)) ) inputs = model.onnx_trace_input(batch) results = trace(*inputs) assert results print(results) else: trace = model.trace(inputs) print("traced!") if use_cuda_half: log_accelerator_feature_usage("build.CUDA.half") # convert trace to half precision trace.cuda().half() #### trace test: demonstrate that it is usable precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True)) ) inputs = model.onnx_trace_input(batch) assert trace(*inputs) #### end of trace test if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace) if hasattr(trace, "validate"): trace.validate(export_config) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring seq_padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring batch_padding_control" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if use_nnpi: print("lowering using to_glow") trace = lower_modules_to_accelerator( model, trace, export_config, use_nnpi_throughput_optimized, use_nnpi_gelu_clip, ) if "split" in accelerate: print("lowering split model to glow") trace = lower_split_model_to_accelerator(model, trace, export_config) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def from_vocab_filename(cls, vocab_filename: str) -> "ScriptBPE": with PathManager.open(vocab_filename) as vocab_file: return cls(cls.load_vocab(vocab_file))
def torchscript_export(self, model, export_path=None, sort_input=False, sort_key=1, **kwargs): # unpack export kwargs quantize = kwargs.get("quantize", False) accelerate = kwargs.get("accelerate", []) seq_padding_control = kwargs.get("seq_padding_control") batch_padding_control = kwargs.get("batch_padding_control") inference_interface = kwargs.get("inference_interface") # Make sure to put the model on CPU and disable CUDA before exporting to # ONNX to disable any data_parallel pieces cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) # Trace needs eval mode, to disable dropout etc model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) # call model forward to set correct device types if sort_input: _, sorted_indices = sort(inputs[sort_key], descending=True) inputs = [i.index_select(0, sorted_indices) for i in inputs] model(*inputs) if "half" in accelerate: model.half() if quantize and hasattr(model, "graph_mode_quantize"): data_loader = self.data.batches(Stage.TRAIN, load_early=False) trace = model.graph_mode_quantize(inputs, data_loader) else: if quantize: model.quantize() trace = model.trace(inputs) if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring padding_control" ) if inference_interface is not None: if hasattr(trace, "inference_interface"): trace.inference_interface(inference_interface) else: print( "inference_interface not supported by model. Ignoring inference_interface" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if "nnpi" in accelerate: trace._c = torch._C._freeze_module( trace._c, preservedAttrs=[ "make_prediction", "make_batch", "set_padding_control" ], ) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def torchscript_export(self, model, export_path=None, export_config=None): # noqa # unpack export config # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize accelerate = export_config.accelerate seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control inference_interface = export_config.inference_interface cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) model(*inputs) if quantize: model.quantize() if accelerate is not None and "half" in accelerate: model.half() if self.trace_both_encoders: trace = jit.trace(model, inputs) else: trace = jit.trace(model.encoder1, (inputs[0], )) if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace, self.trace_both_encoders) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring padding_control" ) if inference_interface is not None: if hasattr(trace, "inference_interface"): trace.inference_interface(inference_interface) else: print( "inference_interface not supported by model. Ignoring inference_interface" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if accelerate is not None and "nnpi" in accelerate: trace._c = torch._C._freeze_module( trace._c, preservedAttrs=[ "make_prediction", "make_batch", "set_padding_control" ], ) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def __init__( self, config: Config, right_dim: int, left_dim: int, to_dim: int, export_type=ExportType.NONE, ) -> None: super().__init__(config) self.mlp_for_right = MLPDecoderTwoTower.get_mlp( right_dim, 0, config.right_hidden_dims, config.layer_norm, config.dropout, config.activation, export_embedding=True, ) self.mlp_for_left = MLPDecoderTwoTower.get_mlp( left_dim, 0, config.left_hidden_dims, config.layer_norm, config.dropout, config.activation, export_embedding=True, ) if config.dense_tower_dims is None: from_dim = config.right_hidden_dims[-1] + config.left_hidden_dims[ -1] self.concat_dense = False else: self.mlp_for_dense = MLPDecoderTwoTower.get_mlp( from_dim=config.dense_tower_dims[0], to_dim=config.dense_tower_dims[-1], hidden_dims=config.dense_tower_dims[1:-1], layer_norm=config.layer_norm, dropout=config.dropout, activation=config.activation, ) from_dim = (config.right_hidden_dims[-1] + config.left_hidden_dims[-1] + config.dense_tower_dims[-1]) self.concat_dense = True self.mlp = MLPDecoderTwoTower.get_mlp( from_dim, to_dim, config.hidden_dims, config.layer_norm, config.dropout, config.activation, ) # load model if config.load_model_path: with PathManager.open(config.load_model_path, "rb") as f: model = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) mlp_state = { k.replace("decoder.", ""): v for k, v in model["model_state"].items() if k.startswith("decoder.mlp") or k.startswith("decoder.mlp_for_right") or k.startswith("decoder.mlp_for_left") or k.startswith("decoder.mlp_for_dense") } if mlp_state["mlp.0.weight"].shape[1] != from_dim: mlp_state = { k: v for k, v in mlp_state.items() if not k.startswith("mlp.") } print("top mlp weights not loaded") self.load_state_dict(mlp_state, strict=config.load_strict) print("loaded mlp state") self.out_dim = to_dim self.export_type = export_type log_class_usage
def reader_raw(file_path, vocab): with PathManager.open(file_path, "r") as r: for line in r: yield vocab[line.strip()]
def torchscript_export( self, model, export_path=None, sort_input=False, sort_key=1, export_config=None, ): # TODO(T88310041) Remove These torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(False) # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control accel = AccelerateOptions(export_config.accelerate) print(f"Using accelerate options: {accel.__dict__}") # what hosts can this model run on # by default, pytext works on CPU and CUDA (because it implements set_device) model_host = ["cpu", "cuda"] if accel.use_cuda: # CUDA FP16 models only work on CUDA model_host = ["cuda"] if accel.use_nnpi: model_host = ["nnpi"] if hasattr(model, "set_host"): model.set_host(model_host) # what is the type of this model # pytext models are nlp models model_type = ["nlp"] instance_paths_p = any( True for _ in find_module_instances(model, RoBERTaEncoder, [])) if instance_paths_p: model_type.append("transformer") instance_paths_p = any( True for _ in find_module_instances(model, BiLSTM, [])) if instance_paths_p: model_type.append("BiLSTM") if hasattr(model, "set_model"): model.set_type(model_type) # Make sure to put the model on CPU and disable CUDA before exporting to # ONNX to disable any data_parallel pieces cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) model = rewrite_nnpi_modules(model, accel) # Trace needs eval mode, to disable dropout etc model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) # call model forward to set correct device types if sort_input: _, sorted_indices = sort(inputs[sort_key], descending=True) inputs = [i.index_select(0, sorted_indices) for i in inputs] model(*inputs) # Default to dynamic if accel.use_fx_quantize: data_loader = self.data.batches(Stage.TRAIN, load_early=False) trace = quantize_fx( model, inputs, data_loader, accel.use_nnpi_fx_dynamic_quantize or accel.use_cpu_fx_dynamic_quantize, accel.use_nnpi_fx_static_selectively_quantize or accel.use_cpu_fx_static_selectively_quantize, ) elif (quantize or accel.use_nnpi_quantize) and hasattr( model, "graph_mode_quantize"): data_loader = self.data.batches(Stage.TRAIN, load_early=False) print("Quantizing the model ...") # recognize legazy nnpi_q or $platform:$option syntax quantize_linear_only = accel.use_nnpi_quantize module_swap = accel.use_nnpi trace = quantize_statically(model, inputs, data_loader, quantize_linear_only, module_swap) else: if quantize: log_feature_usage("quantize.dynamically.CPU") model.quantize() if accel.use_cuda and (accel.use_cuda_half_ft or accel.use_cuda_dq): log_accelerator_feature_usage( "build.CUDA.half.faster_transformers") # We need a separate path for GPU-only tracing, as we can't just trace a CPU model # and invoke .cuda().half(), # as we don't have equivalent CPU implementations of these operators. precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True if accel.use_cuda_dq: model = swap_modules(model, MODULE_TO_REWRITER["cuda-dq"]) else: model = swap_modules(model, MODULE_TO_REWRITER["cuda"]) model.eval() model.half().cuda() # obtain new inputs with cuda/fp16 enabled. unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) trace = model.trace(inputs) print("Traced (faster_transformers)!") # should be unnecessary. trace.cuda().half() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) results = trace(*inputs) assert results else: trace = model.trace(inputs) print("Traced!") if accel.use_cuda and accel.use_cuda_half: log_accelerator_feature_usage("build.CUDA.half") # convert trace to half precision trace.cuda().half() #### trace test: demonstrate that it is usable precision.FP16_ENABLED = True cuda.CUDA_ENABLED = True unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) assert trace(*inputs) #### end of trace test if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace) if hasattr(trace, "validate"): trace.validate(export_config) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring seq_padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring batch_padding_control" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if accel.use_nnpi and not accel.use_nnpi_split: print("Lowering using to_glow") trace = lower_modules_to_accelerator( model, trace, export_config, accel.use_nnpi_throughput_optimized, accel.use_nnpi_throughput_maximized, accel.use_nnpi_gelu_clip, ) if accel.use_nnpi_split: print("Lowering split model to Glow") trace = lower_split_model_to_accelerator(model, trace, export_config) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def get_test_sample(): with PathManager.open(RAW_TEST_PATH, "r") as f: data = json.load(f) return data
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." # sharing compression across each layers # create compress layer if use linear multihead attention if config.use_linformer_encoder: compress_layer = nn.Linear( config.max_seq_len - 2, (config.max_seq_len - 2) // config.linformer_compressed_ratio, ) self.use_selfie_encoder = config.use_selfie_encoder if config.use_linformer_encoder: if config.linformer_quantize: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=QuantizedMultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ), ) for _ in range(config.num_encoder_layers) ] else: layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadSelfAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, scaling=config.scaling, ), normalize_before=config.normalize_before, ) for _ in range(config.num_encoder_layers) ] self.encoder = (SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, normalize_before=config.normalize_before, )) if not self.use_selfie_encoder else PostEncoder( transformer=SELFIETransformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, ))) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) if config.use_bias_finetuning: for (n, p) in self.encoder.named_parameters(): # "encoder.transformer.layers.0.attention.input_projection.weight" -> false # "encoder.transformer.layers.0.attention.input_projection.bias" -> true if n.split(".")[-1] != "bias": p.requires_grad_(False) self._prune_transformer_layers_and_heads(config) self.export_encoder = config.export_encoder self.variable_size_embedding = config.variable_size_embedding self.use_linformer_encoder = config.use_linformer_encoder log_class_usage(__class__)
def torchscript_export(self, model, export_path=None, export_config=None): # noqa # unpack export config # unpack export config if export_config is None: export_config = ExportConfig() quantize = export_config.torchscript_quantize accelerate = export_config.accelerate seq_padding_control = export_config.seq_padding_control batch_padding_control = export_config.batch_padding_control if (accelerate is not None) and (accelerate != []): raise RuntimeError( "old-style task.py does not support export for NNPI accelerators" ) cuda.CUDA_ENABLED = False model.cpu() optimizer = self.trainer.optimizer optimizer.pre_export(model) model.eval() model.prepare_for_onnx_export_() unused_raw_batch, batch = next( iter(self.data.batches(Stage.TRAIN, load_early=True))) inputs = model.onnx_trace_input(batch) model(*inputs) if quantize: model.quantize() if self.trace_both_encoders: trace = jit.trace(model, inputs) else: trace = jit.trace(model.encoder1, (inputs[0], )) if hasattr(model, "torchscriptify"): trace = model.torchscriptify(self.data.tensorizers, trace, self.trace_both_encoders) if seq_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("sequence_length", seq_padding_control) else: print( "Padding_control not supported by model. Ignoring padding_control" ) if batch_padding_control is not None: if hasattr(trace, "set_padding_control"): trace.set_padding_control("batch_length", batch_padding_control) else: print( "Padding_control not supported by model. Ignoring padding_control" ) trace.apply(lambda s: s._pack() if s._c._has_method("_pack") else None) if export_path is not None: print(f"Saving torchscript model to: {export_path}") with PathManager.open(export_path, "wb") as f: torch.jit.save(trace, f) return trace
def print_sample(file_name): with PathManager.open(file_name, "r") as given_file: for _i in range(SAMPLE_PRINT_COUNT): line = next(given_file).strip() print(line)
def __init__(self, config: Config, output_encoded_layers: bool, **kwarg) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # map to the real model_path config.model_path = (resources.roberta.RESOURCE_MAP[config.model_path] if config.model_path in resources.roberta.RESOURCE_MAP else config.model_path) # assert config.pretrained_encoder.load_path, "Load path cannot be empty." # sharing compression across each layers # create compress layer if use linear multihead attention if config.use_linformer_encoder: compress_layer = nn.Linear( config.max_seq_len - 2, (config.max_seq_len - 2) // config.linformer_compressed_ratio, ) self.use_selfie_encoder = config.use_selfie_encoder layers = [ TransformerLayer( embedding_dim=config.embedding_dim, attention=MultiheadLinearAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads, compress_layer=compress_layer, ) if config.use_linformer_encoder else MultiheadSelfAttention( embed_dim=config.embedding_dim, num_heads=config.num_attention_heads), ) for _ in range(config.num_encoder_layers) ] self.encoder = (SentenceEncoder(transformer=Transformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, )) if not self.use_selfie_encoder else PostEncoder( transformer=SELFIETransformer( vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, layers=layers, max_seq_len=config.max_seq_len, ))) self.apply(init_params) if config.model_path: with PathManager.open(config.model_path, "rb") as f: roberta_state = torch.load(f, map_location=lambda s, l: default_restore_location(s, "cpu")) # In case the model has previously been loaded in PyText and finetuned, # then we dont need to do the special state dict translation. Load # it directly if not config.is_finetuned: self.encoder.load_roberta_state_dict(roberta_state["model"]) else: self.load_state_dict(roberta_state) self.representation_dim = self._embedding().weight.size(-1) self.export_encoder = config.export_encoder self.variable_size_embedding = config.variable_size_embedding log_class_usage(__class__)
def __init__( self, config: Config, output_encoded_layers: bool, *args, **kwargs ) -> None: super().__init__(config, output_encoded_layers=output_encoded_layers) # Load config config_file = os.path.join(config.bert_cpt_dir, "config.json") local_config_path = PathManager.get_local_path(config_file) bert_config = BertConfig.from_json_file(local_config_path) print("Bert model config {}".format(bert_config)) # Instantiate model. model = BertModel(bert_config) weights_path = os.path.join(config.bert_cpt_dir, "pytorch_model.bin") # load pre-trained weights if weights_path exists if config.load_weights and PathManager.isfile(weights_path): with PathManager.open(weights_path, "rb") as fd: state_dict = torch.load(fd) missing_keys: List[str] = [] unexpected_keys: List[str] = [] error_msgs: List[str] = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, "_metadata", None) for key in list(state_dict.keys()): new_key = None if key.endswith("LayerNorm.gamma"): # compatibility with v0.5 models new_key = key.replace("LayerNorm.gamma", "LayerNorm.weight") if key.endswith("LayerNorm.beta"): # compatibility with v0.5 models new_key = key.replace("LayerNorm.beta", "LayerNorm.bias") if new_key is not None: state_dict[new_key] = state_dict.pop(key) if metadata is not None: state_dict._metadata = metadata def load(module, prefix=""): local_metadata = ( {} if metadata is None else metadata.get(prefix[:-1], {}) ) module._load_from_state_dict( state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs, ) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") load(model, prefix="" if hasattr(model, "bert") else "bert.") if len(missing_keys) > 0: print( "Weights of {} not initialized from pretrained model: {}".format( model.__class__.__name__, missing_keys ) ) if len(unexpected_keys) > 0: print( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys ) ) self.bert = model log_class_usage(__class__)