def lower_modules_to_accelerator(model: nn.Module, trace, export_options: ExportConfig): import torch_glow if hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder): backend = "NNPI" ( submod_modelpath, compilation_spec_dict, inputs_function, ) = accelerator.get_modules(model, backend)[0] submod_tracepath = accelerator.model2trace_path(submod_modelpath) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend(backend) compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) for k, v in compilation_spec_dict.items(): compilation_group.get_settings().backend_specific_opts_insert(k, v) if inputs_function is not None: input_sets = inputs_function(model, trace, export_options, None, submod_modelpath) compilation_group.set_input_sets(input_sets) trace = torch_glow.to_glow_selective( trace, {submod_tracepath: spec}, inplace=False, ) return trace else: return trace
def devices_to_use_test_helper(self, input, num_replications): model = SimpleModule() spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") # Init with total number of devices. torch_glow.setGlowBackendNumDevices(6) compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) input_spec = torch_glow.InputSpec() input_spec.set_same_as(input) compilation_group.input_sets_append([input_spec]) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_num_devices_to_use(3) compilation_group_settings.set_replication_count(num_replications) traced_mod = torch.jit.trace(model, input) lowered_model = torch_glow.to_glow(traced_mod, {"forward": spec}) g = lowered_model(input) t = model(input) self.assertEqual(type(g), type(t)) self.assertEqual(len(g), len(t)) for (gi, ti) in zip(g, t): self.assertTrue(torch.allclose(gi, ti))
def lower_modules_to_accelerator(model: nn.Module, trace, export_options: ExportConfig): import torch_glow if hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder): backend = "NNPI" submod_modelpath, compilation_spec_dict = accelerator.get_modules( model, backend)[0] submod_tracepath = accelerator.model2trace_path(submod_modelpath) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend(backend) compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) for k, v in compilation_spec_dict.items(): compilation_group.get_settings().backend_specific_opts_insert(k, v) # Todod: @input decorator dose not work properly, fixing it later # input_sets = inputs.input_process(model, export_options, None, submod_tracepath) input_sets = accelerator_transformerLayers_inputs( model, trace, export_options, None, submod_tracepath) compilation_group.set_input_sets(input_sets) trace = torch_glow.to_glow_selective( trace, {submod_tracepath: spec}, inplace=False, ) return trace else: return trace
def test_save_preprocessed_module(self): with torch.no_grad(): x = torch.randn([1, 4, 4, 4], dtype=torch.float32) model = Bar() model.eval() model = torch.jit.trace(model, x) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group.input_sets_append( torch_glow.input_specs_from_tensors([x])) torch_glow.disableFusionPass() torch_glow.enable_convert_to_fp16() glow_mod = torch_glow.to_glow(model, spec) reloaded = utils.save_and_reload_model(glow_mod) wrappername = "__loweredModule__" attrname = "__processed_module" wp = getattr(reloaded._c, wrappername) pp = getattr(wp, attrname) pt_model = torch.jit._recursive.wrap_cpp_module(pp) graph = pt_model.graph_for(x) found = False for node in graph.nodes(): if node.kind() == "quantized::conv2d": found = True assert found
def test_to_glow_selective_already_scripted(self): a = torch.zeros(4) + 8 b = torch.zeros(4) + 7 torch_res = model(a, b) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) a_spec = torch_glow.InputSpec() a_spec.set_same_as(a) b_spec = torch_glow.InputSpec() b_spec.set_same_as(b) compilation_group.input_sets_append([a_spec, b_spec]) with torch.no_grad(): traced_model = torch.jit.trace(model, (a, b)) glow_mod = torch_glow.to_glow_selective( traced_model, { "foo.bar": spec, "qux": spec }, inplace=False, ) glow_res = glow_mod(a, b) assert torch.allclose(torch_res, glow_res)
def test_to_glow_tuple_output(self): input = torch.randn(4) model = Foo() spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) input_spec = torch_glow.InputSpec() input_spec.set_same_as(input) compilation_group.input_sets_append([input_spec]) scripted_mod = torch.jit.script(model) lowered_model = torch_glow.to_glow(scripted_mod, {"forward": spec}) # Run Glow model (gx, gy) = lowered_model(input) # Run reference model (tx, ty) = model(input) assert torch.allclose(tx, gx) assert torch.allclose(ty, gy)
def tuple_test_helper(self, ModType): input = torch.randn(4) model = ModType() spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) input_spec = torch_glow.InputSpec() input_spec.set_same_as(input) compilation_group.input_sets_append([input_spec]) scripted_mod = torch.jit.script(model) lowered_model = torch_glow.to_glow(scripted_mod, {"forward": spec}) # Run Glow model g = lowered_model(input) # Run reference model t = model(input) self.assertEqual(type(g), type(t)) self.assertEqual(len(g), len(t)) for (gi, ti) in zip(g, t): self.assertTrue(torch.allclose(gi, ti))
def test_to_glow_selective(self): a = torch.zeros(4) + 8 b = torch.zeros(4) + 7 torch_res = model(a, b) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) a_spec = torch_glow.InputSpec() a_spec.set_same_as(a) b_spec = torch_glow.InputSpec() b_spec.set_same_as(b) compilation_group.input_sets_append([a_spec, b_spec]) glow_mod = torch_glow.to_glow_selective(model, { "foo.bar": (spec, (a, b)), "qux": (spec, (a, b)) }) glow_mod = torch.jit.trace(glow_mod, (a, b)) glow_res = glow_mod(a, b) assert torch.allclose(torch_res, glow_res)
def lower_modules_to_accelerator(model: nn.Module, trace, export_options: ExportConfig, throughput_optimize=False): # Raise error if accelerator could not be imported if not accelerator_lowering_supported: raise RuntimeError("Accelerator Lowering not supported!") import torch_glow log_accelerator_feature_usage("build.NNPI") if ((hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder)) or (hasattr(model, "representation") and isinstance(model.representation, AcceleratorBiLSTM)) or (hasattr(model, "lower_module") # Internal CNN LM module to add accelerator support. and type(model.lower_module).__qualname__ == "CNNLowerModule")): backend = "NNPI" ( submod_modelpath, compilation_spec_dict, inputs_function, ) = accelerator.get_modules(model, backend)[0] submod_tracepath = accelerator.model2trace_path(submod_modelpath) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend(backend) compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) # Override the options for throughput-optimized case if throughput_optimize: compilation_spec_dict["NNPI_IceCores"] = "4" compilation_spec_dict["NNPINumParallelChunks"] = "4" compilation_group_settings.set_replication_count(3) for k, v in compilation_spec_dict.items(): compilation_group.get_settings().backend_specific_opts_insert(k, v) if inputs_function is not None: input_sets = inputs_function(model, trace, export_options, None, submod_modelpath) else: raise RuntimeError( "inputs_function needs to be specified in accelerator decorator" ) compilation_group.set_input_sets(input_sets) trace = torch_glow.to_glow_selective( trace, {submod_tracepath: spec}, inplace=False, ) return trace else: return trace
def test_to_glow_multiple_groups_and_input_sets(self): x1 = torch.randn(1, 4) y1 = torch.randn(2, 4) x2 = torch.randn(1, 2) y2 = torch.randn(5, 2) x3 = torch.randn(7) y3 = torch.randn(3, 7) mod = Foo() scripted_mod = torch.jit.script(mod) x1_y1_set = torch_glow.input_specs_from_tensors([x1, y1]) x2_y2_set = torch_glow.input_specs_from_tensors([x2, y2]) x3_y3_set = torch_glow.input_specs_from_tensors([x3, y3]) # Create two CompilationGroup, first one contains two input sets # and the second CompilationGroup has the third input set spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group_1 = torch_glow.CompilationGroup() compilation_group_2 = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group_1) spec.compilation_groups_append(compilation_group_2) compilation_group_1.input_sets_append(x1_y1_set) compilation_group_1.input_sets_append(x2_y2_set) compilation_group_2.input_sets_append(x3_y3_set) lowered_module = torch_glow.to_glow(scripted_mod, spec) torch_res1 = mod(x1, y1) torch_res2 = mod(x2, y2) torch_res3 = mod(x3, y3) glow_res1 = lowered_module(x1, y1) glow_res2 = lowered_module(x2, y2) glow_res3 = lowered_module(x3, y3) assert torch.allclose(torch_res1, glow_res1) assert torch.allclose(torch_res2, glow_res2) assert torch.allclose(torch_res3, glow_res3)
def get_compilation_spec(inputs): """helper function to get the compilation spec of the submodule""" spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group.input_sets_append(torch_glow.input_specs_from_tensors(inputs)) return spec
def generate_glow_spec(module, backend, *inputs): spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend(backend) compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) input_specs = [] for input in inputs: input_spec = torch_glow.InputSpec() input_spec.set_same_as(input) input_specs.append(input_spec) compilation_group.input_sets_append(input_specs) return spec
def build_compiliation_spec(self): compilation_spec = torch_glow.CompilationSpec() compilation_spec_settings = compilation_spec.get_settings() compilation_spec_settings.set_glow_backend("CPU") compilation_spec_settings.set_enable_fuser(True) fuser_settings = compilation_spec.get_fuser_settings() fuser_settings.set_min_fusion_group_size(3) fuser_settings.set_max_fusion_merge_size(4) fuser_settings.set_fusion_start_index(5) fuser_settings.set_fusion_end_index(6) fuser_settings.op_blacklist_append("aten::mean") fuser_settings.op_blacklist_append("aten::dropout") compilation_group = torch_glow.CompilationGroup() input1_spec = torch_glow.input_spec_from_tensor(torch.randn(2, 3, 224, 224)) input2_spec = torch_glow.input_spec_from_tensor( torch.randn(3, 2).to(torch.float16) ) compilation_group.input_sets_append([input1_spec, input2_spec]) compilation_group.input_sets_append( torch_glow.input_specs_from_tensors( [torch.randn(1, 3, 224, 224), torch.randn(4, 1)] ) ) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) compilation_group_settings.set_num_devices_to_use(50) compilation_group_settings.set_replication_count(52) compilation_group_settings.backend_specific_opts_insert("apple", "orange") compilation_spec.compilation_groups_append(compilation_group) default_compilation_group_settings = ( compilation_spec.get_default_compilation_group_settings() ) default_compilation_group_settings.set_convert_to_fp16(False) default_compilation_group_settings.set_num_devices_to_use(89) default_compilation_group_settings.set_replication_count(90) default_compilation_group_settings.backend_specific_opts_insert( "hello", "goodbye" ) return compilation_spec
def run_to_glow(m, x): """Trace the model m with input x and call to_glow""" traced_m = torch.jit.trace(m, (x)) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) input_spec = torch_glow.InputSpec() input_spec.set_same_as(x) compilation_group.input_sets_append([input_spec]) lowered_module = torch_glow.to_glow(traced_m, spec) return lowered_module
def lower_modules_to_accelerator(model, trace, seq_padding_control, batch_padding_control): import torch_glow if hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder): backend = "NNPI" submod_modelpath, compilation_spec_dict = accelerator.get_modules( model, backend)[0] submod_tracepath = accelerator.model2trace_path(submod_modelpath) embedding_dim = model.encoder.encoder.transformer.token_embedding.embedding_dim spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend(backend) compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) for k, v in compilation_spec_dict.items(): compilation_group.get_settings().backend_specific_opts_insert(k, v) for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue input1 = torch.randn([seq_len, batch_size, embedding_dim], dtype=torch.float32) input2 = torch.randn([batch_size, seq_len]).bool() input_specs = torch_glow.input_specs_from_tensors( [input1, input2]) compilation_group.input_sets_append(input_specs) trace = torch_glow.to_glow_selective( trace, {submod_tracepath: spec}, inplace=False, ) return trace else: return trace
def test_serialization(self): with torch.no_grad(): x = torch.randn([1, 4, 4, 4], dtype=torch.float32) y = torch.randn([1, 4, 4, 4], dtype=torch.float32) model = Bar() model = torch.jit.trace(model, (x, y)) spec = torch_glow.CompilationSpec() spec_settings = spec.get_settings() spec_settings.set_glow_backend("NNPI") # Enabled the serialize in this spec spec_settings.set_enable_serialize(True) compilation_group = torch_glow.CompilationGroup() compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_replication_count(1) compilation_group_settings.backend_specific_opts_insert( "NNPI_IceCores", "1") compilation_group.input_sets_append( torch_glow.input_specs_from_tensors([x, y])) spec.compilation_groups_append(compilation_group) torch_glow.disableFusionPass() torch_glow.enable_convert_to_fp16() # Enable global serialize # then compile(serialize) the model and save it torch_glow.enable_dump_serialized_model() glow_mod = torch_glow.to_glow(model, spec) res1 = glow_mod(x, y) torch.jit.save(glow_mod, "/tmp/serialize_to_glow.pt") # Enable global deserialize and disable serialize # and load(deserialize) the model to loaded_glow_mod torch_glow.enable_deserialize() torch_glow.disable_dump_serialized_model() loaded_glow_mod = torch.jit.load("/tmp/serialize_to_glow.pt") res2 = loaded_glow_mod(x, y) assert torch.allclose(res1, res2, 1e-5, 1e-5)
def run_model(m, input, randomize): torch_glow.disableFusionPass() traced_m = torch.jit.trace(m, input) if randomize: torch_glow.enable_randomize_constants() else: torch_glow.disable_randomize_constants() spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) input_spec = torch_glow.InputSpec() input_spec.set_same_as(input) compilation_group.input_sets_append([input_spec]) glow_m = torch_glow.to_glow(traced_m, {"forward": spec}) return glow_m(input)
def lower_modules_to_accelerator(model, trace, seq_padding_control, batch_padding_control): import torch_glow if hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder): embedding_dim = model.encoder.encoder.transformer.token_embedding.embedding_dim spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("NNPI") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) compilation_group.get_settings().backend_specific_opts_insert( "NNPI_IceCores", "12") compilation_group.get_settings().backend_specific_opts_insert( "NNPINumParallelChunks", "12") for seq_len in seq_padding_control: if seq_len <= 0: continue for batch_size in batch_padding_control: if batch_size <= 0: continue input1 = torch.randn([seq_len, batch_size, embedding_dim], dtype=torch.float32) input2 = torch.randn([batch_size, seq_len]).bool() input_specs = torch_glow.input_specs_from_tensors( [input1, input2]) compilation_group.input_sets_append(input_specs) trace = torch_glow.to_glow_selective( trace, {"model.encoder.encoder.transformer.layers": spec}, inplace=False, ) return trace else: return trace
def lower_and_write_to_onnx_helper(self, ModType, onnx_prefix): x = torch.randn(1, 3, 8, 8) model = create_model(x, ModType) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("Interpreter") compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) input_spec = torch_glow.InputSpec() input_spec.set_same_as(x) compilation_group.input_sets_append([input_spec]) scripted_mod = torch.jit.trace(model, x) torch_glow.enable_write_to_onnx() torch_glow.set_onnx_file_name_prefix(onnx_prefix) torch_glow.enable_write_without_randomize() lowered_model = torch_glow.to_glow(scripted_mod, {"forward": spec}) # Run Glow model g = lowered_model(x) # Run reference model t = model(x) self.assertEqual(type(g), type(t)) self.assertEqual(len(g), len(t)) for (gi, ti) in zip(g, t): self.assertTrue(torch.allclose(gi, ti)) assert os.path.exists(onnx_prefix + ".onnxtxt") onnx_files = glob.glob(onnx_prefix + "*.onnx*") for f in onnx_files: os.remove(f)
def infer_nnpi(model, device, data_type, input_size, output_size, batch_size, args): import torch_glow # Detailed structure for spec can be found at https://fburl.com/diffusion/79q4efud # Create compilation spec spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend("NNPI") # Create compilation group and update settings. # Compilation group contains compilation specific information like # fp16 settings, enableRemoveMutation, anything that changes # the Glow graph compiled for example. compilation_group = torch_glow.CompilationGroup() compilation_group_settings = compilation_group.get_settings() compilation_group_settings.set_convert_to_fp16(True) compilation_group_settings.set_replication_count(1) compilation_group_settings.backend_specific_opts_insert( "NNPI_IceCores", "1") data = torch.randn(batch_size, input_size) # Create input spec and add it into compilation group. # This is used for shape inference when lowering the model to Glow. data_spec = torch_glow.InputSpec() data_spec.set_same_as(data) compilation_group.input_sets_append([data_spec]) spec.compilation_groups_append(compilation_group) traced_model = torch.jit.trace(model, (data)) lowered_model = torch_glow.to_glow(traced_model, spec) start_time = time.time() for i in range(args.steps + args.warmups): lowered_model(data) if i < args.warmups: start_time = time.time() return time.time() - start_time
def lower_modules_to_accelerator( model: nn.Module, trace, export_options: ExportConfig, throughput_optimize=False ): # Raise error if accelerator could not be imported if not accelerator_lowering_supported: raise RuntimeError("Accelerator Lowering not supported!") import torch_glow log_accelerator_feature_usage("build.NNPI") if ( (hasattr(model, "encoder") and isinstance(model.encoder, RoBERTaEncoder)) or ( hasattr(model, "representation") and isinstance(model.representation, AcceleratorBiLSTM) ) or ( hasattr(model, "lower_module") # Internal CNN LM module to add accelerator support. and type(model.lower_module).__qualname__ == "CNNLowerModule" ) ): backend = "NNPI" backend_qualifier = "" if throughput_optimize: backend_qualifier = ":throughput_optimized" modules_to_lower = accelerator.get_modules(model, backend + backend_qualifier) if len(modules_to_lower) < 1: raise RuntimeError("Need at least one module to lower to accelerator") elif len(modules_to_lower) > 1: print(f"Warning. Received {len(modules_to_lower)} modules to lower.") print("Warning. Only lowering first module.") ( submod_modelpath, compilation_spec_dict, inputs_function, ) = modules_to_lower[0] submod_tracepath = accelerator.model2trace_path(submod_modelpath) spec = torch_glow.CompilationSpec() spec.get_settings().set_glow_backend(backend) compilation_group = torch_glow.CompilationGroup() spec.compilation_groups_append(compilation_group) compilation_group_settings = compilation_group.get_settings() # Set values from dict that are not set via backend-specific opts compilation_group_settings.set_convert_to_fp16( compilation_spec_dict.pop("glow:ConvertToFP16", "true") in ["true", "True"] ) compilation_group_settings.set_replication_count( int(compilation_spec_dict.pop("glow:ReplicationCount", "1")) ) for k, v in compilation_spec_dict.items(): compilation_group.get_settings().backend_specific_opts_insert(k, v) if inputs_function is not None: input_sets = inputs_function( model, trace, export_options, None, submod_modelpath ) else: raise RuntimeError( "inputs_function needs to be specified in accelerator decorator" ) compilation_group.set_input_sets(input_sets) trace = torch_glow.to_glow_selective( trace, {submod_tracepath: spec}, inplace=False, ) return trace else: return trace