def main(): model = efficientnetv2_s() # option1 for name, para in model.named_parameters(): # 除head外,其他权重全部冻结 if "head" not in name: para.requires_grad_(False) else: print("training {}".format(name)) complexity = model.complexity(224, 224, 3) table = PrettyTable() table.field_names = ["params", "freeze-params", "train-params", "FLOPs", "acts"] table.add_row([complexity["params"], complexity["freeze"], complexity["params"] - complexity["freeze"], complexity["flops"], complexity["acts"]]) print(table) # option2 tensor = (torch.rand(1, 3, 224, 224),) flops = FlopCountAnalysis(model, tensor) print(flops.total()) print(parameter_count_table(model))
def test_flop_count_empty(self) -> None: model = nn.ReLU() inputs = (torch.randn((1, 10)), ) table = flop_count_table(FlopCountAnalysis(model, inputs)) self.assertGreater(len(table), 0) out = flop_count_str(FlopCountAnalysis(model, inputs)) self.assertGreater(len(out), 0)
def profile_fvcore(model, input_size=(3, 224, 224), batch_size=1, detailed=False, force_cpu=False): if force_cpu: model = model.to('cpu') device, dtype = next(model.parameters()).device, next(model.parameters()).dtype example_input = torch.ones((batch_size,) + input_size, device=device, dtype=dtype) fca = FlopCountAnalysis(model, example_input) aca = ActivationCountAnalysis(model, example_input) if detailed: fcs = flop_count_str(fca) print(fcs) return fca.total(), aca.total()
def test_flop_count_table(self) -> None: model = TestNet() inputs = (torch.randn((1, 10)), ) table = flop_count_table(FlopCountAnalysis(model, inputs)) self.assertFalse(" a1 " in table) # Wrapper skipping successful self.assertFalse("a1.b1.c1.d1.bias" in table) # Didn't go to depth 4 self.assertTrue("a1.b1.c1.d1" in table) # Did go to depth 3 self.assertTrue(" a1.b1 " in table) # Didn't skip different stats self.assertTrue("a2.b1.c1.weight" in table) # Weights incuded self.assertTrue("(10, 10)" in table) # Shapes included self.assertTrue(" a2.b1 " in table) # Didn't skip through mod with >1 child self.assertFalse("#activations" in table) # No activations self.assertTrue(" 0.33K" in table) # Pretty stats, correct indentation self.assertFalse(" 0.33K" in table) # Correct indentation self.assertTrue("#parameters or shape" in table) # Correct header # Expected: # | module | #parameters or shape | #flops | # |:-------------------|:-----------------------|:---------| # | model | 0.33K | 0.3K | # | a1.b1 | 0.11K | 100 | # | a1.b1.c1 | 0.11K | N/A | # | a1.b1.c1.d1 | 0.11K | 100 | # | a2.b1 | 0.22K | 0.2K | # | a2.b1.c1 | 0.11K | 100 | # | a2.b1.c1.weight | (10, 10) | | # | a2.b1.c1.bias | (10,) | | # | a2.b1.c2 | 0.11K | 100 | # | a2.b1.c2.weight | (10, 10) | | # | a2.b1.c2.bias | (10,) | | # Test activations and no parameter shapes table = flop_count_table( flops=FlopCountAnalysis(model, inputs), activations=ActivationCountAnalysis(model, inputs), show_param_shapes=False, ) self.assertTrue("#activations" in table) # Activation header self.assertTrue(" 20" in table) # Activation value with correct indent self.assertFalse("#parameters or shape" in table) # Correct header self.assertTrue("#parameters") # Correct header self.assertFalse("a2.b1.c1.weight" in table) # Weights not included self.assertFalse("(10, 10)" in table) # Shapes not included self.assertFalse("a2.b1.c1.d2" in table) # Skipped empty
def test_detr_fbnet_export(self): runner = create_runner("d2go.projects.detr.runner.DETRRunner") cfg = runner.get_default_cfg() cfg.MODEL.DEVICE = "cpu" # DETR self._set_detr_cfg(cfg, 3, 3, 50, 256) # backbone cfg.MODEL.BACKBONE.NAME = "FBNetV2C4Backbone" cfg.MODEL.FBNET_V2.ARCH = "FBNetV3_A_dsmask_C5" cfg.MODEL.FBNET_V2.WIDTH_DIVISOR = 8 cfg.MODEL.FBNET_V2.OUT_FEATURES = ["trunk4"] # build model model = runner.build_model(cfg).eval() model = model.detr print(model) scripted_model = torch.jit.script(model) self._assert_model_output(model, scripted_model) # print flops table = flop_count_table( FlopCountAnalysis(model, ([torch.rand(3, 224, 320)], ))) print(table)
def main(): # Self-Attention a1 = Attention(dim=512, num_heads=1) a1.proj = torch.nn.Identity() # remove Wo # Multi-Head Attention a2 = Attention(dim=512, num_heads=8) # [batch_size, num_tokens, total_embed_dim] t = (torch.rand(32, 1024, 512), ) flops1 = FlopCountAnalysis(a1, t) print("Self-Attention FLOPs:", flops1.total()) flops2 = FlopCountAnalysis(a2, t) print("Multi-Head Attention FLOPs:", flops2.total())
def test_flop_count_str(self) -> None: """ Tests calculating model flops and outputing them in model print format. """ model = TestNet() inputs = (torch.randn((1, 10)), ) model_str = flop_count_str(FlopCountAnalysis(model, inputs)) self.assertTrue( "N/A indicates a possibly missing statistic" in model_str) self.assertTrue("n_params: 0.11K, n_flops: 100" in model_str) self.assertTrue("ReLU()" in model_str) # Suppress trivial statistics self.assertTrue("n_params: 0.11K, n_flops: N/A" in model_str) # Uncalled stats self.assertTrue("[[1, 10]]") # Input sizes # Expected: # "Input sizes (torch.Tensor only): [[1, 10]]\n" # "N/A indicates a possibly missing statistic due to how the " # "module was called. Missing values are still included in the " # "parent's total.\n" # "TestNet(\n" # " n_params: 0.33K, n_flops: 0.3K\n" # " (a1): A1(\n" # " n_params: 0.11K, n_flops: 100\n" # " (b1): A1B1(\n" # " n_params: 0.11K, n_flops: 100\n" # " (c1): A1B1C1(\n" # " n_params: 0.11K, n_flops: N/A\n" # " (d1): Linear(\n" # " in_features=10, out_features=10, bias=True\n" # " n_params: 0.11K, n_flops: 100\n" # " )\n" # " (d2): ReLU()\n" # " )\n" # " )\n" # " )\n" # " (a2): A2(\n" # " n_params: 0.22K, n_flops: 0.2K\n" # " (b1): A2B1(\n" # " n_params: 0.22K, n_flops: 0.2K\n" # " (c1): Linear(\n" # " in_features=10, out_features=10, bias=True\n" # " n_params: 0.11K, n_flops: 100\n" # " )\n" # " (c2): Linear(\n" # " in_features=10, out_features=10, bias=True\n" # " n_params: 0.11K, n_flops: 100\n" # " )\n" # " )\n" # " )\n" # ")" # Test with activations model_str = flop_count_str( FlopCountAnalysis(model, inputs), activations=ActivationCountAnalysis(model, inputs), ) self.assertTrue( "n_params: 0.33K, n_flops: 0.3K, n_acts: 30" in model_str) self.assertTrue( "n_params: 0.11K, n_flops: N/A, n_acts: N/A" in model_str)
model = MPViT( img_size=224, num_stages=4, num_path=[2, 3, 3, 3], num_layers=[1, 3, 8, 3], embed_dims=[128, 224, 368, 480], mlp_ratios=[4, 4, 4, 4], num_heads=[8, 8, 8, 8], **kwargs, ) model.default_cfg = _cfg_mpvit() return model if __name__ == "__main__": model = mpvit_xsmall() model.eval() inputs = torch.randn(1, 3, 224, 224) model(inputs) from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis flops = FlopCountAnalysis(model, inputs) param = sum(p.numel() for p in model.parameters() if p.requires_grad) acts = ActivationCountAnalysis(model, inputs) print(f"total flops : {flops.total()}") print(f"total activations: {acts.total()}") print(f"number of parameter: {param}")
def transformer( save, load, half, bt2fairseq, ): xlarge = False large = False DEFAULT_PADDING_IDX = 1 avg_sequence_length = 128 max_sequence_length = 256 batch_size = 64 class FairseqEncoder(torch.nn.Module): def __init__( self, embed_dim, attention_heads, ffn_embed_dim, num_layers, embedding_layer, # torch.nn.Embedding. Must have a padding_idx field dropout=0, normalize_before=False, torch_encoder=None, # torch encoder that you can map weights from activation="relu", ): super().__init__() cfg = FairseqTransformerConfig() cfg.encoder.embed_dim = embed_dim cfg.encoder.attention_heads = attention_heads cfg.encoder.ffn_embed_dim = ffn_embed_dim cfg.dropout = dropout cfg.encoder.normalize_before = normalize_before cfg.encoder.layers = num_layers # make embedding behavior same as other encoders cfg.no_token_positional_embeddings = True cfg.no_scale_embedding = True cfg.activation_fn = activation dictionary = {} # TODO: verify what this is self.encoder = FairseqTransformerEncoder( cfg, dictionary, embedding_layer, return_fc=False ) if torch_encoder is not None: for src_layer, dst_layer in zip( torch_encoder.layers, self.encoder.layers ): w_q, w_k, w_v = src_layer.self_attn.in_proj_weight.chunk(3, dim=0) b_q, b_k, b_v = src_layer.self_attn.in_proj_bias.chunk(3, dim=0) dst_layer.self_attn.q_proj.weight = torch.nn.Parameter(w_q) dst_layer.self_attn.q_proj.bias = torch.nn.Parameter(b_q) dst_layer.self_attn.k_proj.weight = torch.nn.Parameter(w_k) dst_layer.self_attn.k_proj.bias = torch.nn.Parameter(b_k) dst_layer.self_attn.v_proj.weight = torch.nn.Parameter(w_v) dst_layer.self_attn.v_proj.bias = torch.nn.Parameter(b_v) dst_layer.self_attn.out_proj.weight = ( src_layer.self_attn.out_proj.weight ) dst_layer.self_attn.out_proj.bias = ( src_layer.self_attn.out_proj.bias ) dst_layer.fc1.weight = src_layer.linear1.weight dst_layer.fc1.bias = src_layer.linear1.bias # fairseq may use fusedlayernorm from nvidia apex - diff properties dst_layer.self_attn_layer_norm.load_state_dict( src_layer.norm1.state_dict() ) dst_layer.fc2.weight = src_layer.linear2.weight dst_layer.fc2.bias = src_layer.linear2.bias dst_layer.final_layer_norm.load_state_dict( src_layer.norm2.state_dict() ) # self.encoder = self.encoder.eval().cuda().half() def forward(self, tokens, src_lengths=None): return self.encoder( tokens, src_lengths=src_lengths, return_all_hiddens=False, token_embeddings=None, ) def get_layers_embedding_dim_num_heads_for_configuration(xlarge, large): if xlarge: # XLM-R extra large (no BERT-XL exists) L = 24 # Layers D = 2560 # Embedding Dim H = 32 # Number of Heads FD = 10240 # Feed-forward network dim V = 30000 # Vocab Size elif large: # BERT-large L = 24 D = 1024 H = 16 FD = 4096 V = 30000 else: # BERT-base L = 12 D = 768 H = 12 FD = 3072 V = 30000 return (L, D, H, FD, V) # Better transformer class PTTransformer(torch.nn.Module): def __init__(self, transformer, embedding): super().__init__() self.transformer = transformer self.embedding = embedding self.padding_idx = DEFAULT_PADDING_IDX def forward(self, x): padding_mask = None if not x.is_nested: padding_mask = x.eq(self.padding_idx) x = self.embedding(x) return self.transformer(x, src_key_padding_mask=padding_mask) def make_transformer(): return ( PTTransformer( torch.nn.TransformerEncoder( torch.nn.TransformerEncoderLayer( d_model=D, nhead=H, dim_feedforward=FD, batch_first=True, activation="relu", ), num_layers=L, enable_nested_tensor=False, ), embedding_layer, ) .eval() .cuda() ) def copy_weights(layers_fairseq, layers_bt): for src_layer, dst_layer in zip(layers_fairseq, layers_bt): w_q = src_layer.self_attn.q_proj.weight b_q = src_layer.self_attn.q_proj.bias w_k = src_layer.self_attn.k_proj.weight b_k = src_layer.self_attn.k_proj.bias w_v = src_layer.self_attn.v_proj.weight b_v = src_layer.self_attn.v_proj.bias dst_layer.self_attn.in_proj_weight = torch.nn.Parameter( torch.cat((w_q, w_k, w_v), dim=0) ) dst_layer.self_attn.in_proj_bias = torch.nn.Parameter( torch.cat((b_q, b_k, b_v), dim=0) ) dst_layer.self_attn.out_proj.weight = src_layer.self_attn.out_proj.weight dst_layer.self_attn.out_proj.bias = src_layer.self_attn.out_proj.bias dst_layer.linear1.weight = src_layer.fc1.weight dst_layer.linear1.bias = src_layer.fc1.bias dst_layer.linear2.weight = src_layer.fc2.weight dst_layer.linear2.bias = src_layer.fc2.bias dst_layer.norm1.weight = src_layer.self_attn_layer_norm.weight dst_layer.norm1.bias = src_layer.self_attn_layer_norm.bias dst_layer.norm2.weight = src_layer.final_layer_norm.weight dst_layer.norm2.bias = src_layer.final_layer_norm.bias (L, D, H, FD, V) = get_layers_embedding_dim_num_heads_for_configuration( xlarge, large ) embedding_layer = torch.nn.Embedding(V, D, DEFAULT_PADDING_IDX) # True means BT as source and fairseq is target, False means the other way # mode1 = False if bt2fairseq: # BT as source and fairseq is target, copy BT's weight to fairseq transformer = make_transformer() fairseq_transformer = ( FairseqEncoder( D, H, FD, L, embedding_layer, dropout=0, normalize_before=False, torch_encoder=transformer.transformer, activation="relu", ) .eval() .cuda() ) if half: transformer.half() fairseq_transformer.half() if not bt2fairseq: # the other way around, fairseq is source and BT is target,copy fairseq's weight to BT transformer = make_transformer() fairseq_transformer = ( FairseqEncoder( D, H, FD, L, embedding_layer, dropout=0, normalize_before=False, torch_encoder=None, activation="relu", ) .eval() .cuda() ) # for the test where we need to load existing ckpt. It is tested that after loading # the ckpt, the results between fairseq_transformer(BT kernel) equals BT if half: transformer.half() fairseq_transformer.half() if save: torch.save(fairseq_transformer.state_dict(), "./fairseq.pt") sys.exit(0) if load: fairseq_transformer.load_state_dict(torch.load("./fairseq.pt")) # copy copy_weights(fairseq_transformer.encoder.layers, transformer.transformer.layers) device = "cuda" lengths = (avg_sequence_length,) * batch_size tokens = torch.full( (batch_size, max_sequence_length), DEFAULT_PADDING_IDX, device=device, dtype=torch.long, ) for i in range(batch_size): tokens[i, : lengths[i]] = torch.randint( DEFAULT_PADDING_IDX + 1, V - 1, size=(lengths[i],), device=device, dtype=torch.long, ) # mask if half: lengths_tensor = torch.Tensor(lengths).cuda().half() else: lengths_tensor = torch.Tensor(lengths).cuda() with torch.inference_mode(): fs_output = fairseq_transformer(tokens, lengths_tensor)["encoder_out"][0] fs_output = fs_output.transpose(0, 1) with torch.inference_mode(): t_output = transformer(tokens) test_lst = [ # (name, output, relative tolerance, absolute tolerance) ("FS", fs_output, 1e-4, 9e-3), ] numerical_test(lengths, t_output, test_lst) iters = 100 t = benchmark_torch_function(iters, transformer, tokens) def bert_flops(B, T, D, L): mlp = 2 * (B * T * D * 4 * D) + 2 * (B * T * D * 4 * D) qkv = 3 * 2 * B * T * D * D attn = 2 * B * D * T * T + 2 * B * D * T * T + 2 * B * T * D * D return L * (mlp + qkv + attn) flops = bert_flops(batch_size, avg_sequence_length, D, L) flops_e = ( FlopCountAnalysis(transformer, (tokens[:, :avg_sequence_length])).total() * 2 ) with torch.inference_mode(): bt = benchmark_torch_function(iters, transformer, tokens) fst = benchmark_torch_function( iters, fairseq_transformer, tokens, lengths_tensor ) def metrics(tt, baseline=None): if baseline: return metrics(tt) + f", Speedup: {baseline / tt:.2f}x" return f"{tt * 1.0e3:.2f} ms/iter, {flops_e / tt / 1.0e12:.2f} TFLOP/s" results = [ f"Seed: {seed}", f"Padded tokens: {(1-sum(lengths)/(tokens.numel()))*100:.2f}%", f"Batch shape: {tokens.shape}", f"Analytical flops per batch: {flops/ batch_size / 1e9:.2f} GFLOPS", f"Empirical flops per batch: {flops_e/ batch_size / 1e9:.2f} GFLOPS", f"B: {batch_size}", f"T: {avg_sequence_length}", f"TMax: {max_sequence_length}", f"Eager Time: {metrics(t)}", f"BetterTransformer: {metrics(bt, t)}", f"FST: {metrics(fst, t)}", ] print("===========Speedup Results") print("; ".join(results))
def flop_count_analysis( self, inputs: Union[torch.Tensor, Tuple[torch.Tensor, ...]] ) -> FlopCountAnalysis: return FlopCountAnalysis(self, inputs)