def main():
    model = efficientnetv2_s()

    # option1
    for name, para in model.named_parameters():
        # 除head外,其他权重全部冻结
        if "head" not in name:
            para.requires_grad_(False)
        else:
            print("training {}".format(name))

    complexity = model.complexity(224, 224, 3)
    table = PrettyTable()
    table.field_names = ["params", "freeze-params", "train-params", "FLOPs", "acts"]
    table.add_row([complexity["params"],
                   complexity["freeze"],
                   complexity["params"] - complexity["freeze"],
                   complexity["flops"],
                   complexity["acts"]])
    print(table)

    # option2
    tensor = (torch.rand(1, 3, 224, 224),)
    flops = FlopCountAnalysis(model, tensor)
    print(flops.total())

    print(parameter_count_table(model))
Example #2
0
    def test_flop_count_empty(self) -> None:
        model = nn.ReLU()
        inputs = (torch.randn((1, 10)), )
        table = flop_count_table(FlopCountAnalysis(model, inputs))
        self.assertGreater(len(table), 0)

        out = flop_count_str(FlopCountAnalysis(model, inputs))
        self.assertGreater(len(out), 0)
def profile_fvcore(model, input_size=(3, 224, 224), batch_size=1, detailed=False, force_cpu=False):
    if force_cpu:
        model = model.to('cpu')
    device, dtype = next(model.parameters()).device, next(model.parameters()).dtype
    example_input = torch.ones((batch_size,) + input_size, device=device, dtype=dtype)
    fca = FlopCountAnalysis(model, example_input)
    aca = ActivationCountAnalysis(model, example_input)
    if detailed:
        fcs = flop_count_str(fca)
        print(fcs)
    return fca.total(), aca.total()
    def test_flop_count_table(self) -> None:

        model = TestNet()
        inputs = (torch.randn((1, 10)), )

        table = flop_count_table(FlopCountAnalysis(model, inputs))

        self.assertFalse(" a1 " in table)  # Wrapper skipping successful
        self.assertFalse("a1.b1.c1.d1.bias" in table)  # Didn't go to depth 4
        self.assertTrue("a1.b1.c1.d1" in table)  # Did go to depth 3
        self.assertTrue(" a1.b1 " in table)  # Didn't skip different stats
        self.assertTrue("a2.b1.c1.weight" in table)  # Weights incuded
        self.assertTrue("(10, 10)" in table)  # Shapes included
        self.assertTrue(" a2.b1 "
                        in table)  # Didn't skip through mod with >1 child
        self.assertFalse("#activations" in table)  # No activations
        self.assertTrue(" 0.33K" in table)  # Pretty stats, correct indentation
        self.assertFalse("  0.33K" in table)  # Correct indentation
        self.assertTrue("#parameters or shape" in table)  # Correct header

        # Expected:
        # | module            | #parameters or shape   | #flops   |
        # |:-------------------|:-----------------------|:---------|
        # | model              | 0.33K                  | 0.3K     |
        # |  a1.b1             |  0.11K                 |  100     |
        # |   a1.b1.c1         |   0.11K                |   N/A    |
        # |    a1.b1.c1.d1     |    0.11K               |    100   |
        # |  a2.b1             |  0.22K                 |  0.2K    |
        # |   a2.b1.c1         |   0.11K                |   100    |
        # |    a2.b1.c1.weight |    (10, 10)            |          |
        # |    a2.b1.c1.bias   |    (10,)               |          |
        # |   a2.b1.c2         |   0.11K                |   100    |
        # |    a2.b1.c2.weight |    (10, 10)            |          |
        # |    a2.b1.c2.bias   |    (10,)               |          |

        # Test activations and no parameter shapes
        table = flop_count_table(
            flops=FlopCountAnalysis(model, inputs),
            activations=ActivationCountAnalysis(model, inputs),
            show_param_shapes=False,
        )

        self.assertTrue("#activations" in table)  # Activation header
        self.assertTrue("  20"
                        in table)  # Activation value with correct indent
        self.assertFalse("#parameters or shape" in table)  # Correct header
        self.assertTrue("#parameters")  # Correct header
        self.assertFalse("a2.b1.c1.weight" in table)  # Weights not included
        self.assertFalse("(10, 10)" in table)  # Shapes not included
        self.assertFalse("a2.b1.c1.d2" in table)  # Skipped empty
 def test_detr_fbnet_export(self):
     runner = create_runner("d2go.projects.detr.runner.DETRRunner")
     cfg = runner.get_default_cfg()
     cfg.MODEL.DEVICE = "cpu"
     # DETR
     self._set_detr_cfg(cfg, 3, 3, 50, 256)
     # backbone
     cfg.MODEL.BACKBONE.NAME = "FBNetV2C4Backbone"
     cfg.MODEL.FBNET_V2.ARCH = "FBNetV3_A_dsmask_C5"
     cfg.MODEL.FBNET_V2.WIDTH_DIVISOR = 8
     cfg.MODEL.FBNET_V2.OUT_FEATURES = ["trunk4"]
     # build model
     model = runner.build_model(cfg).eval()
     model = model.detr
     print(model)
     scripted_model = torch.jit.script(model)
     self._assert_model_output(model, scripted_model)
     # print flops
     table = flop_count_table(
         FlopCountAnalysis(model, ([torch.rand(3, 224, 320)], )))
     print(table)
def main():
    # Self-Attention
    a1 = Attention(dim=512, num_heads=1)
    a1.proj = torch.nn.Identity()  # remove Wo

    # Multi-Head Attention
    a2 = Attention(dim=512, num_heads=8)

    # [batch_size, num_tokens, total_embed_dim]
    t = (torch.rand(32, 1024, 512), )

    flops1 = FlopCountAnalysis(a1, t)
    print("Self-Attention FLOPs:", flops1.total())

    flops2 = FlopCountAnalysis(a2, t)
    print("Multi-Head Attention FLOPs:", flops2.total())
    def test_flop_count_str(self) -> None:
        """
        Tests calculating model flops and outputing them in model print format.
        """

        model = TestNet()
        inputs = (torch.randn((1, 10)), )
        model_str = flop_count_str(FlopCountAnalysis(model, inputs))

        self.assertTrue(
            "N/A indicates a possibly missing statistic" in model_str)
        self.assertTrue("n_params: 0.11K, n_flops: 100" in model_str)
        self.assertTrue("ReLU()" in model_str)  # Suppress trivial statistics
        self.assertTrue("n_params: 0.11K, n_flops: N/A"
                        in model_str)  # Uncalled stats
        self.assertTrue("[[1, 10]]")  # Input sizes

        # Expected:

        # "Input sizes (torch.Tensor only): [[1, 10]]\n"
        # "N/A indicates a possibly missing statistic due to how the "
        # "module was called. Missing values are still included in the "
        # "parent's total.\n"
        # "TestNet(\n"
        # "  n_params: 0.33K, n_flops: 0.3K\n"
        # "  (a1): A1(\n"
        # "    n_params: 0.11K, n_flops: 100\n"
        # "    (b1): A1B1(\n"
        # "      n_params: 0.11K, n_flops: 100\n"
        # "      (c1): A1B1C1(\n"
        # "        n_params: 0.11K, n_flops: N/A\n"
        # "        (d1): Linear(\n"
        # "          in_features=10, out_features=10, bias=True\n"
        # "          n_params: 0.11K, n_flops: 100\n"
        # "        )\n"
        # "        (d2): ReLU()\n"
        # "      )\n"
        # "    )\n"
        # "  )\n"
        # "  (a2): A2(\n"
        # "    n_params: 0.22K, n_flops: 0.2K\n"
        # "    (b1): A2B1(\n"
        # "      n_params: 0.22K, n_flops: 0.2K\n"
        # "      (c1): Linear(\n"
        # "        in_features=10, out_features=10, bias=True\n"
        # "        n_params: 0.11K, n_flops: 100\n"
        # "      )\n"
        # "      (c2): Linear(\n"
        # "        in_features=10, out_features=10, bias=True\n"
        # "        n_params: 0.11K, n_flops: 100\n"
        # "      )\n"
        # "    )\n"
        # "  )\n"
        # ")"

        # Test with activations
        model_str = flop_count_str(
            FlopCountAnalysis(model, inputs),
            activations=ActivationCountAnalysis(model, inputs),
        )

        self.assertTrue(
            "n_params: 0.33K, n_flops: 0.3K, n_acts: 30" in model_str)
        self.assertTrue(
            "n_params: 0.11K, n_flops: N/A, n_acts: N/A" in model_str)
Example #8
0
    model = MPViT(
        img_size=224,
        num_stages=4,
        num_path=[2, 3, 3, 3],
        num_layers=[1, 3, 8, 3],
        embed_dims=[128, 224, 368, 480],
        mlp_ratios=[4, 4, 4, 4],
        num_heads=[8, 8, 8, 8],
        **kwargs,
    )
    model.default_cfg = _cfg_mpvit()
    return model


if __name__ == "__main__":
    model = mpvit_xsmall()
    model.eval()
    inputs = torch.randn(1, 3, 224, 224)
    model(inputs)

    from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis

    flops = FlopCountAnalysis(model, inputs)
    param = sum(p.numel() for p in model.parameters() if p.requires_grad)
    acts = ActivationCountAnalysis(model, inputs)

    print(f"total flops : {flops.total()}")
    print(f"total activations: {acts.total()}")
    print(f"number of parameter: {param}")
Example #9
0
def transformer(
    save,
    load,
    half,
    bt2fairseq,
):
    xlarge = False
    large = False
    DEFAULT_PADDING_IDX = 1
    avg_sequence_length = 128
    max_sequence_length = 256
    batch_size = 64

    class FairseqEncoder(torch.nn.Module):
        def __init__(
            self,
            embed_dim,
            attention_heads,
            ffn_embed_dim,
            num_layers,
            embedding_layer,  # torch.nn.Embedding. Must have a padding_idx field
            dropout=0,
            normalize_before=False,
            torch_encoder=None,  # torch encoder that you can map weights from
            activation="relu",
        ):
            super().__init__()

            cfg = FairseqTransformerConfig()
            cfg.encoder.embed_dim = embed_dim
            cfg.encoder.attention_heads = attention_heads
            cfg.encoder.ffn_embed_dim = ffn_embed_dim
            cfg.dropout = dropout
            cfg.encoder.normalize_before = normalize_before
            cfg.encoder.layers = num_layers
            # make embedding behavior same as other encoders
            cfg.no_token_positional_embeddings = True
            cfg.no_scale_embedding = True
            cfg.activation_fn = activation
            dictionary = {}  # TODO: verify what this is

            self.encoder = FairseqTransformerEncoder(
                cfg, dictionary, embedding_layer, return_fc=False
            )

            if torch_encoder is not None:
                for src_layer, dst_layer in zip(
                    torch_encoder.layers, self.encoder.layers
                ):
                    w_q, w_k, w_v = src_layer.self_attn.in_proj_weight.chunk(3, dim=0)
                    b_q, b_k, b_v = src_layer.self_attn.in_proj_bias.chunk(3, dim=0)

                    dst_layer.self_attn.q_proj.weight = torch.nn.Parameter(w_q)
                    dst_layer.self_attn.q_proj.bias = torch.nn.Parameter(b_q)
                    dst_layer.self_attn.k_proj.weight = torch.nn.Parameter(w_k)
                    dst_layer.self_attn.k_proj.bias = torch.nn.Parameter(b_k)
                    dst_layer.self_attn.v_proj.weight = torch.nn.Parameter(w_v)
                    dst_layer.self_attn.v_proj.bias = torch.nn.Parameter(b_v)

                    dst_layer.self_attn.out_proj.weight = (
                        src_layer.self_attn.out_proj.weight
                    )
                    dst_layer.self_attn.out_proj.bias = (
                        src_layer.self_attn.out_proj.bias
                    )

                    dst_layer.fc1.weight = src_layer.linear1.weight
                    dst_layer.fc1.bias = src_layer.linear1.bias

                    # fairseq may use fusedlayernorm from nvidia apex - diff properties
                    dst_layer.self_attn_layer_norm.load_state_dict(
                        src_layer.norm1.state_dict()
                    )

                    dst_layer.fc2.weight = src_layer.linear2.weight
                    dst_layer.fc2.bias = src_layer.linear2.bias

                    dst_layer.final_layer_norm.load_state_dict(
                        src_layer.norm2.state_dict()
                    )

            # self.encoder = self.encoder.eval().cuda().half()

        def forward(self, tokens, src_lengths=None):
            return self.encoder(
                tokens,
                src_lengths=src_lengths,
                return_all_hiddens=False,
                token_embeddings=None,
            )

    def get_layers_embedding_dim_num_heads_for_configuration(xlarge, large):
        if xlarge:
            # XLM-R extra large (no BERT-XL exists)
            L = 24  # Layers
            D = 2560  # Embedding Dim
            H = 32  # Number of Heads
            FD = 10240  # Feed-forward network dim
            V = 30000  # Vocab Size
        elif large:
            # BERT-large
            L = 24
            D = 1024
            H = 16
            FD = 4096
            V = 30000
        else:
            # BERT-base
            L = 12
            D = 768
            H = 12
            FD = 3072
            V = 30000

        return (L, D, H, FD, V)

    # Better transformer
    class PTTransformer(torch.nn.Module):
        def __init__(self, transformer, embedding):
            super().__init__()
            self.transformer = transformer
            self.embedding = embedding
            self.padding_idx = DEFAULT_PADDING_IDX

        def forward(self, x):
            padding_mask = None
            if not x.is_nested:
                padding_mask = x.eq(self.padding_idx)
            x = self.embedding(x)
            return self.transformer(x, src_key_padding_mask=padding_mask)

    def make_transformer():
        return (
            PTTransformer(
                torch.nn.TransformerEncoder(
                    torch.nn.TransformerEncoderLayer(
                        d_model=D,
                        nhead=H,
                        dim_feedforward=FD,
                        batch_first=True,
                        activation="relu",
                    ),
                    num_layers=L,
                    enable_nested_tensor=False,
                ),
                embedding_layer,
            )
            .eval()
            .cuda()
        )

    def copy_weights(layers_fairseq, layers_bt):
        for src_layer, dst_layer in zip(layers_fairseq, layers_bt):
            w_q = src_layer.self_attn.q_proj.weight
            b_q = src_layer.self_attn.q_proj.bias
            w_k = src_layer.self_attn.k_proj.weight
            b_k = src_layer.self_attn.k_proj.bias
            w_v = src_layer.self_attn.v_proj.weight
            b_v = src_layer.self_attn.v_proj.bias
            dst_layer.self_attn.in_proj_weight = torch.nn.Parameter(
                torch.cat((w_q, w_k, w_v), dim=0)
            )
            dst_layer.self_attn.in_proj_bias = torch.nn.Parameter(
                torch.cat((b_q, b_k, b_v), dim=0)
            )

            dst_layer.self_attn.out_proj.weight = src_layer.self_attn.out_proj.weight
            dst_layer.self_attn.out_proj.bias = src_layer.self_attn.out_proj.bias

            dst_layer.linear1.weight = src_layer.fc1.weight
            dst_layer.linear1.bias = src_layer.fc1.bias
            dst_layer.linear2.weight = src_layer.fc2.weight
            dst_layer.linear2.bias = src_layer.fc2.bias

            dst_layer.norm1.weight = src_layer.self_attn_layer_norm.weight
            dst_layer.norm1.bias = src_layer.self_attn_layer_norm.bias
            dst_layer.norm2.weight = src_layer.final_layer_norm.weight
            dst_layer.norm2.bias = src_layer.final_layer_norm.bias

    (L, D, H, FD, V) = get_layers_embedding_dim_num_heads_for_configuration(
        xlarge, large
    )
    embedding_layer = torch.nn.Embedding(V, D, DEFAULT_PADDING_IDX)
    # True means BT as source and fairseq is target, False means the other way
    # mode1 = False
    if bt2fairseq:
        # BT as source and fairseq is target, copy BT's weight to fairseq
        transformer = make_transformer()
        fairseq_transformer = (
            FairseqEncoder(
                D,
                H,
                FD,
                L,
                embedding_layer,
                dropout=0,
                normalize_before=False,
                torch_encoder=transformer.transformer,
                activation="relu",
            )
            .eval()
            .cuda()
        )
        if half:
            transformer.half()
            fairseq_transformer.half()
    if not bt2fairseq:
        # the other way around, fairseq is source and BT is target,copy fairseq's weight to BT
        transformer = make_transformer()
        fairseq_transformer = (
            FairseqEncoder(
                D,
                H,
                FD,
                L,
                embedding_layer,
                dropout=0,
                normalize_before=False,
                torch_encoder=None,
                activation="relu",
            )
            .eval()
            .cuda()
        )
        # for the test where we need to load existing ckpt. It is tested that after loading
        # the ckpt, the results between fairseq_transformer(BT kernel) equals BT
        if half:
            transformer.half()
            fairseq_transformer.half()
        if save:
            torch.save(fairseq_transformer.state_dict(), "./fairseq.pt")
            sys.exit(0)
        if load:
            fairseq_transformer.load_state_dict(torch.load("./fairseq.pt"))
        # copy
        copy_weights(fairseq_transformer.encoder.layers, transformer.transformer.layers)

    device = "cuda"
    lengths = (avg_sequence_length,) * batch_size
    tokens = torch.full(
        (batch_size, max_sequence_length),
        DEFAULT_PADDING_IDX,
        device=device,
        dtype=torch.long,
    )
    for i in range(batch_size):
        tokens[i, : lengths[i]] = torch.randint(
            DEFAULT_PADDING_IDX + 1,
            V - 1,
            size=(lengths[i],),
            device=device,
            dtype=torch.long,
        )
    # mask
    if half:
        lengths_tensor = torch.Tensor(lengths).cuda().half()
    else:
        lengths_tensor = torch.Tensor(lengths).cuda()

    with torch.inference_mode():
        fs_output = fairseq_transformer(tokens, lengths_tensor)["encoder_out"][0]
        fs_output = fs_output.transpose(0, 1)
    with torch.inference_mode():
        t_output = transformer(tokens)
    test_lst = [
        # (name, output, relative tolerance, absolute tolerance)
        ("FS", fs_output, 1e-4, 9e-3),
    ]
    numerical_test(lengths, t_output, test_lst)

    iters = 100
    t = benchmark_torch_function(iters, transformer, tokens)

    def bert_flops(B, T, D, L):
        mlp = 2 * (B * T * D * 4 * D) + 2 * (B * T * D * 4 * D)
        qkv = 3 * 2 * B * T * D * D
        attn = 2 * B * D * T * T + 2 * B * D * T * T + 2 * B * T * D * D
        return L * (mlp + qkv + attn)

    flops = bert_flops(batch_size, avg_sequence_length, D, L)
    flops_e = (
        FlopCountAnalysis(transformer, (tokens[:, :avg_sequence_length])).total() * 2
    )
    with torch.inference_mode():
        bt = benchmark_torch_function(iters, transformer, tokens)
        fst = benchmark_torch_function(
            iters, fairseq_transformer, tokens, lengths_tensor
        )

        def metrics(tt, baseline=None):
            if baseline:
                return metrics(tt) + f", Speedup: {baseline / tt:.2f}x"
            return f"{tt * 1.0e3:.2f} ms/iter, {flops_e / tt / 1.0e12:.2f} TFLOP/s"

        results = [
            f"Seed: {seed}",
            f"Padded tokens: {(1-sum(lengths)/(tokens.numel()))*100:.2f}%",
            f"Batch shape: {tokens.shape}",
            f"Analytical flops per batch: {flops/ batch_size / 1e9:.2f} GFLOPS",
            f"Empirical flops per batch: {flops_e/ batch_size / 1e9:.2f} GFLOPS",
            f"B: {batch_size}",
            f"T: {avg_sequence_length}",
            f"TMax: {max_sequence_length}",
            f"Eager Time: {metrics(t)}",
            f"BetterTransformer: {metrics(bt, t)}",
            f"FST: {metrics(fst, t)}",
        ]
        print("===========Speedup Results")
        print("; ".join(results))
Example #10
0
 def flop_count_analysis(
     self, inputs: Union[torch.Tensor, Tuple[torch.Tensor, ...]]
 ) -> FlopCountAnalysis:
     return FlopCountAnalysis(self, inputs)