Exemple #1
0
    def _profile_structure(self, model, x, use_cuda=False, alexnet_ops=[]):
        with torchprof.Profile(model, use_cuda=use_cuda) as prof:
            model(x)

        traces, event_lists_dict = prof.raw()

        for layer_idx, trace in enumerate(traces):
            (path, leaf, _) = trace
            self.assertEqual((path, leaf), self.alexnet_traces[layer_idx])
            event_lists = event_lists_dict[path]
            if leaf:
                # model(x) called once, each layer should have one event_list
                self.assertEqual(len(event_lists), 1)
                event_names = tuple(e.name for e in event_lists[0])
                # profiler returned order is not deterministic
                try:
                    self.assertTrue(
                        all(event_name in event_names
                            for event_name in alexnet_ops[layer_idx]),
                        f"Layer {layer_idx} received {event_names}, old {alexnet_ops[layer_idx]}",
                    )
                except IndexError:
                    self.assertTrue(
                        False, f"Layer {layer_idx} received {event_names}")
            else:
                # non leaf nodes should not have event_list values
                self.assertEqual(len(event_lists), 0)

        pretty = prof.display()
        pretty_full = prof.display(show_events=True)
        self.assertIsInstance(pretty, str)
        self.assertIsInstance(pretty_full, str)
def compute_speed(model, input_size, device, iteration):
    torch.cuda.set_device(device)
    torch.backends.cudnn.benchmark = True

    model.eval()
    model = model.cuda()

    input = torch.randn(*input_size, device=device)

    torch.cuda.synchronize()
    for _ in range(50):
        model(input)
        torch.cuda.synchronize()

    logger.info('=========Speed Testing=========')
    time_spent = []
    for _ in range(iteration):
        torch.cuda.synchronize()
        t_start = time.perf_counter()
        with torch.no_grad():
            model(input)
        torch.cuda.synchronize()
        time_spent.append(time.perf_counter() - t_start)
    torch.cuda.synchronize()
    elapsed_time = np.sum(time_spent)
    with torchprof.Profile(model, use_cuda=True) as prof:
        model(input)
    print(prof.display(show_events=False))
    logger.info('Elapsed time: [%.2f s / %d iter]' % (elapsed_time, iteration))
    logger.info('Speed Time: %.2f ms / iter    FPS: %.2f' %
                (elapsed_time / iteration * 1000, iteration / elapsed_time))
Exemple #3
0
def profile(model, inp_data, want_op_file=False, cuda_=False):
    df1 = pf_flop(model, inputs=(inp_data, ))
    with pf_time.Profile(model, use_cuda=cuda_) as prof:
        model(inp_data)
    df2 = prof.display()
    for i1 in df1.index:
        df1["Layer_Name"][i1] = df2["Layer_Name"][i1]
    #   print(df1)
    #   print(df2)
    #   mynn={"Layer Name":[],"FLOPs":[],"Self CPU total":[], "CPU Total":[], "GPU Total":[],"Input Features":[], "Output Features":[], "Dict Size of Emb":[], "Emb Vector Size":[], "Norm Size":[]}
    #   for i1 in df1.index:
    #     mynn["Layer Name"].append(str(df2["Layer Name"][i1]))
    #     mynn["Self CPU total"].append(str(df2["Self CPU total"][i1]))
    #     mynn["CPU Total"].append(str(df2["CPU total"][i1]))
    #     mynn["GPU Total"].append(str(df2["GPU total"][i1]))
    #     mynn["Input Features"].append(str(df1["Input Features"][i1]))
    #     mynn["Output Features"].append(str(df1["Output Features"][i1]))
    #     mynn["Dict Size of Emb"].append(str(df1["Dict Size of Emb"][i1]))
    #     mynn["Emb Vector Size"].append(str(df1["Emb Vector Size"][i1]))
    #     mynn["Norm Size"].append(str(df1["Norm Size"][i1]))

    #   df=DataFrame(mynn, columns= ["Layer Name","FLOPs","Self CPU total","CPU Total","GPU Total","Input Features","Output Features","Dict Size of Emb","Emb Vector Size","Norm Size"])
    del df2["Layer_Name"]
    df = pd.concat([df1, df2], axis=1).reindex(df1.index)
    if want_op_file == True:
        export_csv = df.to_csv(r'output_file.csv', index=None, header=True)
    else:
        print(df)
Exemple #4
0
    def one_block_latency(self, n_iter=100):
        """
        :return: inner one block
        """
        with torch.no_grad():
            count = 1
            for idx, data in enumerate(self.test_loader):
                if count > n_iter:
                    break

                images, labels = data
                # self.logger.info("outer shape: {}".format(images.shape))

                # infer
                with torchprof.Profile(self.model, use_cuda=True) as prof:
                    self.model(images.cuda(self.device))

                count += 1
                if count % 10 == 0:
                    self.logger.info("{} times estimation".format(count))

            latency = pd.Series(data=self.model.blocks[0].latency_list[15],
                                name="one_block_latency")

        return latency
Exemple #5
0
    def _generate_summary(self, input_tensors: List[torch.Tensor]) -> None:
        """
        Creates a list of input torch tensors and registers forward pass hooks to the model,
        passes the inputs through the model, and collects model information such num of parameters
        and intermediate tensor size.
        :param input_tensors: A list of tensors which are fed into the torch model.
        """
        def print_summary() -> None:
            logging.info(
                "-------------------------------------------------------------------------------"
            )
            line_new = "{:>20} {:>25} {:>15} {:>15}".format(
                "Layer (type)", "Output Shape", "Param #", "Device")
            logging.info(line_new)
            logging.info(
                "==============================================================================="
            )
            total_output = 0.0
            for layer in self.summary:
                line_new = "{:>20} {:>25} {:>15} {:>15}".format(
                    layer, str(self.summary[layer].output_shape),
                    "{0:,}".format(self.summary[layer].n_params),
                    str(self.summary[layer].device))
                total_output += self.summary[layer].output_memory_megabytes
                logging.info(line_new)

            # Assume 4 bytes/number (float on cuda) - Without mixed precision training and inplace operations
            input_sizes = self._get_sizes_from_list(input_tensors)
            total_input_size = self.compute_tensor_memory_megabytes(
                input_sizes)
            total_output_size = 2. * total_output  # x2 for gradients

            logging.info(
                "==============================================================================="
            )
            logging.info("Total params: {0:,}".format(self.n_params))
            logging.info("Trainable params: {0:,}".format(
                self.n_trainable_params))
            logging.info("Input mem size (MB)(Wout mixed-precision): %0.2f" %
                         total_input_size)
            logging.info(
                "Forward/backward pass mem size (MB)(Wout mixed-precision): %0.2f"
                % total_output_size)
            logging.info(
                "-------------------------------------------------------------------------------"
            )

        # Register the forward-pass hooks, profile the model, and restore its state
        self.model.apply(self._register_hook)
        with torchprof.Profile(self.model, use_cuda=self.use_gpu) as prof:
            forward_preserve_state(self.model, input_tensors)  # type: ignore

        # Log the model summary: tensor shapes, num of parameters, memory requirement, and forward pass time
        logging.info(self.model)
        logging.info('\n' + prof.display(show_events=False))
        print_summary()

        # Remove the hooks via handles
        for h in self.hooks:
            h.remove()
def get_layer_profile(model, dataset, batch_size):
    input_size = (batch_size, *get_input_size(dataset))

    x = torch.randn(input_size, requires_grad=True)
    with torchprof.Profile(model, use_cuda=True) as prof:
        y = model(x)
        #y.backward()

    print(prof.display(show_events=False))
def torchprof_test():
    import torchprof
    model = torchvision.models.alexnet(pretrained=False).cuda()
    x = torch.rand([64, 3, 224, 224]).cuda()

    # `profile_memory` was added in PyTorch 1.6, this will output a runtime warning if unsupported.
    with torchprof.Profile(model, use_cuda=True, profile_memory=True) as prof:
        model(x)

    # equivalent to `print(prof)` and `print(prof.display())`
    print(prof.display(show_events=False))
Exemple #8
0
def profile_fp(hps: HyperParams) -> None:
    import torchprof
    start_time = time.time()
    device = torch.device("cuda:0")
    obs_config = obs_config_from(hps)
    env = envs.CodeCraftVecEnv(
        hps.num_envs,
        hps.num_self_play,
        hps.objective,
        hps.action_delay,
        randomize=hps.task_randomize,
        use_action_masks=hps.use_action_masks,
        obs_config=obs_config,
        symmetric=hps.symmetric_map,
        hardness=hps.task_hardness,
        mix_mp=hps.mix_mp,
        build_variety_bonus=hps.build_variety_bonus,
        win_bonus=hps.win_bonus,
        attac=hps.attac,
        protec=hps.protec,
        max_army_size_score=hps.max_army_size_score,
        max_enemy_army_size_score=hps.max_enemy_army_size_score,
        rule_rng_fraction=hps.rule_rng_fraction,
        rule_rng_amount=hps.rule_rng_amount,
        rule_cost_rng=hps.rule_cost_rng,
        scripted_opponents=[
            ("destroyer", hps.num_vs_destroyer),
            ("replicator", hps.num_vs_replicator),
            ("aggressive_replicator", hps.num_vs_aggro_replicator),
        ],
        max_game_length=None
        if hps.max_game_length == 0 else hps.max_game_length,
        stagger_offset=hps.rank / hps.parallelism,
        mothership_damage_scale=hps.mothership_damage_scale)
    policy = TransformerPolicy8(hps, obs_config).to(device)
    obs, action_masks, privileged_obs = env.reset()

    with torchprof.Profile(policy, use_cuda=True) as prof:
        for _ in range(0, hps.seq_rosteps):
            obs_tensor = torch.tensor(obs).to(device)
            privileged_obs_tensor = torch.tensor(privileged_obs).to(device)
            action_masks_tensor = torch.tensor(action_masks).to(device)
            actions, logprobs, entropy, values, probs = \
                policy.evaluate(obs_tensor, action_masks_tensor, privileged_obs_tensor)
            actions = actions.cpu().numpy()
            obs, _, _, _, action_masks, privileged_obs = env.step(
                actions, action_masks=action_masks)
    elapsed = time.time() - start_time
    print(
        f"Collected {hps.seq_rosteps * hps.num_envs} frames in {int(elapsed)}s ({int(hps.seq_rosteps * hps.num_envs / elapsed)}fps)"
    )
    print(prof.display(show_events=False))
Exemple #9
0
    def test_cpu_profile_structure(self):
        model = torchvision.models.alexnet(pretrained=False)
        x = torch.rand([1, 3, 224, 224])

        paths = [("AlexNet", "features", "3"), ("AlexNet", "avgpool")]

        with torchprof.Profile(model, paths=paths) as prof:
            model(x)

        # print(prof)
        traces, event_dict = prof.raw()
        self.assertEqual(len(event_dict.keys()), 2)
        self.assertEqual(list(event_dict.keys()), paths)
Exemple #10
0
def Model_init():
    # Model
    print('==> Building model..')
    #net = VGG('VGG16')
    net = VGG_ghost('VGG16')
    #net = VGG_ghost_2('VGG16')
    #net = VGG_ghost_v2('VGG16_Ghost_bottle')
    #net = VGG_ghost_v2_2('VGG16_Ghost_bottle')
    #net = VGG_ghost_v2_3('VGG16_Ghost_bottle')
    #net = VGG_ghost_v3('VGG16_Ghost_bottle')
    #net = VGG_ghost_v4('VGG16_Ghost_bottle')
    '''
    Flops
    '''
    '''
    input = torch.randn(1, 3, 32, 32)
    flops, params = profile(net, inputs=(input,))
    flops, params = clever_format([flops, params], "%.3f")
    print('flops is {}'.format(flops))
    print('params is {}'.format(params))
    '''

    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True
    ''' Here to see torchprof'''
    x = torch.rand([1, 3, 32, 32]).cuda()
    # `profile_memory` was added in PyTorch 1.6, this will output a runtime warning if unsupported.
    with torchprof.Profile(net, use_cuda=True, profile_memory=True) as prof:
        net(x)
    # equivalent to `print(prof)` and `print(prof.display())`
    #print(prof.display(show_events=False))
    print(prof.display(show_events=True))
    '''Done'''

    criterion = nn.CrossEntropyLoss()
    return net
Exemple #11
0
def evaluate_autograd_profiler(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + "/MM") if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)

        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(eval_output_dir)

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
        
        paths = [("BertForSequenceClassification", "bert", "encoder","layer","1"), ("BertForSequenceClassification", "bert", "encoder","layer","1","attention"),("BertForSequenceClassification", "bert", "encoder","layer","1","intermediate","dense"), ("BertForSequenceClassification", "bert", "encoder","layer","1","output","dense")]
        
        for i,batch in enumerate(tqdm(eval_dataloader, desc="Evaluating")):
            model.eval()
            if i >= args.n_trials:
                break
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None
                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
                # with profiler.profile(record_shapes=True) as prof:
                #     with profiler.record_function("model_inference"):
                # torch.cuda.synchronize()
                with torchprof.Profile(model, use_cuda=False, paths=paths) as prof:
                    outputs = model(**inputs)
        # print(prof.display(show_events=False))
        prof_str, prof_stats = prof.display(show_events=False)
        return prof_str, prof_stats
Exemple #12
0
    def one_block_latency(self, n_iter=100):
        """
        :return: inner one block: pd.DataFrame
        """
        self.model.blocks[0].reset_latency_list()
        with torch.no_grad():
            count = 1
            for idx, data in enumerate(self.test_loader):
                if count > n_iter:
                    break

                images, labels = data
                # self.logger.info("outer shape: {}".format(images.shape))

                # infer
                # torchprof is not used but prevent lazy operation of time module
                with torchprof.Profile(self.model, use_cuda=True) as prof:
                    self.model(images.cuda(self.device))

                count += 1
                if count % 10 == 0:
                    self.logger.info("{} times estimation".format(count))
            latency_df = self.model.blocks[0].latency_df
        return latency_df[latency_df.columns[-1]].rename("one_block_latency")
Exemple #13
0
    def various_latency(self, n_iter=70):
        """
            inner total, outer total, ops of one block, the block
        :return: list of latency and average of latency
        """
        latency_avg = None
        outside_total_time = []
        torchprof_block_time = []
        with torch.no_grad():
            count = 1
            l_sum = 0
            for idx, data in enumerate(self.test_loader):
                if count > n_iter:
                    break

                images, labels = data
                # self.logger.info("outer shape: {}".format(images.shape))

                # open the binary gate
                # self.model.reset_binary_gates()
                # self.model.unused_modules_off()

                # time
                # start = time.time()
                # self.model(images.cuda(self.device))
                # outside_total_time.append((time.time() - start))

                # autograd
                # with torch.autograd.profiler.profile(use_cuda=True) as prof:
                #     self.p_model(images)
                # self.logger.info("autograd: {}".format(prof.self_cpu_time_total))

                # torchprof
                with torchprof.Profile(self.model, use_cuda=True) as prof:
                    start = time.time()
                    self.model(images.cuda(self.device))
                    outside_total_time.append((time.time() - start))
                torchprof_time = sum(
                    get_time(prof, target="blocks", show_events=False))
                # self.logger.info("time: {}".format(self.model.latency_list))
                # self.logger.info("\n{}".format(self.model.blocks[0].latency_df))
                # self.logger.info("torchprof: {}".format(torchprof_time))
                torchprof_block_time.append(torchprof_time)

                # get latency
                # latency = sum(get_time(prof, target="blocks", show_events=False))
                # l_sum += latency
                # latency_list.append(latency)
                # self.logger.info("{n} times - latency: {latency}, avg: {avg}".format(
                #     pid=os.getpid(), n=count, latency=latency, avg=l_sum / count
                # ))

                count += 1
                if count % 10 == 0:
                    self.logger.info("{} times estimation".format(count))
            # for block in self.model.blocks:
            #     self.logger.info("{}".format(block.latency_df))
            torchprof_df = pd.DataFrame(data=torchprof_block_time,
                                        columns=["torchprof_block"])
            outside_df = pd.DataFrame(
                data=self.model.unit_transform(outside_total_time),
                columns=["outside_total"])
            combined_df = pd.concat(
                [
                    self.model.latency_df.rename(columns={0: "inside_total"}),
                    outside_df, self.model.blocks[0].latency_df, torchprof_df
                ],
                axis=1)  # .rename(columns={0: "inside_total", 1: "total"})
            from util.outlier import cut_outlier
            cut_df = cut_outlier(combined_df, min_border=0.25, max_border=0.75)
            self.logger.info("\n{}".format(combined_df))
            self.logger.info("\ntime: \n{} \nafter cut oulier: \n{}".format(
                combined_df.describe(), cut_df.describe()))

        return combined_df, cut_df
Exemple #14
0
def train(local_rank, args):
    rank = args.nr * args.gpus + local_rank
    setup(rank, args.world_size)
    transform = transforms.Compose([
        torchvision.transforms.Resize(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    batch_size = args.batchsize
    train_dataset = torchvision.datasets.CIFAR10('../datasets/',
                                                 transform=transform,
                                                 download=True)
    sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=args.world_size, rank=rank)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=batch_size,
                                              num_workers=2,
                                              sampler=sampler)

    model = models.resnet18()
    model.eval()
    torch.cuda.set_device(local_rank)
    model.cuda()
    print("GPU initialization")
    dummy_input = torch.randn(1, 3, 224, 224, dtype=torch.float).to(local_rank)
    for _ in range(10):
        _ = model(dummy_input)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])
    training_run_data = pd.DataFrame(
        columns=['epoch', 'batch', 'batch_size', 'gpu_number', 'time'])
    prof_file = open("../results/resnet18_mem_profiling.txt", "w")
    for epoch in range(0, 10):
        for i, data in enumerate(trainloader, 0):
            starter, ender = torch.cuda.Event(
                enable_timing=True), torch.cuda.Event(enable_timing=True)
            starter.record()
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()
            with torch.no_grad():
                with torchprof.Profile(model,
                                       use_cuda=True,
                                       profile_memory=True) as prof:
                    outputs = model(inputs)
            ender.record()
            if rank == 0:
                torch.cuda.synchronize()
                timer = starter.elapsed_time(ender)
                training_run_data = training_run_data.append(
                    {
                        'batch': i,
                        'batch_size': batch_size,
                        'gpu_number': args.gpus * args.nodes,
                        'time (ms)': timer / (batch_size * args.gpus),
                        'throughput': 1000 * (batch_size * args.gpus) / timer
                    },
                    ignore_index=True)
                training_run_data.to_csv(args.output, index=False)
                print("Batch: %d  Time per Image: %.2f ms Throughput:%.2f" %
                      (i, timer / (batch_size * args.gpus), 1000 *
                       (batch_size * args.gpus) / timer))
                if i % 20 == 19:
                    prof_file.write(prof.display(show_events=False))
    cleanup()
def train(local_rank, args):
    rank = args.nr * args.gpus + local_rank
    setup(rank, args.world_size)
    transform = transforms.Compose([
        torchvision.transforms.Resize(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    batch_size = args.batchsize
    train_dataset = torchvision.datasets.CIFAR10('../datasets/',
                                                 transform=transform,
                                                 download=True)
    sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=args.world_size, rank=rank)
    trainloader = torch.utils.data.DataLoader(train_dataset,
                                              batch_size=batch_size,
                                              num_workers=2,
                                              sampler=sampler)

    model = models.resnet18()
    torch.cuda.set_device(local_rank)
    model.cuda()
    print("GPU initialization")
    dummy_input = torch.randn(1, 3, 224, 224, dtype=torch.float).to(local_rank)
    for _ in range(10):
        _ = model(dummy_input)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    training_run_data = pd.DataFrame(
        columns=['epoch', 'batch', 'batch_size', 'gpu_number', 'time'])
    prof_file = open("../results/resnet18_mem_profiling.txt", "w")
    for epoch in range(args.epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        print("Epoch %d" % epoch)
        sampler.set_epoch(epoch)
        for i, data in enumerate(trainloader, 0):
            starter, ender = torch.cuda.Event(
                enable_timing=True), torch.cuda.Event(enable_timing=True)
            starter.record()
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()

            optimizer.zero_grad()
            with torchprof.Profile(model, use_cuda=True,
                                   profile_memory=True) as prof:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
            with torch.autograd.profiler.profile(
                    use_kineto=True, use_cuda=True,
                    profile_memory=True) as backprof:
                loss.backward()
            grads_conv1 = model.module.conv1.weight.grad
            optimizer.step()
            ender.record()
            # print statistics
            if rank == 0:
                torch.cuda.synchronize()
                timer = starter.elapsed_time(ender)
                training_run_data = training_run_data.append(
                    {
                        'epoch': epoch,
                        'batch': i,
                        'loss': loss.item(),
                        'batch_size': batch_size,
                        'gpu_number': args.gpus * args.nodes,
                        'time (ms)': timer / (batch_size * args.gpus),
                        'throughput': 1000 * (batch_size * args.gpus) / timer
                    },
                    ignore_index=True)
                training_run_data.to_csv(
                    "../results/resnet18_training_stats_GPU_%.0f_batchsize_%.0f.csv"
                    % (args.gpus * args.nodes, batch_size),
                    index=False)
                print(
                    "[Epoch %d] Batch: %d Loss: %.3f Time per Image: %.2f msi Throughput:%.2f"
                    % (epoch, i, loss.item(), timer /
                       (batch_size * args.gpus), 1000 *
                       (batch_size * args.gpus) / timer))
                if i % 20 == 19:
                    prof_file.write(prof.display(show_events=False))
                    prof_file.write(backprof.table(row_limit=100000))
                running_loss += loss.item()
                if i % 2000 == 1999:  # print every 2000 mini-batches
                    print('[%d, %5d] loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / 2000))
                    running_loss = 0.0
    cleanup()
    os.mkdir(checkpoint_dir)

writer = SummaryWriter(log_dir=experiment_dir)


if __name__ == "__main__":

    # net = ResNet18()
    # net = VGG('VGG19')
    net = CNN()
    # net = CNN_3()
    # net = CNN_4()
    # net = My_CNN()
    print(net)
    summary(net,(3,32,32))
    with torchprof.Profile(net, use_cuda = True) as prof:
        net(torch.rand([1, 3, 32, 32]).cuda())
    
    print(prof.display(show_events=False))

    with torch.cuda.device(0):
        macs, params = get_model_complexity_info(net, (3, 32, 32), as_strings=True,
                                           print_per_layer_stat=True, verbose=True)
        print('{:<30}  {:<8}'.format('Computational complexity: ', macs))
        print('{:<30}  {:<8}'.format('Number of parameters: ', params))


    net = CNN()
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
    optimizer = get_optimizer(optimizer_type, net, LR)
Exemple #17
0
    def predict(self, eval_tuple: DataTuple, dump=None):
        """
        Predict the answers to questions in a data split.

        :param eval_tuple: The data tuple to be evaluated.
        :param dump: The path of saved file to dump results.
        :return: A dict of question_id to answer.
        """
        self.model.eval()
        dset, loader, evaluator = eval_tuple
        quesid2ans = {}
        import time
        from tqdm import tqdm
        import torchprof
        # import torch.autograd.profiler as profiler

        start = time.time()
        print('model set up, starting warming up prediction...')
        count = 0
        batches = 0
        # with torch.no_grad(), profiler.profile(record_shapes=True) as prof:
        with torch.no_grad():
            for i, datum_tuple in tqdm(enumerate(loader)):
                ques_id, img_paths, sent = datum_tuple[:
                                                       3]  # Avoid seeing ground truth
                img_tensor, im_scales, im_infos = [], [], []
                for img_path in img_paths:
                    im, im_scale, im_info = self._image_transform(img_path)
                    # im, im_scale, im_info = img_item
                    img_tensor.append(im)
                    im_scales.append(im_scale)
                    im_infos.append(im_info)
                current_img_list = to_image_list(img_tensor, size_divisible=32)
                # print('current_img_list.device', current_img_list.tensors.size())
                current_img_list = current_img_list.to("cuda")
                output = self.model.detection_model(current_img_list)

                # get bbox and features
                feat_list, info_list = self._process_feature_extraction(
                    output,
                    im_scales,
                    im_infos,
                    self.args.feature_name,
                    self.args.confidence_threshold,
                )
                feats = torch.stack(feat_list)
                boxes = torch.stack(info_list)
                # feats, boxes = feats.cuda(), boxes.cuda()
                logit = self.model(feats, boxes, sent)
                score, label = logit.max(1)
                batches += 1
                if batches >= 2:
                    break
        batches = 0
        count = 0
        print('model warmed up, starting predicting...')
        with torch.no_grad(), torchprof.Profile(self.model,
                                                use_cuda=True) as prof:
            for i, datum_tuple in tqdm(enumerate(loader)):
                ques_id, img_paths, sent = datum_tuple[:
                                                       3]  # Avoid seeing ground truth
                img_tensor, im_scales, im_infos = [], [], []
                for img_path in img_paths:
                    im, im_scale, im_info = self._image_transform(img_path)
                    # im, im_scale, im_info = img_item
                    img_tensor.append(im)
                    im_scales.append(im_scale)
                    im_infos.append(im_info)
                current_img_list = to_image_list(img_tensor, size_divisible=32)
                # print('current_img_list.device', current_img_list.tensors.size())
                current_img_list = current_img_list.to("cuda")
                output = self.model.detection_model(current_img_list)

                # get bbox and features
                feat_list, info_list = self._process_feature_extraction(
                    output,
                    im_scales,
                    im_infos,
                    self.args.feature_name,
                    self.args.confidence_threshold,
                )
                feats = torch.stack(feat_list)
                boxes = torch.stack(info_list)
                # feats, boxes = feats.cuda(), boxes.cuda()
                logit = self.model(feats, boxes, sent)
                score, label = logit.max(1)
                batches += 1
                for qid, l in zip(ques_id, label.cpu().numpy()):
                    ans = dset.label2ans[l]
                    quesid2ans[qid.item()] = ans
                    count += 1
        print(prof.display(show_events=False))
        end = time.time()
        trace, event_lists_dict = prof.raw()
        import pickle
        with open(args.profile_save or 'profile.pk', 'wb') as f:
            pickle.dump(event_lists_dict, f)
        print('prediction finished!', end - start, batches, count)
        if dump is not None:
            evaluator.dump_result(quesid2ans, dump)
        return quesid2ans