def _profile_structure(self, model, x, use_cuda=False, alexnet_ops=[]): with torchprof.Profile(model, use_cuda=use_cuda) as prof: model(x) traces, event_lists_dict = prof.raw() for layer_idx, trace in enumerate(traces): (path, leaf, _) = trace self.assertEqual((path, leaf), self.alexnet_traces[layer_idx]) event_lists = event_lists_dict[path] if leaf: # model(x) called once, each layer should have one event_list self.assertEqual(len(event_lists), 1) event_names = tuple(e.name for e in event_lists[0]) # profiler returned order is not deterministic try: self.assertTrue( all(event_name in event_names for event_name in alexnet_ops[layer_idx]), f"Layer {layer_idx} received {event_names}, old {alexnet_ops[layer_idx]}", ) except IndexError: self.assertTrue( False, f"Layer {layer_idx} received {event_names}") else: # non leaf nodes should not have event_list values self.assertEqual(len(event_lists), 0) pretty = prof.display() pretty_full = prof.display(show_events=True) self.assertIsInstance(pretty, str) self.assertIsInstance(pretty_full, str)
def compute_speed(model, input_size, device, iteration): torch.cuda.set_device(device) torch.backends.cudnn.benchmark = True model.eval() model = model.cuda() input = torch.randn(*input_size, device=device) torch.cuda.synchronize() for _ in range(50): model(input) torch.cuda.synchronize() logger.info('=========Speed Testing=========') time_spent = [] for _ in range(iteration): torch.cuda.synchronize() t_start = time.perf_counter() with torch.no_grad(): model(input) torch.cuda.synchronize() time_spent.append(time.perf_counter() - t_start) torch.cuda.synchronize() elapsed_time = np.sum(time_spent) with torchprof.Profile(model, use_cuda=True) as prof: model(input) print(prof.display(show_events=False)) logger.info('Elapsed time: [%.2f s / %d iter]' % (elapsed_time, iteration)) logger.info('Speed Time: %.2f ms / iter FPS: %.2f' % (elapsed_time / iteration * 1000, iteration / elapsed_time))
def profile(model, inp_data, want_op_file=False, cuda_=False): df1 = pf_flop(model, inputs=(inp_data, )) with pf_time.Profile(model, use_cuda=cuda_) as prof: model(inp_data) df2 = prof.display() for i1 in df1.index: df1["Layer_Name"][i1] = df2["Layer_Name"][i1] # print(df1) # print(df2) # mynn={"Layer Name":[],"FLOPs":[],"Self CPU total":[], "CPU Total":[], "GPU Total":[],"Input Features":[], "Output Features":[], "Dict Size of Emb":[], "Emb Vector Size":[], "Norm Size":[]} # for i1 in df1.index: # mynn["Layer Name"].append(str(df2["Layer Name"][i1])) # mynn["Self CPU total"].append(str(df2["Self CPU total"][i1])) # mynn["CPU Total"].append(str(df2["CPU total"][i1])) # mynn["GPU Total"].append(str(df2["GPU total"][i1])) # mynn["Input Features"].append(str(df1["Input Features"][i1])) # mynn["Output Features"].append(str(df1["Output Features"][i1])) # mynn["Dict Size of Emb"].append(str(df1["Dict Size of Emb"][i1])) # mynn["Emb Vector Size"].append(str(df1["Emb Vector Size"][i1])) # mynn["Norm Size"].append(str(df1["Norm Size"][i1])) # df=DataFrame(mynn, columns= ["Layer Name","FLOPs","Self CPU total","CPU Total","GPU Total","Input Features","Output Features","Dict Size of Emb","Emb Vector Size","Norm Size"]) del df2["Layer_Name"] df = pd.concat([df1, df2], axis=1).reindex(df1.index) if want_op_file == True: export_csv = df.to_csv(r'output_file.csv', index=None, header=True) else: print(df)
def one_block_latency(self, n_iter=100): """ :return: inner one block """ with torch.no_grad(): count = 1 for idx, data in enumerate(self.test_loader): if count > n_iter: break images, labels = data # self.logger.info("outer shape: {}".format(images.shape)) # infer with torchprof.Profile(self.model, use_cuda=True) as prof: self.model(images.cuda(self.device)) count += 1 if count % 10 == 0: self.logger.info("{} times estimation".format(count)) latency = pd.Series(data=self.model.blocks[0].latency_list[15], name="one_block_latency") return latency
def _generate_summary(self, input_tensors: List[torch.Tensor]) -> None: """ Creates a list of input torch tensors and registers forward pass hooks to the model, passes the inputs through the model, and collects model information such num of parameters and intermediate tensor size. :param input_tensors: A list of tensors which are fed into the torch model. """ def print_summary() -> None: logging.info( "-------------------------------------------------------------------------------" ) line_new = "{:>20} {:>25} {:>15} {:>15}".format( "Layer (type)", "Output Shape", "Param #", "Device") logging.info(line_new) logging.info( "===============================================================================" ) total_output = 0.0 for layer in self.summary: line_new = "{:>20} {:>25} {:>15} {:>15}".format( layer, str(self.summary[layer].output_shape), "{0:,}".format(self.summary[layer].n_params), str(self.summary[layer].device)) total_output += self.summary[layer].output_memory_megabytes logging.info(line_new) # Assume 4 bytes/number (float on cuda) - Without mixed precision training and inplace operations input_sizes = self._get_sizes_from_list(input_tensors) total_input_size = self.compute_tensor_memory_megabytes( input_sizes) total_output_size = 2. * total_output # x2 for gradients logging.info( "===============================================================================" ) logging.info("Total params: {0:,}".format(self.n_params)) logging.info("Trainable params: {0:,}".format( self.n_trainable_params)) logging.info("Input mem size (MB)(Wout mixed-precision): %0.2f" % total_input_size) logging.info( "Forward/backward pass mem size (MB)(Wout mixed-precision): %0.2f" % total_output_size) logging.info( "-------------------------------------------------------------------------------" ) # Register the forward-pass hooks, profile the model, and restore its state self.model.apply(self._register_hook) with torchprof.Profile(self.model, use_cuda=self.use_gpu) as prof: forward_preserve_state(self.model, input_tensors) # type: ignore # Log the model summary: tensor shapes, num of parameters, memory requirement, and forward pass time logging.info(self.model) logging.info('\n' + prof.display(show_events=False)) print_summary() # Remove the hooks via handles for h in self.hooks: h.remove()
def get_layer_profile(model, dataset, batch_size): input_size = (batch_size, *get_input_size(dataset)) x = torch.randn(input_size, requires_grad=True) with torchprof.Profile(model, use_cuda=True) as prof: y = model(x) #y.backward() print(prof.display(show_events=False))
def torchprof_test(): import torchprof model = torchvision.models.alexnet(pretrained=False).cuda() x = torch.rand([64, 3, 224, 224]).cuda() # `profile_memory` was added in PyTorch 1.6, this will output a runtime warning if unsupported. with torchprof.Profile(model, use_cuda=True, profile_memory=True) as prof: model(x) # equivalent to `print(prof)` and `print(prof.display())` print(prof.display(show_events=False))
def profile_fp(hps: HyperParams) -> None: import torchprof start_time = time.time() device = torch.device("cuda:0") obs_config = obs_config_from(hps) env = envs.CodeCraftVecEnv( hps.num_envs, hps.num_self_play, hps.objective, hps.action_delay, randomize=hps.task_randomize, use_action_masks=hps.use_action_masks, obs_config=obs_config, symmetric=hps.symmetric_map, hardness=hps.task_hardness, mix_mp=hps.mix_mp, build_variety_bonus=hps.build_variety_bonus, win_bonus=hps.win_bonus, attac=hps.attac, protec=hps.protec, max_army_size_score=hps.max_army_size_score, max_enemy_army_size_score=hps.max_enemy_army_size_score, rule_rng_fraction=hps.rule_rng_fraction, rule_rng_amount=hps.rule_rng_amount, rule_cost_rng=hps.rule_cost_rng, scripted_opponents=[ ("destroyer", hps.num_vs_destroyer), ("replicator", hps.num_vs_replicator), ("aggressive_replicator", hps.num_vs_aggro_replicator), ], max_game_length=None if hps.max_game_length == 0 else hps.max_game_length, stagger_offset=hps.rank / hps.parallelism, mothership_damage_scale=hps.mothership_damage_scale) policy = TransformerPolicy8(hps, obs_config).to(device) obs, action_masks, privileged_obs = env.reset() with torchprof.Profile(policy, use_cuda=True) as prof: for _ in range(0, hps.seq_rosteps): obs_tensor = torch.tensor(obs).to(device) privileged_obs_tensor = torch.tensor(privileged_obs).to(device) action_masks_tensor = torch.tensor(action_masks).to(device) actions, logprobs, entropy, values, probs = \ policy.evaluate(obs_tensor, action_masks_tensor, privileged_obs_tensor) actions = actions.cpu().numpy() obs, _, _, _, action_masks, privileged_obs = env.step( actions, action_masks=action_masks) elapsed = time.time() - start_time print( f"Collected {hps.seq_rosteps * hps.num_envs} frames in {int(elapsed)}s ({int(hps.seq_rosteps * hps.num_envs / elapsed)}fps)" ) print(prof.display(show_events=False))
def test_cpu_profile_structure(self): model = torchvision.models.alexnet(pretrained=False) x = torch.rand([1, 3, 224, 224]) paths = [("AlexNet", "features", "3"), ("AlexNet", "avgpool")] with torchprof.Profile(model, paths=paths) as prof: model(x) # print(prof) traces, event_dict = prof.raw() self.assertEqual(len(event_dict.keys()), 2) self.assertEqual(list(event_dict.keys()), paths)
def Model_init(): # Model print('==> Building model..') #net = VGG('VGG16') net = VGG_ghost('VGG16') #net = VGG_ghost_2('VGG16') #net = VGG_ghost_v2('VGG16_Ghost_bottle') #net = VGG_ghost_v2_2('VGG16_Ghost_bottle') #net = VGG_ghost_v2_3('VGG16_Ghost_bottle') #net = VGG_ghost_v3('VGG16_Ghost_bottle') #net = VGG_ghost_v4('VGG16_Ghost_bottle') ''' Flops ''' ''' input = torch.randn(1, 3, 32, 32) flops, params = profile(net, inputs=(input,)) flops, params = clever_format([flops, params], "%.3f") print('flops is {}'.format(flops)) print('params is {}'.format(params)) ''' net = net.to(device) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True ''' Here to see torchprof''' x = torch.rand([1, 3, 32, 32]).cuda() # `profile_memory` was added in PyTorch 1.6, this will output a runtime warning if unsupported. with torchprof.Profile(net, use_cuda=True, profile_memory=True) as prof: net(x) # equivalent to `print(prof)` and `print(prof.display())` #print(prof.display(show_events=False)) print(prof.display(show_events=True)) '''Done''' criterion = nn.CrossEntropyLoss() return net
def evaluate_autograd_profiler(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + "/MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) paths = [("BertForSequenceClassification", "bert", "encoder","layer","1"), ("BertForSequenceClassification", "bert", "encoder","layer","1","attention"),("BertForSequenceClassification", "bert", "encoder","layer","1","intermediate","dense"), ("BertForSequenceClassification", "bert", "encoder","layer","1","output","dense")] for i,batch in enumerate(tqdm(eval_dataloader, desc="Evaluating")): model.eval() if i >= args.n_trials: break batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids # with profiler.profile(record_shapes=True) as prof: # with profiler.record_function("model_inference"): # torch.cuda.synchronize() with torchprof.Profile(model, use_cuda=False, paths=paths) as prof: outputs = model(**inputs) # print(prof.display(show_events=False)) prof_str, prof_stats = prof.display(show_events=False) return prof_str, prof_stats
def one_block_latency(self, n_iter=100): """ :return: inner one block: pd.DataFrame """ self.model.blocks[0].reset_latency_list() with torch.no_grad(): count = 1 for idx, data in enumerate(self.test_loader): if count > n_iter: break images, labels = data # self.logger.info("outer shape: {}".format(images.shape)) # infer # torchprof is not used but prevent lazy operation of time module with torchprof.Profile(self.model, use_cuda=True) as prof: self.model(images.cuda(self.device)) count += 1 if count % 10 == 0: self.logger.info("{} times estimation".format(count)) latency_df = self.model.blocks[0].latency_df return latency_df[latency_df.columns[-1]].rename("one_block_latency")
def various_latency(self, n_iter=70): """ inner total, outer total, ops of one block, the block :return: list of latency and average of latency """ latency_avg = None outside_total_time = [] torchprof_block_time = [] with torch.no_grad(): count = 1 l_sum = 0 for idx, data in enumerate(self.test_loader): if count > n_iter: break images, labels = data # self.logger.info("outer shape: {}".format(images.shape)) # open the binary gate # self.model.reset_binary_gates() # self.model.unused_modules_off() # time # start = time.time() # self.model(images.cuda(self.device)) # outside_total_time.append((time.time() - start)) # autograd # with torch.autograd.profiler.profile(use_cuda=True) as prof: # self.p_model(images) # self.logger.info("autograd: {}".format(prof.self_cpu_time_total)) # torchprof with torchprof.Profile(self.model, use_cuda=True) as prof: start = time.time() self.model(images.cuda(self.device)) outside_total_time.append((time.time() - start)) torchprof_time = sum( get_time(prof, target="blocks", show_events=False)) # self.logger.info("time: {}".format(self.model.latency_list)) # self.logger.info("\n{}".format(self.model.blocks[0].latency_df)) # self.logger.info("torchprof: {}".format(torchprof_time)) torchprof_block_time.append(torchprof_time) # get latency # latency = sum(get_time(prof, target="blocks", show_events=False)) # l_sum += latency # latency_list.append(latency) # self.logger.info("{n} times - latency: {latency}, avg: {avg}".format( # pid=os.getpid(), n=count, latency=latency, avg=l_sum / count # )) count += 1 if count % 10 == 0: self.logger.info("{} times estimation".format(count)) # for block in self.model.blocks: # self.logger.info("{}".format(block.latency_df)) torchprof_df = pd.DataFrame(data=torchprof_block_time, columns=["torchprof_block"]) outside_df = pd.DataFrame( data=self.model.unit_transform(outside_total_time), columns=["outside_total"]) combined_df = pd.concat( [ self.model.latency_df.rename(columns={0: "inside_total"}), outside_df, self.model.blocks[0].latency_df, torchprof_df ], axis=1) # .rename(columns={0: "inside_total", 1: "total"}) from util.outlier import cut_outlier cut_df = cut_outlier(combined_df, min_border=0.25, max_border=0.75) self.logger.info("\n{}".format(combined_df)) self.logger.info("\ntime: \n{} \nafter cut oulier: \n{}".format( combined_df.describe(), cut_df.describe())) return combined_df, cut_df
def train(local_rank, args): rank = args.nr * args.gpus + local_rank setup(rank, args.world_size) transform = transforms.Compose([ torchvision.transforms.Resize(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) batch_size = args.batchsize train_dataset = torchvision.datasets.CIFAR10('../datasets/', transform=transform, download=True) sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, sampler=sampler) model = models.resnet18() model.eval() torch.cuda.set_device(local_rank) model.cuda() print("GPU initialization") dummy_input = torch.randn(1, 3, 224, 224, dtype=torch.float).to(local_rank) for _ in range(10): _ = model(dummy_input) model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) training_run_data = pd.DataFrame( columns=['epoch', 'batch', 'batch_size', 'gpu_number', 'time']) prof_file = open("../results/resnet18_mem_profiling.txt", "w") for epoch in range(0, 10): for i, data in enumerate(trainloader, 0): starter, ender = torch.cuda.Event( enable_timing=True), torch.cuda.Event(enable_timing=True) starter.record() inputs, labels = data inputs = inputs.cuda() labels = labels.cuda() with torch.no_grad(): with torchprof.Profile(model, use_cuda=True, profile_memory=True) as prof: outputs = model(inputs) ender.record() if rank == 0: torch.cuda.synchronize() timer = starter.elapsed_time(ender) training_run_data = training_run_data.append( { 'batch': i, 'batch_size': batch_size, 'gpu_number': args.gpus * args.nodes, 'time (ms)': timer / (batch_size * args.gpus), 'throughput': 1000 * (batch_size * args.gpus) / timer }, ignore_index=True) training_run_data.to_csv(args.output, index=False) print("Batch: %d Time per Image: %.2f ms Throughput:%.2f" % (i, timer / (batch_size * args.gpus), 1000 * (batch_size * args.gpus) / timer)) if i % 20 == 19: prof_file.write(prof.display(show_events=False)) cleanup()
def train(local_rank, args): rank = args.nr * args.gpus + local_rank setup(rank, args.world_size) transform = transforms.Compose([ torchvision.transforms.Resize(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) batch_size = args.batchsize train_dataset = torchvision.datasets.CIFAR10('../datasets/', transform=transform, download=True) sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.world_size, rank=rank) trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, sampler=sampler) model = models.resnet18() torch.cuda.set_device(local_rank) model.cuda() print("GPU initialization") dummy_input = torch.randn(1, 3, 224, 224, dtype=torch.float).to(local_rank) for _ in range(10): _ = model(dummy_input) model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) training_run_data = pd.DataFrame( columns=['epoch', 'batch', 'batch_size', 'gpu_number', 'time']) prof_file = open("../results/resnet18_mem_profiling.txt", "w") for epoch in range(args.epochs): # loop over the dataset multiple times running_loss = 0.0 print("Epoch %d" % epoch) sampler.set_epoch(epoch) for i, data in enumerate(trainloader, 0): starter, ender = torch.cuda.Event( enable_timing=True), torch.cuda.Event(enable_timing=True) starter.record() inputs, labels = data inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() with torchprof.Profile(model, use_cuda=True, profile_memory=True) as prof: outputs = model(inputs) loss = criterion(outputs, labels) with torch.autograd.profiler.profile( use_kineto=True, use_cuda=True, profile_memory=True) as backprof: loss.backward() grads_conv1 = model.module.conv1.weight.grad optimizer.step() ender.record() # print statistics if rank == 0: torch.cuda.synchronize() timer = starter.elapsed_time(ender) training_run_data = training_run_data.append( { 'epoch': epoch, 'batch': i, 'loss': loss.item(), 'batch_size': batch_size, 'gpu_number': args.gpus * args.nodes, 'time (ms)': timer / (batch_size * args.gpus), 'throughput': 1000 * (batch_size * args.gpus) / timer }, ignore_index=True) training_run_data.to_csv( "../results/resnet18_training_stats_GPU_%.0f_batchsize_%.0f.csv" % (args.gpus * args.nodes, batch_size), index=False) print( "[Epoch %d] Batch: %d Loss: %.3f Time per Image: %.2f msi Throughput:%.2f" % (epoch, i, loss.item(), timer / (batch_size * args.gpus), 1000 * (batch_size * args.gpus) / timer)) if i % 20 == 19: prof_file.write(prof.display(show_events=False)) prof_file.write(backprof.table(row_limit=100000)) running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 cleanup()
os.mkdir(checkpoint_dir) writer = SummaryWriter(log_dir=experiment_dir) if __name__ == "__main__": # net = ResNet18() # net = VGG('VGG19') net = CNN() # net = CNN_3() # net = CNN_4() # net = My_CNN() print(net) summary(net,(3,32,32)) with torchprof.Profile(net, use_cuda = True) as prof: net(torch.rand([1, 3, 32, 32]).cuda()) print(prof.display(show_events=False)) with torch.cuda.device(0): macs, params = get_model_complexity_info(net, (3, 32, 32), as_strings=True, print_per_layer_stat=True, verbose=True) print('{:<30} {:<8}'.format('Computational complexity: ', macs)) print('{:<30} {:<8}'.format('Number of parameters: ', params)) net = CNN() criterion = nn.CrossEntropyLoss() criterion.to(device) optimizer = get_optimizer(optimizer_type, net, LR)
def predict(self, eval_tuple: DataTuple, dump=None): """ Predict the answers to questions in a data split. :param eval_tuple: The data tuple to be evaluated. :param dump: The path of saved file to dump results. :return: A dict of question_id to answer. """ self.model.eval() dset, loader, evaluator = eval_tuple quesid2ans = {} import time from tqdm import tqdm import torchprof # import torch.autograd.profiler as profiler start = time.time() print('model set up, starting warming up prediction...') count = 0 batches = 0 # with torch.no_grad(), profiler.profile(record_shapes=True) as prof: with torch.no_grad(): for i, datum_tuple in tqdm(enumerate(loader)): ques_id, img_paths, sent = datum_tuple[: 3] # Avoid seeing ground truth img_tensor, im_scales, im_infos = [], [], [] for img_path in img_paths: im, im_scale, im_info = self._image_transform(img_path) # im, im_scale, im_info = img_item img_tensor.append(im) im_scales.append(im_scale) im_infos.append(im_info) current_img_list = to_image_list(img_tensor, size_divisible=32) # print('current_img_list.device', current_img_list.tensors.size()) current_img_list = current_img_list.to("cuda") output = self.model.detection_model(current_img_list) # get bbox and features feat_list, info_list = self._process_feature_extraction( output, im_scales, im_infos, self.args.feature_name, self.args.confidence_threshold, ) feats = torch.stack(feat_list) boxes = torch.stack(info_list) # feats, boxes = feats.cuda(), boxes.cuda() logit = self.model(feats, boxes, sent) score, label = logit.max(1) batches += 1 if batches >= 2: break batches = 0 count = 0 print('model warmed up, starting predicting...') with torch.no_grad(), torchprof.Profile(self.model, use_cuda=True) as prof: for i, datum_tuple in tqdm(enumerate(loader)): ques_id, img_paths, sent = datum_tuple[: 3] # Avoid seeing ground truth img_tensor, im_scales, im_infos = [], [], [] for img_path in img_paths: im, im_scale, im_info = self._image_transform(img_path) # im, im_scale, im_info = img_item img_tensor.append(im) im_scales.append(im_scale) im_infos.append(im_info) current_img_list = to_image_list(img_tensor, size_divisible=32) # print('current_img_list.device', current_img_list.tensors.size()) current_img_list = current_img_list.to("cuda") output = self.model.detection_model(current_img_list) # get bbox and features feat_list, info_list = self._process_feature_extraction( output, im_scales, im_infos, self.args.feature_name, self.args.confidence_threshold, ) feats = torch.stack(feat_list) boxes = torch.stack(info_list) # feats, boxes = feats.cuda(), boxes.cuda() logit = self.model(feats, boxes, sent) score, label = logit.max(1) batches += 1 for qid, l in zip(ques_id, label.cpu().numpy()): ans = dset.label2ans[l] quesid2ans[qid.item()] = ans count += 1 print(prof.display(show_events=False)) end = time.time() trace, event_lists_dict = prof.raw() import pickle with open(args.profile_save or 'profile.pk', 'wb') as f: pickle.dump(event_lists_dict, f) print('prediction finished!', end - start, batches, count) if dump is not None: evaluator.dump_result(quesid2ans, dump) return quesid2ans