def evaluate_two_stage(model, test_batch, args): avg_loss = metric.AverageMeter('avg_loss', ':.4e') single_time = metric.AverageMeter('Time', ':6.3f') progress = metric.ProgressMeter(len(test_batch), avg_loss, single_time, prefix="Evaluation: ") model.eval() label_list = [] psnr_list = [] logit_list = [] counter = 0 for k, (images, labels) in enumerate(test_batch): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) counter += 1 label = labels if args.label else None channel = (images.size()[1] // args.c - 1) * args.c # input_image = images[:, 0:channel] # target_image = images[:, channel:] input_image = images.detach() target_image = images.detach() with autocast(): reconstructed_image, loss, logit = model.forward(input_image, gt=target_image, label=label, train=False) loss = loss['pixel_loss'].view(loss['pixel_loss'].shape[0], -1).mean(1) assert len(loss) == len( label ), "During inference, loss sample number must match label sample number." for i in range(len(loss)): psnr_list.append(psnr(loss[i].item())) logit_list.append(logit[i].item()) label_list.append(label[i].item()) avg_loss.update(loss[i].item(), 1) psnr_score_total_list = np.asarray(psnr_score_list(psnr_list)) label_list = np.asarray(label_list) logit_list = np.asarray(logit_list) assert psnr_score_total_list.size == label_list.size, "INFERENCE LENGTH MUST MATCH LABEL LENGTH." # final_score = 0.8 * logit_list + 0.2 * (1 - psnr_score_total_list) final_score = logit_list accuracy = roc_auc_score(y_true=label_list, y_score=final_score) # plot_AUC(psnr_score_total_list, np.expand_dims(1 - labels_list, 0)) print("EVALUATE FRAME NUMBER: ", psnr_score_total_list.size) return accuracy, avg_loss.avg
def validate(val_loader, model, args, streams=None): """validate function""" batch_time = metric.AverageMeter('Time', ':6.3f') avg_ce_loss = metric.AverageMeter('ce_loss', ':.4e') # record the top1 accuray of each small network top1_all = [] for i in range(args.loop_factor): top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f')) avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f') avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f') progress = metric.ProgressMeter(len(val_loader), batch_time, avg_ce_loss, *top1_all, avg_top1, avg_top5, prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(val_loader): if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) # compute outputs and losses if args.is_amp: with amp.autocast(): ensemble_output, outputs, ce_loss = model(images, target=target, mode='val') else: ensemble_output, outputs, ce_loss = model(images, target=target, mode='val') # measure accuracy and record loss batch_size_now = images.size(0) for j in range(args.loop_factor): acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5)) top1_all[j].update(acc1[0].item(), batch_size_now) # simply average outputs of small networks avg_acc1, avg_acc5 = metric.accuracy(ensemble_output, target, topk=(1, 5)) avg_top1.update(avg_acc1[0].item(), batch_size_now) avg_top5.update(avg_acc5[0].item(), batch_size_now) avg_ce_loss.update(ce_loss.mean().item(), batch_size_now) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.print(i) acc_all = [] acc_all.append(avg_top1.avg) acc_all.append(avg_top5.avg) acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1]) for j in range(args.loop_factor): acc_all.append(top1_all[j].avg) acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg) print(acc_info) # torch.cuda.empty_cache() return acc_all
def train(train_loader, model, optimizer, scheduler, epoch, args, streams=None, scaler=None): """training function""" batch_time = metric.AverageMeter('Time', ':6.3f') data_time = metric.AverageMeter('Data', ':6.3f') avg_ce_loss = metric.AverageMeter('ce_loss', ':.4e') avg_cot_loss = metric.AverageMeter('cot_loss', ':.4e') # record the top1 accuray of each small network top1_all = [] for i in range(args.loop_factor): # ce_losses_l.append(metric.AverageMeter('{}_CE_Loss'.format(i), ':.4e')) top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f')) avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f') #if args.dataset == 'imagenet': # avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f') # show all total_iters = len(train_loader) progress = metric.ProgressMeter(total_iters, batch_time, data_time, avg_ce_loss, avg_cot_loss, *top1_all, avg_top1, prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() end = time.time() # prefetch data prefetcher = prefetch.data_prefetcher(train_loader) images, target = prefetcher.next() i = 0 """Another way to load the data for i, (images, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) target = target.cuda(args.gpu, non_blocking=True) """ optimizer.zero_grad() while images is not None: # measure data loading time data_time.update(time.time() - end) # adjust the lr first scheduler(optimizer, i, epoch) i += 1 # compute outputs and losses if args.is_amp: # Runs the forward pass with autocasting. with amp.autocast(): ensemble_output, outputs, ce_loss, cot_loss = model( images, target=target, mode='train', epoch=epoch, streams=streams) else: ensemble_output, outputs, ce_loss, cot_loss = model( images, target=target, mode='train', epoch=epoch, streams=streams) # measure accuracy and record loss batch_size_now = images.size(0) # notice the index i and j, avoid contradictory for j in range(args.loop_factor): acc1 = metric.accuracy(outputs[j, ...], target, topk=(1, )) top1_all[j].update(acc1[0].item(), batch_size_now) # simply average outputs of small networks avg_acc1 = metric.accuracy(ensemble_output, target, topk=(1, )) avg_top1.update(avg_acc1[0].item(), batch_size_now) # avg_top5.update(avg_acc1[0].item(), batch_size_now) avg_ce_loss.update(ce_loss.mean().item(), batch_size_now) avg_cot_loss.update(cot_loss.mean().item(), batch_size_now) # compute gradient and do SGD step total_loss = (ce_loss + cot_loss) / args.iters_to_accumulate if args.is_amp: # Scales loss. Calls backward() on scaled loss to create scaled gradients. # Backward passes under autocast are not recommended. # Backward ops run in the same dtype autocast chose for corresponding forward ops. scaler.scale(total_loss).backward() if i % args.iters_to_accumulate == 0 or i == total_iters: # scaler.step() first unscales the gradients of the optimizer's assigned params. # If these gradients do not contain infs or NaNs, optimizer.step() is then called, # otherwise, optimizer.step() is skipped. scaler.step(optimizer) # Updates the scale for next iteration. scaler.update() optimizer.zero_grad() else: total_loss.backward() if i % args.iters_to_accumulate == 0 or i == total_iters: optimizer.step() optimizer.zero_grad() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if not args.multiprocessing_distributed or (args.rank % args.ngpus_per_node == 0): if i % (args.print_freq * args.iters_to_accumulate) == 0: progress.print(i) images, target = prefetcher.next()
def evaluate_two_stage_object(model, test_batch, args): avg_loss = metric.AverageMeter('avg_loss', ':.4e') single_time = metric.AverageMeter('Time', ':6.3f') progress = metric.ProgressMeter(len(test_batch), avg_loss, single_time, prefix="Evaluation: ") model.eval() label_list = [] psnr_list = [] logit_list = [] ct = 0 counter = 0 for k, (images, labels, bboxes) in enumerate(test_batch): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) bboxes = [x.cuda(non_blocking=True) for x in bboxes] a = time.time() counter += 1 patches, patch_labels, bbox_num = get_object_images( images, labels, bboxes, args) # [K,C,H,W] [K] [B] if patches is None: for i in range(len(labels)): label_list.append(labels[i].item()) psnr_list.append(100.0) else: del images batch_size_now = len(bbox_num) ct += patches.size()[0] label = labels if args.label else None channel = (patches.size()[1] // args.c - 1) * args.c input_image = patches[:, 0:channel] target_image = patches[:, channel:] with autocast(): reconstructed_image, loss, logit = model.forward( input_image, gt=target_image, label=label, train=False) loss = loss['pixel_loss'].view(loss['pixel_loss'].shape[0], -1).mean(1) assert len(loss) == len( label ), "During inference, loss sample number must match label sample number." start_ = 0 for i, num_ in enumerate(bbox_num): # per sample in batch logit_per_sample = torch.max( logit[start_:start_ + num_]).item() if num_ > 0 else 0 loss_per_sample = torch.max( loss[start_:start_ + num_]).item() if num_ > 0 else 0 psnr_list.append(psnr(loss_per_sample)) # TODO: Max or Mean logit_list.append(logit_per_sample) label_list.append(labels[i].item()) avg_loss.update(loss_per_sample, batch_size_now) start_ += num_ assert start_ == logit.size( )[0], "patch num and bbox_num doesn't match" if args.evaluate_time: single_time.update((time.time() - a) * 1000) progress.print(counter) # print("Single batch time cost {}ms, loss {}".format(1000*(time.time()-a), loss.mean().item())) psnr_score_total_list = np.asarray(psnr_score_list(psnr_list)) label_list = np.asarray(label_list) logit_list = np.asarray(logit_list) assert psnr_score_total_list.size == label_list.size and psnr_score_total_list.size == logit_list.size, "INFERENCE LENGTH MUST MATCH LABEL LENGTH." final_score = 0.1 * logit_list + 0.9 * (1 - psnr_score_total_list) # final_score = logit_list accuracy = roc_auc_score(y_true=label_list, y_score=final_score) # accuracy1 = roc_auc_score(y_true=label_list, y_score=1-psnr_score_total_list) # plot_AUC(psnr_score_total_list, np.expand_dims(1 - label_list, 0)) print("EVAL FRAME & BOX NUMBER & ACC : ", psnr_score_total_list.size, ct, accuracy * 100) return accuracy, avg_loss.avg
def train_D(train_batch, model, D, optimizer, optimizer_D, epoch, args): batch_time = metric.AverageMeter('Time', ':6.3f') data_time = metric.AverageMeter('Data', ':6.3f') avg_loss = metric.AverageMeter('avg_loss', ':.4e') avg_loss_D = metric.AverageMeter('avg_loss_D', ':.4e') progress = metric.ProgressMeter(len(train_batch), batch_time, data_time, avg_loss, avg_loss_D, prefix="Epoch: [{}]".format(epoch)) model.train() D.train() end = time.time() if args.object_detection: prefetcher = prefetch.data_prefetcher_trible(train_batch) images, labels, bboxes = prefetcher.next() else: prefetcher = prefetch.data_prefetcher(train_batch) images, labels = prefetcher.next() bboxes = None optimizer.zero_grad() optimizer_D.zero_grad() counter = -1 while images is not None: data_time.update(time.time() - end) counter += 1 # whether split into object if args.object_detection: # 5 - 10 ms patches, labels, bbox_num = get_object_images( images, labels, bboxes, args) # [K,C,H,W] [K] [B] del images batch_size_now = len(bbox_num) else: patches = images batch_size_now = images.size()[0] if patches is None: # prevent no input if args.object_detection: images, labels, bboxes = prefetcher.next() else: images, labels = prefetcher.next() bboxes = None continue label = labels if args.label else None assert label.sum() == 0, "training label must equal to zero" channel = (patches.size()[1] // args.c - 1) * args.c input_image = patches[:, 0:channel] target_image = patches[:, channel:] if args.visualize_input: _ = visualize_single(target_image) optimizer.zero_grad() optimizer_D.zero_grad() # G Loss with autocast(): reconstructed_image, loss = model.forward(input_image, gt=target_image, label=label, train=True) # loss = sum(loss.values()) loss_bak = loss['pixel_loss'].view(loss['pixel_loss'].shape[0], -1).mean(1) loss = loss['pixel_loss'].mean() # WGAN weight_cliping_limit = 0.01 for p in D.parameters(): p.data.clamp_(-weight_cliping_limit, weight_cliping_limit) # loss_bak n; reconstructed_image n,3,h,w; target_image n,3,h,w; input_image n,6,h,w; b, c, h, w = target_image.size() in_label = torch.zeros([b]).cuda() in_label_fake = torch.ones([b]).cuda() in_image = reconstructed_image - target_image # optimize G if epoch >= 1: # G&D Loss with autocast(): loss_D_fake, _ = D(in_image, in_label_fake, train=True) # advasarial inverse target loss1 = loss_D_fake * 0.2 # TODO: coef args.scaler.scale(loss + loss1).backward() if args.gradient_clip: args.scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.gradient_clip) args.scaler.step(optimizer) args.scaler.update() avg_loss.update(loss.mean().item() + loss1.item(), batch_size_now) else: args.scaler.scale(loss).backward() if args.gradient_clip: args.scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.gradient_clip) args.scaler.step(optimizer) args.scaler.update() avg_loss.update(loss.mean().item(), batch_size_now) # optimize D optimizer_D.zero_grad() # label Threshold t = 7e-3 - 2e-3 * (epoch // 15) in_label[torch.where(loss_bak > t)] = 1 # TODO: T adjust if counter % 100 == 0: print("ANOMALY RATIO: ", sum(in_label) / in_label.size()[0]) in_image = in_image.detach() with autocast(): loss_D, logit = D(in_image, in_label, train=True) args.scaler_D.scale(loss_D).backward() args.scaler_D.step(optimizer_D) args.scaler_D.update() avg_loss_D.update(loss_D.item(), batch_size_now) # ACC logit[logit > 0.5] = 1 logit[logit <= 0.5] = 0 acc = torch.true_divide(sum(torch.eq(logit - in_label, 0)), len(logit)) if counter % 100 == 0: print("ACC is: ", acc) batch_time.update(time.time() - end) end = time.time() if args.rank % args.ngpus_per_node == 0: if counter % args.print_freq == 0: progress.print(counter) if args.visualize: _ = visualize(reconstructed_image, target_image) if args.object_detection: images, labels, bboxes = prefetcher.next() else: images, labels = prefetcher.next() bboxes = None print("Training sample number of epoch {} is: {}".format( epoch, counter * int(args.batch_size))) return avg_loss.avg
def train(train_batch, model, optimizer, epoch, args): batch_time = metric.AverageMeter('Time', ':6.3f') data_time = metric.AverageMeter('Data', ':6.3f') avg_loss = metric.AverageMeter('avg_loss', ':.4e') # show all progress = metric.ProgressMeter(len(train_batch), batch_time, data_time, avg_loss, prefix="Epoch: [{}]".format(epoch)) model.train() end = time.time() if args.object_detection: prefetcher = prefetch.data_prefetcher_trible(train_batch) images, labels, bboxes = prefetcher.next() else: prefetcher = prefetch.data_prefetcher(train_batch) images, labels = prefetcher.next() bboxes = None optimizer.zero_grad() counter = -1 while images is not None: data_time.update(time.time() - end) counter += 1 # whether split into object if args.object_detection: # 5 - 10 ms patches, labels, bbox_num = get_object_images( images, labels, bboxes, args) # [K,C,H,W] [K] [B] del images batch_size_now = len(bbox_num) else: patches = images batch_size_now = images.size()[0] if patches is None: # prevent no input if args.object_detection: images, labels, bboxes = prefetcher.next() else: images, labels = prefetcher.next() bboxes = None continue label = labels if args.label else None #assert label.sum() == 0, "training label must equal to zero" channel = (patches.size()[1] // args.c - 1) * args.c # input_image = patches[:, 0:channel] # target_image = patches[:, channel:] input_image = patches.detach() target_image = patches.detach() if args.visualize_input: _ = visualize_single(target_image) optimizer.zero_grad() with autocast(): if 'Classifier' in args.arch: reconstructed_image, loss, _ = model.forward(input_image, gt=target_image, label=label, train=True) loss = loss['pixel_loss'].mean( ) + loss['classifier_loss'].mean() else: reconstructed_image, loss = model.forward(input_image, gt=target_image, label=label, train=True) loss = loss['pixel_loss'].mean() args.scaler.scale(loss).backward() if args.gradient_clip: args.scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.gradient_clip) args.scaler.step(optimizer) args.scaler.update() avg_loss.update(loss.mean().item(), batch_size_now) batch_time.update(time.time() - end) end = time.time() if args.rank % args.ngpus_per_node == 0: if counter % args.print_freq == 0: progress.print(counter) if args.visualize: _ = visualize(reconstructed_image, target_image) if args.object_detection: images, labels, bboxes = prefetcher.next() else: images, labels = prefetcher.next() bboxes = None print("Training sample number of epoch {} is: {}".format( epoch, counter * int(args.batch_size))) return avg_loss.avg
def multigpu_test_2gpus(args): """ This is a simple program for validating the idea of parallel runing of multiple model on multiple gpus. """ model = splitnet.SplitNet(args, norm_layer=norm.norm(args.norm_mode), criterion=None) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("INFO:PyTorch: => loading checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) old_dict = checkpoint['state_dict'] # orignial ckpt was save as nn.parallel.DistributedDataParallel() object old_dict = { k.replace("module.models", "models"): v for k, v in old_dict.items() } model.load_state_dict(old_dict) print("INFO:PyTorch: => loaded checkpoint" " '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("INFO:PyTorch: => no checkpoint found at '{}'".format( args.resume)) # accelarate the training torch.backends.cudnn.benchmark = True val_loader = factory.get_data_loader(args.data, batch_size=args.eval_batch_size, crop_size=args.crop_size, dataset=args.dataset, split="val", num_workers=args.workers) # record the top1 accuray of each small network top1_all = [] for i in range(args.loop_factor): top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f')) avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f') avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f') progress = metric.ProgressMeter(len(val_loader), *top1_all, avg_top1, avg_top5, prefix='Test: ') # switch to evaluate mode model.eval() # move model to the gpu if args.is_test_on_multigpus: print("INFO:PyTorch: multi GPUs test") cuda_models = [] for idx in range(args.split_factor): cuda_models.append(model.models[idx].cuda(idx)) else: print("INFO:PyTorch: single GPU test") model = model.cuda(0) with torch.no_grad(): # record time and number of samples prefetcher = data_prefetcher_2gpus(val_loader, ngpus=args.split_factor) images_gpu0, target, images_gpu1 = prefetcher.next() i = 0 n_count = 0.0 start_time = time.time() while images_gpu0 is not None: i += 1 # for i, (images, target) in enumerate(val_loader): # compute outputs and losses if args.is_test_on_multigpus: if args.is_amp: with amp.autocast(): output_gpu0 = cuda_models[0](images_gpu0) with amp.autocast(): output_gpu1 = cuda_models[1](images_gpu1) else: output_gpu0 = cuda_models[0](images_gpu0) output_gpu1 = cuda_models[1](images_gpu1) if _GEO_TEST: if i == 1: print("using geometry mean") output_gpu0 = F.softmax(output_gpu0, dim=-1) output_gpu1 = F.softmax(output_gpu1, dim=-1) ensemble_output = torch.sqrt(output_gpu0 * output_gpu1.cuda(0)) else: outputs = torch.stack([output_gpu0, output_gpu1.cuda(0)]) ensemble_output = torch.mean(outputs, dim=0) else: # compute outputs and losses if args.is_amp: with amp.autocast(): ensemble_output, outputs, ce_loss = model( images_gpu0, target=target, mode='val') else: ensemble_output, outputs, ce_loss = model(images_gpu0, target=target, mode='val') # measure accuracy and record loss """ target = target.cpu() ensemble_output = ensemble_output.cpu().float() outputs = outputs.cpu().float() """ batch_size_now = images_gpu0.size(0) """ for j in range(args.loop_factor): acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5)) top1_all[j].update(acc1[0].item(), batch_size_now) """ # simply average outputs of small networks avg_acc1, avg_acc5 = metric.accuracy(ensemble_output, target, topk=(1, 5)) avg_top1.update(avg_acc1[0].item(), batch_size_now) avg_top5.update(avg_acc5[0].item(), batch_size_now) images_gpu0, target, images_gpu1 = prefetcher.next() n_count += batch_size_now """ if i % args.print_freq == 0: progress.print(i) """ time_cnt = time.time() - start_time # print accuracy info acc_all = [] acc_all.append(avg_top1.avg) acc_all.append(avg_top5.avg) acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1]) """ mean_acc = 0.0 for j in range(args.loop_factor): acc_all.append(top1_all[j].avg) acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg) mean_acc += top1_all[j].avg acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor) """ print(acc_info) print("multiple GPUs ({})".format(args.is_test_on_multigpus)) print("The tested architecture is {} with split_factor {}".format( args.arch, args.split_factor)) print("The number of the samples is {}".format(n_count)) print("The total testing time is {} second".format(time_cnt)) print("The average test time is {}ms per images".format(1000 * time_cnt / n_count)) torch.cuda.empty_cache() sys.exit(0)
def multistreams_test(args): """ This is a simple program for validating the idea of parallel runing of multiple model on single gpu via multi cuda streams. """ model = splitnet.SplitNet(args, norm_layer=norm.norm(args.norm_mode), criterion=None) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("INFO:PyTorch: => loading checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) old_dict = checkpoint['state_dict'] # orignial ckpt was save as nn.parallel.DistributedDataParallel() object old_dict = { k.replace("module.models", "models"): v for k, v in old_dict.items() } model.load_state_dict(old_dict) print("INFO:PyTorch: => loaded checkpoint" " '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("INFO:PyTorch: => no checkpoint found at '{}'".format( args.resume)) # accelarate the training torch.backends.cudnn.benchmark = True val_loader = factory.get_data_loader(args.data, batch_size=args.eval_batch_size, crop_size=args.crop_size, dataset=args.dataset, split="val", num_workers=args.workers) # record the top1 accuray of each small network top1_all = [] for i in range(args.loop_factor): top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f')) avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f') avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f') progress = metric.ProgressMeter(len(val_loader), *top1_all, avg_top1, avg_top5, prefix='Test: ') # switch to evaluate mode model.eval() # move model to the gpu cuda_models = [] cuda_streams = [] for idx in range(args.split_factor): cuda_streams.append(torch.cuda.Stream()) cuda_models.append(model.models[idx].cuda(0)) torch.cuda.synchronize() # record time and number of samples n_count = 0.0 start_time = time.time() with torch.no_grad(): for i, (images, target) in enumerate(val_loader): images = images.cuda(0, non_blocking=True) target = target.cuda(0, non_blocking=True) collect_outputs = [] if args.is_amp: with torch.cuda.stream(cuda_streams[0]): with amp.autocast(): output_0 = cuda_models[0](images) with torch.cuda.stream(cuda_streams[1]): with amp.autocast(): output_1 = cuda_models[1](images) else: for idx in range(args.split_factor): with torch.cuda.stream(cuda_streams[idx]): collect_outputs.append(cuda_models[idx](images)) torch.cuda.synchronize() collect_outputs.extend([output_0, output_1]) # output is fp16 outputs = torch.stack(collect_outputs, dim=0) ensemble_output = torch.mean(outputs, dim=0) # measure accuracy and record loss batch_size_now = images.size(0) n_count += batch_size_now for j in range(args.loop_factor): acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5)) top1_all[j].update(acc1[0].item(), batch_size_now) # simply average outputs of small networks avg_acc1, avg_acc5 = metric.accuracy(ensemble_output, target, topk=(1, 5)) avg_top1.update(avg_acc1[0].item(), batch_size_now) avg_top5.update(avg_acc5[0].item(), batch_size_now) #if i >= 200: # break if i % args.print_freq == 0: progress.print(i) time_cnt = time.time() - start_time # print accuracy info acc_all = [] acc_all.append(avg_top1.avg) acc_all.append(avg_top5.avg) acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1]) mean_acc = 0.0 for j in range(args.loop_factor): acc_all.append(top1_all[j].avg) acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg) mean_acc += top1_all[j].avg acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor) print(acc_info) print("The tested architecture is {} with split_factor {}".format( args.arch, args.split_factor)) print("The number of the samples is {}".format(n_count)) print("The total testing time is {} second".format(time_cnt)) print("The average test time is {}ms per images".format(1000 * time_cnt / n_count)) torch.cuda.empty_cache() sys.exit(0)
def multigpu_test(args): """ This is a simple program for validating the idea of parallel runing of multiple model on multiple gpus. """ model = splitnet.SplitNet(args, norm_layer=norm.norm(args.norm_mode), criterion=None) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("INFO:PyTorch: => loading checkpoint '{}'".format( args.resume)) checkpoint = torch.load(args.resume) old_dict = checkpoint['state_dict'] # orignial ckpt was save as nn.parallel.DistributedDataParallel() object old_dict = { k.replace("module.models", "models"): v for k, v in old_dict.items() } model.load_state_dict(old_dict) print("INFO:PyTorch: => loaded checkpoint" " '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) else: print("INFO:PyTorch: => no checkpoint found at '{}'".format( args.resume)) # accelarate the training torch.backends.cudnn.benchmark = True val_loader = factory.get_data_loader(args.data, batch_size=args.eval_batch_size, crop_size=args.crop_size, dataset=args.dataset, split="val", num_workers=args.workers) # record the top1 accuray of each small network top1_all = [] for i in range(args.loop_factor): top1_all.append(metric.AverageMeter('{}_Acc@1'.format(i), ':6.2f')) avg_top1 = metric.AverageMeter('Avg_Acc@1', ':6.2f') avg_top5 = metric.AverageMeter('Avg_Acc@1', ':6.2f') progress = metric.ProgressMeter(len(val_loader), *top1_all, avg_top1, avg_top5, prefix='Test: ') # switch to evaluate mode model.eval() n_count = 0.0 # move model to the gpu cuda_models = [] for idx in range(args.split_factor): cuda_models.append(model.models[idx].cuda(idx)) start_time = time.time() for i, (images, target) in enumerate(val_loader): cuda_images = [] cuda_outpouts = [] collect_outputs = [] target = target.cuda(0, non_blocking=True) for idx in range(args.split_factor): cuda_images.append(images.cuda(idx, non_blocking=True)) if args.is_amp: with amp.autocast(): for idx in range(args.split_factor): cuda_outpouts.append(cuda_models[idx](cuda_images[idx])) else: for idx in range(args.split_factor): cuda_outpouts.append(cuda_models[idx](cuda_images[idx])) for idx in range(args.split_factor): # use the first gpu as host gpu collect_outputs.append(cuda_outpouts[idx].cuda(0)) if _GEO_TEST: if i == 1: print("using geometry mean") cmul = 1.0 for j in range(args.split_factor): cmul = cmul * F.softmax(cuda_outpouts[j].cuda(0), dim=-1) # ensemble_output = torch.pow(cmul, 1.0 / args.split_factor) ensemble_output = torch.sqrt(cmul) else: outputs = torch.stack(collect_outputs, dim=0) ensemble_output = torch.mean(outputs, dim=0) batch_size_now = images.size(0) """ for j in range(args.loop_factor): acc1, acc5 = metric.accuracy(outputs[j, ...], target, topk=(1, 5)) top1_all[j].update(acc1[0].item(), batch_size_now) """ # simply average outputs of small networks avg_acc1, avg_acc5 = metric.accuracy(ensemble_output, target, topk=(1, 5)) avg_top1.update(avg_acc1[0].item(), batch_size_now) avg_top5.update(avg_acc5[0].item(), batch_size_now) n_count += batch_size_now """ if i % args.print_freq == 0: progress.print(i) """ time_cnt = time.time() - start_time # print accuracy info acc_all = [] acc_all.append(avg_top1.avg) acc_all.append(avg_top5.avg) acc_info = '* Acc@1 {:.3f} Acc@5 {:.3f}'.format(acc_all[0], acc_all[1]) """ mean_acc = 0.0 for j in range(args.loop_factor): acc_all.append(top1_all[j].avg) acc_info += '\t {}_acc@1 {:.3f}'.format(j, top1_all[j].avg) mean_acc += top1_all[j].avg acc_info += "\t avg_acc {:.3f}".format(mean_acc / args.split_factor) """ print(acc_info) print("multiple GPUs ({})".format(args.is_test_on_multigpus)) print("The tested architecture is {} with split_factor {}".format( args.arch, args.split_factor)) print("The number of the samples is {}".format(n_count)) print("The total testing time is {} second".format(time_cnt)) print("The average test time is {}ms per images".format(1000 * time_cnt / n_count)) torch.cuda.empty_cache() sys.exit(0)