def get(self): test_model = test() self.write(test_model.f() + "_blog")
def main_worker(gpu, ngpus_per_node, args): print("gpu:", gpu) args.gpu = gpu if args.rank == 0: #(第一台服务器只有三台GPU,需要特殊处理) newrank = args.rank * ngpus_per_node + gpu else: newrank = args.rank * ngpus_per_node + gpu - 1 #初始化,使用tcp方式进行通信 print("begin init") dist.init_process_group(init_method=args.init_method, backend="nccl", world_size=args.world_size, rank=newrank) print("end init") #建立通信group,rank=0作为server,用broadcast模拟send和rec,需要server和每个client建立group group = [] for i in range(1, args.world_size): group.append(dist.new_group([0, i])) allgroup = dist.new_group([i for i in range(args.world_size)]) if newrank == 0: """ server""" print("使用{}号服务器的第{}块GPU作为server".format(args.rank, gpu)) #在模型训练期间,server只负责整合参数并分发,不参与任何计算 #设置cpu args.device = torch.device( 'cuda:{}'.format(args.gpu) if torch.cuda.is_available() and args.gpu != -1 else 'cpu') net = CNNMnist().to(args.device) w_avg = copy.deepcopy(net.state_dict()) for j in range(args.epochs): if j == args.epochs - 1: for i in w_avg.keys(): temp = w_avg[i].to(args.device) w_avg[i] = average_gradients(temp, group, allgroup) else: for i in w_avg.keys(): temp = w_avg[i].to(args.device) average_gradients(temp, group, allgroup) torch.save(w_avg, 'w_wag') net.load_state_dict(w_avg) #加载测试数据 trans_mnist = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]) dataset_test = datasets.MNIST('data/', train=False, download=True, transform=trans_mnist) test_set = torch.utils.data.DataLoader(dataset_test, batch_size=args.bs) test_accuracy, test_loss = test(net, test_set, args) print("Testing accuracy: {:.2f}".format(test_accuracy)) print("Testing loss: {:.2f}".format(test_loss)) else: """clents""" print("使用{}号服务器的第{}块GPU作为第{}个client".format(args.rank, gpu, newrank)) #设置gpu args.device = torch.device( 'cuda:{}'.format(args.gpu) if torch.cuda.is_available() and args.gpu != -1 else 'cpu') print("begin train...") net = CNNMnist().to(args.device) print(net) data = torch.load("data/distributed/data_of_client{}".format(newrank)) bsz = 64 train_set = torch.utils.data.DataLoader(data, batch_size=bsz) optimizer = torch.optim.SGD(net.parameters(), lr=args.lr, momentum=0.5) num_batches = ceil(len(train_set.dataset) / float(bsz)) start = time.time() for epoch in range(args.epochs): for iter in range(3): epoch_loss = 0.0 for data, target in train_set: data, target = data.to(args.device), target.to(args.device) data, target = Variable(data), Variable(target) optimizer.zero_grad() output = net(data) loss = F.nll_loss(output, target) epoch_loss += loss.item() loss.backward() optimizer.step() if iter == 3 - 1: print('Rank ', dist.get_rank(), ', epoch ', epoch, ': ', epoch_loss / num_batches) """federated learning""" w_avg = copy.deepcopy(net.state_dict()) for k in w_avg.keys(): print("k:", k) temp = average_gradients(w_avg[k].to(args.device), group, allgroup) w_avg[k] = temp net.load_state_dict(w_avg) end = time.time() print(" training time:{}".format((end - start))) train_accuracy, train_loss = test(net, train_set, args) print("Training accuracy: {:.2f}".format(train_accuracy)) print("Training loss: {:.2f}".format(train_loss))
device = torch.device('cpu') elif torch.cuda.is_available() and args.use_gpu: device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') else: device = torch.device('cpu') model.to(device) logger.info('Running with device %s', device) transform = torchvision.transforms.Compose( [torchvision.transforms.ToTensor()]) if args.test: test_dataset = XrayImageFolder(args.test_data, transform=transform) test_dataloader = DataLoader(test_dataset, batch_size=cfg.BATCH_SIZE) test(model, test_dataloader, device) elif args.inference: img = cv2.imread(args.img_path) try: img = transform(img).unsqueeze(0) except TypeError: logger.exception('Possible incorrect image path or image type') start_time = datetime.now() with torch.no_grad(): img = img.to(device) output = model.inference(img) label = output.data.argmax() prob = output.detach()[0][label].item() * 100
def get(self): test_model = test() self.write(test_model.f() + "_admin")
print('run') for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(i, train_loader, model, criterion, optimizer, opt, train_logger, train_batch_logger) if not opt.no_val: validation_loss = val_epoch(i, val_loader, model, criterion, opt, val_logger) if not opt.no_train and not opt.no_val: scheduler.step(validation_loss) if opt.test: spatial_transform = Compose([ Scale(int(opt.sample_size / opt.scale_in_test)), CornerCrop(opt.sample_size, opt.crop_position_in_test), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = VideoID() test_data = get_test_set(opt, spatial_transform, temporal_transform, target_transform) test_loader = torch.utils.data.DataLoader( test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) test.test(test_loader, model, opt, test_data.class_names)