def run(gpu, config): cudnn.benchmark = True if config['distribute']: rank = config['rank'] * config['last_node_gpus'] + gpu print("world_size: {}, rank: {}".format(config['world_size'], rank)) dist.init_process_group(backend=config['backend'], init_method=config['ip'], world_size=config['world_size'], rank=rank) assert cudnn.enabled, "Amp requires cudnn backend to be enabled." torch.cuda.set_device(gpu) # create model model = AlexNet(10) # define loss function criterion = nn.CrossEntropyLoss() # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), config['lr'], momentum=config['momentum'], weight_decay=config['weight_decay']) # convert pytorch to apex model. apexparallel = ApexDistributeModel(model, criterion, optimizer, config, gpu) apexparallel.convert() apexparallel.lars() # load data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if config['distribute']: train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, config['batch_size'], shuffle=(train_sampler is None), num_workers=config['num_workers'], pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epo in range(config['epoch']): if config['distribute']: train_sampler.set_epoch(epo) # train for per epoch apexparallel.train(epo, train_loader)
def run(gpu, config): cudnn.benchmark = True if config['distribute']: rank = config['rank'] * config['last_node_gpus'] + gpu print("world_size: {}, rank: {}".format(config['world_size'], rank)) dist.init_process_group(backend=config['backend'], init_method=config['ip'], world_size=config['world_size'], rank=rank) assert cudnn.enabled, "Amp requires cudnn backend to be enabled." # create model model = AlexNet(10) if config['sync_bn']: # synchronization batch normal model = apex.parallel.convert_syncbn_model(model) torch.cuda.set_device(gpu) model = model.cuda(gpu) # define loss function criterion = nn.CrossEntropyLoss().cuda(gpu) # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), config['lr'], momentum=config['momentum'], weight_decay=config['weight_decay']) # initialization apex model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O0') if config['distribute']: # model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) # load data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if config['distribute']: train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, config['batch_size'], shuffle=(train_sampler is None), num_workers=config['num_workers'], pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epo in range(config['epoch']): if config['distribute']: train_sampler.set_epoch(epo) # train for per epoch train(train_loader, model, criterion, optimizer, epo, gpu)
def main(is_distributed, rank, ip): world_size = 1 if is_distributed: world_size = 2 torch.distributed.init_process_group(backend='nccl', init_method=ip, world_size=world_size, rank=rank) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." print("Connect") # set hyper parameters batch_size = 128 lr = 0.01 # base on batch size 256 momentum = 0.9 weight_decay = 0.0001 epoch = 100 # recompute lr lr = lr * world_size # create model model = AlexNet(10) model = model.cuda() if is_distributed: # for distribute training model = nn.parallel.DistributedDataParallel(model) # define loss function criterion = nn.CrossEntropyLoss().cuda() # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) # load train data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if is_distributed: train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, batch_size, shuffle=(train_sampler is None), num_workers=4, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epoch in range(100): # for distribute if is_distributed: train_sampler.set_epoch(epoch) model.train() train_iter = iter(train_loader) inputs, target = next(train_iter) step = 0 print("Epoch is {}".format(epoch)) while inputs is not None: step += 1 print("Step is {}".format(step)) if not is_distributed: inputs = inputs.cuda() time_model_1 = time.time() output = model(inputs) time_model_2 = time.time() print("model time: {}".format(time_model_2 - time_model_1)) time_loss_1 = time.time() loss = criterion(output, target.cuda()) time_loss_2 = time.time() print("loss time: {}".format(time_loss_2 - time_loss_1)) optimizer.zero_grad() time_back_1 = time.time() loss.backward() time_back_2 = time.time() print("back time: {}".format(time_back_2 - time_back_1)) optimizer.step() if step % 10 == 0: print("loss is : {}", loss.item()) inputs, target = next(train_iter, (None, None))
def weight_init(m): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weigth.data.fill_(1) m.bias.data.zero_() trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) trainloader = DataLoader(trainset, batch_size=100, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) testloader = DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) n_output = 10 net = AlexNet(10) # 如果GPU可用,使用GPU if use_cuda: # move param and buffer to GPU net.cuda() # parallel use GPU net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()-1)) # speed up slightly cudnn.benchmark = True # 定义度量和优化 criterion = nn.CrossEntropyLoss() #交叉熵验证 optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) #随机梯度下降
# read image img=None for i in range(1,6): im = readimg('test/violent/' + str(i)) if img==None: img=[im] else: img=np.append(img,[im],axis=0) x = tf.placeholder(tf.float32, [ 5,IMAGE_SIZE, IMAGE_SIZE, 20]) # initialization model = AlexNet(x, NUM_CLASSES) score = model.fc8 with tf.name_scope('result')as scope: result=tf.argmax(tf.nn.softmax(score), 1) saver=tf.train.Saver() with tf.Session() as sess: #notice the order of initialize_all_variables and restore weights from checkpoint sess.run(tf.global_variables_initializer()) # restore saved weights ckpt = tf.train.get_checkpoint_state('check')
def main(is_distributed, sync_bn, rank): world_size = 1 if is_distributed: world_size = 2 torch.distributed.init_process_group( backend='nccl', init_method='tcp://172.16.117.110:1234', world_size=world_size, rank=rank) assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # set hyper parameters batch_size = 30 lr = 0.01 # base on batch size 256 momentum = 0.9 weight_decay = 0.0001 epoch = 100 # recompute lr lr = lr * world_size # create model model = AlexNet(10) # leverage apex to realize batch_normal synchronization in different GPU # if sync_bn: # model = apex.parallel.convert_syncbn_model(model) model = model.cuda() # define loss function criterion = nn.CrossEntropyLoss().cuda() # define optimizer strategy optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) # initialize Amp # model, optimizer = apex.amp.initialize(model, optimizer, opt_level='O0') if is_distributed: # for distribute training # model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) model = nn.parallel.DistributedDataParallel(model) # load train data data_path = '~/datasets/cifar10/train' train_set = LoadClassifyDataSets(data_path, 227) train_sampler = None if is_distributed: train_sampler = distributed.DistributedSampler(train_set, world_size, rank=rank) # train_sampler = distributed.DistributedSampler(train_set) train_loader = DataLoader(train_set, batch_size, shuffle=(train_sampler is None), num_workers=4, pin_memory=True, sampler=train_sampler, collate_fn=collate_fn) for epoch in range(100): # for distribute if is_distributed: train_sampler.set_epoch(epoch) model.train() train_iter = iter(train_loader) inputs, target = next(train_iter) step = 0 print("Epoch is {}".format(epoch)) while inputs is not None: step += 1 print("test0") temp = inputs.cuda() print("test01") output = model(temp) print("test1") loss = criterion(output, target.cuda()) print("test2") optimizer.zero_grad() print("test3") loss.backward() print("test4") # with apex.amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() optimizer.step() print("test5") if step % 10 == 0: print("loss is : ", loss.item()) inputs, target = next(train_iter, (None, None))