def main(config, cuda, gpu): # Configuration CONFIG = Dict(yaml.load(open(config))) # CUDA check cuda = cuda and torch.cuda.is_available() if cuda: gpu_ids = [int(string) for string in gpu.split(',')] current_device = torch.cuda.current_device() print('Running on', torch.cuda.get_device_name(current_device), gpu_ids) # Dataset dataset = CocoStuff10k( root=CONFIG.ROOT, split='train', image_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, scale=True, flip=True, ) # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.BATCH_SIZE, num_workers=CONFIG.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model model = DeepLabV2_ResNet101_MSC(n_classes=CONFIG.N_CLASSES) state_dict = torch.load(CONFIG.INIT_MODEL) model.load_state_dict(state_dict, strict=False) # Skip "aspp" layer model = nn.DataParallel(model, device_ids=gpu_ids) if cuda: model.cuda() # Optimizer optimizer = { 'sgd': torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[{ 'params': get_lr_params(model.module, key='1x'), 'lr': CONFIG.LR, 'weight_decay': CONFIG.WEIGHT_DECAY }, { 'params': get_lr_params(model.module, key='10x'), 'lr': 10 * CONFIG.LR, 'weight_decay': CONFIG.WEIGHT_DECAY }, { 'params': get_lr_params(model.module, key='20x'), 'lr': 20 * CONFIG.LR, 'weight_decay': 0.0 }], momentum=CONFIG.MOMENTUM, ), }.get(CONFIG.OPTIMIZER) # Loss definition criterion = CrossEntropyLoss2d(ignore_index=CONFIG.IGNORE_LABEL) if cuda: criterion.cuda() # TensorBoard Logger writer = SummaryWriter(CONFIG.LOG_DIR) loss_meter = MovingAverageValueMeter(20) model.train() model.module.scale.freeze_bn() for iteration in tqdm( range(1, CONFIG.ITER_MAX + 1), total=CONFIG.ITER_MAX, leave=False, dynamic_ncols=True, ): # Set a learning rate poly_lr_scheduler( optimizer=optimizer, init_lr=CONFIG.LR, iter=iteration - 1, lr_decay_iter=CONFIG.LR_DECAY, max_iter=CONFIG.ITER_MAX, power=CONFIG.POLY_POWER, ) # Clear gradients (ready to accumulate) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG.ITER_SIZE + 1): data, target = next(loader_iter) # Image data = data.cuda() if cuda else data data = Variable(data) # Propagate forward outputs = model(data) # Loss loss = 0 for output in outputs: # Resize target for {100%, 75%, 50%, Max} outputs target_ = resize_target(target, output.size(2)) target_ = target_.cuda() if cuda else target_ target_ = Variable(target_) # Compute crossentropy loss loss += criterion(output, target_) # Backpropagate (just compute gradients wrt the loss) loss /= float(CONFIG.ITER_SIZE) loss.backward() iter_loss += loss.data[0] # Reload dataloader if ((iteration - 1) * CONFIG.ITER_SIZE + i) % len(loader) == 0: loader_iter = iter(loader) loss_meter.add(iter_loss) # Update weights with accumulated gradients optimizer.step() # TensorBoard if iteration % CONFIG.ITER_TF == 0: writer.add_scalar('train_loss', loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar('train_lr_group{}'.format(i), o['lr'], iteration) if iteration % 1000 != 0: continue for name, param in model.named_parameters(): name = name.replace('.', '/') writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram(name + '/grad', param.grad, iteration, bins="auto") # Save a model if iteration % CONFIG.ITER_SNAP == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, 'checkpoint_{}.pth'.format(iteration)), ) # Save a model if iteration % 100 == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, 'checkpoint_current.pth'), ) torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, 'checkpoint_final.pth'), )
class Generator(nn.Module): def __init__(self, hyperparameters, log_dir, gpu_id): super(Generator, self).__init__() self.hyp = hyperparameters print(hyperparameters) self.gpu_id = gpu_id self.noise_dim = self.hyp['noise_dim'] self.vis_noise = torch.randn(1, self.hyp['noise_dim']).cuda( self.gpu_id).requires_grad_(False) self.g_loss_meter = MovingAverageValueMeter(5) self.log_dir = log_dir # Architecture: self.lab0 = nn.Linear(1, self.hyp['p1'], bias=False) self.fc0 = nn.Linear(self.noise_dim, self.hyp['p2'], bias=False) self.nonlin0 = nn.Sequential(*[ nn.BatchNorm2d(self.hyp['p1'] + self.hyp['p2']), nn.LeakyReLU(self.hyp['lrelu_g']) ] if self.hyp['bg0'] else [ nn.LeakyReLU(self.hyp['lrelu_g']), ]) self.conv1 = nn.ConvTranspose2d(self.hyp['p1'] + self.hyp['p2'], self.hyp['p3'], (1, 55), bias=True) self.nonlin1 = nn.Sequential(*[ nn.BatchNorm2d(self.hyp['p3']), nn.LeakyReLU(self.hyp['lrelu_g']) ] if self.hyp['bg1'] else [ nn.LeakyReLU(self.hyp['lrelu_g']), ]) self.conv2 = nn.ConvTranspose2d(self.hyp['p3'], 1, (55, 1), bias=True) self.sigmoid = nn.Tanh() self.cuda(self.gpu_id) opt_param_list = [{ 'params': [ param for name, param in self.named_parameters() if 'lab0' not in name ] }, { 'params': self.lab0.parameters(), 'lr': 1 * self.hyp['lr_g'] }] self.optimizer = torch.optim.Adam(opt_param_list, lr=self.hyp['lr_g'], betas=(self.hyp['b1_g'], self.hyp['b2_g']), weight_decay=self.hyp['wd_g']) # rand init for m in self.modules(): if isinstance(m, nn.ConvTranspose2d): torch.nn.init.kaiming_normal_(m.weight, a=self.hyp['lrelu_g'], nonlinearity='leaky_relu') if not m.bias is None: torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): torch.nn.init.kaiming_normal_(m.weight) if not m.bias is None: torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): torch.nn.init.constant_(m.weight, 1) torch.nn.init.constant_(m.bias, 0) def forward(self, z, labels): x = z.view(-1, self.noise_dim) labels = labels.view(-1, 1).float() * 2 - 1 x = torch.cat([ self.fc0(x).view(-1, self.hyp['p2'], 1, 1), self.lab0(labels).view(-1, self.hyp['p1'], 1, 1) ], 1) x = self.nonlin0(x) x = self.conv1(x) x = self.nonlin1(x) x = self.conv2(x) x = (x + torch.transpose(x, -1, -2)) / 2 x = self.sigmoid(x) return x def train_step(self, netd): self.zero_grad() self.fake_labels = torch.randint(0, 2, (self.hyp['batch_size'], )).type( torch.float).cuda(self.gpu_id) self.noise = torch.randn(self.hyp['batch_size'], self.hyp['noise_dim']).cuda(self.gpu_id) self.g = self(self.noise, self.fake_labels) self.g_cost = -netd(self.g, self.fake_labels).mean() self.g_cost.backward() self.optimizer.step() self.g_loss_meter.add(self.g_cost.detach().cpu()) def generate_fake_images(self, num_images): self.eval() labels = (torch.randint(0, 2, (num_images, ))).type(torch.long).cuda( self.gpu_id) noise = torch.randn(num_images, self.hyp['noise_dim']).cuda( self.gpu_id).requires_grad_(False) images = self(noise, labels).detach() self.train() return (images, labels) def visualize_gen_images(self, global_step): """ Saves sample of generated images to a eps and png file. Note that the noise input of the generator for visualizing is the same during training. :param global_step: :return: """ self.eval() noise = torch.cat([self.vis_noise, self.vis_noise], 0) labels = (torch.from_numpy(np.array([0, 1]))).type(torch.long).view( -1, 1).cuda(self.gpu_id).requires_grad_(False) samples = self(noise, labels) i = str(global_step) os.makedirs(os.path.join(self.log_dir, 'vis_imgs'), exist_ok=True) filename = os.path.join(self.log_dir, 'vis_imgs', 'gen_img_it_' + i) b, chs, h, w = samples.shape imgs = samples.view(b, h, w).detach().cpu().data.numpy() np.save(filename + '.npy', imgs) labels = labels.view(b).detach().cpu().data.numpy() fig = plt.figure() for i in range(b): plt.subplot(1, 2, i + 1) plt.imshow(imgs[i], cmap='jet', interpolation='nearest', vmin=-1, vmax=1) plt.title('Sex: ' + ['Female', 'Male'][labels[i]]) plt.axis('off') plt.savefig(filename + '.eps') plt.savefig(filename + '.png') plt.close() self.train()
def main(config, cuda): device = torch.device("cuda" if cuda and torch.cuda.is_available() else "cpu") if cuda: current_device = torch.cuda.current_device() print("Running on", torch.cuda.get_device_name(current_device)) else: print("Running on CPU") # Configuration CONFIG = Dict(yaml.load(open(config))) # Dataset dataset = CocoStuff10k( root=CONFIG.ROOT, split="train", image_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, scale=True, flip=True, ) # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.BATCH_SIZE, num_workers=CONFIG.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model model = DeepLabV2_ResNet101_MSC(n_classes=CONFIG.N_CLASSES) state_dict = torch.load(CONFIG.INIT_MODEL) model.load_state_dict(state_dict, strict=False) # Skip "aspp" layer model = nn.DataParallel(model) model.to(device) # Optimizer optimizer = { "sgd": torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_lr_params(model.module, key="1x"), "lr": CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_lr_params(model.module, key="10x"), "lr": 10 * CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_lr_params(model.module, key="20x"), "lr": 20 * CONFIG.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.MOMENTUM, ) }.get(CONFIG.OPTIMIZER) # Loss definition criterion = CrossEntropyLoss2d(ignore_index=CONFIG.IGNORE_LABEL) criterion.to(device) # TensorBoard Logger writer = SummaryWriter(CONFIG.LOG_DIR) loss_meter = MovingAverageValueMeter(20) model.train() model.module.scale.freeze_bn() for iteration in tqdm( range(1, CONFIG.ITER_MAX + 1), total=CONFIG.ITER_MAX, leave=False, dynamic_ncols=True, ): # Set a learning rate poly_lr_scheduler( optimizer=optimizer, init_lr=CONFIG.LR, iter=iteration - 1, lr_decay_iter=CONFIG.LR_DECAY, max_iter=CONFIG.ITER_MAX, power=CONFIG.POLY_POWER, ) # Clear gradients (ready to accumulate) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG.ITER_SIZE + 1): try: data, target = next(loader_iter) except: loader_iter = iter(loader) data, target = next(loader_iter) # Image data = data.to(device) # Propagate forward outputs = model(data) # Loss loss = 0 for output in outputs: # Resize target for {100%, 75%, 50%, Max} outputs target_ = resize_target(target, output.size(2)) target_ = target_.to(device) # Compute crossentropy loss loss += criterion(output, target_) # Backpropagate (just compute gradients wrt the loss) loss /= float(CONFIG.ITER_SIZE) loss.backward() iter_loss += float(loss) loss_meter.add(iter_loss) # Update weights with accumulated gradients optimizer.step() # TensorBoard if iteration % CONFIG.ITER_TF == 0: writer.add_scalar("train_loss", loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("train_lr_group{}".format(i), o["lr"], iteration) # for name, param in model.named_parameters(): # name = name.replace('.', '/') # writer.add_histogram(name, param, iteration, bins="auto") # if param.requires_grad: # writer.add_histogram(name + '/grad', param.grad, iteration, bins="auto") # Save a model if iteration % CONFIG.ITER_SNAP == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_{}.pth".format(iteration)), ) # Save a model if iteration % 100 == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_current.pth"), ) torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_final.pth") )
def generateTB(self, period): self.writer = SummaryWriter(self.savepath + '/runs') self.loss_meter = MovingAverageValueMeter(20) self.tb = period
train_file = "train_loss.txt" text_file = "test_loss.txt" save_file = "loss_curve.pdf" train_f = open(train_file) train_d = train_f.readlines() train_f.close() valid_f = open(text_file) valid_d = valid_f.readlines() valid_f.close() train_iter = [] train_loss = [] i = 0 ma_loss = MovingAverageValueMeter(windowsize=500) for s in train_d: i = i + 1 t = s.strip().split(' ') t_iter = int(t[0]) ma_loss.add(float(t[1])) if i % 500 == 0: train_iter.append(t_iter) train_loss.append(ma_loss.value()[0]) valid_iter = [] valid_loss = [] i = 0 for s in valid_d: i = i + 1 if i >= 0:
def train(self): torch.cuda.empty_cache() ###################### # Save / Load model ###################### if self.opt.continue_train: try: self.continue_from_latest_checkpoint() except CyganException as e: self.logger.error(e) self.opt.continue_train = False self.reset_save() else: self.reset_save() self.add_file_logger() ###################### # Dataset ###################### if self.opt.model == 'base': dataset = SteelyDataset(self.opt.genreA, self.opt.genreB, self.opt.phase, use_mix=False) else: dataset = SteelyDataset(self.opt.genreA, self.opt.genreB, self.opt.phase, use_mix=True) dataset_size = len(dataset) iter_num = int(dataset_size / self.opt.batch_size) self.logger.info( f'Dataset loaded, genreA: {self.opt.genreA}, genreB: {self.opt.genreB}, total size: {dataset_size}.' ) ###################### # Initiate ###################### lambda_A = 10.0 # weight for cycle loss (A -> B -> A^) lambda_B = 10.0 # weight for cycle loss (B -> A -> B^) lambda_identity = 0.5 criterionGAN = GANLoss(gan_mode='lsgan') criterionCycle = nn.L1Loss() criterionIdt = nn.L1Loss() GLoss_meter = MovingAverageValueMeter(self.opt.plot_every) DLoss_meter = MovingAverageValueMeter(self.opt.plot_every) CycleLoss_meter = MovingAverageValueMeter(self.opt.plot_every) # loss meters losses = {} scores = {} losses_dict = {'loss_G': [], 'loss_D': [], 'loss_C': [], 'epoch': []} ###################### # Start Training ###################### for epoch in range(self.opt.start_epoch, self.opt.max_epoch): loader = DataLoader(dataset, batch_size=self.opt.batch_size, shuffle=True, num_workers=self.opt.num_threads, drop_last=True) epoch_start_time = time.time() for i, data in enumerate(loader): real_A = torch.unsqueeze(data[:, 0, :, :], 1).to(self.device, dtype=torch.float) real_B = torch.unsqueeze(data[:, 1, :, :], 1).to(self.device, dtype=torch.float) gaussian_noise = torch.abs( torch.normal(mean=torch.zeros(self.opt.data_shape), std=self.opt.gaussian_std)).to( self.device, dtype=torch.float) if self.opt.model == 'base': ###################### # Generator ###################### fake_B = self.generator_A2B(real_A) # X -> Y' fake_A = self.generator_B2A(real_B) # Y -> X' fake_B_copy = copy.copy(fake_B) fake_A_copy = copy.copy(fake_A) DB_fake = self.discriminator_B( fake_B + gaussian_noise) # netD_x provide feedback to netG_x DA_fake = self.discriminator_A(fake_A + gaussian_noise) loss_G_A2B = criterionGAN(DB_fake, True) loss_G_B2A = criterionGAN(DA_fake, True) # cycle_consistence cycle_A = self.generator_B2A(fake_B) # Y' -> X^ cycle_B = self.generator_A2B(fake_A) # Y -> X' -> Y^ loss_cycle_A2B = criterionCycle(cycle_A, real_A) * lambda_A loss_cycle_B2A = criterionCycle(cycle_B, real_B) * lambda_B # identity loss if lambda_identity > 0: # netG_x should be identity if real_y is fed: ||netG_x(real_y) - real_y|| idt_A = self.generator_A2B(real_B) idt_B = self.generator_B2A(real_A) loss_idt_A = criterionIdt( idt_A, real_B) * lambda_A * lambda_identity loss_idt_B = criterionIdt( idt_B, real_A) * lambda_A * lambda_identity else: loss_idt_A = 0. loss_idt_B = 0. loss_idt = loss_idt_A + loss_idt_B self.GA2B_optimizer.zero_grad( ) # set g_x and g_y gradients to zero loss_A2B = loss_G_A2B + loss_cycle_A2B + loss_idt_A loss_A2B.backward(retain_graph=True) self.GA2B_optimizer.step() self.GB2A_optimizer.zero_grad( ) # set g_x and g_y gradients to zero loss_B2A = loss_G_B2A + loss_cycle_B2A + loss_idt_B loss_B2A.backward(retain_graph=True) self.GB2A_optimizer.step() cycle_loss = loss_cycle_A2B + loss_cycle_B2A CycleLoss_meter.add(cycle_loss.item()) loss_G = loss_G_A2B + loss_G_B2A + loss_idt GLoss_meter.add(loss_G.item()) ###################### # Sample ###################### fake_A_sample, fake_B_sample = (None, None) if self.opt.use_image_pool: [fake_A_sample, fake_B_sample] = self.pool([fake_A_copy, fake_B_copy]) ###################### # Discriminator ###################### # loss_real DA_real = self.discriminator_A(real_A + gaussian_noise) DB_real = self.discriminator_B(real_B + gaussian_noise) loss_DA_real = criterionGAN(DA_real, True) loss_DB_real = criterionGAN(DB_real, True) # loss fake if self.opt.use_image_pool: DA_fake_sample = self.discriminator_A(fake_A_sample + gaussian_noise) DB_fake_sample = self.discriminator_B(fake_B_sample + gaussian_noise) loss_DA_fake = criterionGAN(DA_fake_sample, False) loss_DB_fake = criterionGAN(DB_fake_sample, False) else: loss_DA_fake = criterionGAN(DA_fake, False) loss_DB_fake = criterionGAN(DB_fake, False) # loss and backward self.DA_optimizer.zero_grad() loss_DA = (loss_DA_real + loss_DA_fake) * 0.5 loss_DA.backward() self.DA_optimizer.step() self.DB_optimizer.zero_grad() loss_DB = (loss_DB_real + loss_DB_fake) * 0.5 loss_DB.backward() self.DB_optimizer.step() loss_D = loss_DA + loss_DB DLoss_meter.add(loss_D.item()) else: real_mixed = torch.unsqueeze(data[:, 2, :, :], 1).to(self.device, dtype=torch.float) ###################### # Generator ###################### fake_B = self.generator_A2B(real_A) # X -> Y' fake_A = self.generator_B2A(real_B) # Y -> X' fake_B_copy = fake_B.detach().clone() fake_A_copy = fake_A.detach().clone() DB_fake = self.discriminator_B( fake_B + gaussian_noise) # netD_x provide feedback to netG_x DA_fake = self.discriminator_A(fake_A + gaussian_noise) loss_G_A2B = criterionGAN(DB_fake, True) loss_G_B2A = criterionGAN(DA_fake, True) # cycle_consistence cycle_A = self.generator_B2A(fake_B) # Y' -> X^ cycle_B = self.generator_A2B(fake_A) # Y -> X' -> Y^ loss_cycle_A2B = criterionCycle(cycle_A, real_A) * lambda_A loss_cycle_B2A = criterionCycle(cycle_B, real_B) * lambda_B # identity loss if lambda_identity > 0: # netG_x should be identity if real_y is fed: ||netG_x(real_y) - real_y|| idt_A = self.generator_A2B(real_B) idt_B = self.generator_B2A(real_A) loss_idt_A = criterionIdt( idt_A, real_B) * lambda_A * lambda_identity loss_idt_B = criterionIdt( idt_B, real_A) * lambda_A * lambda_identity else: loss_idt_A = 0. loss_idt_B = 0. loss_idt = loss_idt_A + loss_idt_B self.GA2B_optimizer.zero_grad( ) # set g_x and g_y gradients to zero loss_A2B = loss_G_A2B + loss_cycle_A2B + loss_idt_A loss_A2B.backward(retain_graph=True) self.GA2B_optimizer.step() self.GB2A_optimizer.zero_grad( ) # set g_x and g_y gradients to zero loss_B2A = loss_G_B2A + loss_cycle_B2A + loss_idt_B loss_B2A.backward(retain_graph=True) self.GB2A_optimizer.step() cycle_loss = loss_cycle_A2B + loss_cycle_B2A CycleLoss_meter.add(cycle_loss.item()) loss_G = loss_G_A2B + loss_G_B2A + loss_idt GLoss_meter.add(loss_G.item()) ###################### # Sample ###################### fake_A_sample, fake_B_sample = (None, None) if self.opt.use_image_pool: [fake_A_sample, fake_B_sample] = self.pool([fake_A_copy, fake_B_copy]) ###################### # Discriminator ###################### # loss_real DA_real = self.discriminator_A(real_A + gaussian_noise) DB_real = self.discriminator_B(real_B + gaussian_noise) DA_real_all = self.discriminator_A_all(real_mixed + gaussian_noise) DB_real_all = self.discriminator_B_all(real_mixed + gaussian_noise) loss_DA_real = criterionGAN(DA_real, True) loss_DB_real = criterionGAN(DB_real, True) loss_DA_all_real = criterionGAN(DA_real_all, True) loss_DB_all_real = criterionGAN(DB_real_all, True) # loss fake if self.opt.use_image_pool: DA_fake_sample = self.discriminator_A(fake_A_sample + gaussian_noise) DB_fake_sample = self.discriminator_B(fake_B_sample + gaussian_noise) DA_fake_sample_all = self.discriminator_A_all( fake_A_sample + gaussian_noise) DB_fake_sample_all = self.discriminator_B_all( fake_B_sample + gaussian_noise) loss_DA_all_fake = criterionGAN( DA_fake_sample_all, False) loss_DB_all_fake = criterionGAN( DB_fake_sample_all, False) loss_DA_fake = criterionGAN(DA_fake_sample, False) loss_DB_fake = criterionGAN(DB_fake_sample, False) else: DA_fake_all = self.discriminator_A_all(fake_A_copy + gaussian_noise) DB_fake_all = self.discriminator_B_all(fake_B_copy + gaussian_noise) loss_DA_all_fake = criterionGAN(DA_fake_all, False) loss_DB_all_fake = criterionGAN(DB_fake_all, False) loss_DA_fake = criterionGAN(DA_fake, False) loss_DB_fake = criterionGAN(DB_fake, False) # loss and backward self.DA_optimizer.zero_grad() loss_DA = (loss_DA_real + loss_DA_fake) * 0.5 loss_DA.backward() self.DA_optimizer.step() self.DB_optimizer.zero_grad() loss_DB = (loss_DB_real + loss_DB_fake) * 0.5 loss_DB.backward() self.DB_optimizer.step() self.DA_all_optimizer.zero_grad() loss_DA_all = (loss_DA_all_real + loss_DA_all_fake) * 0.5 loss_DA_all.backward() self.DA_all_optimizer.step() self.DB_all_optimizer.zero_grad() loss_DB_all = (loss_DB_all_real + loss_DB_all_fake) * 0.5 loss_DB_all.backward() self.DB_all_optimizer.step() loss_D = loss_DA + loss_DB + loss_DB_all + loss_DA_all DLoss_meter.add(loss_D.item()) ###################### # Snapshot ###################### if i % self.opt.plot_every == 0: file_name = self.opt.name + '_snap_%03d_%05d.png' % ( epoch, i, ) # test_path = os.path.join(self.opt.checkpoint_path, file_name) # tv.utils.save_image(fake_B, test_path, normalize=True) # self.logger.info(f'Snapshot {file_name} saved.') losses['loss_C'] = float(CycleLoss_meter.value()[0]) losses['loss_G'] = float(GLoss_meter.value()[0]) losses['loss_D'] = float(DLoss_meter.value()[0]) self.logger.info(str(losses)) self.logger.info('Epoch {} progress: {:.2%}\n'.format( epoch, i / iter_num)) # save model if epoch % self.opt.save_every == 0 or epoch == self.opt.max_epoch - 1: self.save_model(epoch) ###################### # lr_scheduler ###################### self.GA2B_scheduler.step(epoch) self.GB2A_scheduler.step(epoch) self.DA_scheduler.step(epoch) self.DB_scheduler.step(epoch) if self.opt.model != 'base': self.DA_all_scheduler.step(epoch) self.DB_all_scheduler.step(epoch) epoch_time = int(time.time() - epoch_start_time) ###################### # Logging ###################### self.logger.info( f'Epoch {epoch} finished, cost time {epoch_time}\n') self.logger.info(str(losses) + '\n\n') ###################### # Loss_Dict ###################### losses_dict['loss_C'].append(losses['loss_C']) losses_dict['loss_G'].append(losses['loss_G']) losses_dict['loss_D'].append(losses['loss_D']) losses_dict['epoch'].append(epoch) with open(self.opt.loss_save_path, 'w') as f: json.dump(losses_dict, f)
def train(**kwargs): opt._parse(kwargs) image_folder_path = 'DataSets/images/' cvs_file_path = 'DataSets/labels.csv' dataset = DataSets(cvs_file_path, image_folder_path) data_size = len(dataset) indices = list(range(data_size)) split = int(np.floor(data_size * 0.2)) np.random.seed(42) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_sampler = torch.utils.data.SubsetRandomSampler(train_indices) valid_sampler = torch.utils.data.SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=1, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(dataset, batch_size=1, sampler=valid_sampler) print('load data') avg_loss = AverageValueMeter() ma20_loss = MovingAverageValueMeter(windowsize=20) faster_rcnn = FasterRCNNVGG16() print('model construct completed') start_epoch = 0 best_map = -100 trainer = FasterRCNNTrainer(faster_rcnn).cuda() optimizer = optim.SGD(trainer.faster_rcnn.parameters(), lr=opt.lr, momentum=0.9) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) if opt.load_path: print('load pretrained model from %s' % opt.load_path) checkpoint = torch.load(opt.load_path) start_epoch = checkpoint['epoch'] best_map = checkpoint['best_map'] trainer.faster_rcnn.load_state_dict(checkpoint['model_state']) optimizer.load_state_dict(checkpoint['optimizer_state']) print("> Loaded checkpoint '{}' (epoch {})".format( args.resume, start_epoch)) #trainer.vis.text(dataset.db.label_names, win='labels') # set tensor-board for visualization writer = SummaryWriter('runs/' + opt.log_root) for epoch in range(start_epoch, opt.epoch): trainer.train(mode=True) #must set as that in tranning for ii, (img, _, _, bbox_, label_, scale, _) in enumerate(train_loader): scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() optimizer.zero_grad() loss = trainer.forward(img, bbox, label, scale) loss.total_loss.backward() optimizer.step() #print(loss) #print(loss.total_loss) loss_value = loss.total_loss.cpu().data.numpy() avg_loss.add(float(loss_value)) ma20_loss.add(float(loss_value)) print( '[epoch:{}/{}] [batch:{}/{}] [sample_loss:{:.4f}] [avg_loss:{:.4f}] [ma20_loss:{:.4f}]' .format(epoch, opt.epoch, ii + 1, len(train_loader), loss.total_loss.data, avg_loss.value()[0], ma20_loss.value()[0])) if (ii + 1) % opt.plot_every == 0: niter = epoch * len(train_loader) + ii writer.add_scalar('Train/Loss', ma20_loss.value()[0], niter) eval_result = eval(val_loader, faster_rcnn, test_num=opt.test_num) print(eval_result['map']) if eval_result['map'] > best_map: best_map = eval_result['map'] state = { "epoch": epoch + 1, "best_map": best_map, "model_state": trainer.faster_rcnn.state_dict(), "optimizer_state": optimizer.state_dict() } torch.save(state, opt.model_para) scheduler.step() state = { "epoch": epoch + 1, "best_map": best_map, "model_state": trainer.faster_rcnn.state_dict(), "optimizer_state": optimizer.state_dict() } torch.save(state, 'last_epoch.pkl') writer.close()
def __init__(self, hyperparameters, gpu_id): super(Discriminator, self).__init__() self.hyp = hyperparameters self.gpu_id = gpu_id self.w_loss_meter = MovingAverageValueMeter(5) self.d_loss_meter = MovingAverageValueMeter(5) self.r_loss_meter = MovingAverageValueMeter(5) self.f_loss_meter = MovingAverageValueMeter(5) self.gp_loss_meter = MovingAverageValueMeter(5) # Architecture self.lab0 = nn.ConvTranspose2d(1, self.hyp['q1'], (1, 55), bias=False) self.conv0 = nn.Conv2d(1, self.hyp['q2'], (55, 1), bias=False) self.nonlin0 = nn.Sequential(*[ nn.BatchNorm2d(self.hyp['q1'] + self.hyp['q2']), nn.LeakyReLU(self.hyp['lrelu_d']) ] if self.hyp['bd0'] else [ nn.LeakyReLU(self.hyp['lrelu_d']), ]) self.conv1 = nn.Conv2d(self.hyp['q1'] + self.hyp['q2'], self.hyp['q3'], (1, 55), bias=False) self.nonlin1 = nn.Sequential(*[ nn.BatchNorm2d(self.hyp['q3']), nn.LeakyReLU(self.hyp['lrelu_d']) ] if self.hyp['bd1'] else [ nn.LeakyReLU(self.hyp['lrelu_d']), ]) self.fc = nn.Linear(self.hyp['q3'], 1, bias=False) self.cuda(self.gpu_id) opt_param_list = [{ 'params': [ param for name, param in self.named_parameters() if 'lab0' not in name ] }, { 'params': self.lab0.parameters(), 'lr': 1 * self.hyp['lr_d'] }] self.optimizer = torch.optim.Adam(opt_param_list, lr=self.hyp['lr_d'], betas=(self.hyp['b1_d'], self.hyp['b2_d']), weight_decay=self.hyp['wd_d']) # rand init for m in self.modules(): if isinstance(m, nn.ConvTranspose2d): torch.nn.init.kaiming_normal_(m.weight, a=self.hyp['lrelu_d'], nonlinearity='leaky_relu') if not m.bias is None: torch.nn.init.constant_(m.bias, 0) if isinstance(m, nn.Conv2d): torch.nn.init.kaiming_normal_(m.weight, a=self.hyp['lrelu_d'], nonlinearity='leaky_relu') if not m.bias is None: torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): torch.nn.init.constant_(m.weight, 1) torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): torch.nn.init.kaiming_normal_(m.weight) if not m.bias is None: torch.nn.init.constant_(m.bias, 0)
save_dir, 'models', modelName + '_epoch-' + str(resume_epoch - 1) + '.pth'), map_location=lambda storage, loc: storage) ) # Load all tensors onto the CPU if gpu_id >= 0: torch.cuda.set_device(device=gpu_id) net.cuda() if resume_epoch != nEpochs: # Logging into Tensorboard log_dir = os.path.join( save_dir, 'models', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) writer = SummaryWriter(log_dir=log_dir) loss_meter = MovingAverageValueMeter(20) # Use the following optimizer optimizer = optim.SGD(net.parameters(), lr=p['lr'], momentum=p['momentum'], weight_decay=p['wd']) p['optimizer'] = str(optimizer) composed_transforms_tr = transforms.Compose([ tr.RandomSized(512), tr.RandomRotate(15), tr.RandomHorizontalFlip(), tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), tr.ToTensor() ])
def main(config, cuda, excludeval, embedding, continue_from, nolog, inputmix, imagedataset, experimentid, nshot, ishot): frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) #print(values) #in case you want to save to the location of script you're running datadir = os.path.join( '/home/SharedData/omkar/zscoseg/yash_manas/data/datasets', imagedataset) if not nolog: #name the savedir, might add logs/ before the datetime for clarity if experimentid is None: savedir = time.strftime('%Y%m%d%H%M%S') else: savedir = experimentid #the full savepath is then: savepath = os.path.join('logs', imagedataset, savedir) #in case the folder has not been created yet / except already exists error: try: os.makedirs(savepath) print("Log dir:", savepath) except: pass if continue_from is None: #now join the path in save_screenshot: shutil.copytree('./libs/', savepath + '/libs') shutil.copy2(osp.abspath(inspect.stack()[0][1]), savepath) shutil.copy2(config, savepath) args_dict = {} for a in args: args_dict[a] = values[a] with open(savepath + '/args.json', 'w') as fp: json.dump(args_dict, fp) cuda = cuda and torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") if cuda: current_device = torch.cuda.current_device() print("Running on", torch.cuda.get_device_name(current_device)) else: print("Running on CPU") # Configuration CONFIG = Dict(yaml.load(open(config), Loader=yaml.FullLoader)) visibility_mask = {} if excludeval: seen_classes = np.load(datadir + '/split/seen_cls.npy') else: seen_classes = np.asarray(np.concatenate([ np.load(datadir + '/split/seen_cls.npy'), np.load(datadir + '/split/val_cls.npy') ]), dtype=int) novel_classes = np.load(datadir + '/split/novel_cls.npy') seen_novel_classes = np.concatenate([seen_classes, novel_classes]) seen_map = np.array([-1] * 256) for i, n in enumerate(list(seen_classes)): seen_map[n] = i visibility_mask[0] = seen_map.copy() for i, n in enumerate(list(novel_classes)): visibility_mask[i + 1] = seen_map.copy() visibility_mask[i + 1][n] = seen_classes.shape[0] + i if excludeval: train = np.load(datadir + '/split/train_list.npy')[:-CONFIG.VAL_SIZE] else: train = np.load(datadir + '/split/train_list.npy') novelset = [] seenset = [] if inputmix == 'novel' or inputmix == 'both': inverse_dict = pickle.load( open(datadir + '/split/inverse_dict_train.pkl', 'rb')) for icls, key in enumerate(novel_classes): if (inverse_dict[key].size > 0): for v in inverse_dict[key][ishot * 20:ishot * 20 + nshot]: novelset.append((v, icls)) #print((v, icls)) if inputmix == 'both': seenset = [] inverse_dict = pickle.load( open(datadir + '/split/inverse_dict_train.pkl', 'rb')) for icls, key in enumerate(seen_classes): if (inverse_dict[key].size > 0): for v in inverse_dict[key][ishot * 20:ishot * 20 + nshot]: seenset.append(v) if inputmix == 'seen': seenset = range(train.shape[0]) sampler = RandomImageSampler(seenset, novelset) if inputmix == 'novel': visible_classes = seen_novel_classes if nshot is not None: nshot = str(nshot) + 'n' elif inputmix == 'seen': visible_classes = seen_classes if nshot is not None: nshot = str(nshot) + 's' elif inputmix == 'both': visible_classes = seen_novel_classes if nshot is not None: nshot = str(nshot) + 'b' print("Visible classes:", visible_classes.size, " \nClasses are: ", visible_classes, "\nTrain Images:", train.shape[0]) #a Dataset 10k or 164k dataset = get_dataset(CONFIG.DATASET)(train=train, test=None, root=CONFIG.ROOT, split=CONFIG.SPLIT.TRAIN, base_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.WARP_IMAGE, scale=(0.5, 1.5), flip=True, visibility_mask=visibility_mask) # DataLoader loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=CONFIG.BATCH_SIZE.TRAIN, num_workers=CONFIG.NUM_WORKERS, sampler=sampler) if embedding == 'word2vec': class_emb = pickle.load( open(datadir + '/word_vectors/word2vec.pkl', "rb")) elif embedding == 'fasttext': class_emb = pickle.load( open(datadir + '/word_vectors/fasttext.pkl', "rb")) elif embedding == 'fastnvec': class_emb = np.concatenate([ pickle.load(open(datadir + '/word_vectors/fasttext.pkl', "rb")), pickle.load(open(datadir + '/word_vectors/word2vec.pkl', "rb")) ], axis=1) else: print("invalid emb ", embedding) sys.exit() print((class_emb.shape)) class_emb = F.normalize(torch.tensor(class_emb), p=2, dim=1).cuda() loader_iter = iter(loader) DeepLab = DeepLabV2_ResNet101_MSC #import ipdb; ipdb.set_trace() state_dict = torch.load(CONFIG.INIT_MODEL) # Model load model = DeepLab(class_emb.shape[1], class_emb[visible_classes]) if continue_from is not None and continue_from > 0: print("Loading checkpoint: {}".format(continue_from)) #import ipdb; ipdb.set_trace() model = nn.DataParallel(model) state_file = osp.join(savepath, "checkpoint_{}.pth".format(continue_from)) if osp.isfile(state_file + '.tar'): state_dict = torch.load(state_file + '.tar') model.load_state_dict(state_dict['state_dict'], strict=True) elif osp.isfile(state_file): state_dict = torch.load(state_file) model.load_state_dict(state_dict, strict=True) else: print("Checkpoint {} not found".format(continue_from)) sys.exit() else: model.load_state_dict( state_dict, strict=False ) # make strict=True to debug if checkpoint is loaded correctly or not if performance is low model = nn.DataParallel(model) model.to(device) # Optimizer optimizer = { "sgd": torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[{ "params": get_params(model.module, key="1x"), "lr": CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.LR, "weight_decay": 0.0, }], momentum=CONFIG.MOMENTUM, ), "adam": torch.optim.Adam( # cf lr_mult and decay_mult in train.prototxt params=[{ "params": get_params(model.module, key="1x"), "lr": CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.LR, "weight_decay": 0.0, }]) # Add any other optimizer }.get(CONFIG.OPTIMIZER) if 'optimizer' in state_dict: optimizer.load_state_dict(state_dict['optimizer']) print("Learning rate:", CONFIG.LR) # Loss definition criterion = nn.CrossEntropyLoss(ignore_index=-1) criterion.to(device) if not nolog: # TensorBoard Logger if continue_from is not None: writer = SummaryWriter( savepath + '/runs/fs_{}_{}_{}'.format(continue_from, nshot, ishot)) else: writer = SummaryWriter(savepath + '/runs') loss_meter = MovingAverageValueMeter(20) model.train() model.module.scale.freeze_bn() pbar = tqdm( range(1, CONFIG.ITER_MAX + 1), total=CONFIG.ITER_MAX, leave=False, dynamic_ncols=True, ) for iteration in pbar: # Set a learning rate poly_lr_scheduler( optimizer=optimizer, init_lr=CONFIG.LR, iter=iteration - 1, lr_decay_iter=CONFIG.LR_DECAY, max_iter=CONFIG.ITER_MAX, power=CONFIG.POLY_POWER, ) # Clear gradients (ready to accumulate) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG.ITER_SIZE + 1): try: data, target = next(loader_iter) except: loader_iter = iter(loader) data, target = next(loader_iter) # Image data = data.to(device) # Propagate forward outputs = model(data) # Loss loss = 0 for output in outputs: # Resize target for {100%, 75%, 50%, Max} outputs target_ = resize_target(target, output.size(2)) target_ = torch.tensor(target_).to(device) loss += criterion.forward(output, target_) # Backpropagate (just compute gradients wrt the loss) #print(loss) loss /= float(CONFIG.ITER_SIZE) loss.backward() iter_loss += float(loss) del data, target, outputs #print(iter_loss) pbar.set_postfix(loss="%.3f" % iter_loss) # Update weights with accumulated gradients optimizer.step() if not nolog: loss_meter.add(iter_loss) # TensorBoard if iteration % CONFIG.ITER_TB == 0: writer.add_scalar("train_loss", loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("train_lr_group{}".format(i), o["lr"], iteration) if False: # This produces a large log file for name, param in model.named_parameters(): name = name.replace(".", "/") writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram(name + "/grad", param.grad, iteration, bins="auto") # Save a model if continue_from is not None: if iteration in CONFIG.ITER_SAVE: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), }, osp.join( savepath, "checkpoint_{}_{}_{}_{}.pth.tar".format( continue_from, nshot, ishot, iteration)), ) # Save a model (short term) [unnecessary for fewshot] if False and iteration % 100 == 0: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), }, osp.join( savepath, "checkpoint_{}_{}_{}_current.pth.tar".format( continue_from, nshot, ishot)), ) print( osp.join( savepath, "checkpoint_{}_{}_{}_current.pth.tar".format( continue_from, nshot, ishot))) else: if iteration % CONFIG.ITER_SAVE == 0: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(savepath, "checkpoint_{}.pth.tar".format(iteration)), ) # Save a model (short term) if iteration % 100 == 0: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(savepath, "checkpoint_current.pth.tar"), ) torch.cuda.empty_cache() if not nolog: if continue_from is not None: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), }, osp.join( savepath, "checkpoint_{}_{}_{}_{}.pth.tar".format( continue_from, nshot, ishot, CONFIG.ITER_MAX))) else: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(savepath, "checkpoint_{}.pth.tar".format(CONFIG.ITER_MAX)))
def main(): parser = argparse.ArgumentParser(description="Train the SharpNet network") parser.add_argument('-c', '--configFile', required=True, help='Path to config yaml file', metavar='path/to/config') args = parser.parse_args() CONFIG_FILE_PATH = args.configFile with open(CONFIG_FILE_PATH) as fd: config_yaml = oyaml.load( fd) # Returns an ordered dict. Used for printing config = AttrDict(config_yaml) print( colored( 'Config being used for training:\n{}\n\n'.format( oyaml.dump(config_yaml)), 'green')) os.environ['CUDA_VISIBLE_DEVICES'] = config.train.cuda_device cuda = False if config.train.nocuda else True resnet50_url = 'https://download.pytorch.org/models/resnet50-19c8e357.pth' cuda = cuda and torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") if cuda: current_device = torch.cuda.current_device() print("Running on " + torch.cuda.get_device_name(current_device)) else: print("Running on CPU") now = datetime.datetime.now() date_str = now.strftime("%d-%m-%Y_%H-%M") t = [] torch.manual_seed(329) bias = True if config.train.bias else False # build model model = SharpNet(ResBlock, [3, 4, 6, 3], [2, 2, 2, 2, 2], use_normals=True if config.train.normals else False, use_depth=True if config.train.depth else False, use_boundary=True if config.train.boundary else False, bias_decoder=bias) model_dict = model.state_dict() # Load pretrained weights resnet_path = 'models/resnet50-19c8e357.pth' if not os.path.exists(resnet_path): command = 'wget ' + resnet50_url + ' && mkdir models/ && mv resnet50-19c8e357.pth models/' os.system(command) resnet50_dict = torch.load(resnet_path) resnet_dict = { k.replace('.', '_img.', 1): v for k, v in resnet50_dict.items() if k.replace('.', '_img.', 1) in model_dict } # load weights up to pool print('Loading checkpoint from {}'.format(config.train.pretrained_model)) if config.train.pretrained_model is not None: model_path = config.train.pretrained_model tmp_dict = torch.load(model_path) if config.train.depth: pretrained_dict = { k: v for k, v in tmp_dict.items() if k in model_dict } else: pretrained_dict = { k: v for k, v in tmp_dict.items() if (k in model_dict and not k.startswith('depth_decoder')) } else: pretrained_dict = resnet_dict try: model_dict.update(pretrained_dict) model.load_state_dict(model_dict) print('Successfully loaded pretrained ResNet weights') except: print('Could not load the pretrained model weights') sys.exit(0) model = nn.DataParallel(model) model.to(device) model.zero_grad() model.train() freeze_decoders = config.train.decoder_freeze.split(',') freeze_model_decoders(model.module, freeze_decoders) if config.train.dataset != 'NYU': sharpnet_loss = SharpNetLoss( lamb=0.5, mu=1.0, use_depth=True if config.train.depth else False, use_boundary=True if config.train.boundary else False, use_normals=True if config.train.normals else False, use_geo_consensus=True if config.train.geo_consensus else False) else: sharpnet_loss = SharpNetLoss( lamb=0.5, mu=1.0, use_depth=True if config.train.depth else False, use_boundary=False, use_normals=False, use_geo_consensus=True if config.train.geo_consensus else False) if config.train.optimizer == 'SGD': optimizer = SGD(params=get_params(model), lr=float(config.train.learning_rate), weight_decay=float(config.train.decay), momentum=0.9) elif config.train.optimizer == 'Adam': optimizer = Adam(params=get_params(model), lr=float(config.train.learning_rate), weight_decay=float(config.train.decay)) else: print( 'Could not configure the optimizer, please select --optimizer Adam or SGD' ) sys.exit(0) # TensorBoard Logger train_loss_meter = MovingAverageValueMeter(20) val_loss_meter = MovingAverageValueMeter(3) depth_loss_meter = MovingAverageValueMeter( 3) if config.train.depth else None normals_loss_meter = MovingAverageValueMeter( 3) if config.train.normals and config.train.dataset != 'NYU' else None grad_loss_meter = MovingAverageValueMeter( 3) if config.train.depth else None boundary_loss_meter = MovingAverageValueMeter( 3) if config.train.boundary and config.train.dataset != 'NYU' else None consensus_loss_meter = MovingAverageValueMeter( 3) if config.train.geo_consensus else None exp_name = config.train.experiment_name if config.train.experiment_name is not None else '' print('Experiment Name: {}'.format(exp_name)) log_dir = os.path.join('logs', 'Joint', str(exp_name) + '_' + date_str) cp_dir = os.path.join('logs', 'Joint', str(exp_name) + '_' + date_str) print('Checkpoint Directory: {}'.format(cp_dir)) train_writer = SummaryWriter(os.path.join(log_dir, 'train')) val_writer = SummaryWriter(os.path.join(log_dir, 'val')) if not os.path.exists(cp_dir): os.makedirs(cp_dir) if not os.path.exists(log_dir): os.makedirs(os.path.join(log_dir, 'train')) os.makedirs(os.path.join(log_dir, 'val')) train_dataloader, val_dataloader = get_trainval_splits( config) # SHREK: Added Modification to pass in config. # Either pass in path to config file, or real yaml and pass in the dict of config file. # Config file need only contain the paths to datasets train and val. # For val, we'd like to pass real images dataset. for epoch in range(config.train.max_epoch): if config.train.optimizer == 'SGD': adjust_learning_rate(float(config.train.learning_rate), config.train.lr_mode, float(config.train.gradient_step), config.train.max_epoch, optimizer, epoch) train_epoch(train_dataloader, val_dataloader, model, sharpnet_loss, optimizer, config.train.start_epoch + epoch, train_writer, val_writer, train_loss_meter, val_loss_meter, depth_loss_meter, grad_loss_meter, normals_loss_meter, date_str=date_str, model_save_path=cp_dir, config=config, boundary_loss_meter=boundary_loss_meter, consensus_loss_meter=consensus_loss_meter) # Save a model if epoch % 1 == 0 and epoch > int(0.9 * config.train.max_epoch): torch.save( model.state_dict(), os.path.join( cp_dir, 'checkpoint_{}_final.pth'.format(config.train.start_epoch + epoch)), ) elif epoch % 1 == 0: torch.save( model.state_dict(), os.path.join( cp_dir, 'checkpoint_{}_final.pth'.format(config.train.start_epoch + epoch)), ) torch.save( model.state_dict(), os.path.join( cp_dir, 'checkpoint_{}_final.pth'.format(config.train.start_epoch + config.train.max_epoch)), ) return None
def train(self): torch.cuda.empty_cache() ###################### # Save / Load model ###################### if self.opt.continue_train: try: self.continue_from_latest_checkpoint() except Exception as e: self.logger.error(e) self.opt.continue_train = False self.reset_save() else: self.reset_save() dataset = UnitRiffDataset(self.opt.dataset_name, self.opt.instr_type) dataset_size = len(dataset) self.logger.info( f'Dataset {self.opt.dataset_name} loaded, size {dataset_size}') ###################### # Initiate ###################### criterionGAN = nn.BCEWithLogitsLoss() GLoss_meter = MovingAverageValueMeter(self.opt.plot_every) DLoss_meter = MovingAverageValueMeter(self.opt.plot_every) losses = {} ###################### # Start Training ###################### for epoch in range(self.opt.start_epoch, self.opt.max_epoch): loader = DataLoader(dataset, batch_size=self.opt.batch_size, shuffle=True, num_workers=self.opt.num_threads, drop_last=False) epoch_start_time = time.time() for i, data in enumerate(loader): batch_size = data.size(0) # print(batch_size) real_label = torch.ones(size=[batch_size, 1], device=self.device) fake_label = torch.zeros(size=[batch_size, 1], device=self.device) seed = np.array([ generate_random_seed(1, self.opt.instr_type, pattern=self.opt.chord_type) for _ in range(batch_size) ]) # print(seed.shape) noise = torch.randn(batch_size, self.opt.seed_size, device=self.device) seed = torch.from_numpy(seed).to(device=self.device, dtype=torch.float) fake_data = self.generator(noise, seed, batch_size) D_fake = self.discriminator(fake_data, batch_size) real_data = torch.unsqueeze(data, 1).to(device=self.device, dtype=torch.float) D_real = self.discriminator(real_data, batch_size) # print(D_fake.shape) ###################### # Generator ###################### self.G_optimizer.zero_grad() loss_G = criterionGAN(D_fake, real_label) loss_G.backward(retain_graph=True) self.G_optimizer.step() self.G_optimizer.zero_grad() loss_G = criterionGAN(D_fake, real_label) loss_G.backward(retain_graph=True) self.G_optimizer.step() GLoss_meter.add(loss_G.item()) ###################### # Discriminator ###################### self.D_optimizer.zero_grad() loss_D_real = criterionGAN(D_real, real_label) loss_D_fake = criterionGAN(D_fake, fake_label) loss_D = 0.5 * loss_D_real + 0.5 * loss_D_fake loss_D.backward() self.D_optimizer.step() DLoss_meter.add(loss_D.item()) if epoch % self.opt.save_every == 0 or epoch == self.opt.max_epoch - 1: self.save_model(epoch) losses['loss_G'] = float(GLoss_meter.value()[0]) losses['loss_D'] = float(DLoss_meter.value()[0]) self.G_scheduler.step(epoch) self.D_scheduler.step(epoch) epoch_time = int(time.time() - epoch_start_time) self.logger.info( f'Epoch {epoch} finished, cost time {epoch_time}\n') self.logger.info(str(losses) + '\n\n')
def train(self): torch.cuda.empty_cache() if self.model == 'base': dataset = SteelyDataset(self.genreA, self.genreB, 'train', use_mix=False) dataset_size = len(dataset) else: dataset = SteelyDataset(self.genreA, self.genreB, 'train', use_mix=True) dataset_size = len(dataset) if self.continue_train: self.continue_from_latest_checkpoint() else: self.empty_checkpoints() self.create_save_dirs() iter_num = int(dataset_size / self.batch_size) print(f'loaded {dataset_size} images for training') # optimizers = [optimizer_g, optimizer_d] lambda_A = 10.0 # weight for cycle loss (A -> B -> A^) lambda_B = 10.0 # weight for cycle loss (B -> A -> B^) L1_lambda = 10.0 lambda_identity = 0.5 # it's a MSELoss() when initialized, only calculate later during iteration # criterionGAN = nn.MSELoss().to(device) criterionGAN = GANLoss(gan_mode='vanilla') # cycle loss criterionCycle = nn.L1Loss() # identical loss criterionIdt = nn.L1Loss() GLoss_meter = MovingAverageValueMeter(self.plot_every) DLoss_meter = MovingAverageValueMeter(self.plot_every) # score_DA_real_B = MovingAverageValueMeter(self.plot_every) # score_DA_fake_B = MovingAverageValueMeter(self.plot_every) # loss meters losses = {} scores = {} for epoch in range(self.start_epoch, self.max_epoch): loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=1, drop_last=True) epoch_start_time = time.time() for i, data in enumerate(loader): real_A = torch.unsqueeze(data[:, 0, :, :], 1).to(self.device, dtype=torch.float) real_B = torch.unsqueeze(data[:, 1, :, :], 1).to(self.device, dtype=torch.float) gaussian_noise = torch.abs( torch.normal(mean=torch.zeros(self.data_shape), std=1)).to(self.device, dtype=torch.float) if self.model == 'base': self.GA2B_optimizer.zero_grad( ) # set g_x and g_y gradients to zero fake_B = self.generator_A2B(real_A) # X -> Y' fake_B_copy = copy.copy(fake_B.detach()) DB_fake = self.discriminator_B( fake_B + gaussian_noise) #netD_x provide feedback to netG_x loss_G_A2B = criterionGAN(DB_fake, True) # cycle_consistence cycle_A = self.generator_B2A(fake_B) # Y' -> X^ # Forward cycle loss x^ = || G_y(G_x(real_x)) || loss_cycle_A2B = criterionCycle(cycle_A, real_A) * lambda_A # identity loss if lambda_identity > 0: # netG_x should be identity if real_y is fed: ||netG_x(real_y) - real_y|| idt_A = self.generator_A2B(real_B) loss_idt_A = criterionIdt( idt_A, real_B) * lambda_A * lambda_identity else: loss_idt_A = 0. loss_A = loss_G_A2B + 5. * loss_cycle_A2B loss_A.backward(retain_graph=True) self.GA2B_optimizer.step() ###################### # B -> A' -> B^ cycle ###################### self.GB2A_optimizer.zero_grad( ) # set g_x and g_y gradients to zero fake_A = self.generator_B2A(real_B) # Y -> X' fake_A_copy = copy.copy(fake_A.detach()) DA_fake = self.discriminator_A(fake_A + gaussian_noise) loss_G_B2A = criterionGAN(DA_fake, True) # print(f'loss_G_Y = {round(float(loss_G_Y), 3)}') cycle_B = self.generator_A2B(fake_A) # Y -> X' -> Y^ # Forward cycle loss y^ = || G_x(G_y(real_y)) || loss_cycle_B2A = criterionCycle(cycle_B, real_B) * lambda_B # identity loss if lambda_identity > 0: # netG_y should be identiy if real_x is fed: ||netG_y(real_x) - real_x|| idt_B = self.generator_B2A(real_A) loss_idt_B = criterionIdt( idt_B, real_A) * lambda_A * lambda_identity else: loss_idt_B = 0. loss_B = loss_G_B2A + 5. * loss_cycle_B2A loss_B.backward(retain_graph=True) self.GB2A_optimizer.step() ###################### # sample ###################### if self.use_image_poll: [fake_A_sample, fake_B_sample] = self.pool([fake_A_copy, fake_B_copy]) ###################### # netD_A ###################### # loss_real DA_real = self.discriminator_A(real_A + gaussian_noise) loss_DA_real = criterionGAN(DA_real, True) score_DA_real_B.add(float(DA_real.data.mean())) # loss fake if self.use_image_poll: DA_fake_sample = self.discriminator_A(fake_A_sample + gaussian_noise) loss_DA_fake = criterionGAN(DA_fake_sample, False) score_DA_fake_B.add(float(DA_fake_sample.data.mean())) else: loss_DA_fake = criterionGAN(DA_fake, False) score_DA_fake_B.add(float(DA_fake.data.mean())) # loss and backward self.DA_optimizer.zero_grad() loss_DA = (loss_DA_real + loss_DA_fake) * 0.5 loss_DA.backward() self.DA_optimizer.step() ###################### # netD_B ###################### # loss_real DB_real = self.discriminator_B(real_B + gaussian_noise) loss_DB_real = criterionGAN(DB_real, True) # loss_fake if self.use_image_poll: DB_fake_sample = self.discriminator_B(fake_B_sample + gaussian_noise) loss_DB_fake = criterionGAN(DB_fake_sample, False) else: loss_DB_fake = criterionGAN(DB_fake, False) # loss and backward self.DB_optimizer.zero_grad() loss_DB = (loss_DB_real + loss_DB_fake) * 0.5 loss_DB.backward() self.DB_optimizer.step() else: real_mixed = torch.unsqueeze(data[:, 2, :, :], 1).to(self.device, dtype=torch.float) ###################### # A -> B' -> A^ cycle ###################### self.GA2B_optimizer.zero_grad( ) # set g_x and g_y gradients to zero fake_B = self.generator_A2B(real_A) # X -> Y' fake_B_copy = copy.copy(fake_B.detach()) DB_fake = self.discriminator_B( fake_B + gaussian_noise) # netD_x provide feedback to netG_x ''' to_binary ''' loss_G_A2B = criterionGAN(DB_fake, True) # cycle_consistence cycle_A = self.generator_B2A(fake_B) # Y' -> X^ # Forward cycle loss x^ = || G_y(G_x(real_x)) || loss_cycle_A2B = criterionCycle(cycle_A, real_A) * lambda_A # identity loss if lambda_identity > 0: # netG_x should be identity if real_y is fed: ||netG_x(real_y) - real_y|| idt_A = self.generator_A2B(real_B) loss_idt_A = criterionIdt( idt_A, real_B) * lambda_A * lambda_identity else: loss_idt_A = 0. loss_A = loss_G_A2B + 5. * loss_cycle_A2B loss_A.backward(retain_graph=True) self.GA2B_optimizer.step() loss_A_meter.add(loss_A.item()) ###################### # B -> A' -> B^ cycle ###################### self.GB2A_optimizer.zero_grad( ) # set g_x and g_y gradients to zero fake_A = self.generator_B2A(real_B) # Y -> X' fake_A_copy = copy.copy(fake_A.detach()) DA_fake = self.discriminator_A(fake_A + gaussian_noise) loss_G_B2A = criterionGAN(DA_fake, True) # print(f'loss_G_Y = {round(float(loss_G_Y), 3)}') cycle_B = self.generator_A2B(fake_A) # Y -> X' -> Y^ # Forward cycle loss y^ = || G_x(G_y(real_y)) || loss_cycle_B2A = criterionCycle(cycle_B, real_B) * lambda_B # identity loss if lambda_identity > 0: # netG_y should be identiy if real_x is fed: ||netG_y(real_x) - real_x|| idt_B = self.generator_B2A(real_A) loss_idt_B = criterionIdt( idt_B, real_A) * lambda_A * lambda_identity else: loss_idt_B = 0. loss_B = loss_G_B2A + 5. * loss_cycle_B2A loss_B.backward(retain_graph=True) self.GB2A_optimizer.step() loss_B_meter.add(loss_B.item()) ###################### # sample ###################### if self.use_image_poll: [fake_A_sample, fake_B_sample] = self.pool([fake_A_copy, fake_B_copy]) ###################### # netD_A $ netD_A_all ###################### # loss_real DA_real = self.discriminator_A(real_A + gaussian_noise) loss_DA_real = criterionGAN(DA_real, True) # score_DA_real_B.add(float(DA_real.data.mean())) DA_real_all = self.discriminator_A_all(real_mixed + gaussian_noise) loss_DA_all_real = criterionGAN(DA_real_all, True) # loss fake if self.use_image_poll: DA_fake_sample = self.discriminator_A(fake_A_sample + gaussian_noise) loss_DA_fake = criterionGAN(DA_fake_sample, False) # score_DA_fake_B.add(float(DA_fake_sample.data.mean())) DA_fake_sample_all = self.discriminator_A_all( fake_A_sample + gaussian_noise) loss_DA_all_fake = criterionGAN( DA_fake_sample_all, False) else: loss_DA_fake = criterionGAN(DA_fake, False) # score_DA_fake_B.add(float(DA_fake.data.mean())) DA_fake_all = self.discriminator_A_all(fake_A_copy + gaussian_noise) loss_DA_all_fake = criterionGAN(DA_fake_all, False) # loss and backward self.DA_optimizer.zero_grad() loss_DA = (loss_DA_real + loss_DA_fake) * 0.5 loss_DA.backward() self.DA_optimizer.step() self.DA_all_optimizer.zero_grad() loss_DA_all = (loss_DA_all_real + loss_DA_all_fake) * 0.5 loss_DA_all.backward() self.DA_all_optimizer.step() ###################### # netD_A_all ###################### self.DA_all_optimizer.zero_grad() # loss_real DA_real_all = self.discriminator_A_all(real_mixed + gaussian_noise) loss_DA_all_real = criterionGAN(DA_real_all, True) # loss fake DA_fake_sample_all = self.discriminator_A_all( fake_A_sample + gaussian_noise) loss_DA_all_fake = criterionGAN(DA_fake_sample_all, False) loss_DA_all = (loss_DA_all_real + loss_DA_all_fake) * 0.5 loss_DA_all.backward() self.DA_all_optimizer.step() ###################### # netD_B & netD_B_all ###################### # loss_real DB_real = self.discriminator_B(real_B + gaussian_noise) loss_DB_real = criterionGAN(DB_real, True) DB_real_all = self.discriminator_B_all(real_mixed + gaussian_noise) loss_DB_all_real = criterionGAN(DB_real_all, True) # loss_fake if self.use_image_poll: DB_fake_sample = self.discriminator_B(fake_B_sample + gaussian_noise) loss_DB_fake = criterionGAN(DB_fake_sample, False) DB_fake_sample_all = self.discriminator_B_all( fake_B_sample + gaussian_noise) loss_DB_all_fake = criterionGAN( DB_fake_sample_all, False) else: loss_DB_fake = criterionGAN(DB_fake, False) DB_fake_all = self.discriminator_B_all(fake_B_copy + gaussian_noise) loss_DB_all_fake = criterionGAN(DB_fake_all, False) # loss and backward self.DB_optimizer.zero_grad() loss_DB = (loss_DB_real + loss_DB_fake) * 0.5 + ( loss_DB_all_real + loss_DB_all_fake) * 0.5 loss_DB.backward() self.DB_optimizer.step() self.DB_all_optimizer.zero_grad() loss_DB_all = (loss_DB_all_real + loss_DB_all_fake) * 0.5 loss_DB_all.backward() self.DB_all_optimizer.step() ###################### # netD_all ###################### # loss_D_all = loss_DB_all + loss_DA_all # loss_D_all.backward(retain_graph=True) ''' ###################### # netD_B_all ###################### self.DB_all_optimizer.zero_grad() # loss_real DB_real_all = self.discriminator_B_all(real_mixed + gaussian_noise) loss_DB_all_real = criterionGAN(DB_real_all, True) # loss fake DB_fake_sample_all = self.discriminator_B_all(fake_B_sample + gaussian_noise) loss_DB_all_fake = criterionGAN(DB_fake_sample_all, False) loss_DB_all = (loss_DB_all_real + loss_DB_all_fake) * 0.5 loss_DB_all.backward() self.DB_all_optimizer.step() ''' # save snapshot if i % self.plot_every == 0: file_name = self.name + '_snap_%03d_%05d.png' % ( epoch, i, ) test_path = os.path.join(self.checkpoint_path, file_name) tv.utils.save_image(fake_B, test_path, normalize=True) print(f'{file_name} saved.') losses['loss_A'] = loss_A_meter.value()[0] losses['loss_B'] = loss_B_meter.value()[0] scores['score_DA_real_B'] = score_DA_real_B.value()[0] scores['score_DA_fake_B'] = score_DA_fake_B.value()[0] print(losses) print(scores) print('Epoch {} progress: {:.2%}\n'.format( epoch, i / iter_num)) # save model if epoch % self.save_every == 0 or epoch == self.max_epoch - 1: self.save_model(epoch) print(f'model saved') self.GA2B_scheduler.step(epoch) self.GB2A_scheduler.step(epoch) self.DA_scheduler.step(epoch) self.DB_scheduler.step(epoch) if self.model != 'base': self.DA_all_scheduler.step(epoch) self.DB_all_scheduler.step(epoch) epoch_time = int(time.time() - epoch_start_time) print_options(self.opt, epoch_log=True, epoch=epoch, time=epoch_time, losses=losses, scores=scores)
def __init__(self, hyperparameters, log_dir, gpu_id): super(Generator, self).__init__() self.hyp = hyperparameters print(hyperparameters) self.gpu_id = gpu_id self.noise_dim = self.hyp['noise_dim'] self.vis_noise = torch.randn(1, self.hyp['noise_dim']).cuda( self.gpu_id).requires_grad_(False) self.g_loss_meter = MovingAverageValueMeter(5) self.log_dir = log_dir # Architecture: self.lab0 = nn.Linear(1, self.hyp['p1'], bias=False) self.fc0 = nn.Linear(self.noise_dim, self.hyp['p2'], bias=False) self.nonlin0 = nn.Sequential(*[ nn.BatchNorm2d(self.hyp['p1'] + self.hyp['p2']), nn.LeakyReLU(self.hyp['lrelu_g']) ] if self.hyp['bg0'] else [ nn.LeakyReLU(self.hyp['lrelu_g']), ]) self.conv1 = nn.ConvTranspose2d(self.hyp['p1'] + self.hyp['p2'], self.hyp['p3'], (1, 55), bias=True) self.nonlin1 = nn.Sequential(*[ nn.BatchNorm2d(self.hyp['p3']), nn.LeakyReLU(self.hyp['lrelu_g']) ] if self.hyp['bg1'] else [ nn.LeakyReLU(self.hyp['lrelu_g']), ]) self.conv2 = nn.ConvTranspose2d(self.hyp['p3'], 1, (55, 1), bias=True) self.sigmoid = nn.Tanh() self.cuda(self.gpu_id) opt_param_list = [{ 'params': [ param for name, param in self.named_parameters() if 'lab0' not in name ] }, { 'params': self.lab0.parameters(), 'lr': 1 * self.hyp['lr_g'] }] self.optimizer = torch.optim.Adam(opt_param_list, lr=self.hyp['lr_g'], betas=(self.hyp['b1_g'], self.hyp['b2_g']), weight_decay=self.hyp['wd_g']) # rand init for m in self.modules(): if isinstance(m, nn.ConvTranspose2d): torch.nn.init.kaiming_normal_(m.weight, a=self.hyp['lrelu_g'], nonlinearity='leaky_relu') if not m.bias is None: torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): torch.nn.init.kaiming_normal_(m.weight) if not m.bias is None: torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): torch.nn.init.constant_(m.weight, 1) torch.nn.init.constant_(m.bias, 0)
def main(): global global_iter_idx args = parse_args() if args.verbose: default_handler = logging.StreamHandler(sys.stdout) logger.addHandler(default_handler) logger.setLevel(logging.DEBUG) if args.log_file is not None: logfile_handler = logging.FileHandler(args.log_file) logger_formatter = logging.Formatter( '%(name)s - %(levelname)s - %(message)s') logfile_handler.setFormatter(logger_formatter) logger.addHandler(logfile_handler) logger.setLevel(logging.DEBUG) train_data_source = davis.DavisDataset(base_dir=os.path.join( root_dir, 'dataset', 'DAVIS'), image_size=args.image_dims, year=2016, phase='train', transform=davis.ToTensor()) train_triplet_sampler = davis.TripletSampler(dataset=train_data_source, num_triplets=args.batch_size, randomize=True) train_data_loader = DataLoader(dataset=train_data_source, batch_sampler=train_triplet_sampler) val_data_source = davis.DavisDataset(base_dir=os.path.join( root_dir, 'dataset', 'DAVIS'), image_size=args.image_dims, year=2016, phase='val', transform=davis.ToTensor()) val_triplet_sampler = davis.TripletSampler( dataset=val_data_source, num_triplets=args.num_val_batches, randomize=True) val_data_loader = DataLoader(dataset=val_data_source, batch_sampler=val_triplet_sampler) model = network.BFVOSNet(embedding_vector_dims=args.embedding_vector_dims) train_loss_fn = loss.MinTripletLoss(alpha=args.alpha) val_loss_fn = loss.validation_loss if has_cuda: model = model.cuda() train_loss_fn = train_loss_fn.cuda() train_loss_fn.to(device) logger.debug("Model and loss function moved to CUDA") start_epoch = 0 if args.checkpoint_path is not None: epoch_substr = args.checkpoint_path.split('epoch_')[1] start_epoch = int(epoch_substr.split('_')[0]) batch_substr = epoch_substr.split('_')[1].split('batch_') if len(batch_substr) > 1: global_iter_idx = int(batch_substr[1].split('_')[0]) if has_cuda: if args.checkpoint_path is not None: # Load pre-trained weights for entire model model.load_state_dict( torch.load( args.checkpoint_path, map_location=lambda storage, loc: storage.cuda(gpu_id))) logger.info("Loaded checkpoint from {}".format( args.checkpoint_path)) else: # Load pre-trained weights for feature extraction head model.load_state_dict(torch.load( deeplab_resnet_pre_trained_path, map_location=lambda storage, loc: storage.cuda(gpu_id)), strict=False) logger.info("Loaded DeepLab ResNet from {}".format( deeplab_resnet_pre_trained_path)) else: if args.checkpoint_path is not None: model.load_state_dict(torch.load(args.checkpoint_path)) logger.info("Loaded checkpoint from {}".format( args.checkpoint_path)) else: model.load_state_dict(torch.load(deeplab_resnet_pre_trained_path), strict=False) logger.info("Loaded DeepLab ResNet from {}".format( deeplab_resnet_pre_trained_path)) # Load to appropriate device and set to training mode but freeze feature extraction layer model.to(device).train() model.freeze_feature_extraction() # Initialize optimizer to train only the unfrozen layers optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.learning_rate, momentum=args.momentum) # Initialize meter and writer train_loss_meter = MovingAverageValueMeter(20) val_loss_meter = AverageValueMeter() summary_writer = SummaryWriter(tensorboard_save_dir) # Train for epoch in tqdm(range(start_epoch, args.num_epochs)): logger.info("Epoch {}/{}".format(epoch + 1, args.num_epochs)) train(epoch, train_data_loader, val_data_loader, model, train_loss_fn, val_loss_fn, optimizer, train_loss_meter, val_loss_meter, summary_writer, args.log_interval, args.checkpoint_interval, args.val_interval, args.num_val_batches) # Save final model after all epochs model.eval().cpu() save_model_filename = "epoch_{}_{}.model".format( args.num_epochs, str(time.time()).replace(" ", "_").replace(".", "_")) save_model_path = os.path.join(model_dir, save_model_filename) torch.save(model.state_dict(), save_model_path) logger.info("Model saved to {}".format(save_model_filename)) training_config_save_path = os.path.join( config_save_dir, save_model_filename.replace('.model', '.json')) training_config = vars(args) training_config['device'] = str(torch.device) with open(training_config_save_path, 'w') as f: json.dump(training_config, f) logger.info( "Training config saved to {}".format(training_config_save_path))
class Discriminator(nn.Module): def __init__(self, hyperparameters, gpu_id): super(Discriminator, self).__init__() self.hyp = hyperparameters self.gpu_id = gpu_id self.w_loss_meter = MovingAverageValueMeter(5) self.d_loss_meter = MovingAverageValueMeter(5) self.r_loss_meter = MovingAverageValueMeter(5) self.f_loss_meter = MovingAverageValueMeter(5) self.gp_loss_meter = MovingAverageValueMeter(5) # Architecture self.lab0 = nn.ConvTranspose2d(1, self.hyp['q1'], (1, 55), bias=False) self.conv0 = nn.Conv2d(1, self.hyp['q2'], (55, 1), bias=False) self.nonlin0 = nn.Sequential(*[ nn.BatchNorm2d(self.hyp['q1'] + self.hyp['q2']), nn.LeakyReLU(self.hyp['lrelu_d']) ] if self.hyp['bd0'] else [ nn.LeakyReLU(self.hyp['lrelu_d']), ]) self.conv1 = nn.Conv2d(self.hyp['q1'] + self.hyp['q2'], self.hyp['q3'], (1, 55), bias=False) self.nonlin1 = nn.Sequential(*[ nn.BatchNorm2d(self.hyp['q3']), nn.LeakyReLU(self.hyp['lrelu_d']) ] if self.hyp['bd1'] else [ nn.LeakyReLU(self.hyp['lrelu_d']), ]) self.fc = nn.Linear(self.hyp['q3'], 1, bias=False) self.cuda(self.gpu_id) opt_param_list = [{ 'params': [ param for name, param in self.named_parameters() if 'lab0' not in name ] }, { 'params': self.lab0.parameters(), 'lr': 1 * self.hyp['lr_d'] }] self.optimizer = torch.optim.Adam(opt_param_list, lr=self.hyp['lr_d'], betas=(self.hyp['b1_d'], self.hyp['b2_d']), weight_decay=self.hyp['wd_d']) # rand init for m in self.modules(): if isinstance(m, nn.ConvTranspose2d): torch.nn.init.kaiming_normal_(m.weight, a=self.hyp['lrelu_d'], nonlinearity='leaky_relu') if not m.bias is None: torch.nn.init.constant_(m.bias, 0) if isinstance(m, nn.Conv2d): torch.nn.init.kaiming_normal_(m.weight, a=self.hyp['lrelu_d'], nonlinearity='leaky_relu') if not m.bias is None: torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): torch.nn.init.constant_(m.weight, 1) torch.nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): torch.nn.init.kaiming_normal_(m.weight) if not m.bias is None: torch.nn.init.constant_(m.bias, 0) def forward(self, x, l): x = self.conv0(x) l = self.lab0(l.float().view(-1, 1, 1, 1)) * 2 - 1 x = torch.cat([x, l], 1) x = self.nonlin0(x) x = self.conv1(x) x = self.nonlin1(x) x = x.view(-1, self.hyp['q3']) x = self.fc(x) return x def train_step(self, inputs, netg): """ One training step. :param inputs: :param netg: :return: """ real_data, real_labels = inputs real_data = real_data.cuda(self.gpu_id) real_labels = real_labels.cuda(self.gpu_id) self.zero_grad() self.d_real = self(real_data, real_labels).mean() # train with fake noise = torch.randn(real_data.shape[0], self.hyp['noise_dim']).cuda(self.gpu_id) fake = netg(noise, real_labels).data self.d_fake = self(fake, real_labels).mean() self.d_cost = self.d_fake - self.d_real # train with gradient penalty if not self.hyp['lambda_gp'] == 0: self.gradient_penalty = self.calc_gradient_penalty_cond( real_data.data, real_labels, fake.data) self.d_cost += self.gradient_penalty * self.hyp['lambda_gp'] self.wasserstein_d = self.d_real - self.d_fake self.d_cost.backward() self.optimizer.step() self.w_loss_meter.add(self.wasserstein_d.detach().cpu()) self.d_loss_meter.add(self.d_cost.detach().cpu()) self.r_loss_meter.add(self.d_real.detach().cpu()) self.f_loss_meter.add(self.d_fake.detach().cpu()) if not self.hyp['lambda_gp'] == 0: self.gp_loss_meter.add(self.gradient_penalty.detach().cpu()) def calc_gradient_penalty_cond(self, real_data, real_labels, fake_data): """ Calculates Gradient Penalty. :param real_data: :param real_labels: :param fake_data: :return: """ alpha = torch.rand(real_data.size()[0], 1, 1, 1).expand(real_data.size()).cuda(self.gpu_id) interpolates = alpha * real_data + ((1 - alpha) * fake_data) interpolates = interpolates.cuda(self.gpu_id).requires_grad_(True) real_labels.requires_grad_(True) disc_interpolates = self(interpolates, real_labels) gradients = torch.autograd.grad(outputs=disc_interpolates, inputs=[interpolates, real_labels], grad_outputs=torch.ones( disc_interpolates.size()).cuda( self.gpu_id), create_graph=True, retain_graph=True, only_inputs=True, allow_unused=True)[0] gradient_penalty = ((gradients.norm(2, dim=1) - 1)**2).mean() return gradient_penalty
def train(self): torch.cuda.empty_cache() ###################### # Save / Load model ###################### if self.opt.continue_train: try: self.continue_from_latest_checkpoint() except Exception as e: self.logger.error(e) return else: self.reset_save() self.logger.add_file_logger(self.opt.log_path) ###################### # Dataset ###################### dataset = ClassifierDataset(self.opt.genreA, self.opt.genreB, 'train') test_dataset = ClassifierDataset(self.opt.genreA, self.opt.genreB, 'test') dataset_size = len(dataset) iter_num = int(dataset_size / self.opt.batch_size) plot_every = iter_num // 10 self.logger.info( f'Dataset loaded, genreA: {self.opt.genreA}, genreB: {self.opt.genreB}, total size: {dataset_size}.' ) ###################### # Initiate ###################### softmax_criterion = nn.BCELoss() Loss_meter = MovingAverageValueMeter(self.opt.plot_every) losses = {} ###################### # Start Training ###################### test_data = torch.from_numpy(test_dataset.get_data()).to( self.device, dtype=torch.float) gaussian_noise = torch.normal(mean=torch.zeros(test_data.shape), std=self.opt.gaussian_std).to( self.device, dtype=torch.float) # test_data += gaussian_noise real_test_label = torch.from_numpy(test_dataset.get_labels()).view( -1, 2).to(self.device, dtype=torch.float) for epoch in range(self.opt.start_epoch, self.opt.max_epoch): loader = DataLoader(dataset, batch_size=self.opt.batch_size, shuffle=True, num_workers=self.opt.num_threads, drop_last=True) epoch_start_time = time.time() for i, batch in enumerate(loader): data = batch[0].to(self.device, dtype=torch.float) real_label = batch[1].view(self.opt.batch_size, 2).to(self.device, dtype=torch.float) self.classifier_optimizer.zero_grad() estimate_train = self.classifier(data) loss = softmax_criterion(estimate_train, real_label) loss.backward() self.classifier_optimizer.step() Loss_meter.add(loss.item()) # test if i % plot_every == 0: with torch.no_grad(): estimate_test = self.classifier(test_data) estimate_test = nn.functional.softmax(estimate_test, dim=1) test_prediction = torch.argmax(estimate_test, 1).eq( torch.argmax(real_test_label, 1)) test_accuracy = torch.mean( test_prediction.type(torch.float32)).cpu() self.logger.info( 'Epoch {} progress {:.2%}: Loss: {}, Accuracy: {}\n'. format(epoch, i / iter_num, Loss_meter.value()[0], test_accuracy)) if epoch % self.opt.save_every == 0 or epoch == self.opt.max_epoch - 1: self.save_model(epoch) self.classifier_scheduler.step(epoch) epoch_time = int(time.time() - epoch_start_time) self.logger.info( f'Epoch {epoch} finished, cost time {epoch_time}\n')
lr_decay_epoch=10): """Sets the learning rate to the initial LR decayed by lr_decay_factor every lr_decay_epoch epochs""" if epoch % lr_decay_epoch == 0: lr = init_lr * (lr_decay_factor**(epoch // lr_decay_epoch)) print('LR is set to {}'.format(lr)) for param_group in optimizer.param_groups: param_group['lr'] = lr model = faster_rcnn(20, backbone='vgg16') if torch.cuda.is_available(): model = model.cuda() optimizer = model.get_optimizer(is_adam=False) avg_loss = AverageValueMeter() ma20_loss = MovingAverageValueMeter(windowsize=20) model.train() for epoch in range(15): adjust_learning_rate(optimizer, epoch, 0.001, lr_decay_epoch=10) for i in range(len(trainval_dataset)): img, bbox, label = trainval_dataset[ i] #this is to retrive the data from datasets easy method img = img / 255 loss = model.loss(img, bbox, label) optimizer.zero_grad() loss.backward() optimizer.step() loss_value = loss.cpu().data.numpy()
def main(): parser = argparse.ArgumentParser(description="Train the SharpNet network") parser.add_argument('--dataset', '-d', dest='dataset', help='Name of the dataset (MLT, NYUv2 or pix3d)') parser.add_argument('--exp_name', dest='experiment_name', help='Custom name of the experiment', type=str, default=None) parser.add_argument('--batch-size', '-b', dest='batch_size', type=int, default=3, help='Batch size') parser.add_argument('--iter-size', dest='iter_size', type=int, default=3, help='Iteration size (for accumulated gradients)') parser.add_argument('--boundary', action='store_true', help='Use boundary decoder') parser.add_argument('--normals', action='store_true', help='Use normals decoder') parser.add_argument('--depth', action='store_true', help='Use depth decoder') parser.add_argument('--consensus', dest='geo_consensus', action='store_true') parser.add_argument('--freeze', dest='decoder_freeze', default='', type=str, help='Decoders to freeze (comma seperated)') parser.add_argument('--verbose', action='store_true', help='Activate to display loss components terms') parser.add_argument('--rootdir', '-r', dest='root_dir', default='', help='Root Directory of the dataset') parser.add_argument( '--nocuda', action="store_true", help='Use flag to use on CPU only (currently not supported)') parser.add_argument('--lr', dest='learning_rate', type=float, default=1e-5, help='Initial learning rate') parser.add_argument('--lr-mode', dest='lr_mode', default='poly', help='Learning rate decay mode') parser.add_argument('--max-epoch', dest='max_epoch', type=int, default=1000, help='MAXITER') parser.add_argument('--step', '-s', dest='gradient_step', default=5e-2, help='gradient step') parser.add_argument('--cuda', dest='cuda_device', default="0", help='CUDA device ID') parser.add_argument('--cpu', dest='num_workers', default=4) parser.add_argument('--pretrained-model', dest='pretrained_model', default=None, help="Choose a model to fine tune") parser.add_argument('--start_epoch', dest='start_epoch', default=0, type=int, help="Starting epoch") parser.add_argument('--bias', action="store_true", help="Flag to learn bias in decoder convnet") parser.add_argument('--optimizer', dest='optimizer', default='SGD', type=str, help="Optimizer type: SGD / Adam") parser.add_argument('--decay', dest='decay', default=5e-5, type=float, help="Weight decay rate") args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device) cuda = False if args.nocuda else True resnet50_url = 'https://download.pytorch.org/models/resnet50-19c8e357.pth' cuda = cuda and torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") if cuda: current_device = torch.cuda.current_device() print("Running on " + torch.cuda.get_device_name(current_device)) else: print("Running on CPU") now = datetime.datetime.now() date_str = now.strftime("%d-%m-%Y_%H-%M") t = [] torch.manual_seed(329) bias = True if args.bias else False # build model model = SharpNet(ResBlock, [3, 4, 6, 3], [2, 2, 2, 2, 2], use_normals=True if args.normals else False, use_depth=True if args.depth else False, use_boundary=True if args.boundary else False, bias_decoder=bias) model_dict = model.state_dict() # Load pretrained weights resnet_path = 'models/resnet50-19c8e357.pth' if not os.path.exists(resnet_path): command = 'wget ' + resnet50_url + ' && mkdir models/ && mv resnet50-19c8e357.pth models/' os.system(command) resnet50_dict = torch.load(resnet_path) resnet_dict = { k.replace('.', '_img.', 1): v for k, v in resnet50_dict.items() if k.replace('.', '_img.', 1) in model_dict } # load weights up to pool if args.pretrained_model is not None: model_path = args.pretrained_model tmp_dict = torch.load(model_path) if args.depth: pretrained_dict = { k: v for k, v in tmp_dict.items() if k in model_dict } else: pretrained_dict = { k: v for k, v in tmp_dict.items() if (k in model_dict and not k.startswith('depth_decoder')) } else: pretrained_dict = resnet_dict try: model_dict.update(pretrained_dict) model.load_state_dict(model_dict) print('Successfully loaded pretrained ResNet weights') except: print('Could not load the pretrained model weights') sys.exit(0) model.to(device) model.zero_grad() model.train() freeze_decoders = args.decoder_freeze.split(',') freeze_model_decoders(model, freeze_decoders) if args.dataset != 'NYU': sharpnet_loss = SharpNetLoss( lamb=0.5, mu=1.0, use_depth=True if args.depth else False, use_boundary=True if args.boundary else False, use_normals=True if args.normals else False, use_geo_consensus=True if args.geo_consensus else False) else: sharpnet_loss = SharpNetLoss( lamb=0.5, mu=1.0, use_depth=True if args.depth else False, use_boundary=False, use_normals=False, use_geo_consensus=True if args.geo_consensus else False) if args.optimizer == 'SGD': optimizer = SGD(params=get_params(model), lr=args.learning_rate, weight_decay=args.decay, momentum=0.9) elif args.optimizer == 'Adam': optimizer = Adam(params=get_params(model), lr=args.learning_rate, weight_decay=args.decay) else: print( 'Could not configure the optimizer, please select --optimizer Adam or SGD' ) sys.exit(0) # TensorBoard Logger train_loss_meter = MovingAverageValueMeter(20) val_loss_meter = MovingAverageValueMeter(3) depth_loss_meter = MovingAverageValueMeter(3) if args.depth else None normals_loss_meter = MovingAverageValueMeter( 3) if args.normals and args.dataset != 'NYU' else None grad_loss_meter = MovingAverageValueMeter(3) if args.depth else None boundary_loss_meter = MovingAverageValueMeter( 3) if args.boundary and args.dataset != 'NYU' else None consensus_loss_meter = MovingAverageValueMeter( 3) if args.geo_consensus else None exp_name = args.experiment_name if args.experiment_name is not None else '' print('Experiment Name: {}'.format(exp_name)) log_dir = os.path.join('logs', 'Joint', str(exp_name) + '_' + date_str) cp_dir = os.path.join('checkpoints', 'Joint', str(exp_name) + '_' + date_str) print('Checkpoint Directory: {}'.format(cp_dir)) train_writer = SummaryWriter(os.path.join(log_dir, 'train')) val_writer = SummaryWriter(os.path.join(log_dir, 'val')) if not os.path.exists(cp_dir): os.makedirs(cp_dir) if not os.path.exists(log_dir): os.makedirs(os.path.join(log_dir, 'train')) os.makedirs(os.path.join(log_dir, 'val')) train_dataloader, val_dataloader = get_trainval_splits(args) for epoch in range(args.max_epoch): if args.optimizer == 'SGD': adjust_learning_rate(args.learning_rate, args.lr_mode, args.gradient_step, args.max_epoch, optimizer, epoch) train_epoch(train_dataloader, val_dataloader, model, sharpnet_loss, optimizer, args.start_epoch + epoch, train_writer, val_writer, train_loss_meter, val_loss_meter, depth_loss_meter, grad_loss_meter, normals_loss_meter, date_str=date_str, model_save_path=cp_dir, args=args, boundary_loss_meter=boundary_loss_meter, consensus_loss_meter=consensus_loss_meter) # Save a model if epoch % 2 == 0 and epoch > int(0.9 * args.max_epoch): torch.save( model.state_dict(), os.path.join( cp_dir, 'checkpoint_{}_final.pth'.format(args.start_epoch + epoch)), ) elif epoch % 10 == 0: torch.save( model.state_dict(), os.path.join( cp_dir, 'checkpoint_{}_final.pth'.format(args.start_epoch + epoch)), ) torch.save( model.state_dict(), os.path.join( cp_dir, 'checkpoint_{}_final.pth'.format(args.start_epoch + args.max_epoch)), ) return None
def train(config, cuda): # Auto-tune cuDNN torch.backends.cudnn.benchmark = True # Configuration device = get_device(cuda) CONFIG = Dict(yaml.load(open(config))) # Dataset 10k or 164k dataset = get_dataset(CONFIG.DATASET.NAME)( root=CONFIG.DATASET.ROOT, split=CONFIG.DATASET.SPLIT.TRAIN, base_size=CONFIG.IMAGE.SIZE.TRAIN.BASE, crop_size=CONFIG.IMAGE.SIZE.TRAIN.CROP, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.DATASET.WARP_IMAGE, scale=CONFIG.DATASET.SCALES, flip=True, ) # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.SOLVER.BATCH_SIZE.TRAIN, num_workers=CONFIG.DATALOADER.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model model = setup_model(CONFIG.MODEL.INIT_MODEL, CONFIG.DATASET.N_CLASSES, train=True) model.to(device) # Optimizer optimizer = torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_params(model.module, key="1x"), "lr": CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.SOLVER.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.SOLVER.MOMENTUM, ) # Learning rate scheduler scheduler = PolynomialLR( optimizer=optimizer, step_size=CONFIG.SOLVER.LR_DECAY, iter_max=CONFIG.SOLVER.ITER_MAX, power=CONFIG.SOLVER.POLY_POWER, ) # Loss definition criterion = nn.CrossEntropyLoss(ignore_index=CONFIG.DATASET.IGNORE_LABEL) criterion.to(device) # TensorBoard logger writer = SummaryWriter(CONFIG.SOLVER.LOG_DIR) average_loss = MovingAverageValueMeter(CONFIG.SOLVER.AVERAGE_LOSS) # Freeze the batch norm pre-trained on COCO model.train() model.module.base.freeze_bn() for iteration in tqdm( range(1, CONFIG.SOLVER.ITER_MAX + 1), total=CONFIG.SOLVER.ITER_MAX, leave=False, dynamic_ncols=True, ): # Clear gradients (ready to accumulate) optimizer.zero_grad() loss = 0 for _ in range(CONFIG.SOLVER.ITER_SIZE): try: images, labels = next(loader_iter) except: loader_iter = iter(loader) images, labels = next(loader_iter) images = images.to(device) labels = labels.to(device) # Propagate forward logits = model(images) # Loss iter_loss = 0 for logit in logits: # Resize labels for {100%, 75%, 50%, Max} logits _, _, H, W = logit.shape labels_ = resize_labels(labels, shape=(H, W)) iter_loss += criterion(logit, labels_) # Backpropagate (just compute gradients wrt the loss) iter_loss /= CONFIG.SOLVER.ITER_SIZE iter_loss.backward() loss += float(iter_loss) average_loss.add(loss) # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=iteration) # TensorBoard if iteration % CONFIG.SOLVER.ITER_TB == 0: writer.add_scalar("loss/train", average_loss.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("lr/group{}".format(i), o["lr"], iteration) if False: # This produces a large log file for name, param in model.named_parameters(): name = name.replace(".", "/") # Weight/gradient distribution writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram(name + "/grad", param.grad, iteration, bins="auto") # Save a model if iteration % CONFIG.SOLVER.ITER_SAVE == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_DIR, "checkpoint_{}.pth".format(iteration)), ) # To verify progress separately torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_DIR, "checkpoint_current.pth"), ) torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_DIR, "checkpoint_final.pth"), )
def train(): """Create the model and start the training.""" # === 1.Configuration print(CONFIG_PATH) # === select which GPU you want to use # === here assume to use 8 GPUs, idx are 0,1,2,3,...,7 os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, CONFIG.EXP.GPU_IDX)) device = get_device(torch.cuda.is_available()) cudnn.benchmark = True comment_init = "" writer = SummaryWriter(comment=comment_init) # Setup loss logger # === MovingAverageValueMeter(self,windowsize) # === - add(value): 记录value # === - reset() # === - value() : 返回MA和标准差 average_loss = MovingAverageValueMeter(CONFIG.SOLVER.AVERAGE_LOSS) if not os.path.exists(CONFIG.MODEL.SAVE_PATH): os.makedirs(CONFIG.MODEL.SAVE_PATH) # Path to save models checkpoint_dir = os.path.join( CONFIG.EXP.OUTPUT_DIR, # ./data "models", CONFIG.MODEL.NAME.lower(), # DeepLabV2_ResNet101_MSC CONFIG.DATASET.SPLIT.TRAIN, # train_aug ) # === checkpoint_dir: ./data/DeepLabV2_ResNet101_MSC/train_aug if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) print("Checkpoint dst:", checkpoint_dir) # === 2.Dataloader === trainloader = data.DataLoader( VOCDataSet( CONFIG.DATASET.DIRECTORY, CONFIG.DATASET.LIST_PATH, max_iters=CONFIG.SOLVER.ITER_MAX * CONFIG.SOLVER.BATCH_SIZE.TRAIN, crop_size=(CONFIG.IMAGE.SIZE.TRAIN, CONFIG.IMAGE.SIZE.TRAIN), scale=CONFIG.DATASET.RANDOM.SCALE, mirror=CONFIG.DATASET.RANDOM.MIRROR, mean=IMG_MEAN, label_path=CONFIG.DATASET.SEG_LABEL), # for training batch_size=CONFIG.SOLVER.BATCH_SIZE.TRAIN, shuffle=True, num_workers=CONFIG.DATALOADER.NUM_WORKERS, pin_memory=True) # 使用iter(dataloader)返回的是一个迭代器,可以使用next访问 # loader_iter = iter(trainloader) # === 3.Create network & weights === print("Model:", CONFIG.MODEL.NAME) # model = DeepLabV2_ResNet101_MSC(n_classes=CONFIG.DATASET.N_CLASSES) model = DeepLabV2_DRN105_MSC(n_classes=CONFIG.DATASET.N_CLASSES) state_dict = torch.load(CONFIG.MODEL.INIT_MODEL) # model.base.load_state_dict(state_dict, strict=False) # to skip ASPP print(" Init:", CONFIG.MODEL.INIT_MODEL) # === show the skip weight for m in model.base.state_dict().keys(): if m not in state_dict.keys(): print(" Skip init:", m) # === DeepLabv2 = Res101+ASPP # === model.base = DeepLabv2 # === model = MSC(DeepLabv2) # model.base.load_state_dict(state_dict, # strict=False) # strict=False to skip ASPP model = nn.DataParallel(model) # multi-GPU model.to(device) # put in GPU is available # === 4.Loss definition criterion = nn.CrossEntropyLoss(ignore_index=CONFIG.DATASET.IGNORE_LABEL) criterion.to(device) # put in GPU is available # === 5.optimizer === optimizer = torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_params(model.module, key="1x"), "lr": CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.SOLVER.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.SOLVER.MOMENTUM, ) # Learning rate scheduler scheduler = PolynomialLR( optimizer=optimizer, step_size=CONFIG.SOLVER.LR_DECAY, iter_max=CONFIG.SOLVER.ITER_MAX, power=CONFIG.SOLVER.POLY_POWER, ) time_start = time.time() # set start time # === training iteration === for i_iter, batch in enumerate(trainloader, start=1): torch.set_grad_enabled(True) model.train() model.module.base.freeze_bn() optimizer.zero_grad() images, labels, _, _ = batch logits = model(images.to(device)) # <<<<<<<<<<<<<<<<<<<< # === Loss # === logits = [logits] + logits_pyramid + [logits_max] iter_loss = 0 loss = 0 for logit in logits: # Resize labels for {100%, 75%, 50%, Max} logits _, _, H, W = logit.shape labels_ = resize_labels(labels, size=(H, W)) iter_loss += criterion(logit, labels_.to(device)) # iter_loss /= CONFIG.SOLVER.ITER_SIZE iter_loss /= 4 iter_loss.backward() loss += float(iter_loss) average_loss.add(loss) # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=i_iter) # TensorBoard writer.add_scalar("loss", average_loss.value()[0], global_step=i_iter) print( 'iter/max_iter = [{}/{}] completed, loss = {:4.3} time:{}'.format( i_iter, CONFIG.SOLVER.ITER_MAX, average_loss.value()[0], show_timing(time_start, time.time()))) # print('iter = ', i_iter, 'of', args.num_steps, '', # loss.data.cpu().numpy()) # === save final model if i_iter >= CONFIG.SOLVER.ITER_MAX: print('save final model as...{}'.format( osp.join(CONFIG.MODEL.SAVE_PATH, 'VOC12_' + str(CONFIG.SOLVER.ITER_MAX) + '.pth'))) torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_PATH, 'VOC12_' + str(CONFIG.SOLVER.ITER_MAX) + '.pth')) break if i_iter % CONFIG.EXP.EVALUATE_ITER == 0: print("Evaluation....") evaluate_gpu(model, writer, i_iter) # === Save model every 250 iteration========================== # because DataParalel will add 'module' in each name of layer. # so here use model.module.state_dict() # ============================================================ if i_iter % CONFIG.MODEL.SAVE_EVERY_ITER == 0: print('saving model ...') torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_PATH, 'VOC12_{}.pth'.format(i_iter)))
def train(**kwargs): # first free all GPU memory t.cuda.empty_cache() """ Get options """ opt = Config() print_options(opt) # overwrite options from commandline for k_, v_ in kwargs.items(): setattr(opt, k_, v_) device = t.device('cuda') if opt.gpu else t.device('cpu') # TODO: visualization """ Dataset """ dataset = create_dataset(opt) dataset_size = len(dataset) iter_per_epoch = int(dataset_size / opt.batch_size) print(f'loaded {dataset_size} images for training') """ Create Network Instances """ model_names = ['netG_x', 'netG_y', 'netD_x', 'netD_y'] netG_x = ResnetGenerator(opt) netG_y = ResnetGenerator(opt) # print(netG_x) netD_x = NLayerDiscriminator(opt) netD_y = NLayerDiscriminator(opt) # print(netD_x) if opt.gpu: netG_x.to(device) summary(netG_x, input_size=(3, opt.crop_size, opt.crop_size)) netG_y.to(device) netD_x.to(device) summary(netD_x, input_size=(3, opt.crop_size, opt.crop_size)) netD_y.to(device) """ Define optimizer and Loss """ optimizer_g = t.optim.Adam(itertools.chain(netG_x.parameters(), netG_y.parameters()), lr=opt.g_lr, betas=(opt.beta1, 0.999)) optimizer_d = t.optim.Adam(itertools.chain(netD_x.parameters(), netD_y.parameters()), lr=opt.d_lr, betas=(opt.beta1, 0.999)) optimizers = [optimizer_g, optimizer_d] """ Forward cycle loss: lambda_A * ||G_B(G_A(A)) - A|| (Eqn. (2) in the paper) Backward cycle loss: lambda_B * ||G_A(G_B(B)) - B|| (Eqn. (2) in the paper) Identity loss (optional): lambda_identity * (||G_A(B) - B|| * lambda_B + ||G_B(A) - A|| * lambda_A) (Sec 5.2 "Photo generation from paintings" in the paper) """ lambda_X = 10.0 # weight for cycle loss (A -> B -> A^) lambda_Y = 10.0 # weight for cycle loss (B -> A -> B^) lambda_identity = 0.5 # 定义 GAN 损失,define GAN loss. # it's a MSELoss() when initialized, only calculate later during iteration # criterionGAN = nn.MSELoss().to(device) criterionGAN = GANLoss(gan_mode='lsgan') # cycle loss criterionCycle = nn.L1Loss() # identical loss criterionIdt = nn.L1Loss() # loss meters loss_X_meter = MovingAverageValueMeter(opt.plot_every) loss_Y_meter = MovingAverageValueMeter(opt.plot_every) score_Dx_real_y = MovingAverageValueMeter(opt.plot_every) score_Dx_fake_y = MovingAverageValueMeter(opt.plot_every) losses = {} scores = {} """ use identity mapping. Setting lambda_identity other than 0 has an effect of scaling the weight of the identity mapping loss. For example, if the weight of the identity loss should be 10 times smaller than the weight of the reconstruction loss, please set lambda_identity = 0.1 """ for epoch in range(opt.max_epochs): epoch_start_time = time.time() """ calculate losses, gradients, and update network weights; called in every iteration """ for i, data in enumerate(dataset): real_x = data['A'].to(device) real_y = data['B'].to(device) ###################### # X -> Y' -> X^ cycle ###################### optimizer_g.zero_grad() # set g_x and g_y gradients to zero fake_y = netG_x(real_x) # X -> Y' prediction = netD_x(fake_y) #netD_x provide feedback to netG_x loss_G_X = criterionGAN(prediction, True) # cycle_consistance x_hat = netG_y(fake_y) # Y' -> X^ # Forward cycle loss x^ = || G_y(G_x(real_x)) || loss_cycle_X = criterionCycle(x_hat, real_x) * lambda_X # identity loss if lambda_identity > 0: # netG_x should be identity if real_y is fed: ||netG_x(real_y) - real_y|| idt_x = netG_x(real_y) loss_idt_x = criterionIdt(idt_x, real_y) * lambda_Y * lambda_identity else: loss_idt_x = 0. loss_X = loss_G_X + loss_cycle_X + loss_idt_x loss_X.backward(retain_graph=True) optimizer_g.step() loss_X_meter.add(loss_X.item()) ###################### # Y -> X' -> Y^ cycle ###################### optimizer_g.zero_grad() # set g_x and g_y gradients to zero fake_x = netG_y(real_y) # Y -> X' prediction = netD_y(fake_x) loss_G_Y = criterionGAN(prediction, True) # print(f'loss_G_Y = {round(float(loss_G_Y), 3)}') y_hat = netG_x(fake_x) # Y -> X' -> Y^ # Forward cycle loss y^ = || G_x(G_y(real_y)) || loss_cycle_Y = criterionCycle(y_hat, real_y) * lambda_Y # identity loss if lambda_identity > 0: # netG_y should be identiy if real_x is fed: ||netG_y(real_x) - real_x|| idt_y = netG_y(real_x) loss_idt_y = criterionIdt(idt_y, real_x) * lambda_X * lambda_identity else: loss_idt_y = 0. loss_Y = loss_G_Y + loss_cycle_Y + loss_idt_y loss_Y.backward(retain_graph=True) optimizer_g.step() loss_Y_meter.add(loss_Y.item()) ###################### # netD_x ###################### optimizer_d.zero_grad() # loss_real pred_real = netD_x(real_y) loss_D_x_real = criterionGAN(pred_real, True) score_Dx_real_y.add(float(pred_real.data.mean())) # loss_fake pred_fake = netD_x(fake_y) loss_D_x_fake = criterionGAN(pred_fake, False) score_Dx_fake_y.add(float(pred_fake.data.mean())) # loss and backward loss_D_x = (loss_D_x_real + loss_D_x_fake) * 0.5 loss_D_x.backward() optimizer_d.step() ###################### # netD_y ###################### optimizer_d.zero_grad() # loss_real pred_real = netD_y(real_x) loss_D_y_real = criterionGAN(pred_real, True) # loss_fake pred_fake = netD_y(fake_x) loss_D_y_fake = criterionGAN(pred_fake, False) # loss and backward loss_D_y = (loss_D_y_real + loss_D_y_fake) * 0.5 loss_D_y.backward() optimizer_d.step() # save snapshot if i % opt.plot_every == 0: filename = opt.name + '_snap_%03d_%05d.png' % ( epoch, i, ) test_path = os.path.join(opt.checkpoint_path, filename) tv.utils.save_image(fake_y, test_path, normalize=True) print(f'{filename} saved.') losses['loss_X'] = loss_X_meter.value()[0] losses['loss_Y'] = loss_Y_meter.value()[0] scores['score_Dx_real_y'] = score_Dx_real_y.value()[0] scores['score_Dx_fake_y'] = score_Dx_fake_y.value()[0] print(losses) print(scores) # print(f'iteration {i} finished') # save model if epoch % opt.save_every == 0 or epoch == opt.max_epochs - 1: save_filename = f'{opt.name}_netG_{epoch}.pth' save_filepath = os.path.join(opt.model_path, save_filename) t.save(netG_x.state_dict(), save_filepath) print(f'model saved as {save_filename}') # epoch end logs epoech_time = int(time.time() - epoch_start_time) print_options(opt, epoch_log=True, epoch=epoch, time=epoech_time, losses=losses, scores=scores) print()
def main(config, cuda): # Configuration with open(config) as f: CONFIG = yaml.load(f) cuda = cuda and torch.cuda.is_available() # Dataset dataset = get_dataset(CONFIG['DATASET'])( root=CONFIG['ROOT'], split='train', image_size=(CONFIG['IMAGE']['SIZE']['TRAIN'], CONFIG['IMAGE']['SIZE']['TRAIN']), scale=True, flip=True, # preload=True ) # DataLoader loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=CONFIG['BATCH_SIZE'], num_workers=CONFIG['NUM_WORKERS'], shuffle=True) loader_iter = iter(loader) # Model model = DeepLabV2_ResNet101_MSC(n_classes=CONFIG['N_CLASSES']) state_dict = torch.load(CONFIG['INIT_MODEL']) model.load_state_dict(state_dict, strict=False) # Skip "aspp" layer if cuda: model.cuda() # Optimizer optimizer = { 'sgd': torch.optim.SGD( params=[ { 'params': get_1x_lr_params(model), 'lr': float(CONFIG['LR']) }, { 'params': get_10x_lr_params(model), 'lr': 10 * float(CONFIG['LR']) } # NOQA ], lr=float(CONFIG['LR']), momentum=float(CONFIG['MOMENTUM']), weight_decay=float(CONFIG['WEIGHT_DECAY'])), }.get(CONFIG['OPTIMIZER']) # Loss definition criterion = CrossEntropyLoss2d(ignore_index=CONFIG['IGNORE_LABEL']) if cuda: criterion.cuda() # TensorBoard Logger writer = SummaryWriter(CONFIG['LOG_DIR']) loss_meter = MovingAverageValueMeter(20) model.train() for iteration in tqdm(range(1, CONFIG['ITER_MAX'] + 1), total=CONFIG['ITER_MAX'], leave=False, dynamic_ncols=True): # Polynomial lr decay poly_lr_scheduler(optimizer=optimizer, init_lr=float(CONFIG['LR']), iter=iteration - 1, lr_decay_iter=CONFIG['LR_DECAY'], max_iter=CONFIG['ITER_MAX'], power=CONFIG['POLY_POWER']) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG['ITER_SIZE'] + 1): data, target = next(loader_iter) # Image data = data.cuda() if cuda else data data = Variable(data) # Forward propagation outputs = model(data) # Label target = resize_target(target, outputs[0].size(2)) target = target.cuda() if cuda else target target = Variable(target) # Aggregate losses for [100%, 75%, 50%, Max] loss = 0 for output in outputs: loss += criterion(output, target) loss /= CONFIG['ITER_SIZE'] iter_loss += loss.data[0] loss.backward() # Reload dataloader if ((iteration - 1) * CONFIG['ITER_SIZE'] + i) % len(loader) == 0: loader_iter = iter(loader) loss_meter.add(iter_loss) # Back propagation optimizer.step() # TensorBoard if iteration % CONFIG['ITER_TF'] == 0: writer.add_scalar('train_loss', loss_meter.value()[0], iteration) # Save a model if iteration % CONFIG['ITER_SNAP'] == 0: torch.save( model.state_dict(), osp.join(CONFIG['SAVE_DIR'], 'checkpoint_{}.pth.tar'.format(iteration))) # NOQA writer.add_text('log', 'Saved a model', iteration) torch.save(model.state_dict(), osp.join(CONFIG['SAVE_DIR'], 'checkpoint_final.pth.tar'))
def train(config_path, cuda): """ Training DeepLab by v2 protocol """ # Configuration CONFIG = Dict(yaml.load(config_path)) device = get_device(cuda) torch.backends.cudnn.benchmark = True # Dataset dataset = get_dataset(CONFIG.DATASET.NAME)( root=CONFIG.DATASET.ROOT, split=CONFIG.DATASET.SPLIT.TRAIN, ignore_label=CONFIG.DATASET.IGNORE_LABEL, mean_bgr=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), augment=True, base_size=CONFIG.IMAGE.SIZE.BASE, crop_size=CONFIG.IMAGE.SIZE.TRAIN, scales=CONFIG.DATASET.SCALES, flip=True, ) print(dataset) # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.SOLVER.BATCH_SIZE.TRAIN, num_workers=CONFIG.DATALOADER.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model check print("Model:", CONFIG.MODEL.NAME) assert ( CONFIG.MODEL.NAME == "DeepLabV2_ResNet101_MSC" ), 'Currently support only "DeepLabV2_ResNet101_MSC"' # Model setup model = eval(CONFIG.MODEL.NAME)(n_classes=CONFIG.DATASET.N_CLASSES) state_dict = torch.load(CONFIG.MODEL.INIT_MODEL) print(" Init:", CONFIG.MODEL.INIT_MODEL) for m in model.base.state_dict().keys(): if m not in state_dict.keys(): print(" Skip init:", m) model.base.load_state_dict(state_dict, strict=False) # to skip ASPP model = nn.DataParallel(model) model.to(device) # Loss definition criterion = nn.CrossEntropyLoss(ignore_index=CONFIG.DATASET.IGNORE_LABEL) criterion.to(device) # Optimizer optimizer = torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_params(model.module, key="1x"), "lr": CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.SOLVER.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.SOLVER.MOMENTUM, ) # Learning rate scheduler scheduler = PolynomialLR( optimizer=optimizer, step_size=CONFIG.SOLVER.LR_DECAY, iter_max=CONFIG.SOLVER.ITER_MAX, power=CONFIG.SOLVER.POLY_POWER, ) # Setup loss logger writer = SummaryWriter(os.path.join(CONFIG.EXP.OUTPUT_DIR, "logs", CONFIG.EXP.ID)) average_loss = MovingAverageValueMeter(CONFIG.SOLVER.AVERAGE_LOSS) # Path to save models checkpoint_dir = os.path.join( CONFIG.EXP.OUTPUT_DIR, "models", CONFIG.EXP.ID, CONFIG.MODEL.NAME.lower(), CONFIG.DATASET.SPLIT.TRAIN, ) makedirs(checkpoint_dir) print("Checkpoint dst:", checkpoint_dir) # Freeze the batch norm pre-trained on COCO model.train() model.module.base.freeze_bn() for iteration in tqdm( range(1, CONFIG.SOLVER.ITER_MAX + 1), total=CONFIG.SOLVER.ITER_MAX, dynamic_ncols=True, ): # Clear gradients (ready to accumulate) optimizer.zero_grad() loss = 0 for _ in range(CONFIG.SOLVER.ITER_SIZE): try: _, images, labels = next(loader_iter) except: loader_iter = iter(loader) _, images, labels = next(loader_iter) # Propagate forward logits = model(images.to(device)) # Loss iter_loss = 0 for logit in logits: # Resize labels for {100%, 75%, 50%, Max} logits _, _, H, W = logit.shape labels_ = resize_labels(labels, size=(H, W)) iter_loss += criterion(logit, labels_.to(device)) # Propagate backward (just compute gradients wrt the loss) iter_loss /= CONFIG.SOLVER.ITER_SIZE iter_loss.backward() loss += float(iter_loss) #print(loss) average_loss.add(loss) # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=iteration) # TensorBoard if iteration % CONFIG.SOLVER.ITER_TB == 0: writer.add_scalar("loss/train", average_loss.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("lr/group_{}".format(i), o["lr"], iteration) for i in range(torch.cuda.device_count()): writer.add_scalar( "gpu/device_{}/memory_cached".format(i), torch.cuda.memory_cached(i) / 1024 ** 3, iteration, ) if False: for name, param in model.module.base.named_parameters(): name = name.replace(".", "/") # Weight/gradient distribution writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram( name + "/grad", param.grad, iteration, bins="auto" ) # Save a model if iteration % CONFIG.SOLVER.ITER_SAVE == 0: torch.save( model.module.state_dict(), os.path.join(checkpoint_dir, "checkpoint_{}.pth".format(iteration)), ) torch.save( model.module.state_dict(), os.path.join(checkpoint_dir, "checkpoint_final.pth") )
def main(config, cuda): cuda = cuda and torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") if cuda: current_device = torch.cuda.current_device() print("Running on", torch.cuda.get_device_name(current_device)) else: print("Running on CPU") # Configuration CONFIG = Dict(yaml.load(open(config))) dataset = get_dataset(CONFIG.DATASET)( data_path=CONFIG.ROOT, crop_size=256, scale=(0.6, 0.8, 1., 1.2, 1.4), rotation=15, flip=True, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), ) """ # Dataset 10k or 164k dataset = get_dataset(CONFIG.DATASET)( root=CONFIG.ROOT, split=CONFIG.SPLIT.TRAIN, base_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.WARP_IMAGE, scale=(0.5, 0.75, 1.0, 1.25, 1.5), flip=True, ) """ # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.BATCH_SIZE.TRAIN, num_workers=CONFIG.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model model = DeepLabV3Plus_ResNet101_MSC(n_classes=CONFIG.N_CLASSES) state_dict = torch.load(CONFIG.INIT_MODEL) model.load_state_dict(state_dict, strict=False) # Skip "aspp" layer model = nn.DataParallel(model) model.to(device) for name, param in model.named_parameters(): if param.requires_grad: print(name) # Optimizer optimizer = torch.optim.Adam( params=get_params(model.module), lr=CONFIG.LR, weight_decay=CONFIG.WEIGHT_DECAY, ) """ # Optimizer optimizer = torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_params(model.module, key="1x"), "lr": CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.MOMENTUM, ) """ # Loss definition criterion = CrossEntropyLoss2d(ignore_index=CONFIG.IGNORE_LABEL) criterion.to(device) max_pooling_loss = MaxPoolingLoss(ratio=0.3, p=1.7, reduce=True) # TensorBoard Logger writer = SummaryWriter(CONFIG.LOG_DIR) loss_meter = MovingAverageValueMeter(20) model.train() model.module.scale.freeze_bn() for iteration in tqdm( range(1, CONFIG.ITER_MAX + 1), total=CONFIG.ITER_MAX, leave=False, dynamic_ncols=True, ): """ # Set a learning rate poly_lr_scheduler( optimizer=optimizer, init_lr=CONFIG.LR, iter=iteration - 1, lr_decay_iter=CONFIG.LR_DECAY, max_iter=CONFIG.ITER_MAX, power=CONFIG.POLY_POWER, ) """ # Clear gradients (ready to accumulate) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG.ITER_SIZE + 1): try: images, labels = next(loader_iter) except: loader_iter = iter(loader) images, labels = next(loader_iter) images = images.to(device) labels = labels.to(device).unsqueeze(1).float() # Propagate forward logits = model(images) # Loss loss = 0 for logit in logits: # Resize labels for {100%, 75%, 50%, Max} logits labels_ = F.interpolate(labels, logit.shape[2:], mode="nearest") labels_ = labels_.squeeze(1).long() # Compute NLL and MPL nll_loss = criterion(logit, labels_) # loss += nll_loss loss += max_pooling_loss(nll_loss) # Backpropagate (just compute gradients wrt the loss) loss /= float(CONFIG.ITER_SIZE) loss.backward() iter_loss += float(loss) loss_meter.add(iter_loss) # Update weights with accumulated gradients optimizer.step() if iteration % CONFIG.ITER_TB == 0: writer.add_scalar("train_loss", loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("train_lr_group{}".format(i), o["lr"], iteration) gt_viz, images_viz, predicts_viz = make_vizs( images, labels_, logits, (CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R)) writer.add_image("gt/images", torch.from_numpy(images_viz[0]), iteration) writer.add_image("gt/labels", torch.from_numpy(gt_viz[0]), iteration) for i, predict_viz in enumerate(predicts_viz): writer.add_image("predict/" + str(i), torch.from_numpy(predict_viz[0]), iteration) if False: # This produces a large log file for name, param in model.named_parameters(): name = name.replace(".", "/") writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram(name + "/grad", param.grad, iteration, bins="auto") # Save a model if iteration % CONFIG.ITER_SAVE == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_{}.pth".format(iteration)), ) # Save a model (short term) if iteration % 100 == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_current.pth"), ) torch.save(model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_final.pth"))
class DataManager: def __init__(self, imagedataset, datadir, inputmix, embedding, device): self.imagedataset = imagedataset self.datadir = datadir self.inputmix = inputmix self.embedding = embedding self.device = device def generateSavepath(self, experimentid): # name the savedir, might add logs/ before the datetime for clarity if experimentid is None: savedir = time.strftime('%Y%m%d%H%M%S') else: savedir = experimentid self.savepath = os.path.join('logs', self.imagedataset, savedir) return self.savepath # getter method def get_savepath(self): return self.savepath def generateTB(self, period): self.writer = SummaryWriter(self.savepath + '/runs') self.loss_meter = MovingAverageValueMeter(20) self.tb = period def get_writer(self): return self.writer def createDirectory(self, values, config, args): try: os.makedirs(self.savepath) # print("Log dir:", savepath) except: pass # now join the path in save_screenshot: if os.path.exists(self.savepath + '/libs'): shutil.rmtree(self.savepath + '/libs') shutil.copytree('./libs/', self.savepath + '/libs') shutil.copy2(osp.abspath(inspect.stack()[0][1]), self.savepath) shutil.copy2(config, self.savepath) args_dict = {} for a in args: args_dict[a] = values[a] with open(self.savepath + '/args.json', 'w') as fp: json.dump(args_dict, fp) def loadClasses(self, bkg): self.seen_classes = np.load( self.datadir + '/split/seen_cls.npy') #only the seen classes if bkg: self.seen_classes = np.asarray(np.concatenate( [np.array([0]), self.seen_classes]), dtype=int) #seen classes + bkg self.novel_classes = np.load(self.datadir + '/split/novel_cls.npy') self.all_labels = np.genfromtxt(self.datadir + '/labels_2.txt', delimiter='\t', usecols=1, dtype='str') self.seen_classes = np.asarray(np.concatenate( [self.seen_classes, np.load(self.datadir + '/split/val_cls.npy')]), dtype=int) self.seen_novel_classes = np.concatenate( [self.seen_classes, self.novel_classes]) self.to_ignore_classes = self.novel_classes if self.inputmix == 'seen': self.visible_classes = self.seen_classes else: self.visible_classes = self.seen_novel_classes print("Seen classes: ") print(self.seen_classes) print("all labels: ") print(self.all_labels) return self.seen_classes, self.novel_classes, self.seen_novel_classes, self.to_ignore_classes, self.visible_classes, self.all_labels def get_Classes(self): return self.seen_classes, self.novel_classes, self.seen_novel_classes, self.to_ignore_classes, self.visible_classes, self.all_labels, self.visibility_mask def loadData(self): self.train = np.load(self.datadir + '/split/train_list.npy') self.novelset = [] self.seenset = [] if self.inputmix == 'seen': self.seenset = range(self.train.shape[0]) else: print("inputmix is not seen") exit() return self.train, self.seenset, self.novelset def get_data(self): return self.train, self.seenset, self.novelset def loadDatasets(self, CONFIG, bs): # Sampler sampler = MyDistributedSampler( self.seenset, self.novelset, num_replicas=torch.distributed.get_world_size(), rank=torch.distributed.get_rank()) self.dataset = get_dataset(CONFIG.DATASET)( train=self.train, test=None, root=CONFIG.ROOT, transform=None, split=CONFIG.SPLIT.TRAIN, base_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.WARP_IMAGE, scale=(0.5, 1.5), flip=True, visibility_mask=self.visibility_mask, ) random.seed(42) # DataLoader self.loader = torch.utils.data.DataLoader( dataset=self.dataset, batch_size=bs, num_workers=CONFIG.NUM_WORKERS, # num_workers = 1, sampler=sampler, pin_memory=True) return self.dataset, self.loader def get_datasets(self): return self.dataset, self.loader def loadClassEmbs(self): # Word embeddings if self.embedding == 'word2vec': self.class_emb = pickle.load( open(self.datadir + '/word_vectors/word2vec.pkl', "rb")) elif self.embedding == 'fasttext': self.class_emb = pickle.load( open(self.datadir + '/word_vectors/fasttext.pkl', "rb")) elif self.embedding == 'fastnvec': self.class_emb = np.concatenate([ pickle.load( open(self.datadir + '/word_vectors/fasttext.pkl', "rb")), pickle.load( open(self.datadir + '/word_vectors/word2vec.pkl', "rb")) ], axis=1) else: print("invalid emb ", self.embedding) exit() self.class_emb = F.normalize(torch.tensor(self.class_emb), p=2, dim=1).to(self.device) self.seen_class_emb = self.class_emb[self.seen_classes] self.to_ignore_class_emb = self.class_emb[self.to_ignore_classes] return self.class_emb, self.to_ignore_class_emb, self.seen_class_emb def get_clsEmbs(self): return self.class_emb, self.to_ignore_class_emb, self.seen_class_emb def loadClsMaps(self, bkg): self.seen_map = np.array([-1] * 256) for i, n in enumerate(list(self.seen_classes)): self.seen_map[n] = i self.all_map = np.array([-1] * 256) for i, n in enumerate(list(self.seen_classes)): self.all_map[n] = i for i, n in enumerate(self.to_ignore_classes, len(self.seen_classes)): self.all_map[n] = i self.inverse_map = np.array([-1] * 256) for i, n in enumerate(self.all_map): self.inverse_map[n] = i if bkg: for i, n in enumerate(self.to_ignore_classes): self.seen_map[n] = 0 # viene usata per sapere quali predizioni sono unseen e quali no nel calcolo della percentuale self.cls_map_seen = np.array([0] * 256) for i, n in enumerate(self.to_ignore_classes): self.cls_map_seen[n] = 1 self.cls_map = None self.cls_map = np.array([255] * 256) for i, n in enumerate(self.seen_classes): self.cls_map[n] = i # VISIBILITY MASK self.visibility_mask = {} self.visibility_mask[0] = self.seen_map.copy() print(self.visibility_mask[0]) return self.seen_map, self.cls_map_seen, self.cls_map def getClsMaps(self): return self.seen_map, self.cls_map_seen, self.cls_map, self.inverse_map def savePerIteration(self, iter_loss, optimizer, model, iteration, save): self.loss_meter.add(iter_loss) # TensorBoard if iteration % self.tb == 0: self.writer.add_scalar("train_loss", self.loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): self.writer.add_scalar("train_lr_group{}".format(i), o["lr"], iteration) # Save a model (short term) if iteration > 0 and iteration % save == 0: print( "\nIteration: {} \nSaving (short term) model (iteration,state_dict,optimizer) ...\n " .format(iteration)) with open(self.savepath + '/iteration.json', 'w') as fp: json.dump({'iteration': iteration}, fp) name = "checkpoint_current.pth.tar" if "voc" in self.savepath or iteration % 5000 == 0: name = "checkpoint_{}.pth.tar".format(iteration) torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(self.savepath, name)) def saveFinal(self, optimizer, model): torch.save( { 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(self.savepath, "checkpoint_final.pth.tar"))
], momentum=CONFIG.SOLVER.MOMENTUM, ) # Learning rate scheduler scheduler = PolynomialLR( optimizer=optimizer, step_size=CONFIG.SOLVER.LR_DECAY, iter_max=CONFIG.SOLVER.ITER_MAX, power=CONFIG.SOLVER.POLY_POWER, ) # Setup loss logger writer = SummaryWriter(os.path.join("experiment", CONFIG.EXP_ID, "summary")) average_loss = MovingAverageValueMeter(CONFIG.SOLVER.AVERAGE_LOSS) # Path to save models checkpoint_dir = os.path.join("experiment", CONFIG.EXP_ID, "checkpoints") makedirs(checkpoint_dir) print("Checkpoint dst:", checkpoint_dir) # Random Dropout model.train() for iteration in tqdm( range(1, CONFIG.SOLVER.ITER_MAX + 1), total=CONFIG.SOLVER.ITER_MAX, dynamic_ncols=True, ):