def data_loaders(model, loss_func, train_dataset, valid_dataset, test_dataset): data_transform = transforms.Compose([ transforms.Resize(model.input_size[1:]), transforms.RandomHorizontalFlip(), transforms.ToTensor() ]) grayscale = model.input_size[0] != 3 if loss_func.__name__ != 'TripletLoss': train_dataset = SiameseNetworkDataset(imageFolderDataset=train_dataset, transform=data_transform, grayscale=grayscale) valid_dataset = SiameseNetworkDataset(imageFolderDataset=valid_dataset, transform=data_transform, grayscale=grayscale) else: train_dataset = TripletDataset(imageFolderDataset=train_dataset, transform=data_transform, grayscale=grayscale) valid_dataset = TripletDataset(imageFolderDataset=valid_dataset, transform=data_transform, grayscale=grayscale) test_dataset = SiameseNetworkDataset(imageFolderDataset=test_dataset, transform=data_transform, grayscale=grayscale) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=Config.train_batch_size, shuffle=True, num_workers=Config.num_workers) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=Config.valid_batch_size, shuffle=True, num_workers=Config.num_workers) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=Config.num_workers) return train_loader, valid_loader, test_loader
def main(): """ Training. """ global start_epoch, epoch, checkpoint # Initialize model or load checkpoint if checkpoint is None: model = UNet(in_channels, out_channels) # Initialize the optimizer optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=lr) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to default device model = model.to(device) criterion = nn.L1Loss().to(device) # Custom dataloaders train_loader = torch.utils.data.DataLoader(TripletDataset( train_folder, crop_size, scale), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) test_loader = torch.utils.data.DataLoader(TripletDataset( test_folder, crop_size, scale), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) # Total number of epochs to train for epochs = int(iterations // len(train_loader) + 1) # Epochs for epoch in range(start_epoch, epochs): # One epoch's training train(train_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, epoch=epoch, epochs=epochs) test(test_loader=test_loader, model=model, criterion=criterion) # Save checkpoint torch.save({ 'epoch': epoch, 'model': model, 'optimizer': optimizer }, f'checkpoints/checkpoint_unet_{epoch}.pth.tar')
def train(self, config): train_dataset = TripletDataset(config['dataset_path'], 'train', config['data_augmentation_suffixes']) train_dataset.prepare(config['num_train_pairs']) val_dataset = TripletDataset(config['dataset_path'], 'validation') val_dataset.prepare(config['num_val_pairs']) train_generator = TripletDataGenerator( train_dataset, batch_size=config['batch_size'], dim=self.config['input_shape'], shuffle=config['shuffle_training_inputs']) val_generator = TripletDataGenerator( val_dataset, batch_size=config['batch_size'], dim=self.config['input_shape'], shuffle=config['shuffle_training_inputs']) model_path, _ = os.path.split(self.config['model_filename']) callbacks = [ keras.callbacks.TensorBoard(log_dir=self.log_dir, histogram_freq=0, write_graph=True, write_images=False), keras.callbacks.ModelCheckpoint(self.checkpoint_path, verbose=0, save_weights_only=True) ] self.keras_model.compile(loss=utils.l2_loss, optimizer=Adam(lr=config['learning_rate'])) self.keras_model.fit_generator(generator=train_generator, validation_data=val_generator, epochs=config['epochs'], use_multiprocessing=True, callbacks=callbacks, workers=multiprocessing.cpu_count()) self.keras_model.save(self.config['model_filename'])
def __init__(self, path, transform, num_triplets, batchsize, resolution): self.path = path self.batchsize = batchsize self.num_workers = 4 self.transform = transform self.resolution= resolution self.num_triplets= num_triplets self.dataset = TripletDataset(self.path, self.transform, num_triplets = self.num_triplets, resolution = self.resolution) self.dataloader = DataLoader( dataset=self.dataset, batch_size=self.batchsize, shuffle=False, num_workers=self.num_workers)
def get_triplet_dataloader(root=None, batch_size=1, transforms=None): dataset = TripletDataset(root=root, transforms=transforms) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) return dataloader
def main(args): assert args.save_interval % 10 == 0, "save_interval must be a multiple of 10" # prepare dirs os.makedirs(args.log_dir, exist_ok=True) os.makedirs(args.save_model, exist_ok=True) device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu") print("Device is", device) # img path loading with open("data/3d_data.pkl", mode='rb') as f: data_3d = pickle.load(f) train_path_list = data_3d.train_pl val_path_list = data_3d.val_pl train_dataset = TripletDataset(transform=ImageTransform(), flist=train_path_list) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) val_dataset = TripletDataset(transform=ImageTransform(), flist=val_path_list) val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) model = TripletNet() model.to(device) criterion = nn.MarginRankingLoss(margin=args.margin) # choose params to train update_params_name = [] for name, _ in model.named_parameters(): if 'layer4' in name: update_params_name.append(name) elif 'fc' in name: update_params_name.append(name) print("**-----** update params **-----**") print(update_params_name) print("**-----------------------------**") print() params_to_update = choose_update_params(update_params_name, model) # set optimizer optimizer = optim.SGD(params_to_update, lr=1e-4, momentum=0.9) # run epoch log_writer = SummaryWriter(log_dir=args.log_dir) for epoch in range(args.num_epochs): print("-"*80) print('Epoch {}/{}'.format(epoch+1, args.num_epochs)) epoch_loss, epoch_acc = [], [] for inputs, labels in tqdm(train_dataloader): batch_loss, batch_acc = train_one_batch(inputs, labels, model, criterion, optimizer, device) epoch_loss.append(batch_loss.item()) epoch_acc.append(batch_acc.item()) epoch_loss = np.array(epoch_loss) epoch_acc = np.array(epoch_acc) print('[Loss: {:.4f}], [Acc: {:.4f}] \n'.format(np.mean(epoch_loss), np.mean(epoch_acc))) log_writer.add_scalar("train/loss", np.mean(epoch_loss), epoch+1) log_writer.add_scalar("train/acc", np.mean(epoch_acc), epoch+1) # validation if (epoch+1) % 10 == 0: print("Run Validation") epoch_loss, epoch_acc = [], [] for inputs, labels in tqdm(val_dataloader): batch_loss, batch_acc = validation(inputs, labels, model, criterion, device) epoch_loss.append(batch_loss.item()) epoch_acc.append(batch_acc.item()) epoch_loss = np.array(epoch_loss) epoch_acc = np.array(epoch_acc) print('[Validation Loss: {:.4f}], [Validation Acc: {:.4f}]'.format(np.mean(epoch_loss), np.mean(epoch_acc))) log_writer.add_scalar("val/loss", np.mean(epoch_loss), epoch+1) log_writer.add_scalar("val/acc", np.mean(epoch_acc), epoch+1) # save model if (args.save_interval > 0) and ((epoch+1) % args.save_interval == 0): save_path = os.path.join(args.save_model, '{}_epoch_{:.1f}.pth'.format(epoch+1, np.mean(epoch_loss))) torch.save(model.state_dict(), save_path) log_writer.close()
name = 'arcface1.pt' load_local_model = False # os.environ['CUDA_LAUNCH_BLOCKING']='1' # device: cpu or cuda os.environ[ 'CUDA_VISIBLE_DEVICES'] = '2' # specify which gpu you want to use device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("Device:", device) df_eval1 = pd.read_csv('../Data/eval_same.csv') df_eval2 = pd.read_csv('../Data/eval_diff.csv') df_test = pd.read_csv('../Data/test.csv') eval_dataset1 = TripletDataset(df_eval1, mode='eval') eval_dataset2 = TripletDataset(df_eval2, mode='eval') test_dataset = TripletDataset(df_test, mode='test') eval_loader1 = DataLoader(eval_dataset1, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, drop_last=False) eval_loader2 = DataLoader(eval_dataset2, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, drop_last=False) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, drop_last=False)
def train_triplet_module(): """ :return: """ # 设置超参数 LR = 1e-5 # 学习率 EPOCH = 15 # 训练轮数 BATCH_SIZE = 10 # Class Batch size N_CLASS = 10 # 类别个数 num_sub_dataset = 10 # 子集数量 start_epoch = 0 # start from epoch 0 or last checkpoint epoch resume = False # 是否断点训练 workers = 0 # Number of workers for dataloader margin = 1e-1 # triplet loss 超参数 margin k = 1 # k in topk interval = 5 # diff loss和triplet loss间隔epoch balance = 4e-2 # diff loss和triplet loss间权重 # 加载数据 train_dataset = TripletDataset(num_sub_dataset) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=workers) diff_dataset = DifferenceDataset() # 定义模型 model_set = [] for i in range(num_sub_dataset): model = TripletModule().float().to(device) model_set.append(model) # 定义优化器 params_set = [] for model in model_set: params = [{'params': model.parameters(), 'lr': LR}] params_set.extend(params) optimizer = optim.Adam(params_set, lr=LR) # 断点训练,加载模型权重 if resume: print('==> Resuming from checkpoint..') assert os.path.isdir( 'Checkpoint'), 'Error: no Checkpoint directory found!' state = torch.load('Checkpoint/models/ckpt.pth') for i in range(num_sub_dataset): model = model_set[i] model.load_state_dict(state['net'][i]) optimizer.load_state_dict(state['optim']) start_epoch = state['epoch'] # 损失函数 cosloss = nn.CosineSimilarity(dim=1, eps=1e-6).to(device) # 训练模型 for epoch in range(start_epoch, EPOCH): print( "####################################################################################" ) # 学习率调度机制 adjust_learning_rate(optimizer, epoch) print('Learning rate is {}'.format(optimizer.param_groups[0]['lr'])) ############################ # 训练 ############################ # 训练模式 for i in range(num_sub_dataset): model = model_set[i] model.train() # 迭代次数 cnt = 0 # triplet损失 sum_triplet_loss = 0. # diff损失 sum_diff_loss = 0. # 损失 sum_loss = 0. for data in train_loader: cnt += 1 # 加载Triplet数据集数据 x, y = data batch_size = x.size(0) inputs, labels = torch.cat(tuple([x[:, i] for i in range(num_sub_dataset)]), dim=0),\ torch.cat(tuple([y[:, i] for i in range(num_sub_dataset)]), dim=0) inputs, labels = inputs.view( (-1, inputs.size(-1))), labels.view(-1) inputs, labels = inputs.float().to(device), labels.int().to(device) # 梯度置零 optimizer.zero_grad() # 前向传播、后向传播 num_subset_sample = batch_size * N_CLASS # 每个子集对应batch的样本数量 embeddings = torch.cat(tuple([ model_set[i](inputs[num_subset_sample * i:num_subset_sample * (i + 1)]) for i in range(num_sub_dataset) ]), dim=0) triplet_loss = batch_hard_triplet_loss(k, num_subset_sample, labels, embeddings, margin=margin, device=device) # triplet loss sum_triplet_loss += triplet_loss.item() # 加载Difference数据集数据 x, y = diff_dataset.getsamples(batch_size) inputs, labels = torch.from_numpy(x), torch.from_numpy(y) inputs, labels = inputs.float().to(device), labels.int().to(device) outputs = [] outputs_sum = None for model in model_set: output = model(inputs) outputs.append(output) if outputs_sum is None: outputs_sum = output else: outputs_sum += output diff_loss = 0. for output in outputs: # diff_loss += torch.sum(torch.abs(cosloss(output, (outputs_sum-output)/(num_sub_dataset-1)))) / inputs.size(0) diff_loss += torch.sum( cosloss(output, (outputs_sum - output) / (num_sub_dataset - 1))) / inputs.size(0) diff_loss /= num_sub_dataset sum_diff_loss += diff_loss.item() loss = triplet_loss + balance * diff_loss sum_loss += loss.item() if (epoch + 1) % interval == 0: loss.backward() else: triplet_loss.backward() # 梯度更新 optimizer.step() # 打印日志 if cnt % 5 == 0 or cnt == len(train_loader): print( '[%d/%d]--[%d/%d]\tTriplet Loss: %.6f\tDiff Loss: %.6f\tLoss: %.6f' % (epoch + 1, EPOCH, cnt, len(train_loader), sum_triplet_loss / cnt, sum_diff_loss / cnt, sum_loss / cnt)) # 模型状态 net_state_set = [model.state_dict() for model in model_set] # 保存断点模型 state = { 'net': net_state_set, 'optim': optimizer.state_dict(), 'epoch': epoch } torch.save(state, './Checkpoint/models/ckpt.pth')
def train_worker(dataset, device, rank=0, world_size=None): torch.cuda.set_device(device) criterion = TripletMarginRankingLoss(args.loss_margin) model = TransformerPool(args.vocab_size, args.embedding_dim, args.hidden_dim, pre_trained=GLOVE) if args.re_train: model.load_state_dict(torch.load( args.train_model, map_location='cuda:{}'.format(device))) else: model.apply(init_weights) model, criterion = model.to(device), criterion.to(device) triplet_dataset = TripletDataset(dataset) in_distributed_mode = True if world_size else False if in_distributed_mode: rank, device = torch.distributed.get_rank(), torch.cuda.current_device() print("rank:{}, device:{}".format(rank, device)) if in_distributed_mode: model = DistributedDataParallel( model, device_ids=[device]) datasampler = DistributedSampler(triplet_dataset) dataloader = DataLoader(triplet_dataset, shuffle=False, pin_memory=True, num_workers=0, batch_size=args.batch_size, sampler=datasampler) else: dataloader = DataLoader(triplet_dataset, shuffle=True, pin_memory=True, num_workers=4, batch_size=args.batch_size) optimizer = RAdam( model.parameters(), lr=args.learning_rate) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.t_max, eta_min=args.eta_min) model.train() best_avg_loss = None t1 = time.time() for epoch in range(args.epoch): datasampler.set_epoch(epoch) if in_distributed_mode else None total_loss = [] bar = tqdm(desc='EPOCH {:02d}'.format(epoch), total=len( dataloader), leave=False) if rank == 0 else None for triplet in dataloader: optimizer.zero_grad() anchor, positive, negative = model(triplet) loss = criterion(anchor, positive, negative) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() total_loss.append(loss.item()) bar.update() if rank == 0 else None if rank == 0: bar.close() epoch_avg_loss = np.mean(total_loss) scheduler.step(epoch_avg_loss) print("Epoch {:02d}, Time {:.02f}s, AvgLoss {:.08f}, lr {:.8f}".format( epoch, time.time()-t1, epoch_avg_loss, optimizer.param_groups[0]['lr'])) if best_avg_loss is None or epoch_avg_loss < best_avg_loss: best_avg_loss = epoch_avg_loss state_dict = model.module.state_dict() if in_distributed_mode else model.state_dict() torch.save(state_dict, args.model_path) t1 = time.time() scheduler.step(epoch) torch.cuda.empty_cache() return