def train(epochs, n_train, k_train, q_train, n_eval=1, k_eval=3, q_eval=5, episodes_per_epoch=100, num_tasks=1, lr=1e-3, lr_step_size=20, lr_gamma=0.5): # print parameters print('================ parameters ================') print('epochs', epochs) print('train (n, k, q)', n_train, k_train, q_train) print('eval (n, k, q)', n_eval, k_eval, q_eval) print('episodes per epoch', episodes_per_epoch) print('num_tasks', num_tasks) print('learning rate', lr) print('learning rate step size', lr_step_size) print('learning rate step rate', lr_gamma) print('============================================') # dataloaders for train and eval train_set = OmniglotDataset(subset='background') train_loader = DataLoader(train_set, num_workers=0, batch_sampler=FewShotBatchSampler( train_set, episodes_per_epoch=episodes_per_epoch, n=n_train, k=k_train, q=q_train, num_tasks=num_tasks)) eval_set = OmniglotDataset(subset='evaluation') eval_loader = DataLoader(eval_set, num_workers=0, batch_sampler=FewShotBatchSampler( eval_set, episodes_per_epoch=episodes_per_epoch, n=n_eval, k=k_eval, q=q_eval, num_tasks=num_tasks)) # train settings model = protonet_embedding_model().to(config.DEVICE) optimizer = Adam(model.parameters(), lr=lr) scheduler = StepLR(optimizer, step_size=lr_step_size, gamma=lr_gamma) loss_fn = torch.nn.NLLLoss().to(config.DEVICE) summary(model, (1, 105, 105)) # train history = {'loss': list(), 'accuracy': list()} for epoch in range(1, epochs + 1): train_epoch(model, optimizer, scheduler, loss_fn, train_loader, n_train, k_train, q_train, epoch) evaluate(model, history, loss_fn, eval_loader, n_eval, k_eval, q_eval, epoch) # save model and history if epoch == 1 or history['accuracy'][-1] > max( history['accuracy'][:-1]): torch.save(model.state_dict(), f'{config.MODEL_PATH}/protonets.ckpt') save_history(history)
transform=transform), batch_size=batch_size, shuffle=True) test_loader = DataLoader(datasets.CIFAR10('../data', train=False, transform=transforms.Compose([ToTensor()])), batch_size=batch_size, shuffle=True) model = BasicCNN() model.cuda() optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay) #optimizer = optim.Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss().cuda() scheduler = StepLR(optimizer, step_size=10, gamma=0.3) train_loss = np.zeros((epochs, 1), dtype=np.float32) train_acc = np.zeros((epochs, 1), dtype=np.float32) val_acc = np.zeros(shape=(epochs,1), dtype=np.float32) val_loss = np.zeros(shape=(epochs, 1), dtype=np.float32) def save_checkpoint(state, filename='saved/cifar10_checkpoint_%s.pth.tar'%(numb)): torch.save(state, filename) if state['is_best']==True: shutil.copyfile(filename, 'saved/cifar10_model_best_%s.pth.tar'%(numb))
topk = 20 config = get_config(model_name, dataset_name) model = SASRec(config).to(device) elif model_name == 'stamp': batch_size = 512 epoch_number = 30 lr = 0.001 lr_dc = 0.1 lr_dc_step = 80 topk = 20 config = get_config(model_name, dataset_name) model = STAMP(config).to(device) # ----------------------------------------init model---------------------------------------- optimizer = optim.Adam(model.parameters(), lr) scheduler = StepLR(optimizer, step_size=lr_dc_step, gamma=lr_dc) # ----------------------------------------load data---------------------------------------- if dataset_name == 'yoochoose1_64': data_root = './data/yoochoose1_64/' elif dataset_name == 'diginetica': data_root = './data/diginetica/' train, test = Preprocess(data_root) train_dataset = RecSysDataset(train) test_dataset = RecSysDataset(test) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=rec15_collate_fn) test_loader = DataLoader(test_dataset,
def train(args): start_epoch = 0 data_loader = DataLoader(dataset=HellenDataset(True, 224), batch_size=args.batch, shuffle=True, num_workers=16) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") model = CnnAlign() print("add graph") writer.add_graph(model, torch.zeros((1, 3, 224, 224))) print("add graph over") if args.pretrained and os.path.exists(MODEL_SAVE_PATH): print("loading ...") state = torch.load(MODEL_SAVE_PATH) model.load_state_dict(state['net']) start_epoch = state['epoch'] print("loading over") model = torch.nn.DataParallel(model, device_ids=[0, 1]) # multi-GPU model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5) scheduler = StepLR(optimizer, step_size=args.step, gamma=args.gama) train_loss = 0 to_pil_img = tfs.ToPILImage() to_tensor = tfs.ToTensor() for epoch in range(start_epoch, start_epoch+args.epoes): model.train() prefetcher = DataPrefetcher(data_loader) img_tensor, label_tensor = prefetcher.next() last_img_tensor = img_tensor last_label_tensor = label_tensor optimizer.zero_grad() i_batch = 0 while img_tensor is not None: last_img_tensor = img_tensor last_label_tensor = label_tensor output = model(img_tensor) loss = torch.nn.functional.smooth_l1_loss(output, label_tensor.view(-1, output.size(1))) if loss is None: img_tensor, label_tensor = prefetcher.next() continue loss.backward() if i_batch % args.mini_batch == 0: optimizer.step() optimizer.zero_grad() train_loss = loss.item() global_step = epoch*len(data_loader)+i_batch progress_bar(i_batch, len(data_loader), 'loss: %f, epeche: %d'%(train_loss, epoch)) writer.add_scalar("loss", train_loss, global_step=global_step) img_tensor, label_tensor = prefetcher.next() i_batch += 1 #save one pic and output pil_img = to_pil_img(last_img_tensor[0].cpu()) ann = output[0].cpu().detach().numpy() ann = np.resize(ann, (194, 2)) draw_ann(pil_img, ann.tolist(), font1, font_size) writer.add_image("img: "+str(epoch), to_tensor(pil_img)) scheduler.step() if epoch % 10 == 0: print('Saving..') state = { 'net': model.module.state_dict(), 'epoch': epoch, } torch.save(state, "./output/face_align"+str(epoch)+".pt") if not os.path.isdir('data'): os.mkdir('data') print('Saving..') state = { 'net': model.module.state_dict(), 'epoch': epoch, } torch.save(state, MODEL_SAVE_PATH) writer.close()
def main(): # Training settings batch_size = 8 learning_rate = 0.0001 gamma = 0.5 epochs = 50 lr_scheduler_step_size = 12 adam_betas = (0.9, 0.999) pathToModel = os.path.join(BASEDIR, 'weights.pt') restart = True # attempt to use GPU if available use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # CPY ABOVE HERE train_folder = os.path.join(DATA, 'train') path_train_csv = os.path.join(DATA, 'labels', 'Train_labels.csv') print('Loading training data...') trainX, trainY = load_data(train_folder, path_train_csv) print('x train shape:', trainX.shape) print('Split the train/val data sets 80/20') num = int(trainX.shape[0] * 0.2) np.random.seed(1234567) idxs = np.random.choice(np.arange(trainX.shape[0]), num, replace=False) x_val_raw = trainX[idxs] y_val = trainY[idxs] x_train_raw = np.delete(trainX, idxs, axis=0) y_train = np.delete(trainY, idxs, axis=0) y_train = np.argmax(y_train, axis=1) y_val = np.argmax(y_val, axis=1) x_train = preprocess(x_train_raw) x_val = preprocess(x_val_raw) print('Reshaping to have channels first') x_train = reshapeInput(x_train) x_val = reshapeInput(x_val) print('Number of training data:', x_train_raw.shape[0]) print('Number of validation data:', x_val_raw.shape[0]) num = int(trainX.shape[0] * 0.2) np.random.seed(1234567) idxs = np.random.choice(np.arange(trainX.shape[0]), num, replace=False) x_val_raw = trainX[idxs] y_val = trainY[idxs] x_train_raw = np.delete(trainX, idxs, axis=0) y_train = np.delete(trainY, idxs, axis=0) y_train = np.argmax(y_train, axis=1) y_val = np.argmax(y_val, axis=1) # preprocess training and validation print('Preprocessing...') x_train = preprocess(np.copy(x_train_raw)) x_val = preprocess(np.copy(x_val_raw)) print('Reshaping to have channels first') x_train = reshapeInput(x_train) x_val = reshapeInput(x_val) # load the model model = model_generator() # model = PhoneLocator().to(device) if (use_cuda): model.cuda() # load the optimizer and setup schedule to reduce learning rate every 10 epochs optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=adam_betas) scheduler = StepLR(optimizer, step_size=lr_scheduler_step_size, gamma=gamma) train_dataset = (torch.FloatTensor(x_train), torch.FloatTensor(y_train)) validation_dataset = (torch.FloatTensor(x_val), torch.FloatTensor(y_val)) # create train and validatoin data loader data_transform = transforms.Compose([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.RandomGrayscale(p=0.05), transforms.ToTensor() ]) train_dataset = CustomTensorDataset(tensors=train_dataset, transform=data_transform) histcount = np.histogram(y_train, bins=7)[0] classWeight = 1.0 - histcount / histcount.sum() classWeight_tensor = torch.FloatTensor(classWeight).to(device) samples_weights = classWeight_tensor[y_train] sampler = torch.utils.data.WeightedRandomSampler( weights=samples_weights, num_samples=len(samples_weights), replacement=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, **kwargs) validation_loader = torch.utils.data.DataLoader( TensorDataset(*validation_dataset), shuffle=True, **kwargs) # load model if path exists if os.path.isfile(pathToModel) and not restart: print('restarting..') model.load_state_dict(torch.load(pathToModel)) # each iteration gather the n=test_batch_size samples and their respective labels [0,9] best_loss = math.inf train_loss_save = np.zeros((epochs)) val_loss_save = np.zeros((epochs)) print('Beginning to train') for epoch in range(1, epochs + 1): train_loss = train(model, device, train_loader, optimizer, epoch) val_loss = validate(model, device, validation_loader) if (use_cuda): train_loss_save[epoch - 1] = train_loss.cpu().data.numpy() val_loss_save[epoch - 1] = val_loss.cpu().data.numpy() else: train_loss_save[epoch - 1] = train_loss.data.numpy() val_loss_save[epoch - 1] = val_loss.data.numpy() if (val_loss < best_loss): print('Loss improved from ', best_loss, 'to', val_loss, ': Saving new model to', pathToModel) best_loss = val_loss torch.save(model.state_dict(), pathToModel) scheduler.step() np.save('./val_loss.npy', val_loss_save) np.save('./train_loss.npy', train_loss_save)
optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=opt.lr, weight_decay=opt.weight_decay) else: optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=opt.lr, weight_decay=opt.weight_decay) scheduler = StepLR(optimizer, step_size=opt.lr_step, gamma=0.1) start = time.time() for i in range(opt.max_epoch): scheduler.step() model.train() for ii, data in enumerate(trainloader): data_input, label = data data_input = data_input.to(device) #print("data_input.shape",data_input.shape) label = label.to(device).long() feature = model(data_input) #print("feature.shape",feature.shape) output = metric_fc(feature, label) #print("output.shape",output.shape)
test_loss /= len(test_loader.dataset) test_losses.append(test_loss) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) test_acc.append(100. * correct / len(test_loader.dataset)) """# Let's Train and test our model""" from torch.optim.lr_scheduler import StepLR model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) scheduler = StepLR(optimizer, step_size=6, gamma=0.1) EPOCHS = 15 for epoch in range(EPOCHS): print("EPOCH:", epoch) train(model, device, train_loader, optimizer, epoch) test(model, device, test_loader) fig, axs = plt.subplots(2,2,figsize=(15,10)) axs[0, 0].plot(train_losses) axs[0, 0].set_title("Training Loss") axs[1, 0].plot(train_acc[4000:]) axs[1, 0].set_title("Training Accuracy") axs[0, 1].plot(test_losses) axs[0, 1].set_title("Test Loss") axs[1, 1].plot(test_acc) axs[1, 1].set_title("Test Accuracy")
def main(): # ----------------------------------- Model Build ------------------------- model = UnwarpNet_cmap(combine_num=1) args = train_configs.args isTrain = True model = torch.nn.DataParallel(model.cuda()) start_epoch = 1 # Load Parameters # if args.pretrained: if True: print("Loading Pretrained model~") #""/home1/quanquan/code/film_code/output/train/aug20201129-210822-VktsHX/cmap_aug_19.pkl"" # "/home1/quanquan/code/Film-Recovery/cmap_only_45.pkl" # "/home1/quanquan/code/Film-Recovery/output/train/new_data20201214-090229-F3z21O/cmap_aug_500.pkl" pretrained_dict = torch.load("/home1/quanquan/code/Film-Recovery/cmap_only_45.pkl", map_location=None) start_lr = pretrained_dict['lr'] start_epoch = pretrained_dict['epoch'] if pretrained_dict['epoch'] < 100 else 100 # ----------------------- Load partial model --------------------- model_dict=model.state_dict() # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # ------------------------------------------------------------------- # model.load_state_dict(pretrained_dict['model_state']) model.load_state_dict(model_dict) # ------------------------------------ Load Dataset ------------------------- kwargs = {'num_workers': 8, 'pin_memory': True} # dataset_test = filmDataset_3(npy_dir="/home1/quanquan/datasets/generate/mesh_film_small/") # dataset_test_loader = DataLoader(dataset_test,batch_size=args.test_batch_size, shuffle=False, **kwargs) dataset_train = filmDataset_3("/home1/quanquan/datasets/generate/mesh_film_hypo_alpha2/", load_mod="new_ab") dataset_train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, **kwargs) # ------------------------------------ Optimizer ------------------------- optimizer = optim.Adam(model.parameters(), lr=0.001) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) # model, optimizer = amp.initialize(model, optimizer,opt_level='O1',loss_scale="dynamic",verbosity=0) #criterion = torch.nn.MSELoss() criterion = torch.nn.L1Loss() bc_critic = nn.BCELoss() if args.visualize_para: for name, parameters in model.named_parameters(): print(name, ':', parameters.size()) start_lr = args.lr # ----------------------------------- Training --------------------------- for epoch in range(start_epoch, max_epoch + 1): loss_value, loss_cmap_value, loss_ab_value, loss_uv_value, loss_bg_value = 0,0,0,0,0 model.train() datalen = len(dataset_train) print("Output dir:", output_dir) for batch_idx, data in enumerate(dataset_train_loader): ori_gt = data[0].cuda() ab_gt = data[1].cuda() dep_gt = data[2].cuda() nor_gt = data[3].cuda() cmap_gt= data[4].cuda() uv_gt = data[5].cuda() bg_gt = data[6].cuda() optimizer.zero_grad() uv, cmap, ab, bg = model(ori_gt) # print("ab shapes: ", ab.shape, ab_gt.shape) loss_cmap = criterion(cmap, cmap_gt).float() loss_ab = criterion(ab, ab_gt).float() loss_uv = criterion(uv, uv_gt).float() loss_bg = criterion(bg, bg_gt).float() loss = loss_cmap + loss_bg # + loss_ab + loss_uv loss.backward() optimizer.step() loss_value += loss.item() loss_cmap_value += loss_cmap.item() loss_ab_value += loss_ab.item() loss_uv_value += loss_uv.item() loss_bg_value += loss_bg.item() print("\r Epoch[{}/{}] \t batch:{}/{} \t \t loss: {}".format(epoch, max_epoch, batch_idx,datalen, loss_value/(batch_idx+1)), end=" ") lr = get_lr(optimizer) # w("check code") # break #scheduler.step() writer_tb((loss_value/(batch_idx+1), loss_ab_value/(batch_idx+1), loss_uv_value/(batch_idx+1), loss_cmap_value/(batch_idx+1),loss_bg_value/(batch_idx+1), lr), epoch=epoch) write_imgs_2((cmap[0,:,:,:], uv[0,:,:,:], ab[0,:,:,:],bg[0,:,:,:], ori_gt[0,:,:,:], cmap_gt[0,:,:,:], uv_gt[0,:,:,:], ab_gt[0,:,:,:], bg_gt[0,:,:,:]), epoch) if isTrain and args.save_model and epoch % 10 == 0: state = {'epoch': epoch + 1, 'lr': lr, 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict() } torch.save(state, tfilename(output_dir, "{}_{}.pkl".format("cmap_aug", epoch)))
def train(data_dir, train_imdb, val_imdb, model_save_path="./model/", use_gpu=True): # initialize training configuration config = Config() config.pos_pair_range = 180 # do data augmentation in PyTorch; # you can also do complex data augmentation as in the original paper center_crop_size = config.instance_size - config.stride random_crop_size = config.instance_size - 2 * config.stride train_z_transforms = transforms.Compose([ RandomStretch(), CenterCrop((config.examplar_size, config.examplar_size)), ToTensor() ]) train_x_transforms = transforms.Compose([ RandomStretch(), CenterCrop((center_crop_size, center_crop_size)), RandomCrop((random_crop_size, random_crop_size)), ToTensor() ]) valid_z_transforms = transforms.Compose([ CenterCrop((config.examplar_size, config.examplar_size)), ToTensor(), ]) valid_x_transforms = transforms.Compose([ToTensor()]) # load data (see details in VIDDataset.py) train_dataset = VIDDataset(train_imdb, data_dir, config, train_z_transforms, train_x_transforms, curriculum=True) val_dataset = VIDDataset(val_imdb, data_dir, Config(), valid_z_transforms, valid_x_transforms, mode="Validation") # create dataloader train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.train_num_workers, drop_last=True) val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.val_num_workers, drop_last=True) # create SiamFC network architecture (see details in SiamNet.py) net = SiamNet() # move network to GPU if using GPU if use_gpu: net.cuda() # define training strategy; # the learning rate of adjust layer (i.e., a conv layer) # is set to 0 as in the original paper optimizer = torch.optim.SGD([ { 'params': net.feat_extraction.parameters() }, { 'params': net.adjust.bias }, { 'params': net.adjust.weight, 'lr': 0 }, ], config.lr, config.momentum, config.weight_decay) # adjusting learning in each epoch scheduler = StepLR(optimizer, config.step_size, config.gamma) # used to control generating label for training; # once generated, they are fixed since the labels for each # pair of images (examplar z and search region x) are the same train_response_flag = False valid_response_flag = False f = open('./model/modified_loss/loss_data.txt', 'a') # ------------------------ training & validation process ------------------------ for i in range(config.num_epoch): # adjusting learning rate scheduler.step() # ------------------------------ training ------------------------------ # indicating training (very important for batch normalization) net.train() # used to collect loss train_loss = [] train_dataset.set_epoch(i + 1) for j, data in enumerate(tqdm(train_loader)): # fetch data, i.e., B x C x W x H (batchsize x channel x wdith x heigh) exemplar_imgs, instance_imgs = data # forward pass if use_gpu: exemplar_imgs = exemplar_imgs.cuda() instance_imgs = instance_imgs.cuda() output = net.forward(Variable(exemplar_imgs), Variable(instance_imgs)) # create label for training (only do it one time) if not train_response_flag: # change control flag train_response_flag = True # get shape of output (i.e., response map) response_size = output.shape[2:4] # generate label and weight train_eltwise_label, train_instance_weight = create_label( response_size, config, use_gpu) # clear the gradient optimizer.zero_grad() # loss loss = net.weight_loss(output, train_eltwise_label, train_instance_weight) # backward loss.backward() # update parameter optimizer.step() # collect training loss train_loss.append(loss.data.item()) # ------------------------------ saving model ------------------------------ if not os.path.exists(model_save_path): os.makedirs(model_save_path) torch.save(net, model_save_path + "SiamFC_" + str(i + 1) + "_model.pth") # ------------------------------ validation ------------------------------ # indicate validation net.eval() # used to collect validation loss val_loss = [] val_dataset.set_epoch(i + 1) for j, data in enumerate(tqdm(val_loader)): exemplar_imgs, instance_imgs = data # forward pass if use_gpu: exemplar_imgs = exemplar_imgs.cuda() instance_imgs = instance_imgs.cuda() output = net.forward(Variable(exemplar_imgs), Variable(instance_imgs)) # create label for validation (only do it one time) if not valid_response_flag: valid_response_flag = True response_size = output.shape[2:4] valid_eltwise_label, valid_instance_weight = create_label( response_size, config, use_gpu) # loss loss = net.weight_loss(output, valid_eltwise_label, valid_instance_weight) # collect validation loss val_loss.append(loss.data.item()) train_loss = np.array(train_loss) val_loss = np.array(val_loss) f.write('{}, {}'.format(np.mean(train_loss), np.mean(val_loss))) print("Epoch %d training loss: %f, validation loss: %f" % (i + 1, np.mean(train_loss), np.mean(val_loss))) f.close()
DATASET_PATH = os.path.join('/tmp/pycharm_project562/16_tcls_movie') criterion_type = { 'regression': nn.MSELoss(), 'classification': nn.CrossEntropyLoss(), 'bilstmwithattn': nn.CrossEntropyLoss(), 'cnntext': nn.CrossEntropyLoss(), 'ImgText2Vec': nn.CrossEntropyLoss() } criterion = criterion_type[config.model] reg_criterion = nn.MSELoss().cuda() optimizer = optim.Adam(model.parameters(), weight_decay=config.l2, lr=config.learning_rate) scheduler = StepLR(optimizer, step_size=3, gamma=config.lr_decay) # DONOTCHANGE: They are reserved for nsml if config.pause and config.nsml_use: nsml.paused(scope=locals()) # 학습 모드일 때 사용합니다. (기본값) if config.mode == 'train': print('train data loading...') # 데이터를 로드합니다(참고 : 데이터셋의 class 비율이 많이 다릅니다). train_loader = DataLoader( dataset=dataset, batch_size=config.batch, shuffle=True,
def run_main(): # Check if cuda is available use_cuda = torch.cuda.is_available() # Set proper device based on cuda availability device = torch.device("cuda" if use_cuda else "cpu") print("Torch device selected: ", device) # Initialize the model and send to device model = Net().to(device) # Initialize the criterion for loss computation criterion = nn.CrossEntropyLoss(reduction='mean') # Initialize optimizer type if config.optimizer_type == 'Adam': optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) print("Use optimizer type: {}, LR: {}".format(config.optimizer_type, config.learning_rate)) elif config.optimizer_type == 'SGD': optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay) print("Use optimizer type: {}, LR: {}".format(config.optimizer_type, config.learning_rate)) else: print("Select optimizer type from {SGD | Adam}") exit(0) # Create transformations to apply to each data sample # Can specify variations such as image flip, color flip, random crop, ... transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) # Load datasets for training and testing # Inbuilt datasets available in torchvision (check documentation online) dataset1 = datasets.MNIST('./data/', train=True, download=True, transform=transform) dataset2 = datasets.MNIST('./data/', train=False, transform=transform) train_loader = DataLoader(dataset1, batch_size = config.batch_size, shuffle=True, num_workers=4) test_loader = DataLoader(dataset2, batch_size = config.batch_size, shuffle=False, num_workers=4) # Optionally, use a scheduler to change learning rate at certain interval manually # Used for step LR change, cyclic LR change or manual LR change after some epochs scheduler = StepLR(optimizer, step_size=config.step_size, gamma=0.1) # Init variable to store best loss, can use for saving best model best_accuracy = 0.0 # Create summary writer object in specified folder. # Use same head folder and different sub_folder to easily compare between runs # Eg. SummaryWriter("my_logs/run1_Adam"), SummaryWriter("my_logs/run2_SGD") # This allows tensorboard to easily compare between run1 and run2 writer = SummaryWriter("my_logs/run1_Adam", comment="Test_01_LR_1e-3") # Run training for n_epochs specified in config for epoch in range(1, config.n_epochs + 1): train_loss, train_accuracy = train(model, device, train_loader, optimizer, criterion, epoch, log_interval = 50) test_loss, test_accuracy = test(model, device, test_loader) scheduler.step() writer.add_scalar('Loss/train', train_loss, epoch) writer.add_scalar('Loss/test', test_loss, epoch) writer.add_scalar('Accuracy/train', train_accuracy, epoch) writer.add_scalar('Accuracy/test', test_accuracy, epoch) writer.add_scalar('LR', optimizer.param_groups[0]['lr'], epoch) if test_accuracy > best_accuracy and config.save: best_accuracy = test_accuracy save_file_path = os.path.join(config.save_dir, 'model_{}_{:2.2f}.pth'.format(epoch, best_accuracy)) states = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_accuracy': best_accuracy } try: os.mkdir(config.save_dir) except: pass torch.save(states, save_file_path) print('Model saved ', str(save_file_path)) # Alternatively same entire model, but takes larger size # torch.save(model, save_file_path) #if epoch % 5 == 0: # break # Flush all log to writer and close writer.flush() writer.close() print("Training finished")
def main(): epoches = 4 gamma = 0.7 log_interval = 10 torch.manual_seed(1) save_model = True #RNN RNN = True N_STEPS = 28 N_INPUTS = 28 N_NEURONS = 150 N_OUTPUTS = 10 # Check whether you can use Cuda use_cuda = torch.cuda.is_available() # Use Cuda if you can device = torch.device("cuda" if use_cuda else "cpu") ######################3 Torchvision ###########################3 # Use data predefined loader # Pre-processing by using the transform.Compose # divide into batches #num_workers uses subprocesses to asynchronously load data, and use pinned RAM (pin_memory) to speed up ram and # gpu transfers (change 4 change number of num_workers) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} #change 1 normalise the input images transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) train_dataset = datasets.MNIST('PATH_TO_STORE_TRAINSET', download=True, train=True, transform=transform) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, **kwargs) # change 2 normalise test images test_dataset = datasets.MNIST('PATH_TO_STORE_TRAINSET', download=True, train=True, transform=transform) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000, shuffle=True, **kwargs) # get some random training images dataiter = iter(train_loader) images, labels = dataiter.next() #img = torchvision.utils.make_grid(images) #imsave(img) # ##################### Build your network and run ############################ if RNN: model = ImageRNN(64, N_STEPS, N_INPUTS, N_NEURONS, N_OUTPUTS, device).to(device) else: model = ConvNet() if RNN: optimizer = optim.Adadelta(model.parameters(), lr=0.01) else: optimizer = optim.Adam(model.parameters(), lr=0.001) scheduler = StepLR(optimizer, step_size=1, gamma=gamma) for epoch in range(1, epoches + 1): if RNN: train_rnn(log_interval, model, device, train_loader, optimizer, epoch) else: train_cnn(log_interval, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) scheduler.step() if save_model: torch.save(model.state_dict(), "./results/mnist_cnn.pt")
def train_model( batch_size: int = 64, test_batch_size: int = 1000, epochs: int = 14, lr: float = 1.0, gamma: float = 0.7, no_cuda: bool = False, dry_run: bool = False, seed: int = 1, log_interval: int = 10, save_model: bool = False, checkpoint_period: int = 5, # Period between checkpoints in minutes checkpoint_input: str = '', checkpoint_output: str = ''): use_cuda = not no_cuda and torch.cuda.is_available() torch.manual_seed(seed) device = torch.device("cuda" if use_cuda else "cpu") args = argparse.Namespace() args.log_interval = log_interval args.batch_size = batch_size args.dry_run = dry_run kwargs = {'batch_size': batch_size} if use_cuda: kwargs.update({ 'num_workers': 1, 'pin_memory': True, 'shuffle': True }, ) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) dataset1 = datasets.MNIST('../data', train=True, download=True, transform=transform) dataset2 = datasets.MNIST('../data', train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=lr) scheduler = StepLR(optimizer, step_size=1, gamma=gamma) last_checkpoint_time = time.time() epoch_start = 1 if checkpoint_input: print(f"Attempt loading checkpoint from {checkpoint_input}") try: checkpoint = torch.load(checkpoint_input) except Exception as e: print("Skipping broken checkpoint") else: model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch_start = checkpoint['epoch'] + 1 # Start from next epoch print(f"Resuming from checkpoint with epoch: {epoch_start}") for epoch in range(epoch_start, epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) scheduler.step() # The following code block writes a checkpoint if one has not been written # in the past checkpoint_period # of minutes if save_model and (time.time() - last_checkpoint_time) > (checkpoint_period * 60): print("*************** Triggering checkpoint ***************") torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, checkpoint_output)
def __init__(self, optimizer, gamma=0.96, step_size=100000): self.scheduler = StepLR(optimizer, step_size, gamma)
def main(): # Trainset stats: 2072002577 items from 124950714 sessions print('Initializing dataloader...') mtrain_loader = SpotifyDataloader( config_fpath=args.config, mtrain_mode=True, data_sel=(0, 99965071), # 80% 트레인 batch_size=TR_BATCH_SZ, shuffle=True, seq_mode=True) # seq_mode implemented mval_loader = SpotifyDataloader( config_fpath=args.config, mtrain_mode=True, # True, because we use part of trainset as testset data_sel=(99965071, 104965071), #(99965071, 124950714), # 20%를 테스트 batch_size=TS_BATCH_SZ, shuffle=False, seq_mode=True) # Load Teacher net SMT = SeqModel().cuda(GPU) checkpoint = torch.load(FPATH_T_NET_CHECKPOINT, map_location='cuda:{}'.format(GPU)) tqdm.write( "Loading saved teacher model from '{0:}'... loss: {1:.6f}".format( FPATH_T_NET_CHECKPOINT, checkpoint['loss'])) SMT.load_state_dict(checkpoint['SM_state']) SMT_Enc = nn.Sequential(*list(SMT.children())[:1]).cuda(GPU) #SMT_EncFeat = nn.Sequential(*list(SMT.children())[:2]) # Init Student net --> copy classifier from the Teacher net SM = SeqModel_Student().cuda(GPU) SM.feature = deepcopy(SMT.feature) for p in list(SM.feature.parameters()): p.requires_grad = False SM.classifier = deepcopy(SMT.classifier) SM.classifier.weight.requires_grad = False SM.classifier.bias.requires_grad = False SM = SM.cuda(GPU) SM_optim = torch.optim.Adam(filter(lambda p: p.requires_grad, SM.parameters()), lr=LEARNING_RATE) SM_scheduler = StepLR(SM_optim, step_size=1, gamma=0.9) # Load checkpoint if args.load_continue_latest is None: START_EPOCH = 0 else: latest_fpath = max(glob.iglob(MODEL_SAVE_PATH + "check*.pth"), key=os.path.getctime) checkpoint = torch.load(latest_fpath, map_location='cuda:{}'.format(GPU)) tqdm.write("Loading saved model from '{0:}'... loss: {1:.6f}".format( latest_fpath, checkpoint['loss'])) SM.load_state_dict(checkpoint['SM_state']) SM_optim.load_state_dict(checkpoint['SM_opt_state']) SM_scheduler.load_state_dict(checkpoint['SM_sch_state']) START_EPOCH = checkpoint['ep'] # Train for epoch in trange(START_EPOCH, EPOCHS, desc='epochs', position=0, ascii=True): tqdm.write('Train...') tr_sessions_iter = iter(mtrain_loader) total_corrects = 0 total_query = 0 total_trloss = 0 for session in trange(len(tr_sessions_iter), desc='sessions', position=1, ascii=True): SMT.eval() # Teacher-net SM.train() # Student-net x, labels, y_mask, num_items, index = tr_sessions_iter.next( ) # FIXED 13.Dec. SEPARATE LOGS. QUERY SHOULT NOT INCLUDE LOGS # Sample data for 'support' and 'query': ex) 15 items = 7 sup, 8 queries... num_support = num_items[:, 0].detach().numpy().flatten( ) # If num_items was odd number, query has one more item. num_query = num_items[:, 1].detach().numpy().flatten() batch_sz = num_items.shape[0] # x: the first 10 items out of 20 are support items left-padded with zeros. The last 10 are queries right-padded. x = x.permute(0, 2, 1) # bx70*20 # x_feat_T: Teacher-net input, x_feat_S: Student-net input(que-log is excluded) x_feat_T = torch.zeros(batch_sz, 72, 20) x_feat_T[:, :70, :] = x.clone() x_feat_T[:, 70, :10] = 1 # Sup/Que state indicator x_feat_T[:, 71, :10] = labels[:, :10].clone() x_feat_S = x_feat_T.clone() x_feat_S[:, :41, 10:] = 0 # remove que-log x_feat_T = x_feat_T.cuda(GPU) x_feat_S = Variable(x_feat_S).cuda(GPU) # Target: Prepare Teacher's intermediate output enc_target = SMT_Enc(x_feat_T) #target = SMT_EncFeat(x_feat_T) # target mask target_mask = y_mask.clone().unsqueeze(1).repeat(1, 128, 1).cuda(GPU) target_mask_que = target_mask.clone().cuda(GPU) target_mask_que[:, :, :10] = 0 # y_mask y_mask_que = y_mask.clone() y_mask_que[:, :10] = 0 # Forward & update y_hat_enc, y_hat = SM(x_feat_S) # y_hat: b*20 # Calcultate Distillation loss: IN_10 loss1 = F.binary_cross_entropy_with_logits( input=y_hat_enc * target_mask_que, target=torch.sigmoid(enc_target) * target_mask_que) loss2 = F.l1_loss(input=y_hat_enc * target_mask_que, target=enc_target * target_mask_que) loss = loss1 + loss2 total_trloss += loss.item() SM.zero_grad() loss.backward() # Gradient Clipping #torch.nn.utils.clip_grad_norm_(SM.parameters(), 0.5) SM_optim.step() # Decision SM.eval() y_prob = torch.sigmoid( y_hat * y_mask_que.cuda(GPU)).detach().cpu().numpy() # bx20 y_pred = (y_prob[:, 10:] > 0.5).astype(np.int) # bx10 y_numpy = labels[:, 10:].numpy() # bx10 # Acc total_corrects += np.sum( (y_pred == y_numpy) * y_mask_que[:, 10:].numpy()) total_query += np.sum(num_query) # Restore GPU memory del loss, y_hat, y_hat_enc if (session + 1) % 500 == 0: hist_trloss.append(total_trloss / 900) hist_tracc.append(total_corrects / total_query) # Prepare display sample_sup = labels[0, ( 10 - num_support[0]):10].long().numpy().flatten() sample_que = y_numpy[0, :num_query[0]].astype(int) sample_pred = y_pred[0, :num_query[0]] sample_prob = y_prob[0, 10:10 + num_query[0]] tqdm.write("S:" + np.array2string(sample_sup) + '\n' + "Q:" + np.array2string(sample_que) + '\n' + "P:" + np.array2string(sample_pred) + '\n' + "prob:" + np.array2string(sample_prob)) tqdm.write( "tr_session:{0:} tr_loss:{1:.6f} tr_acc:{2:.4f}".format( session, hist_trloss[-1], hist_tracc[-1])) total_corrects = 0 total_query = 0 total_trloss = 0 if (session + 1) % 25000 == 0: # Validation validate(mval_loader, SM, eval_mode=True, GPU=GPU) # Save torch.save( { 'ep': epoch, 'sess': session, 'SM_state': SM.state_dict(), 'loss': hist_trloss[-1], 'hist_vacc': hist_vacc, 'hist_vloss': hist_vloss, 'hist_trloss': hist_trloss, 'SM_opt_state': SM_optim.state_dict(), 'SM_sch_state': SM_scheduler.state_dict() }, MODEL_SAVE_PATH + "check_{0:}_{1:}.pth".format(epoch, session)) # Validation validate(mval_loader, SM, eval_mode=True, GPU=GPU) # Save torch.save( { 'ep': epoch, 'sess': session, 'SM_state': SM.state_dict(), 'loss': hist_trloss[-1], 'hist_vacc': hist_vacc, 'hist_vloss': hist_vloss, 'hist_trloss': hist_trloss, 'SM_opt_state': SM_optim.state_dict(), 'SM_sch_state': SM_scheduler.state_dict() }, MODEL_SAVE_PATH + "check_{0:}_{1:}.pth".format(epoch, session)) SM_scheduler.step()
def pre_train(hp, models, train_data, test_data): print("----------start pre-training models----------") view_num = len(models) par = [] for i in range(view_num): models[i].cuda() models[i].train() par.append({'params': models[i].parameters()}) optimizer = optim.Adam(par, lr=hp['pre_lr']) scheduler = StepLR(optimizer, step_size=10, gamma=0.5) batch_size = hp['pre_size'] loss_func = nn.MSELoss() for epoch in range(hp['pre_epoch']): scheduler.step() running_loss = 0.0 data_num = 0 for i in range(view_num): models[i].train() for i in range(3): data = train_data[i] if data == None: continue bag_num = len(data) data_num += bag_num max_step = int(bag_num / batch_size) while max_step * batch_size < bag_num: max_step += 1 for step in range(max_step): # get data step_data = get_batch( data, list( range(step * batch_size, min((step + 1) * batch_size, bag_num))), hp) x1, x2, bag1, bag2, y = step_data b_y = Variable(y).cuda() loss = 0 if i == 0 or i == 2: x_img = Variable(x1).cuda() h1, _, _ = models[0](x_img, bag1) loss += loss_func(h1, b_y) if i == 0 or i == 1: x_text = Variable(x2).cuda() h2, _, _ = models[1](x_text, bag2) loss += loss_func(h2, b_y) running_loss += loss.data * x2.size(0) # backward optimizer.zero_grad() loss.backward() optimizer.step() # epoch loss epoch_loss = running_loss / data_num print('epoch {}/{} | Loss: {:.9f}'.format(epoch, hp['pre_epoch'], epoch_loss)) rootpath = "{}{}/".format(hp['modelpath'], str(epoch + 1)) os.makedirs(rootpath, exist_ok=True) save_model(models, rootpath) hp['rootdir'] = rootpath result = test(test_data, hp, models, 'pretrain') print("----------end pre-training models----------") return models
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=5, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--dry-run', action='store_true', default=False, help='quickly check a single pass') parser.add_argument('--seed', type=int, default=123, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") print("device: ", device) train_kwargs = {'batch_size': args.batch_size} test_kwargs = {'batch_size': args.test_batch_size} if use_cuda: cuda_kwargs = {'num_workers': 1, 'pin_memory': True, 'shuffle': True} train_kwargs.update(cuda_kwargs) test_kwargs.update(cuda_kwargs) transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) dataset1 = datasets.MNIST('./MNIST', train=True, download=True, transform=transform) dataset2 = datasets.MNIST('./MNIST', train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader, epoch) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
def train(hp, models, train_data): #if hp['pretrain'] == 1: # models = pre_train(hp, models, train_data) print("----------start training models----------") view_num = len(models) # num of view l = hp['label'] # num of label # 初始化K0,M矩阵 k_0 = torch.nn.Softmax()(torch.eye(l)) k_0 = k_0.data.numpy() k_0 = k_0 / np.max(k_0) k_0_inv = np.linalg.inv(k_0) m = cal_distance_matrix(k_0) m = m / np.max(m) trade = hp['trade_off'] # 平衡系数 lr = hp['lr'] ae_coe = hp['ae'] par = [] for i in range(view_num): models[i].cuda() par.append({'params': models[i].parameters()}) optimizer = optim.Adam(par, lr=lr[0]) scheduler = StepLR(optimizer, step_size=10, gamma=0.5) ae_loss = torch.nn.MSELoss(reduction='elementwise_mean') batch_size = hp['batch_size'][0] def train_for_dataset(data, train_type): loss_record = np.zeros(5) if data == None: return loss_record if train_type in [4, 5] and hp['ae'] == 0: return loss_record if train_type == 3 and hp['semi'] == 0: return loss_record bag_num = len(data) max_step = int(bag_num / batch_size) while max_step * batch_size < bag_num: max_step += 1 for step in range(max_step): step_data = get_batch( data, list( range(step * batch_size, min((step + 1) * batch_size, bag_num))), hp) x1, x2, bag1, bag2, y = step_data if train_type == 0: x_img = Variable(x1).cuda() x_text = Variable(x2).cuda() b_y = Variable(y).cuda() # forward h1, fea1, dec1 = models[0](x_img, bag1) h2, fea2, dec2 = models[1](x_text, bag2) # loss w_loss = WassersteinLoss(m, hp['reg']) ae_loss1 = ae_loss(fea1, dec1) loss1 = w_loss(h1, b_y) total_loss = loss1 + hp['ae'] * (ae_loss1) optimizer.zero_grad() total_loss.backward() optimizer.step() w_loss = WassersteinLoss(m, hp['reg']) loss2 = w_loss(h2, b_y) ae_loss2 = ae_loss(fea2, dec2) total_loss = loss2 + hp['ae'] * (ae_loss2) loss_record[0] += loss1.data.cpu().numpy()[0] * x1.size(0) loss_record[1] += loss2.data.cpu().numpy()[0] * x1.size(0) loss_record[2] += ae_loss1.data.cpu().numpy() * x1.size(0) loss_record[3] += ae_loss2.data.cpu().numpy() * x1.size(0) elif train_type == 1: x_text = Variable(x2).cuda() b_y = Variable(y).cuda() # forward h2, fea2, dec2 = models[1](x_text, bag2) # loss w_loss = WassersteinLoss(m, hp['reg']) loss2 = w_loss(h2, b_y) ae_loss2 = ae_loss(fea2, dec2) total_loss = loss2 + hp['ae'] * (ae_loss2) loss_record[1] += loss2.data.cpu().numpy()[0] * x2.size(0) loss_record[3] += ae_loss2.data.cpu().numpy() * x2.size(0) elif train_type == 2: x_img = Variable(x1).cuda() b_y = Variable(y).cuda() # forward h1, fea1, dec1 = models[0](x_img, bag1) # loss w_loss = WassersteinLoss(m, hp['reg']) loss1 = w_loss(h1, b_y) ae_loss1 = ae_loss(fea1, dec1) total_loss = loss1 + hp['ae'] * (ae_loss1) loss_record[0] += loss1.data.cpu().numpy()[0] * x1.size(0) loss_record[2] += ae_loss1.data.cpu().numpy() * x1.size(0) elif train_type == 3 and hp['semi'] == 1: x_img = Variable(x1).cuda() x_text = Variable(x2).cuda() # forward h1, fea1, dec1 = models[0](x_img, bag1) h2, fea2, dec2 = models[1](x_text, bag2) # loss w_loss = WassersteinLoss(m, hp['reg']) semi_loss = w_loss(h1, h2) ae_loss1 = ae_loss(fea1, dec1) ae_loss2 = ae_loss(fea2, dec2) total_loss = semi_loss + hp['ae'] * (ae_loss1 + ae_loss2) loss_record[2] += ae_loss1.data.cpu().numpy() * x1.size(0) loss_record[3] += ae_loss2.data.cpu().numpy() * x1.size(0) loss_record[4] += semi_loss.data.cpu().numpy()[0] * x1.size(0) elif train_type == 4 and hp['ae'] != 0: x_text = Variable(x2).cuda() # forward h2, fea2, dec2 = models[1](x_text, bag2) # loss ae_loss2 = ae_loss(fea2, dec2) total_loss = hp['ae'] * ae_loss2 loss_record[3] += ae_loss2.data.cpu().numpy() * x2.size(0) elif train_type == 5 and hp['ae'] != 0: x_img = Variable(x1).cuda() # forward h1, fea1, dec1 = models[0](x_img, bag1) # loss ae_loss1 = ae_loss(fea1, dec1) total_loss = hp['ae'] * ae_loss1 loss_record[2] += ae_loss1.data.cpu().numpy() * x1.size(0) # backward optimizer.zero_grad() total_loss.backward() optimizer.step() return loss_record store_loss = np.zeros((hp['epoch'] * hp['epoch_1'], 5)) K = 0 for epoch in range(hp['epoch']): for epoch_1 in range(hp['epoch_1']): scheduler.step() for t in range(view_num): models[t].train() for i in range(len(train_data)): print(epoch, epoch_1, i) data = train_data[i] loss_for_dataset = train_for_dataset(data, i) store_loss[epoch * hp['epoch_1'] + epoch_1] += loss_for_dataset.reshape((-1)) # seconde stage K = 0 if hp['fixed'] == 0: for i in range(view_num): models[i].eval() T = np.zeros((l, l)) # calculate T for i in range(len(train_data)): # get data if i > 2: continue data = train_data[i] if data == None: continue for j in range(len(data)): x1, x2, bag1, bag2, b_y = get_batch(data, [j], hp) b_y = b_y.cpu().numpy().reshape((-1, )) b_y[b_y <= 0] = 1e-9 b_y = b_y / np.sum(b_y) x_img = None x_text = None if i == 0 or i == 2: x_img = Variable(x1).cuda() h = models[0](x_img, bag1)[0].cpu().data.numpy() h[h <= 0] = 1e-9 h = h / np.sum(h) Gs = ot.sinkhorn(h.reshape(-1), b_y.reshape(-1), m / np.max(m), hp['reg']) T += Gs if i == 0 or i == 1: x_text = Variable(x2).cuda() h = models[1](x_text, bag2)[0].cpu().data.numpy() h[h <= 0] = 1e-9 h = h / np.sum(h) Gs = ot.sinkhorn(h.reshape(-1), b_y.reshape(-1), m / np.max(m), hp['reg']) T += Gs # T /= (bag_num * view_num) # calculate K G = np.zeros((l, l)) for i in range(l): for j in range(l): if i == j: for k in range(l): if k != i: G[i][j] -= (T[i][k] + T[k][i]) else: G[i][j] = 2 * T[i][j] K = np.linalg.inv(k_0_inv - G / trade) #K = k_0 + G / trade / np.max(G) K = (K + K.T) / 2 u, v = np.linalg.eig(K) u[u < 0] = 0 K = np.dot(v, np.dot(np.diag(u), v.T)) # calculate M m = cal_distance_matrix(K) m = m / np.max(m) # 保存loss np.save("{}loss.npy".format(hp['rootdir']), store_loss) # 保存corr矩阵 np.save("{}M.npy".format(hp['rootdir']), m) np.save("{}K.npy".format(hp['rootdir']), K) return models
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=512, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--dry-run', action='store_true', default=False, help='quickly check a single pass') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--T', type=int, default=450, metavar='N', help='SNN time window') parser.add_argument('--resume', type=str, default=None, metavar='RESUME', help='Resume model from checkpoint') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'batch_size': args.batch_size} if use_cuda: kwargs.update({ 'num_workers': 1, 'pin_memory': True, 'shuffle': True }, ) transform_train = transforms.Compose( [transforms.ToTensor(), AddGaussianNoise(std=0.01)]) transform = transforms.Compose([transforms.ToTensor()]) dataset1 = datasets.MNIST('../data', train=True, download=True, transform=transform_train) for i in range(30): transform_train_1 = transforms.Compose([ transforms.RandomRotation(10), #transforms.RandomHorizontalFlip(), transforms.ToTensor(), AddGaussianNoise(std=0.01) ]) dataset1 = dataset1 + datasets.MNIST( '../data', train=True, download=True, transform=transform_train_1) dataset2 = datasets.MNIST('../data', train=False, transform=transform) snn_dataset = SpikeDataset(dataset2, T=args.T) train_loader = torch.utils.data.DataLoader(dataset1, **kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) snn_loader = torch.utils.data.DataLoader(snn_dataset, **kwargs) model = Net().to(device) snn_model = CatNet(args.T).to(device) if args.resume != None: load_model(torch.load(args.resume), model) for param_tensor in model.state_dict(): print(param_tensor, "\t", model.state_dict()[param_tensor].size()) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) ACC = 0 for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) ACC_ = test(model, device, test_loader) if ACC_ > ACC or ACC_ == ACC: ACC = ACC_ torch.save(model.state_dict(), "mnist_pretrained.pt") scheduler.step() # After retraining with Q function, you can transfer ANN to SNN. fuse_module(model) transfer_model(model, snn_model) test(snn_model, device, snn_loader)
def main(): parser = argparse.ArgumentParser(description="My CNN") parser.add_argument("--batch-size", type=int, default=64, metavar='N', help="input batch sixe for training (default: 64)") parser.add_argument("--test-batch-size", type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda") if use_cuda else torch.device("cpu") print(device) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} #transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) #trainset = torchvision.datasets.CIFAR10('../data', train=True, download=True, transform=transform) train_loader = torch.utils.data.DataLoader(datasets.CIFAR10( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.CIFAR10( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "cifar10_cnn.pt")
def get_model_optimizer_scheduler(args, device, train_loader, test_loader, criterion): if args.model == 'lenet': model = LeNet().to(device) if args.pretrained_model_dir is None: optimizer = torch.optim.Adadelta(model.parameters(), lr=1) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) elif args.model == 'vgg16': model = VGG(depth=16).to(device) if args.pretrained_model_dir is None: optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[ int(args.pretrain_epochs * 0.5), int(args.pretrain_epochs * 0.75) ], gamma=0.1) elif args.model == 'vgg19': model = VGG(depth=19).to(device) if args.pretrained_model_dir is None: optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[ int(args.pretrain_epochs * 0.5), int(args.pretrain_epochs * 0.75) ], gamma=0.1) elif args.model == 'resnet18': model = ResNet18().to(device) if args.pretrained_model_dir is None: optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[ int(args.pretrain_epochs * 0.5), int(args.pretrain_epochs * 0.75) ], gamma=0.1) else: raise ValueError("model not recognized") if args.pretrained_model_dir is None: print('start pre-training...') best_acc = 0 for epoch in range(args.pretrain_epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = test(args, model, device, criterion, test_loader) if acc > best_acc: best_acc = acc state_dict = model.state_dict() model.load_state_dict(state_dict) acc = best_acc torch.save( state_dict, os.path.join(args.experiment_data_dir, f'pretrain_{args.dataset}_{args.model}.pth')) print('Model trained saved to %s' % args.experiment_data_dir) else: model.load_state_dict(torch.load(args.pretrained_model_dir)) best_acc = test(args, model, device, criterion, test_loader) # setup new opotimizer for pruning optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[ int(args.pretrain_epochs * 0.5), int(args.pretrain_epochs * 0.75) ], gamma=0.1) print('Pretrained model acc:', best_acc) return model, optimizer, scheduler
def __init__(self, args): super().__init__() self.args = args self.polar = args.polar self.act = nn.ELU() # (1, 60, 160) self.d1 = 32 self.conv1 = nn.Conv2d(1, self.d1, kernel_size=3, stride=1, padding=1, padding_mode='replicate') # (32, 60, 160) self.batchNorm1 = nn.BatchNorm2d(self.d1) self.pool1 = nn.MaxPool2d(2) self.dropout2d1 = nn.Dropout2d(0.9) # (32, 30, 80) self.d2 = 32 self.conv2 = nn.Conv2d(self.d1, self.d2, kernel_size=3, stride=1, padding=1, padding_mode='replicate') # (32, 30, 80) self.batchNorm2 = nn.BatchNorm2d(self.d2) self.pool2 = nn.MaxPool2d(2) self.dropout2d2 = nn.Dropout2d(0.9) # (32, 15, 40) self.d3 = 64 self.conv3 = nn.Conv2d(self.d2, self.d3, kernel_size=3, stride=1, padding=1, padding_mode='replicate') # (64, 15, 40) self.batchNorm3 = nn.BatchNorm2d(self.d3) self.pool3 = nn.MaxPool2d(2) self.dropout2d3 = nn.Dropout2d(0.9) # (64, 7, 20) self.d4 = 64 self.conv4 = nn.Conv2d(self.d3, self.d4, kernel_size=3, stride=1, padding=1, padding_mode='replicate') # (64, 7, 20) self.batchNorm4 = nn.BatchNorm2d(self.d4) self.pool4 = nn.MaxPool2d(2) self.dropout2d4 = nn.Dropout2d(0.9) # (64, 3, 10) self.fc1 = nn.Linear(1920, 16) self.dropout1 = nn.Dropout(0.8) self.fc2 = nn.Linear(16, 16) self.dropout2 = nn.Dropout(0.8) self.fc3 = nn.Linear(16, 4) self.forward_pass = nn.Sequential( self.conv1, self.batchNorm1, self.pool1, #self.dropout2d1, self.conv2, self.batchNorm2, self.pool2, #self.dropout2d2, self.conv3, self.batchNorm3, self.pool3, #self.dropout2d3, self.conv4, self.batchNorm4, self.pool4, #self.dropout2d4, nn.Flatten(), self.fc1, self.fc2, self.fc3) self.optimizer = optim.Adam(self.parameters(), lr=args.lr) self.scheduler = StepLR(self.optimizer, step_size=args.step, gamma=args.gamma)
def train(data_loader, model_index, x_eval_train, gn_fp, dn_fp, ave_fp): ### Model Initiation gn = GN().cuda() dn = DN().cuda() ave_state_dict = tor.load(ave_fp) gn.load_ave_state(ave_state_dict) dn.load_ave_state(ave_state_dict) if gn_fp : gn_state_dict = tor.load(gn_fp) gn.load_state_dict(gn_state_dict) if dn_fp : dn_state_dict = tor.load(dn_fp) dn.load_state_dict(dn_state_dict) gn.cuda() dn.cuda() loss_func = tor.nn.BCELoss().cuda() #optim = tor.optim.SGD(fcn.parameters(), lr=LR, momentum=MOMENTUM) optim_gn = tor.optim.Adam(gn.parameters(), lr=LR) optim_dn = tor.optim.Adam(dn.parameters(), lr=LR) lr_step_gn = StepLR(optim_gn, step_size=LR_STEPSIZE, gamma=LR_GAMMA) lr_step_dn = StepLR(optim_dn, step_size=LR_STEPSIZE, gamma=LR_GAMMA) ### Training for epoch in range(EPOCH): print("|Epoch: {:>4} |".format(epoch + 1)) for step, (x_batch, y_batch) in enumerate(data_loader): print("Process: {}/{}".format(step, int(AVAILABLE_SIZE[0] / BATCHSIZE)), end="\r") ### train true/false pic if (step // PIVOT_STEPS) % 3 != 2 : out = Variable(x_batch).cuda() if step % 2 == 0 else gn(Variable(tor.randn(BATCHSIZE, 512)).cuda()) ans = Variable(tor.ones(BATCHSIZE, 1)).cuda() if step % 2 == 0 else Variable(tor.zeros(BATCHSIZE, 1)).cuda() dis = dn(out) optim = optim_dn else : out = gn(Variable(tor.randn(BATCHSIZE, 512)).cuda()).cuda() ans = Variable(tor.ones(BATCHSIZE, 1)).cuda() dis = dn(out) optim = optim_dn loss = loss_func(dis, ans) print (loss.data) loss.backward() if (step // PIVOT_STEPS) % 3 != 2 : optim_dn.step() else : optim_gn.step() optim_dn.zero_grad() optim_gn.zero_grad() lr_step_dn.step() lr_step_gn.step() if step % RECORD_JSON_PERIOD == 0 : x_true = Variable(x_eval_train).cuda() out = dn(x_true) acc_true = round(int((out > 0.5).sum().data) / EVAL_SIZE, 5) x_false = gn(Variable(tor.randn((EVAL_SIZE, 512))).cuda()) out = dn(x_false) acc_false = round(int((out <= 0.5).sum().data) / EVAL_SIZE, 5) print ("|Acc True: {} |Acc False: {}".format(acc_true, acc_false)) save_record(model_index, epoch, optim, loss, acc_true, acc_false) if step % RECORD_PIC_PERIOD == 0 : loss = float(loss.data) print("|Loss: {:<8}".format(loss)) save_pic("output_{}".format(model_index), gn, 3) if step % (2 * PIVOT_STEPS) == 0 : pass ### Save model if step % RECORD_MODEL_PERIOD == 0: tor.save(gn.state_dict(), os.path.join(MODEL_ROOT, "gan_gn_{}_{}.pkl".format(model_index, epoch)))
def main(): parser = argparse.ArgumentParser( description='PyTorch training script for SUN397 dataset') parser.add_argument('conf_file') parser.add_argument('output_dir', help='Model save directory') parser.add_argument('-w', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('-b', '--batch-size', default=64, type=int, metavar='N', help='mini-batch size') parser.add_argument('-T', '--tensor-board-dir', help='Tensor board log dir', default='runs') parser.add_argument('--restart', help='Restart', default=False, action='store_true') parser.add_argument('--checkpoint', help='checkpoint file') parser.add_argument('--eval', default=False, action='store_true', help='checkpoint file') args = parser.parse_args() conf = load_conf(args.conf_file) train_set, val_set, net, criterion, metrics_dict, ( score_name, score_function) = task_factory(conf['task'])(conf) if args.restart: run_id = find_recent_output_dir(conf['tag'], args.output_dir) else: run_id = '%s_%s' % (conf['tag'], datetime.now().strftime('%Y%m%d%H%M')) output_dir = os.path.join(args.output_dir, run_id) checkpoint_handler = CheckpointManager(output_dir, 'model', score_name=score_name, score_function=score_function, extra={ 'conf': conf, 'args': vars(args) }) shutil.copy(args.conf_file, os.path.join(output_dir, 'conf.json')) loader_pin_memory = torch.cuda.is_available() train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=loader_pin_memory, drop_last=False) val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=loader_pin_memory, drop_last=False) writer = create_summary_writer(net, train_loader, args.tensor_board_dir, run_id) device = 'cpu' if torch.cuda.is_available(): device = 'cuda' criterion = criterion.cuda() optimizer = torch.optim.Adam(net.parameters(), lr=conf['lr'], weight_decay=conf['weight_decay']) trainer = create_supervised_trainer(net, optimizer, criterion, device=device, gradient_clip=conf['clip_gradient']) train_evaluator = create_supervised_evaluator(net, metrics=metrics_dict, device=device) evaluator = create_supervised_evaluator(net, metrics=metrics_dict, device=device) step_scheduler = StepLR(optimizer, step_size=conf['lr_step'], gamma=conf['lr_decay']) scheduler = LRScheduler(step_scheduler) trainer.add_event_handler(Events.EPOCH_STARTED, scheduler) all_params = { 'model': net, 'optimizer': optimizer, 'lr_scheduler': step_scheduler } evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, all_params) desc = "ITERATION - loss: {:.2f}" pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) log_interval = 10 # load checkpoint if args.restart and checkpoint_handler.is_checkpoint_available(): state_dicts = checkpoint_handler.load_last() load_model(all_params, state_dicts) elif args.checkpoint is not None: state_dicts = checkpoint_handler.load(args.checkpoint) load_model(all_params, state_dicts) @trainer.on(Events.EPOCH_STARTED) def setup_engine(engine): if engine.state.epoch == 1: engine.state.epoch = checkpoint_handler.epoch_ + 1 @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iter = (engine.state.iteration - 1) % len(train_loader) + 1 if iter % log_interval == 0: pbar.desc = desc.format(engine.state.output) pbar.update(log_interval) writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): pbar.refresh() train_evaluator.run(train_loader) log_results(engine, train_evaluator, "Training", writer) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): checkpoint_handler.epoch_ = engine.state.epoch evaluator.run(val_loader) log_results(engine, evaluator, "Validation", writer) pbar.n = pbar.last_print_n = 0 if args.eval: evaluator.run(val_loader) log_results(evaluator, evaluator, "Validation", writer) else: trainer.run(train_loader, max_epochs=conf['epochs']) pbar.close() print("END")
def main(): # Training settings # Use the command line to modify the default settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=256, metavar='N', help='input batch size for training (default: 256)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--validation-percentage', type=float, default=15., metavar='P', help='percentage of training data used for validation (default: 15)') parser.add_argument('--training-division', type=float, default=1., metavar='D', help='divide the remaining training data by this factor') parser.add_argument('--epochs', type=int, default=12, metavar='N', help='number of epochs to train (default: 12)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--step', type=int, default=1, metavar='N', help='number of epochs between learning rate reductions (default: 1)') parser.add_argument('--gamma', type=float, default=1, metavar='M', help='Learning rate step gamma (default: 1)') parser.add_argument('--no-cuda', action='store_true', help='disables CUDA training') parser.add_argument('--no-augmentation', action='store_true', help='disables data augmentation') parser.add_argument('--seed', type=int, default=2020, metavar='S', help='random seed (default: 2020)') parser.add_argument('--log-numbers', type=int, default=1, metavar='N', help='how many entries of logging training status to show per epoch') parser.add_argument('--name', type=str, default='default', metavar='name', help='name of the model') parser.add_argument('--root', type=str, default='../data/hw03_outputs/', metavar='path', help='path to save all model and plots') parser.add_argument('--plot', action='store_true', help='plot the training curve') parser.add_argument('--evaluate', action='store_true', help='evaluate your model on the official test set') parser.add_argument('--save-model', action='store_true', help='save the current model'); args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # Evaluate on the official test set if args.evaluate: path_model = args.root+args.name+'.pt' assert os.path.exists(path_model) # Set the test model model = Net().to(device) model.load_state_dict(torch.load(path_model)) test_dataset = datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.test_batch_size, shuffle=False, **kwargs) test_loss, correct, preds = test(model, device, test_loader) np.save(args.root+args.name+'_test_loss.npy', test_loss) np.save(args.root+args.name+'_test_accuracy.npy', correct/len(test_loader.sampler)*100) np.save(args.root+args.name+'_preds.npy', preds) return # Pytorch has default MNIST dataloader which loads data at each iteration train_dataset = datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ # Data preprocessing transforms.ToTensor(), # Add data augmentation here transforms.Normalize((0.1307,), (0.3081,)) ])) train_dataset_augmented = datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.RandomAffine(4, translate=(.1, .1), scale=(.9, 1.1), shear=(2, 2, 2, 2)), transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) train_labels = np.array([data[1] for data in train_dataset]) labels = np.unique(train_labels) rng = np.random.default_rng(args.seed) train_label_idc = [rng.permutation(np.argwhere(train_labels==l)) for l in labels] subset_indices_train = [idx[0] for idc in train_label_idc for idx in idc[:np.round(len(idc)*(1-args.validation_percentage/100)/args.training_division).astype(int)]] subset_indices_valid = [idx[0] for idc in train_label_idc for idx in idc[np.round(len(idc)*(1-args.validation_percentage/100)).astype(int):]] if args.no_augmentation: train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=SubsetRandomSampler(subset_indices_train) ) else: train_loader = torch.utils.data.DataLoader( train_dataset_augmented, batch_size=args.batch_size, sampler=SubsetRandomSampler(subset_indices_train) ) val_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=SubsetRandomSampler(subset_indices_valid) ) # Load your model [fcNet, ConvNet, Net] model = Net().to(device) # Try different optimzers here [Adam, SGD, RMSprop] optimizer = optim.Adadelta(model.parameters(), lr=args.lr) # Set your learning rate scheduler scheduler = StepLR(optimizer, step_size=args.step, gamma=args.gamma) # Training loop train_loss = np.zeros((args.epochs,)) val_loss = np.zeros((args.epochs,)) train_correct = np.zeros((args.epochs,)) val_correct = np.zeros((args.epochs,)) for epoch in range(args.epochs): train(args, model, device, train_loader, optimizer, epoch) train_loss[epoch], train_correct[epoch] = test(model, device, train_loader, name='Training') val_loss[epoch], val_correct[epoch] = test(model, device, val_loader, name='Validation') print() scheduler.step() # learning rate scheduler np.save(args.root+args.name+'_train_loss.npy', train_loss) np.save(args.root+args.name+'_val_loss.npy', val_loss) np.save(args.root+args.name+'_train_accuracy.npy', train_correct/len(train_loader.sampler)*100) np.save(args.root+args.name+'_train_accuracy.npy', val_correct/len(val_loader.sampler)*100) if args.save_model: torch.save(model.state_dict(), args.root+args.name+'.pt') if args.plot: fig = plt.figure(figsize=(8, 6), tight_layout=True) ax1 = plt.axes() ax1.plot(np.arange(args.epochs), train_loss, 'b-', label='Training Loss') ax1.plot(np.arange(args.epochs), val_loss, 'r-', label='Validation Loss') ax1.set_xlabel('Epochs', fontsize=14, fontweight='bold') ax1.set_ylabel('Negative Log Likelihood Loss', fontsize=14, fontweight='bold') ax2 = ax1.twinx() ax2.plot(np.arange(args.epochs), train_correct/len(train_loader.sampler)*100, 'b:', label='Training Accuracy') ax2.plot(np.arange(args.epochs), val_correct/len(val_loader.sampler)*100, 'r:', label='Validation Accuracy') # ax2.set_xlabel('Epochs') ax2.set_ylabel('Accuracy %', fontsize=14, fontweight='bold') lines1, line_labels1 = ax1.get_legend_handles_labels() lines2, line_labels2 = ax2.get_legend_handles_labels() ax2.legend(lines1 + lines2, line_labels1 + line_labels2, loc='right', fontsize=12) plt.savefig(args.root+args.name+'.pdf', pad_inches=0, bbox_inches='tight') plt.show()
def main(): # Training settings parser = argparse.ArgumentParser(description="Pytorch MNIST Example") parser.add_argument("--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default : 64)") parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default : 1000)") parser.add_argument("--epochs", type=int, default=64, metavar="N", help="number of epochs to train (default : 64)") parser.add_argument("--learning-rate", type=float, default=0.1, metavar="LR", help="the learning rate (default : 0.1)") parser.add_argument("--gamma", type=float, default=0.5, metavar="M", help="Learning rate step gamma (default : 0.5)") parser.add_argument("--no-cuda", action="store_true", default=True, help="disables CUDA training") parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass") parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default : 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status") parser.add_argument("--save-model", action="store_true", default=True, help="For saving the current Model") parser.add_argument( "--load_state_dict", type=str, default="no", help="load the trained model weights or not (default: no)") parser.add_argument("--model", type=str, default="LeNet", help="choose the model to train (default: LeNet)") args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() # not > and > or print("user cuda is {}".format(use_cuda)) torch.manual_seed(args.seed) # 设置随机种子,什么是随机种子? device = torch.device("cuda" if use_cuda else "cpu") train_kwargs = {"batch_size": args.batch_size} test_kwargs = {"batch_size": args.test_batch_size} ''' *args和**kwargs一般是用在函数定义的时候。二者的意义是允许定义的函数接受任意数目的参数。 也就是说我们在函数被调用前并不知道也不限制将来函数可以接收的参数数量。 在这种情况下我们可以使用*args和**kwargs。 ''' if use_cuda: cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True} train_kwargs.update(cuda_kwargs) test_kwargs.update(cuda_kwargs) transform = transforms.Compose([ transforms.ToTensor(), # normalize(mean, std, inplace=False) mean各通道的均值, std各通道的标准差, inplace是否原地操作 # 这里说的均值是数据里的均值 # output = (input - mean) / std # 归一化到-1 ~ 1,也不一定,但是属于标准化 transforms.Normalize((0.1307, ), (0.3081, )) ]) dataset1 = datasets.MNIST("./data", train=True, download=True, transform=transform) dataset2 = datasets.MNIST("./data", train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) model_name = args.model.lower() if model_name == "lenet": model = LeNet().to(device) elif model_name == "defaultnet": model = DefaultNet().to(device) elif model_name == "mynetv1": model = MyNetV1().to(device) elif model_name == "mynetv2": model = MyNetV2().to(device) elif model_name == "myfullconvnet": model = MyFullConvNet().to(device) elif model_name == "myvggnet": model = MyVggNet().to(device) #model = Net().to(device) model_path = Path("./model/weights/{}.pt".format(model_name)) if model_path.exists() and args.load_state_dict == "yes": model.load_state_dict(torch.load(model_path)) print("Load the last trained model.") optimizer = optim.Adadelta(model.parameters(), lr=args.learning_rate) #optimizer_path = Path("./model/weights/") # scheduler是学习率调整,有lambdaLR机制和stepLR机制,lr = lr * gamma^n, n = epoch/step_size scheduler = StepLR(optimizer, step_size=5, gamma=args.gamma) ''' optimizer (Optimizer):要更改学习率的优化器; step_size(int):每训练step_size个epoch,更新一次参数; gamma(float):更新lr的乘法因子; last_epoch (int):最后一个epoch的index,如果是训练了很多个epoch后中断了, 继续训练,这个值就等于加载的模型的epoch。默认为-1表示从头开始训练,即从epoch=1开始。 ''' for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) scheduler.step() if args.save_model: torch.save(model.state_dict(), "./model/weights/{}.pt".format(model_name)) # record the training results create_loss_txt_path = "./model/result/{}_loss.txt".format(model_name) create_acc_txt_path = "./model/result/{}_acc.txt".format(model_name) f = open(create_loss_txt_path, "w+") for loss in graph_loss: f.writelines("{}\n".format(loss)) f.close() f = open(create_acc_txt_path, "w+") for acc in graph_acc: f.writelines("{}\n".format(acc)) f.close()
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = 0 writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: if args.network == 'r18': model = resnet18(args) elif args.network == 'r34': model = resnet34(args) elif args.network == 'r50': model = resnet50(args) elif args.network == 'r101': model = resnet101(args) elif args.network == 'r152': model = resnet152(args) else: model = resnet_face18(args.use_se) model = nn.DataParallel(model) metric_fc = ArcMarginModel(args) metric_fc = nn.DataParallel(metric_fc) if args.optimizer == 'sgd': optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) metric_fc = metric_fc.to(device) # Loss function if args.focal_loss: criterion = FocalLoss(gamma=args.gamma).to(device) else: criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders train_dataset = ArcFaceDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) # Epochs for epoch in range(start_epoch, args.end_epoch): scheduler.step() if args.full_log: lfw_acc, threshold = lfw_test(model) writer.add_scalar('LFW_Accuracy', lfw_acc, epoch) full_log(epoch) start = datetime.now() # One epoch's training train_loss, train_top5_accs = train(train_loader=train_loader, model=model, metric_fc=metric_fc, criterion=criterion, optimizer=optimizer, epoch=epoch, logger=logger) # train_dataset.shuffle() writer.add_scalar('Train_Loss', train_loss, epoch) writer.add_scalar('Train_Top5_Accuracy', train_top5_accs, epoch) end = datetime.now() delta = end - start print('{} seconds'.format(delta.seconds)) # One epoch's validation if epoch > 10 and epoch % 2 == 0 and not args.full_log: start = datetime.now() lfw_acc, threshold = lfw_test(model) writer.add_scalar('LFW Accuracy', lfw_acc, epoch) # Check if there was an improvement is_best = lfw_acc > best_acc best_acc = max(lfw_acc, best_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, metric_fc, optimizer, best_acc, is_best) end = datetime.now() delta = end - start print('{} seconds'.format(delta.seconds))
def main(): # parse the options opts = parse_args() # create the dataloaders dataloader = { "train": create_dataloader("train_valid" if opts.no_validation else "train", opts), "valid": create_dataloader("valid", opts), } # create the model model = Prover(opts) model.to(opts.device) # crete the optimizer optimizer = torch.optim.RMSprop( model.parameters(), lr=opts.learning_rate, momentum=opts.momentum, weight_decay=opts.l2, ) if opts.no_validation: scheduler = StepLR(optimizer, step_size=opts.lr_reduce_steps, gamma=0.1) else: scheduler = ReduceLROnPlateau(optimizer, patience=opts.lr_reduce_patience, verbose=True) # load the checkpoint start_epoch = 0 if opts.resume != None: log("loading model checkpoint from %s.." % opts.resume) if opts.device.type == "cpu": checkpoint = torch.load(opts.resume, map_location="cpu") else: checkpoint = torch.load(opts.resume) model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) start_epoch = checkpoint["n_epoch"] + 1 model.to(opts.device) agent = Agent(model, optimizer, dataloader, opts) best_acc = -1.0 for n_epoch in range(start_epoch, start_epoch + opts.num_epochs): log("EPOCH #%d" % n_epoch) # training loss_train = agent.train(n_epoch) # save the model checkpoint if n_epoch % opts.save_model_epochs == 0: agent.save(n_epoch, opts.checkpoint_dir) # validation if not opts.no_validation: loss_valid = agent.valid(n_epoch) # reduce the learning rate if opts.no_validation: scheduler.step() else: scheduler.step(loss_valid)
def train(train_dir, model_dir, config_path, checkpoint_path, n_steps, save_every, test_every, decay_every, n_speakers, n_valids, n_utterances, seg_len): """Train a d-vector network.""" # setup total_steps = 0 # load data dataset = SEDataset(train_dir, n_utterances, seg_len) train_set, valid_set = random_split(dataset, [len(dataset) - n_valids, n_valids]) train_loader = DataLoader(train_set, batch_size=n_speakers, shuffle=True, num_workers=4, collate_fn=pad_batch, drop_last=True) valid_loader = DataLoader(valid_set, batch_size=n_speakers, shuffle=True, num_workers=4, collate_fn=pad_batch, drop_last=True) train_iter = iter(train_loader) print(f"Training starts with {len(train_set)} speakers. " f"(and {len(valid_set)} speakers for validation)") assert len(train_set) >= n_speakers assert len(valid_set) >= n_speakers # build network and training tools dvector = DVector().load_config_file(config_path) criterion = GE2ELoss() optimizer = SGD(list(dvector.parameters()) + list(criterion.parameters()), lr=0.01) scheduler = StepLR(optimizer, step_size=decay_every, gamma=0.5) # load checkpoint if checkpoint_path is not None: ckpt = torch.load(checkpoint_path) total_steps = ckpt["total_steps"] dvector.load_state_dict(ckpt["state_dict"]) criterion.load_state_dict(ckpt["criterion"]) optimizer.load_state_dict(ckpt["optimizer"]) scheduler.load_state_dict(ckpt["scheduler"]) # prepare for training device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dvector = dvector.to(device) criterion = criterion.to(device) writer = SummaryWriter(model_dir) pbar = tqdm.trange(n_steps) min_loss = 1e308 # start training for step in pbar: total_steps += 1 try: batch = next(train_iter) except StopIteration: train_iter = iter(train_loader) batch = next(train_iter) embd = dvector(batch.to(device)).view(n_speakers, n_utterances, -1) loss = criterion(embd) optimizer.zero_grad() loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( list(dvector.parameters()) + list(criterion.parameters()), max_norm=3) dvector.embedding.weight.grad.data *= 0.5 criterion.w.grad.data *= 0.01 criterion.b.grad.data *= 0.01 optimizer.step() scheduler.step() pbar.set_description(f"global = {total_steps}, loss = {loss:.4f}") writer.add_scalar("Training loss", loss, total_steps) writer.add_scalar("Training min loss", min_loss, total_steps) writer.add_scalar("Gradient norm", grad_norm, total_steps) if (step + 1) % test_every == 0: batch = next(iter(valid_loader)) embd = dvector(batch.to(device)).view(n_speakers, n_utterances, -1) valid_loss = criterion(embd) writer.add_scalar("validation loss", valid_loss, total_steps) if (step + 1) % save_every == 0: ckpt_path = os.path.join(model_dir, f"ckpt-{total_steps}.tar") ckpt_dict = { "total_steps": total_steps, "state_dict": dvector.state_dict(), "criterion": criterion.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), } torch.save(ckpt_dict, ckpt_path) if loss.item() < min_loss: min_loss = loss.item() ckpt_path = os.path.join(model_dir, f"dvector-ckpt-min-loss.tar") ckpt_dict = { "total_steps": total_steps, "state_dict": dvector.state_dict(), "criterion": criterion.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), } torch.save(ckpt_dict, ckpt_path) with open(os.path.join(model_dir, "min_loss_step.txt"), "w", encoding="utf-8") as f: import json json.dump({'total_steps': total_steps, 'loss': min_loss}, f) print("Training completed.")
#init_weight(model) if cuda: model = model.cuda() #loss_fn = loss_fn.cuda() # optimizer = Adam( # [param for param in model.parameters() if param.requires_grad], # lr=base_lr, weight_decay=1e-4) # scheduler = StepLR(optimizer, step_size=40, gamma=0.1) optimizer = Adam( [param for param in model.parameters() if param.requires_grad], lr=base_lr, weight_decay=config.weight_decay) scheduler = StepLR(optimizer, step_size=1, gamma=config.gamma) bind_nsml(model, optimizer, scheduler) if config.pause: nsml.paused(scope=locals()) if mode == 'train': tr_loader, val_loader, val_label_file = data_loader_with_split( root=TRAIN_DATASET_PATH, train_split=train_split) num_batches = len(tr_loader) #local_eval(model, val_loader, val_label_file) #exit(0)