def main(args): device = 'cuda' if torch.cuda.is_available() else 'cpu' if 'vgg' in args.model_arch.lower(): model = VGG(args.model_arch, True, args.dataset, 0, 3, 3) elif 'res' in args.model_arch.lower(): model = ResNet_(args.model_arch, True, args.dataset, 0, 3, 3) my_dataset = Cifar10(args) model = model.to(device) #if torch.cuda.device_count() > 1: model = nn.DataParallel(model) my_dataset.get_loaders() model_path = args.resume model, _, _ = helper.load_checkpoint(args, model, optimizer=None, path=None) criterion = nn.CrossEntropyLoss() top_1_acc, _, _ = trainer.validate(my_dataset.test_loader, model, criterion, args)
def awgn_train(trainloader, valloader, val_set_size, device, args): # Define loggers log_writer_train = SummaryWriter('logs/train/') log_writer_val = SummaryWriter('logs/val/') # Setup the model and move it to GPU net = FC_Autoencoder(args.k, args.n_channel) net = net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate) # optimize all network parameters exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.01) # Decay LR by a factor of 0.1 every 7 epochs loss_func = nn.CrossEntropyLoss() # the target label is not one-hotted patience = 10 # early stopping patience; how long to wait after last time validation loss improved. early_stopping = EarlyStopping(patience=patience, verbose=True) # initialize the early_stopping object loss_vec = [] start = time.time() for epoch in range(args.epochs): train_epoch_loss, train_epoch_acc = train(trainloader, net, optimizer, loss_func, device, loss_vec, args) val_loss, val_accuracy = validate(net,valloader,loss_func, val_set_size, device, args) print('Epoch: ', epoch + 1, '| train loss: %.4f' % train_epoch_loss, '| train acc: %4f' % (train_epoch_acc*100),'%','| val loss: %.4f' % val_loss, '| val acc: %4f' % (val_accuracy*100),'%') log_writer_train.add_scalar('Train/Loss', train_epoch_loss, epoch) log_writer_train.add_scalar('Train/Accuracy', train_epoch_acc, epoch) log_writer_val.add_scalar('Val/Loss', val_loss, epoch) log_writer_val.add_scalar('Val/Accuracy', val_accuracy, epoch) early_stopping(val_loss, net) if early_stopping.early_stop: print("Early stopping") break time_elapsed = time.time() - start print('Training completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) torch.save(net.state_dict(), 'trained_net_74AE.ckpt') # Save trained net generate_encoded_sym_dict(args.n_channel, args.k, net, device) # Generate encoded symbols return net
def main(): with timer('load data'): df = pd.read_csv(TRAIN_PATH) df["loc_x"] = df["loc_x"] / 100 df["loc_y"] = df["loc_y"] / 100 y = df[TARGET_COLUMNS].values df = df[[ID_COLUMNS]] gc.collect() with timer("split data"): folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0).split(df, y) for n_fold, (train_index, val_index) in enumerate(folds): train_df = df.loc[train_index] val_df = df.loc[val_index] y_train = y[train_index] y_val = y[val_index] if n_fold == fold_id: break with timer('preprocessing'): train_augmentation = Compose([ HorizontalFlip(p=0.5), OneOf([ ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03), GridDistortion(p=0.5), OpticalDistortion(p=1, distort_limit=2, shift_limit=0.5) ], p=0.5), RandomBrightnessContrast(p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), Resize(img_size, img_size, p=1) ]) val_augmentation = Compose([ Resize(img_size, img_size, p=1) ]) train_dataset = KDDataset(train_df, y_train, img_size, IMAGE_PATH, id_colname=ID_COLUMNS, transforms=train_augmentation) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True) val_dataset = KDDataset(val_df, y_val, img_size, IMAGE_PATH, id_colname=ID_COLUMNS, transforms=val_augmentation) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True) del df, train_dataset, val_dataset gc.collect() with timer('create model'): model = CnnModel(num_classes=N_CLASSES, encoder="se_resnext50_32x4d", pretrained="../input/pytorch-pretrained-models/se_resnext50_32x4d-a260b3a4.pth", pool_type="avg") if model_path is not None: model.load_state_dict(torch.load(model_path)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, eps=1e-4) # model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) with timer('train'): best_score = 0 for epoch in range(1, epochs + 1): seed_torch(SEED + epoch) if epoch == epochs - 3: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.1 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, N_CLASSES) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) y_pred, target, val_loss = validate(model, val_loader, criterion, device, N_CLASSES) score = roc_auc_score(target, y_pred) LOGGER.info('Mean val loss: {}'.format(round(val_loss, 5))) LOGGER.info('val score: {}'.format(round(score, 5))) if score > best_score: best_score = score np.save("y_pred.npy", y_pred) torch.save(model.state_dict(), save_path) np.save("target.npy", target) with timer('predict'): test_df = pd.read_csv(TEST_PATH) test_ids = test_df["id"].values test_augmentation = Compose([ Resize(img_size, img_size, p=1) ]) test_dataset = KDDatasetTest(test_df, img_size, TEST_IMAGE_PATH, id_colname=ID_COLUMNS, transforms=test_augmentation, n_tta=2) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True) model.load_state_dict(torch.load(save_path)) pred = predict(model, test_loader, device, N_CLASSES, n_tta=2) print(pred.shape) results = pd.DataFrame({"id": test_ids, "is_star": pred.reshape(-1)}) results.to_csv("results.csv", index=False)
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) df.drop("EncodedPixels_2", axis=1, inplace=True) df = df.rename(columns={"EncodedPixels_3": "EncodedPixels_2"}) df = df.rename(columns={"EncodedPixels_4": "EncodedPixels_3"}) y1 = (df.EncodedPixels_1 != "-1").astype("float32").values.reshape( -1, 1) y2 = (df.EncodedPixels_2 != "-1").astype("float32").values.reshape( -1, 1) y3 = (df.EncodedPixels_3 != "-1").astype("float32").values.reshape( -1, 1) #y4 = (df.EncodedPixels_4 != "-1").astype("float32").values.reshape(-1, 1) y = np.concatenate([y1, y2, y3], axis=1) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] y_train, y_val = y[df.fold_id != FOLD_ID], y[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), ], p=0.5), OneOf([ GaussNoise(p=0.5), ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0, class_y=y_train) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('resnet34', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish", freeze_bn=True, classification=CLASSIFICATION, attention_type="cbam", center=True, mode="train") model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam([ { 'params': model.decoder.parameters(), 'lr': 3e-3 }, { 'params': model.encoder.parameters(), 'lr': 3e-4 }, ]) if base_model is None: scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler( optimizer, multiplier=1.1, total_epoch=CLR_CYCLE * 2, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) if EMA: ema_model = copy.deepcopy(model) if base_model_ema is not None: ema_model.load_state_dict(torch.load(base_model_ema)) ema_model.to(device) ema_model = torch.nn.DataParallel(ema_model) else: ema_model = None model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ema_loss = 999 best_model_ep = 0 ema_decay = 0 checkpoint = base_ckpt + 1 for epoch in range(1, EPOCHS + 1): seed = seed + epoch seed_torch(seed) if epoch >= EMA_START: ema_decay = 0.99 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0, classification=CLASSIFICATION, ema_model=ema_model, ema_decay=ema_decay) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss = validate(model, val_loader, criterion, device, classification=CLASSIFICATION) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) if EMA and epoch >= EMA_START: ema_valid_loss = validate(ema_model, val_loader, criterion, device, classification=CLASSIFICATION) LOGGER.info('Mean EMA valid loss: {}'.format( round(ema_valid_loss, 5))) if ema_valid_loss < best_model_ema_loss: torch.save( ema_model.module.state_dict(), 'models/{}_fold{}_ckpt{}_ema.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_ema_loss = ema_valid_loss scheduler.step() if valid_loss < best_model_loss: torch.save( model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch #np.save("val_pred.npy", val_pred) if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save( model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) if EMA: torch.save( ema_model.module.state_dict(), 'models/{}_fold{}_latest_ema.pth'.format( EXP_ID, FOLD_ID)) LOGGER.info('Best ema valid loss: {}'.format( round(best_model_ema_loss, 5))) best_model_ema_loss = 999 checkpoint += 1 best_model_loss = 999 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), RandomBrightness(p=0.5), RandomContrast(p=0.5) ], p=0.5), OneOf([ GaussNoise(p=0.5), Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5) ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('se_resnext50_32x4d', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish") model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) if base_model is None: scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler( optimizer, multiplier=1.1, total_epoch=CLR_CYCLE * 2, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 checkpoint = base_ckpt + 1 for epoch in range(1, EPOCHS + 1): seed = seed + epoch seed_torch(seed) LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss = validate(model, val_loader, criterion, device) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) scheduler.step() if valid_loss < best_model_loss: torch.save( model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch #np.save("val_pred.npy", val_pred) if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save( model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) checkpoint += 1 best_model_loss = 999 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(): print('=> number of GPU: ', args.gpu_num) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num save_path = save_path_formatter(args, parser) args.save_path = 'checkpoints' / save_path print("=> information will be saved in {}".format(args.save_path)) args.save_path.makedirs_p() torch.manual_seed(args.seed) img_H = args.height img_W = args.width if args.evaluate: args.epochs = 0 training_writer = SummaryWriter(args.save_path) ######################################################################## ###################### Data loading part ########################## ## normalize -1 to 1 func normalize = Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) if args.dataset == "NYU": valid_transform = Compose([ CenterCrop(size=(img_H, img_W)), ArrayToTensor(height=img_H, width=img_W), normalize ]) ### NYU valid transform ### else: valid_transform = Compose( [ArrayToTensor(height=img_H, width=img_W), normalize]) ### KITTI valid transform ### print("=> fetching scenes in '{}'".format(args.data)) print("=> Dataset: ", args.dataset) if args.dataset == 'KITTI': train_transform = Compose([ RandomHorizontalFlip(), RandomScaleCrop(), ArrayToTensor(height=img_H, width=img_W), normalize ]) train_set = SequenceFolder(args.data, args=args, transform=train_transform, seed=args.seed, train=True, mode=args.mode) if args.real_test is False: print("=> test on validation set") ''' val_set = SequenceFolder( args.data, args = args, transform=valid_transform, seed=args.seed, train=False, mode = args.mode) ''' val_set = TestFolder(args.data, args=args, transform=valid_transform, seed=args.seed, train=False, mode=args.mode) else: print("=> test on Eigen test split") val_set = TestFolder(args.data, args=args, transform=valid_transform, seed=args.seed, train=False, mode=args.mode) elif args.dataset == 'Make3D': train_transform = Compose([ RandomHorizontalFlip(), RandomScaleCrop(), ArrayToTensor(height=img_H, width=img_W), normalize ]) train_set = Make3DFolder(args.data, args=args, transform=train_transform, seed=args.seed, train=True, mode=args.mode) val_set = Make3DFolder(args.data, args=args, transform=valid_transform, seed=args.seed, train=False, mode=args.mode) elif args.dataset == 'NYU': if args.mode == 'RtoD': print('RtoD transform created') train_transform = EnhancedCompose([ Merge(), RandomCropNumpy(size=(251, 340)), RandomRotate(angle_range=(-5, 5), mode='constant'), Split([0, 3], [3, 4]) ]) train_transform_2 = EnhancedCompose([ CenterCrop(size=(img_H, img_W)), RandomHorizontalFlip(), [RandomColor(multiplier_range=(0.8, 1.2)), None], ArrayToTensor(height=img_H, width=img_W), normalize ]) elif args.mode == 'DtoD': print('DtoD transform created') train_transform = EnhancedCompose([ Merge(), RandomCropNumpy(size=(251, 340)), RandomRotate(angle_range=(-4, 4), mode='constant'), Split([0, 1]) ]) train_transform_2 = EnhancedCompose([ CenterCrop(size=(img_H, img_W)), RandomHorizontalFlip(), ArrayToTensor(height=img_H, width=img_W), normalize ]) train_set = NYUdataset(args.data, args=args, transform=train_transform, transform_2=train_transform_2, seed=args.seed, train=True, mode=args.mode) val_set = NYUdataset(args.data, args=args, transform=valid_transform, seed=args.seed, train=False, mode=args.mode) #print('samples_num: {} train scenes: {}'.format(len(train_set), len(train_set.scenes))) print('=> samples_num: {} '.format(len(train_set))) print('=> samples_num: {}- test'.format(len(val_set))) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.epoch_size == 0: args.epoch_size = len(train_loader) cudnn.benchmark = True ########################################################################### ########################################################################### ################################################################################ ###################### Setting Network, Loss, Optimizer part ################### print("=> creating model") if args.mode == 'DtoD': print('- DtoD train') AE_DtoD = AutoEncoder_DtoD(norm=args.norm, input_dim=1, height=img_H, width=img_W) AE_DtoD = nn.DataParallel(AE_DtoD) AE_DtoD = AE_DtoD.cuda() #AE_DtoD.load_state_dict(torch.load(args.model_dir)) print('- DtoD model is created') optimizer_AE = optim.Adam(AE_DtoD.parameters(), args.lr, [args.momentum, args.beta], eps=1e-08, weight_decay=5e-4) criterion_L2 = nn.MSELoss() criterion_L1 = nn.L1Loss() elif args.mode == 'RtoD': print('- RtoD train') AE_DtoD = AutoEncoder_DtoD(norm=args.norm, input_dim=1, height=img_H, width=img_W) AE_DtoD = nn.DataParallel(AE_DtoD) AE_DtoD = AE_DtoD.cuda() AE_DtoD.load_state_dict(torch.load(args.model_dir)) AE_DtoD.eval() print('- pretrained DtoD model is created') AE_RtoD = AutoEncoder_2(norm=args.norm, input_dim=3, height=img_H, width=img_W) AE_RtoD = nn.DataParallel(AE_RtoD) AE_RtoD = AE_RtoD.cuda() #AE_RtoD.load_state_dict(torch.load(args.RtoD_model_dir)) print('- RtoD model is created') optimizer_AE = optim.Adam(AE_RtoD.parameters(), args.lr, [args.momentum, args.beta], eps=1e-08, weight_decay=5e-4) criterion_L2 = nn.MSELoss() criterion_L1 = nn.L1Loss() elif args.mode == 'RtoD_single': print('- RtoD single train') AE_DtoD = None AE_RtoD = AutoEncoder_2(norm=args.norm, input_dim=3, height=img_H, width=img_W) AE_RtoD = nn.DataParallel(AE_RtoD) AE_RtoD = AE_RtoD.cuda() #AE_RtoD.load_state_dict(torch.load(args.RtoD_model_dir)) print('- RtoD model is created') optimizer_AE = optim.Adam(AE_RtoD.parameters(), args.lr, [args.momentum, args.beta], eps=1e-08, weight_decay=5e-4) criterion_L2 = nn.MSELoss() criterion_L1 = nn.L1Loss() elif args.mode == 'DtoD_test': print('- DtoD test') AE_DtoD = AutoEncoder_DtoD(norm=args.norm, input_dim=1, height=img_H, width=img_W) AE_DtoD = nn.DataParallel(AE_DtoD) AE_DtoD = AE_DtoD.cuda() AE_DtoD.load_state_dict(torch.load(args.model_dir)) print('- pretrained DtoD model is created') elif args.mode == 'RtoD_test': print('- RtoD test') AE_RtoD = AutoEncoder(norm=args.norm, height=img_H, width=img_W) #AE_RtoD = AutoEncoder_2(norm=args.norm,input_dim=3,height=img_H,width=img_W) AE_RtoD = nn.DataParallel(AE_RtoD) AE_RtoD = AE_RtoD.cuda() AE_RtoD.load_state_dict(torch.load(args.RtoD_model_dir)) print('- pretrained RtoD model is created') ############################################################################# ############################################################################# ############################ data log ####################################### if args.evaluate == True: logger = TermLogger(n_epochs=args.epochs, train_size=min(len(train_loader), args.epoch_size), valid_size=len(val_loader)) logger.epoch_bar.start() elif args.evaluate == False: logger = None #logger = TermLogger(n_epochs=args.epochs, train_size=min(len(train_loader), args.epoch_size), valid_size=len(val_loader)) #logger.epoch_bar.start() if logger is not None: with open(args.save_path / args.log_summary, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow(['train_loss', 'validation_loss']) with open(args.save_path / args.log_full, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow(['train_loss_sum', 'output_loss', 'latent_loss']) ############################################################################# ############################ Training part ################################## if args.mode == 'DtoD': loss = train_AE_DtoD(args, AE_DtoD, criterion_L2, criterion_L1, optimizer_AE, train_loader, val_loader, args.batch_size, args.epochs, args.lr, logger, training_writer) print('Final loss:', loss.item()) elif args.mode == 'RtoD' or args.mode == 'RtoD_single': loss, output_loss, latent_loss = train_AE_RtoD( args, AE_RtoD, AE_DtoD, criterion_L2, criterion_L1, optimizer_AE, train_loader, val_loader, args.batch_size, args.epochs, args.lr, logger, training_writer) ########################### Evaluating part ################################# if args.mode == 'DtoD_test': test_model = AE_DtoD print("DtoD_test - switch model to eval mode") elif args.mode == 'RtoD_test': test_model = AE_RtoD print("RtoD_test - switch model to eval mode") test_model.eval() if (logger is not None) and (args.evaluate == True): if args.dataset == 'KITTI': logger.reset_valid_bar() errors, min_errors, error_names = validate(args, val_loader, test_model, 0, logger, args.mode) error_length = 8 elif args.dataset == 'Make3D': logger.reset_valid_bar() errors, min_errors, error_names = validate_Make3D( args, val_loader, test_model, 0, logger, args.mode) error_length = 4 elif args.dataset == 'NYU': logger.reset_valid_bar() errors, min_errors, error_names = validate_NYU( args, val_loader, test_model, 0, logger, args.mode) error_length = 8 for error, name in zip(errors, error_names): training_writer.add_scalar(name, error, 0) error_string = ', '.join( '{} : {:.3f}'.format(name, error) for name, error in zip( error_names[0:error_length], errors[0:error_length])) logger.valid_writer.write(' * Avg {}'.format(error_string)) print("") error_string = ', '.join( '{} : {:.3f}'.format(name, error) for name, error in zip( error_names[0:error_length], min_errors[0:error_length])) logger.valid_writer.write(' * Avg {}'.format(error_string)) logger.valid_bar.finish() print(args.dataset, "valdiation finish") ## Test if args.img_save is False: print("--only Test mode finish--") return k = 0 for gt_data, rgb_data, _ in val_loader: if args.mode == 'RtoD' or args.mode == 'RtoD_test': gt_data = Variable(gt_data.cuda()) final_AE_in = rgb_data.cuda() elif args.mode == 'DtoD' or args.mode == 'DtoD_test': rgb_data = Variable(rgb_data.cuda()) final_AE_in = gt_data.cuda() final_AE_in = Variable(final_AE_in) with torch.no_grad(): final_AE_depth = test_model(final_AE_in, istrain=False) img_arr = [final_AE_depth, gt_data, rgb_data] folder_name_list = ['/output_depth', '/ground_truth', '/input_rgb'] img_name_list = ['/final_AE_depth_', '/final_AE_gt_', '/final_AE_rgb_'] folder_iter = cycle(folder_name_list) img_name_iter = cycle(img_name_list) for img in img_arr: img_org = img.cpu().detach().numpy() folder_name = next(folder_iter) img_name = next(img_name_iter) result_dir = args.result_dir + folder_name if not os.path.exists(result_dir): os.makedirs(result_dir) for t in range(img_org.shape[0]): img = img_org[t] if img.shape[0] == 3: img_ = np.empty([img_H, img_W, 3]) img_[:, :, 0] = img[0, :, :] img_[:, :, 1] = img[1, :, :] img_[:, :, 2] = img[2, :, :] elif img.shape[0] == 1: img_ = np.empty([img_H, img_W]) img_[:, :] = img[0, :, :] scipy.misc.imsave(result_dir + img_name + '%05d.jpg' % (k + t), img_) k += img_org.shape[0]
def main(): device = "cuda:0" if torch.cuda.is_available() else "cpu" parser = argparse.ArgumentParser() parser.add_argument('--image_path', type=str, default="./data/cache/train") parser.add_argument('--label_path', type=str, default="./data/cache/train.csv") parser.add_argument('--kfold_idx', type=int, default=0) # parser.add_argument('--model', type=str, default='CustomModel') parser.add_argument('--model', type=str, default='efficientnet-b0') parser.add_argument('--epochs', type=int, default=2000) parser.add_argument('--batch_size', type=int, default=50) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--patient', type=int, default=8) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--device', type=str, default=device) parser.add_argument('--resume', type=str, default=None) parser.add_argument('--comments', type=str, default=None) args = parser.parse_args() print('=' * 50) print('[info msg] arguments\n') for key, value in vars(args).items(): print(key, ":", value) print('=' * 50) assert os.path.isdir(args.image_path), 'wrong path' assert os.path.isfile(args.label_path), 'wrong path' if (args.resume): assert os.path.isfile(args.resume), 'wrong path' # assert args.kfold_idx < 5 seed_everything(args.seed) data_df = pd.read_csv(args.label_path) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=args.seed) for train_idx, valid_idx in sss.split(X=data_df['id'], y=data_df['accent']): train_df = data_df.iloc[train_idx] valid_df = data_df.iloc[valid_idx] train_data = dataset.DaconDataset( image_folder=args.image_path, label_df=train_df, ) valid_data = dataset.DaconDataset( image_folder=args.image_path, label_df=valid_df, ) train_sampler = get_sampler( df=train_df, dataset=train_data ) valid_sampler = get_sampler( df=valid_df, dataset=valid_data ) train_data_loader = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, # shuffle=True, sampler=train_sampler ) valid_data_loader = torch.utils.data.DataLoader( valid_data, batch_size=args.batch_size, # shuffle=False, sampler=valid_sampler ) model = None if args.model == 'CustomModel': model = CustomModel() print('[info msg] {} model is created\n'.format('CustomModel')) else: model = EfficientNet.from_pretrained(args.model, in_channels=1, num_classes=6, dropout_rate=0.3, advprop=True) print('[info msg] {} model is created\n'.format(args.model)) print('=' * 50) if(args.resume): model.load_state_dict(torch.load(args.resume)) print('[info msg] pre-trained weight is loaded !!\n') print(args.resume) print('=' * 50) if args.device == 'cuda' and torch.cuda.device_count() > 1 : model = torch.nn.DataParallel(model) ##### Wandb ###### wandb.init(project='dacon_voice') wandb.run.name = args.comments wandb.config.update(args) wandb.watch(model) ################## model.to(args.device) optimizer = torch.optim.Adam(model.parameters(), args.lr) criterion = torch.nn.CrossEntropyLoss() scheduler = ReduceLROnPlateau( optimizer=optimizer, mode='min', patience=2, factor=0.5, verbose=True ) train_loss = [] train_acc = [] valid_loss = [] valid_acc = [] best_loss = float("inf") patient = 0 date_time = datetime.now().strftime("%m%d%H%M%S") SAVE_DIR = os.path.join('./model', date_time) print('[info msg] training start !!\n') startTime = datetime.now() for epoch in range(args.epochs): print('Epoch {}/{}'.format(epoch+1, args.epochs)) train_epoch_loss, train_epoch_acc = trainer.train( train_loader=train_data_loader, model=model, loss_func=criterion, device=args.device, optimizer=optimizer, ) train_loss.append(train_epoch_loss) train_acc.append(train_epoch_acc) valid_epoch_loss, valid_epoch_acc = trainer.validate( valid_loader=valid_data_loader, model=model, loss_func=criterion, device=args.device, scheduler=scheduler, ) valid_loss.append(valid_epoch_loss) valid_acc.append(valid_epoch_acc) wandb.log({ "Train Acc": train_epoch_acc, "Valid Acc": valid_epoch_acc, "Train Loss": train_epoch_loss, "Valid Loss": valid_epoch_loss, }) if best_loss > valid_epoch_loss: patient = 0 best_loss = valid_epoch_loss Path(SAVE_DIR).mkdir(parents=True, exist_ok=True) torch.save(model.state_dict(), os.path.join(SAVE_DIR, 'model_best.pth')) print('MODEL IS SAVED TO {}!!!'.format(date_time)) else: patient += 1 if patient > args.patient - 1: print('=======' * 10) print("[Info message] Early stopper is activated") break elapsed_time = datetime.now() - startTime train_loss = np.array(train_loss) train_acc = np.array(train_acc) valid_loss = np.array(valid_loss) valid_acc = np.array(valid_acc) best_loss_pos = np.argmin(valid_loss) print('=' * 50) print('[info msg] training is done\n') print("Time taken: {}".format(elapsed_time)) print("best loss is {} w/ acc {} at epoch : {}".format(best_loss, valid_acc[best_loss_pos], best_loss_pos)) print('=' * 50) print('[info msg] {} model weight and log is save to {}\n'.format(args.model, SAVE_DIR)) with open(os.path.join(SAVE_DIR, 'log.txt'), 'w') as f: for key, value in vars(args).items(): f.write('{} : {}\n'.format(key, value)) f.write('\n') f.write('total ecpochs : {}\n'.format(str(train_loss.shape[0]))) f.write('time taken : {}\n'.format(str(elapsed_time))) f.write('best_train_loss {} w/ acc {} at epoch : {}\n'.format(np.min(train_loss), train_acc[np.argmin(train_loss)], np.argmin(train_loss))) f.write('best_valid_loss {} w/ acc {} at epoch : {}\n'.format(np.min(valid_loss), valid_acc[np.argmin(valid_loss)], np.argmin(valid_loss))) plt.figure(figsize=(15,5)) plt.subplot(1, 2, 1) plt.plot(train_loss, label='train loss') plt.plot(valid_loss, 'o', label='valid loss') plt.axvline(x=best_loss_pos, color='r', linestyle='--', linewidth=1.5) plt.legend() plt.subplot(1, 2, 2) plt.plot(train_acc, label='train acc') plt.plot(valid_acc, 'o', label='valid acc') plt.axvline(x=best_loss_pos, color='r', linestyle='--', linewidth=1.5) plt.legend() plt.savefig(os.path.join(SAVE_DIR, 'history.png'))
def main(): args = parser.parse_args() print("=> No Distributed Training") print('=> Index of using GPU: ', args.gpu_num) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." torch.manual_seed(args.seed) if args.evaluate is True: save_path = save_path_formatter(args, parser) args.save_path = 'checkpoints' / save_path print("=> information will be saved in {}".format(args.save_path)) args.save_path.makedirs_p() training_writer = SummaryWriter(args.save_path) ###################### Data loading part ########################## if args.dataset == 'KITTI': args.max_depth = 80.0 elif args.dataset == 'NYU': args.max_depth = 10.0 if args.result_dir == '': args.result_dir = './' + args.dataset + '_Eval_results' args.log_metric = args.dataset + '_' + args.encoder + args.log_metric test_set = MyDataset(args, train=False) print("=> Dataset: ", args.dataset) print("=> Data height: {}, width: {} ".format(args.height, args.width)) print('=> test samples_num: {} '.format(len(test_set))) test_sampler = None val_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=test_sampler) cudnn.benchmark = True ########################################################################### ###################### setting model list ################################# if args.multi_test is True: print("=> all of model tested") models_list_dir = Path(args.models_list_dir) models_list = sorted(models_list_dir.files('*.pkl')) else: print("=> just one model tested") models_list = [args.model_dir] ###################### setting Network part ################### print("=> creating model") Model = LDRN(args) num_params_encoder = 0 num_params_decoder = 0 for p in Model.encoder.parameters(): num_params_encoder += p.numel() for p in Model.decoder.parameters(): num_params_decoder += p.numel() print("===============================================") print("model encoder parameters: ", num_params_encoder) print("model decoder parameters: ", num_params_decoder) print("Total parameters: {}".format(num_params_encoder + num_params_decoder)) print("===============================================") Model = Model.cuda() Model = torch.nn.DataParallel(Model) if args.evaluate is True: ############################ data log ####################################### logger = TermLogger(n_epochs=args.epochs, train_size=min(len(val_loader), args.epoch_size), valid_size=len(val_loader)) with open(args.save_path / args.log_metric, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') if args.dataset == 'KITTI': writer.writerow([ 'Filename', 'Abs_diff', 'Abs_rel', 'Sq_rel', 'a1', 'a2', 'a3', 'RMSE', 'RMSE_log' ]) elif args.dataset == 'Make3D': writer.writerow( ['Filename', 'Abs_diff', 'Abs_rel', 'log10', 'rmse']) elif args.dataset == 'NYU': writer.writerow([ 'Filename', 'Abs_diff', 'Abs_rel', 'log10', 'a1', 'a2', 'a3', 'RMSE', 'RMSE_log' ]) ########################### Evaluating part ################################# test_model = Model print("Model Initialized") test_len = len(models_list) print("=> Length of model list: ", test_len) for i in range(test_len): filename = models_list[i].split('/')[-1] logger.reset_valid_bar() test_model.load_state_dict( torch.load(models_list[i], map_location='cuda:0')) #test_model.load_state_dict(torch.load(models_list[i])) test_model.eval() if args.dataset == 'KITTI': errors, error_names = validate(args, val_loader, test_model, logger, 'KITTI') elif args.dataset == 'NYU': errors, error_names = validate(args, val_loader, test_model, logger, 'NYU') for error, name in zip(errors, error_names): training_writer.add_scalar(name, error, 0) logger.valid_writer.write(' * model: {}'.format(models_list[i])) print("") error_string = ', '.join( '{} : {:.3f}'.format(name, error) for name, error in zip( error_names[0:len(error_names)], errors[0:len(errors)])) logger.valid_writer.write(' * Avg {}'.format(error_string)) print("") logger.valid_bar.finish() with open(args.save_path / args.log_metric, 'a') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow( ['%s' % filename] + ['%.4f' % (errors[k]) for k in range(len(errors))]) print(args.dataset, " valdiation finish") ## Test if args.img_save is False: print("--only Test mode finish--") return else: test_model = Model test_model.load_state_dict( torch.load(models_list[0], map_location='cuda:0')) #test_model.load_state_dict(torch.load(models_list[0])) test_model.eval() print("=> No validation") test_set = MyDataset(args, train=False, return_filename=True) test_sampler = None val_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=test_sampler) if args.img_save is True: cmap = plt.cm.jet print("=> img save start") for idx, (rgb_data, gt_data, gt_dense, filename) in enumerate(val_loader): if gt_data.ndim != 4 and gt_data[0] == False: continue img_H = gt_data.shape[2] img_W = gt_data.shape[3] gt_data = Variable(gt_data.cuda()) input_img = Variable(rgb_data.cuda()) gt_data = gt_data.clamp(0, args.max_depth) if args.use_dense_depth is True: gt_dense = Variable(gt_dense.cuda()) gt_dense = gt_dense.clamp(0, args.max_depth) input_img_flip = torch.flip(input_img, [3]) with torch.no_grad(): _, final_depth = test_model(input_img) _, final_depth_flip = test_model(input_img_flip) final_depth_flip = torch.flip(final_depth_flip, [3]) final_depth = 0.5 * (final_depth + final_depth_flip) final_depth = final_depth.clamp(0, args.max_depth) d_min = min(final_depth.min(), gt_data.min()) d_max = max(final_depth.max(), gt_data.max()) d_min = d_min.cpu().detach().numpy().astype(np.float32) d_max = d_max.cpu().detach().numpy().astype(np.float32) filename = filename[0] img_arr = [ final_depth, final_depth, final_depth, gt_data, rgb_data, gt_dense, gt_dense, gt_dense ] folder_name_list = [ '/output_depth', '/output_depth_cmap_gray', '/output_depth_cmap_jet', '/ground_truth', '/input_rgb', '/dense_gt', '/dense_gt_cmap_gray', '/dense_gt_cmap_jet' ] img_name_list = [ '/' + filename, '/cmap_gray_' + filename, '/cmap_jet_' + filename, '/gt_' + filename, '/rgb_' + filename, '/gt_dense_' + filename, '/gt_dense_cmap_gray_' + filename, '/gt_dense_cmap_jet_' + filename ] if args.use_dense_depth is False: img_arr = img_arr[:5] folder_name_list = folder_name_list[:5] img_name_list = img_name_list[:5] folder_iter = cycle(folder_name_list) img_name_iter = cycle(img_name_list) for img in img_arr: folder_name = next(folder_iter) img_name = next(img_name_iter) if folder_name == '/output_depth_cmap_gray' or folder_name == '/dense_gt_cmap_gray': if args.dataset == 'NYU': img = img * 1000.0 img = img.cpu().detach().numpy().astype(np.uint16) img_org = img.copy() else: img = img * 256.0 img = img.cpu().detach().numpy().astype(np.uint16) img_org = img.copy() elif folder_name == '/output_depth_cmap_jet' or folder_name == '/dense_gt_cmap_jet': img_org = img else: img = (img / img.max()) * 255.0 img_org = img.cpu().detach().numpy().astype(np.float32) result_dir = args.result_dir + folder_name for t in range(img_org.shape[0]): img = img_org[t] if folder_name == '/output_depth_cmap_jet' or folder_name == '/dense_gt_cmap_jet': img_ = np.squeeze(img.cpu().numpy().astype(np.float32)) img_ = ((img_ - d_min) / (d_max - d_min)) img_ = cmap(img_)[:, :, :3] * 255 else: if img.shape[0] == 3: img_ = np.empty([img_H, img_W, 3]).astype(img.dtype) ''' img_[:,:,2] = img[0,:,:] img_[:,:,1] = img[1,:,:] img_[:,:,0] = img[2,:,:] # for BGR ''' img_ = img.transpose(1, 2, 0) # for RGB elif img.shape[0] == 1: img_ = np.ones([img_H, img_W]).astype(img.dtype) img_[:, :] = img[0, :, :] if not os.path.exists(result_dir): os.makedirs(result_dir) if folder_name == '/output_depth_cmap_gray' or folder_name == '/dense_gt_cmap_gray': plt.imsave(result_dir + img_name, np.log10(img_), cmap='Greys') elif folder_name == '/output_depth_cmap_jet' or folder_name == '/dense_gt_cmap_jet': img_ = Image.fromarray(img_.astype('uint8')) img_.save(result_dir + img_name) else: imageio.imwrite(result_dir + img_name, img_) if (idx + 1) % 10 == 0: print(idx + 1, "th image is processed..") print("--Test image save finish--") return
def main(cfg): cwd = utils.get_original_cwd() cfg.cwd = cwd cfg.pos_size = 2 * cfg.pos_limit + 2 logger.info(f'\n{cfg.pretty()}') __Model__ = { 'cnn': models.PCNN, 'rnn': models.BiLSTM, 'transformer': models.Transformer, 'gcn': models.GCN, 'capsule': models.Capsule, 'lm': models.LM, } # device if cfg.use_gpu and torch.cuda.is_available(): device = torch.device('cuda', cfg.gpu_id) else: device = torch.device('cpu') logger.info(f'device: {device}') # 如果不修改预处理的过程,这一步最好注释掉,不用每次运行都预处理数据一次 if cfg.preprocess: preprocess(cfg) train_data_path = os.path.join(cfg.cwd, cfg.out_path, 'train.pkl') valid_data_path = os.path.join(cfg.cwd, cfg.out_path, 'valid.pkl') test_data_path = os.path.join(cfg.cwd, cfg.out_path, 'test.pkl') vocab_path = os.path.join(cfg.cwd, cfg.out_path, 'vocab.pkl') if cfg.model_name == 'lm': vocab_size = None else: vocab = load_pkl(vocab_path) vocab_size = vocab.count cfg.vocab_size = vocab_size train_dataset = CustomDataset(train_data_path) valid_dataset = CustomDataset(valid_data_path) test_dataset = CustomDataset(test_data_path) train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn(cfg)) valid_dataloader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn(cfg)) test_dataloader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn(cfg)) model = __Model__[cfg.model_name](cfg) model.to(device) logger.info(f'\n {model}') optimizer = optim.Adam(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=cfg.lr_factor, patience=cfg.lr_patience) criterion = nn.CrossEntropyLoss() best_f1, best_epoch = -1, 0 es_loss, es_f1, es_epoch, es_patience, best_es_epoch, best_es_f1, es_path, best_es_path = 1e8, -1, 0, 0, 0, -1, '', '' train_losses, valid_losses = [], [] if cfg.show_plot and cfg.plot_utils == 'tensorboard': writer = SummaryWriter('tensorboard') else: writer = None logger.info('=' * 10 + ' Start training ' + '=' * 10) for epoch in range(1, cfg.epoch + 1): manual_seed(cfg.seed + epoch) train_loss = train(epoch, model, train_dataloader, optimizer, criterion, device, writer, cfg) valid_f1, valid_loss = validate(epoch, model, valid_dataloader, criterion, device, cfg) scheduler.step(valid_loss) model_path = model.save(epoch, cfg) # logger.info(model_path) train_losses.append(train_loss) valid_losses.append(valid_loss) if best_f1 < valid_f1: best_f1 = valid_f1 best_epoch = epoch # 使用 valid loss 做 early stopping 的判断标准 if es_loss > valid_loss: es_loss = valid_loss es_f1 = valid_f1 es_epoch = epoch es_patience = 0 es_path = model_path else: es_patience += 1 if es_patience >= cfg.early_stopping_patience: best_es_epoch = es_epoch best_es_f1 = es_f1 best_es_path = es_path if cfg.show_plot: if cfg.plot_utils == 'matplot': plt.plot(train_losses, 'x-') plt.plot(valid_losses, '+-') plt.legend(['train', 'valid']) plt.title('train/valid comparison loss') plt.show() if cfg.plot_utils == 'tensorboard': for i in range(len(train_losses)): writer.add_scalars('train/valid_comparison_loss', { 'train': train_losses[i], 'valid': valid_losses[i] }, i) writer.close() logger.info( f'best(valid loss quota) early stopping epoch: {best_es_epoch}, ' f'this epoch macro f1: {best_es_f1:0.4f}') logger.info(f'this model save path: {best_es_path}') logger.info( f'total {cfg.epoch} epochs, best(valid macro f1) epoch: {best_epoch}, ' f'this epoch macro f1: {best_f1:.4f}') validate(-1, model, test_dataloader, criterion, device, cfg)
def train_model(args, model, dataset, writer=None, n_rounds=1, lth_pruner=None): root = args.exp_name + '/checkpoints/' criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) #Save initial weight files. init_weight_filename = args.exp_name + '/checkpoints/' + 'initial_state.pth.tar' helper.save_checkpoint(args, model, optimizer, init_weight_filename) for cur_round in range(n_rounds): best_acc = 0 for epoch in range(args.start_epoch, args.epochs): helper.adjust_learning_rate(optimizer, epoch, args) train_top1, train_top5, train_loss,model = trainer.train(dataset.train_loader,model,criterion,\ optimizer,epoch,args,lth_pruner,cur_round,mask_applied=args.mask_applied) val_top1, val_top5, val_loss = trainer.validate( dataset.test_loader, model, criterion, args) if writer is not None: writer.add_scalar("loss/train/" + str(cur_round), train_loss, epoch) writer.add_scalar("top1/train/" + str(cur_round), train_top1, epoch) writer.add_scalar("top5/train/" + str(cur_round), train_top5, epoch) writer.add_scalar("loss/val/" + str(cur_round), val_loss, epoch) writer.add_scalar("top1/val/" + str(cur_round), val_top1, epoch) writer.add_scalar("top5/val/" + str(cur_round), val_top5, epoch) if val_top1 >= best_acc: best_acc = val_top1 is_best = True filename = root + str(cur_round) + '_model_best.pth' helper.save_checkpoint(args, model, optimizer, filename) filename = root + str(cur_round) + '_current.pth' helper.save_checkpoint(args, model, optimizer, filename, epoch=epoch) filename = root + str(cur_round) + '_mask.pkl' if epoch in [0, 1, 2, 3]: #Save early epochs for late resetting. filename = root + 'epoch_' + str(epoch) + '_model.pth' helper.save_checkpoint(args, model, optimizer, filename, epoch=epoch)
def main(): args = parser.parse_args() print('=> number of GPU: ', args.gpu_num) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num save_path = save_path_formatter(args, parser) args.save_path = 'checkpoints' / save_path print("=> information will be saved in {}".format(args.save_path)) args.save_path.makedirs_p() torch.manual_seed(args.seed) img_H = args.height img_W = args.width training_writer = SummaryWriter(args.save_path) ######################################################################## ###################### Data loading part ########################## ## normalize -1 to 1 func normalize = Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) if args.dataset == 'NYU': valid_transform = Compose([ CenterCrop(size=(img_H, img_W)), ArrayToTensor(height=img_H, width=img_W), normalize ]) ### NYU valid transform ### elif args.dataset == 'KITTI': valid_transform = Compose( [ArrayToTensor(height=img_H, width=img_W), normalize]) ### KITTI valid transform ### print("=> fetching scenes in '{}'".format(args.data)) print("=> Dataset: ", args.dataset) if args.dataset == 'KITTI': print("=> test on Eigen test split") val_set = TestFolder(args.data, args=args, transform=valid_transform, seed=args.seed, train=False, mode=args.mode) elif args.dataset == 'Make3D': val_set = Make3DFolder(args.data, args=args, transform=valid_transform, seed=args.seed, train=False, mode=args.mode) elif args.dataset == 'NYU': val_set = NYUdataset(args.data, args=args, transform=valid_transform, seed=args.seed, train=False, mode=args.mode) print('=> samples_num: {}- test'.format(len(val_set))) val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) cudnn.benchmark = True ########################################################################### ###################### setting model list ################################# if args.multi_test is True: print("=> all of model tested") models_list_dir = Path(args.models_list_dir) models_list = sorted(models_list_dir.files('*.pkl')) else: print("=> just one model tested") models_list = [args.model_dir] ###################### setting Network part ################### print("=> creating base model") if args.mode == 'DtoD_test': print('- DtoD test') AE_DtoD = AutoEncoder_DtoD(norm=args.norm, input_dim=1, height=img_H, width=img_W) AE_DtoD = nn.DataParallel(AE_DtoD) AE_DtoD = AE_DtoD.cuda() elif args.mode == 'RtoD_test': print('- RtoD test') #AE_RtoD = AutoEncoder_Unet(norm=args.norm,height=img_H,width=img_W) #previous gradloss_mask model #AE_RtoD = AutoEncoder_2(norm=args.norm,input_dim=3,height=img_H,width=img_W) #current autoencoder_2 model AE_RtoD = AutoEncoder(norm=args.norm, height=img_H, width=img_W) AE_RtoD = nn.DataParallel(AE_RtoD) AE_RtoD = AE_RtoD.cuda() ############################################################################# if args.evaluate is True: ############################ data log ####################################### logger = TermLogger(n_epochs=args.epochs, train_size=min(len(val_loader), args.epoch_size), valid_size=len(val_loader)) #logger.epoch_bar.start() with open(args.save_path / args.log_metric, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t') if args.dataset == 'KITTI': writer.writerow([ 'Epoch', 'Abs_diff', 'Abs_rel', 'Sq_rel', 'a1', 'a2', 'a3', 'RMSE', 'RMSE_log' ]) elif args.dataset == 'Make3D': writer.writerow( ['Epoch', 'Abs_diff', 'Abs_rel', 'log10', 'rmse']) elif args.dataset == 'NYU': writer.writerow([ 'Epoch', 'Abs_diff', 'Abs_rel', 'log10', 'a1', 'a2', 'a3', 'RMSE', 'RMSE_log' ]) ########################### Evaluating part ################################# if args.mode == 'DtoD_test': test_model = AE_DtoD print("DtoD_test - eval 모드로 설정") elif args.mode == 'RtoD_test': test_model = AE_RtoD print("RtoD_test - eval 모드로 설정") test_len = len(models_list) print("=> Length of model list: ", test_len) for i in range(test_len): logger.reset_valid_bar() test_model.load_state_dict(torch.load(models_list[i])) test_model.eval() if args.dataset == 'KITTI': errors, min_errors, error_names = validate( args, val_loader, test_model, 0, logger, args.mode) elif args.dataset == 'Make3D': errors, min_errors, error_names = validate_Make3D( args, val_loader, test_model, 0, logger, args.mode) elif args.dataset == 'NYU': errors, min_errors, error_names = validate_NYU( args, val_loader, test_model, 0, logger, args.mode) for error, name in zip(errors, error_names): training_writer.add_scalar(name, error, 0) logger.valid_writer.write(' * RtoD_model: {}'.format( models_list[i])) #error_string = ', '.join('{} : {:.3f}'.format(name, error) for name, error in zip(error_names[0:len(error_names)], errors[0:len(errors)])) error_string = ', '.join( '{} : {:.3f}'.format(name, error) for name, error in zip(error_names[0:len(error_names)], min_errors[0:len(errors)])) logger.valid_writer.write(' * Avg {}'.format(error_string)) print("") #error_string = ', '.join('{} : {:.3f}'.format(name, error) for name, error in zip(error_names[0:8], min_errors[0:8])) #logger.valid_writer.write(' * Avg {}'.format(error_string)) logger.valid_bar.finish() with open(args.save_path / args.log_metric, 'a') as csvfile: writer = csv.writer(csvfile, delimiter='\t') writer.writerow( ['%02d' % i] + ['%.4f' % (min_errors[k]) for k in range(len(min_errors))]) print(args.dataset, " valdiation finish") ## Test if args.img_save is False: print("--only Test mode finish--") return else: if args.mode == 'DtoD_test': test_model = AE_DtoD print("DtoD_test - eval 모드로 설정") elif args.mode == 'RtoD_test': test_model = AE_RtoD print("RtoD_test - eval 모드로 설정") test_model.load_state_dict(torch.load(models_list[0])) test_model.eval() print("=> No validation") k = 0 print("=> img save start") resize_ = Resize() for gt_data, rgb_data, filename in val_loader: if args.mode == 'RtoD' or args.mode == 'RtoD_test': gt_data = Variable(gt_data.cuda()) final_AE_in = rgb_data.cuda() elif args.mode == 'DtoD' or args.mode == 'DtoD_test': rgb_data = Variable(rgb_data.cuda()) final_AE_in = gt_data.cuda() final_AE_in = Variable(final_AE_in) with torch.no_grad(): final_AE_depth = test_model(final_AE_in, istrain=False) img_arr = [final_AE_depth, gt_data, rgb_data] folder_name_list = ['/output_depth', '/ground_truth', '/input_rgb'] img_name_list = ['/final_AE_depth_', '/final_AE_gt_', '/final_AE_rgb_'] folder_iter = cycle(folder_name_list) img_name_iter = cycle(img_name_list) for img in img_arr: img_org = img.cpu().detach().numpy() folder_name = next(folder_iter) img_name = next(img_name_iter) result_dir = args.result_dir + folder_name for t in range(img_org.shape[0]): filename_ = filename[t] img = img_org[t] if img.shape[0] == 3: img_ = np.empty([img_H, img_W, 3]) img_[:, :, 0] = img[0, :, :] img_[:, :, 1] = img[1, :, :] img_[:, :, 2] = img[2, :, :] if args.resize is True: img_ = resize_(img_, (384, 1248), 'rgb') elif img.shape[0] == 1: img_ = np.empty([img_H, img_W]) img_[:, :] = img[0, :, :] if args.resize is True: img_ = resize_(img_, (384, 1248), 'depth') img_ = img_[:, :, 0] if not os.path.exists(result_dir): os.makedirs(result_dir) scipy.misc.imsave(result_dir + img_name + '%05d.jpg' % (k + t), img_) #print(img_.shape) #print(filename_) #print(result_dir) #print(result_dir+filename_) #scipy.misc.imsave(result_dir + filename_ ,img_) k += img_org.shape[0] print("--Test image save finish--") return
def main(): start_epoch = 0 max_loss = math.inf epochs_since_improvement = 0 dataset = GaitSequenceDataset(root_dir = data_dir, longest_sequence = 85, shortest_sequence = 55) train_sampler, validation_sampler = generate_train_validation_samplers(dataset, validation_split=0.2) print('Building dataloaders..') train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, drop_last=True) validation_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=validation_sampler, drop_last=True) if load_pretrained is True: print('Loading pretrained model..') checkpoint = torch.load(checkpoint_path) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] encoder = checkpoint['encoder'] decoder = checkpoint['decoder'] encoder_optimizer = checkpoint['encoder_optimizer'] decoder_optimizer = checkpoint['decoder_optimizer'] else: print('Creating model..') encoder = Encoder(sequence_length, num_features, embedding_dimension) decoder = Decoder(embedding_dimension, num_classes, hidden_dimension, sequence_length) encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate) criterion = nn.MSELoss().to(device) if mode == 'train': summary = SummaryWriter() #summary = None encoder.to(device) decoder.to(device) for epoch in range(start_epoch, start_epoch+num_epochs): if epochs_since_improvement == 20 : break if epochs_since_improvement > 0 and epochs_since_improvement % 4 == 0: adjust_learning_rate(encoder_optimizer, 0.8) train(encoder, decoder, train_dataloader, encoder_optimizer, decoder_optimizer, criterion, clip_gradient, device, epoch, num_epochs, summary, loss_display_interval) current_loss = validate(encoder, decoder, validation_dataloader, criterion, device, epoch, num_epochs, summary, loss_display_interval) is_best = max_loss > current_loss max_loss = min(max_loss, current_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,)) else: epochs_since_improvement = 0 save_checkpoint(epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, is_best) else: print('testing...') encoder.to(device) decoder.to(device) encoder.eval() decoder.eval() for batch_idx, data in enumerate(validation_dataloader): sequence = data['sequence'][0].unsqueeze(0).permute(1, 0, 2).to(device) seq_len = data['sequence_length'][0] x ,(hidden_state, cell_state)= encoder(sequence) prediction = decoder(hidden_state) sequence = sequence.squeeze(1).detach().cpu().numpy() prediction = prediction.squeeze(1).detach().cpu().numpy() print(sequence.shape) hip_angles_gt = sequence[:seq_len, [0,3]] knee_angles_gt = sequence[:seq_len, [1,4]] ankle_angles_gt = sequence[:seq_len, [2,5]] hip_angles_pred = prediction[:seq_len, [0,3]] knee_angles_pred = prediction[:seq_len, [1,4]] ankle_angles_pred = prediction[:seq_len, [2,5]] time = np.arange(0, len(hip_angles_gt), 1) fig, axs = plt.subplots(4) # fig.suptitle('Hip angle reconstruction') # axs[0].plot(time, hip_angles_gt[:,0]) # axs[0].set_title('Left hip ground truth') # axs[1].plot(time, hip_angles_pred[:,0]) # axs[1].set_title('Left hip prediction') # axs[2].plot(time, hip_angles_gt[:,1]) # axs[2].set_title('Right hip ground truth') # axs[3].plot(time, hip_angles_pred[:,1]) # axs[3].set_title('Right hip prediction') # fig.suptitle('Knee angle reconstruction') # axs[0].plot(time, knee_angles_gt[:,0]) # axs[0].set_title('Left knee ground truth') # axs[1].plot(time, knee_angles_pred[:,0]) # axs[1].set_title('Left knee prediction') # axs[2].plot(time, knee_angles_gt[:,1]) # axs[2].set_title('Right knee ground truth') # axs[3].plot(time, knee_angles_pred[:,1]) # axs[3].set_title('Right knee prediction') fig.suptitle('Ankle angle reconstruction') axs[0].plot(time, ankle_angles_gt[:,0]) axs[0].set_title('Left ankle ground truth') axs[1].plot(time, ankle_angles_pred[:,0]) axs[1].set_title('Left ankle prediction') axs[2].plot(time, ankle_angles_gt[:,1]) axs[2].set_title('Right ankle ground truth') axs[3].plot(time, ankle_angles_pred[:,1]) axs[3].set_title('Right ankle prediction') plt.show() break
def main(): with timer('load data'): df = pd.read_csv(FOLD_PATH) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ #ElasticTransform(p=0.5, alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03), GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), #OneOf([ # ShiftScaleRotate(p=0.5), ## RandomRotate90(p=0.5), # Rotate(p=0.5) #], p=0.5), OneOf([ Blur(blur_limit=8, p=0.5), MotionBlur(blur_limit=8,p=0.5), MedianBlur(blur_limit=8,p=0.5), GaussianBlur(blur_limit=8,p=0.5) ], p=0.5), OneOf([ #CLAHE(clip_limit=4, tile_grid_size=(4, 4), p=0.5), RandomGamma(gamma_limit=(100,140), p=0.5), RandomBrightnessContrast(p=0.5), RandomBrightness(p=0.5), RandomContrast(p=0.5) ], p=0.5), OneOf([ GaussNoise(p=0.5), Cutout(num_holes=10, max_h_size=10, max_w_size=20, p=0.5) ], p=0.5) ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('se_resnext50_32x4d', encoder_weights='imagenet', classes=N_CLASSES) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=3e-4) scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler(optimizer, multiplier=1.1, total_epoch=CLR_CYCLE*2, after_scheduler=scheduler_cosine) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 checkpoint = 0 for epoch in range(1, EPOCHS + 1): if epoch % (CLR_CYCLE * 2) == 0: if epoch != 0: y_val = y_val.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) best_pred = best_pred.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) for i in range(N_CLASSES): th, score, _, _ = search_threshold(y_val[:, i, :, :], best_pred[:, i, :, :]) LOGGER.info('Best loss: {} Best Dice: {} on epoch {} th {} class {}'.format( round(best_model_loss, 5), round(score, 5), best_model_ep, th, i)) checkpoint += 1 best_model_loss = 999 LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss, val_pred, y_val = validate(model, val_loader, criterion, device) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) scheduler.step() if valid_loss < best_model_loss: torch.save(model.state_dict(), '{}_fold{}_ckpt{}.pth'.format(EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch best_pred = val_pred del val_pred gc.collect() with timer('eval'): y_val = y_val.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) best_pred = best_pred.reshape(-1, N_CLASSES, IMG_SIZE[0], IMG_SIZE[1]) for i in range(N_CLASSES): th, score, _, _ = search_threshold(y_val[:, i, :, :], best_pred[:, i, :, :]) LOGGER.info('Best loss: {} Best Dice: {} on epoch {} th {} class {}'.format( round(best_model_loss, 5), round(score, 5), best_model_ep, th, i)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(args): if args.debug: import pdb; pdb.set_trace(); tb_dir = args.exp_name+'/tb_logs/' ckpt_dir = args.exp_name + '/checkpoints/' if not os.path.exists(args.exp_name): os.mkdir(args.exp_name) os.mkdir(tb_dir) os.mkdir(ckpt_dir) #writer = SummaryWriter(tb_dir+'{}'.format(args.exp_name), flush_secs=10) writer = SummaryWriter(tb_dir, flush_secs=10) # create model print("=> creating model: ") os.system('nvidia-smi') #model = models.__dict__[args.arch]() #model = resnet_dilated.Resnet18_32s(num_classes=21) print(args.no_pre_train,' pretrain') #model = resnet18_fcn.Resnet18_fcn(num_classes=args.n_classes,pre_train=args.no_pre_train) model_map = { 'deeplabv3_resnet18': arma_network.deeplabv3_resnet18, 'deeplabv3_resnet50': arma_network.deeplabv3_resnet50, 'fcn_resnet18': arma_network.fcn_resnet18, #'deeplabv3_resnet101': network.deeplabv3_resnet101, # 'deeplabv3plus_resnet18': network.deeplabv3plus_resnet18, # 'deeplabv3plus_resnet50': network.deeplabv3plus_resnet50, # 'deeplabv3plus_resnet101': network.deeplabv3plus_resnet101 } model = model_map['deeplabv3_resnet50'](arma=False,num_classes=args.n_classes) model = model.cuda() model = nn.DataParallel(model) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): model,optimizer,args = helper.load_checkpoint(args,model,optimizer) else: print("=> no checkpoint found at '{}'".format(args.resume)) #USE this only when batch size is fixed. #This takes time, but optimizes to crazy speeds once input is fixed. cudnn.benchmark = True #Load dataloaders augmentations = aug.Compose([aug.RandomCrop(512),aug.RandomHorizontallyFlip(5),\ aug.RandomRotate(30),aug.RandomSizedCrop(512)]) my_dataset = pascalVOCLoader(args=args,root=args.data,sbd_path=args.data,\ augmentations=augmentations) my_dataset.get_loaders() init_weight_filename ='initial_state.pth.tar' helper.save_checkpoint(args,model,optimizer,custom_name=init_weight_filename) with open(args.exp_name+'/'+'args.pkl','wb') as fout: pickle.dump(args,fout) best_iou = -100.0 for epoch in range(args.start_epoch, args.epochs): helper.adjust_learning_rate(optimizer, epoch, args) train_loss = trainer.train(my_dataset.train_loader,model,optimizer,epoch,args,writer) val_loss,scores,class_iou,running_metrics_val = trainer.validate(my_dataset.val_loader, model,epoch,args,writer) if scores["Mean IoU : \t"] >= best_iou: best_iou = scores["Mean IoU : \t"] is_best = True if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): if epoch in [0,1,2,3,4,5,6,7,8]: helper.save_checkpoint(args,model,optimizer,epoch,custom_name=str(epoch)+'.pth') if args.save_freq is None: helper.save_checkpoint(args,model,optimizer,epoch,is_best=is_best,periodic=False) else: helper.save_checkpoint(args,model,optimizer,epoch,is_best=is_best,periodic=True) with open(args.exp_name+'/running_metric.pkl','wb') as fout: pickle.dump(running_metrics_val,fout)
def Process2_PartNet(args): log_now = args.dataset + '/PartNet' process_name = 'partnet' if os.path.isfile(log_now + '/final.txt'): print('the Process2_PartNet is finished') return best_prec1 = 0 model = Model_Construct(args, process_name) model = torch.nn.DataParallel(model).cuda() criterion = nn.BCELoss().cuda() # print(model) # print('the learning rate for the new added layer is set to 1e-3 to slow down the speed of learning.') optimizer = torch.optim.SGD( [{ 'params': model.module.conv_model.parameters(), 'name': 'pre-trained' }, { 'params': model.module.classification_stream.parameters(), 'name': 'new-added' }, { 'params': model.module.detection_stream.parameters(), 'name': 'new-added' }], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) start_epoch = args.start_epoch if args.resume: if os.path.isfile(args.resume): print("==> loading checkpoints '{}'".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("==> loaded checkpoint '{}'(epoch {})".format( args.resume, checkpoint['epoch'])) args.resume = '' else: raise ValueError('The file to be resumed from is not exited', args.resume) else: if not os.path.isdir(log_now): os.makedirs(log_now) log = open(os.path.join(log_now, 'log.txt'), 'w') state = {k: v for k, v in args._get_kwargs()} log.write(json.dumps(state) + '\n') log.close() cudnn.benchmark = True train_loader, val_loader = generate_dataloader(args, process_name, -1) if args.test_only: validate(val_loader, model, criterion, 2000, args) for epoch in range(start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, log_now, process_name, args) # evaluate on the val data prec1 = validate(val_loader, model, criterion, epoch, log_now, process_name, args) # record the best prec1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best: log = open(os.path.join(log_now, 'log.txt'), 'a') log.write("best acc %3f" % (best_prec1)) log.close() save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, log_now) svb_timer = time.time() if args.svb and epoch != (args.epochs - 1): svb(model, args) print( '!!!!!!!!!!!!!!!!!! the svb constrain is only applied on the classification stream.' ) svb_det(model, args) print('the svb time is: ', time.time() - svb_timer) #download_scores(val_loader, model, log_now, process_name, args) log = open(os.path.join(log_now, 'final.txt'), 'w') log.write("best acc %3f" % (best_prec1)) log.close()
tokenizer = BertTokenizer.from_pretrained(params.bert_type) train_loader, val_loader, test_loader, train_loader2 = get_nlu_dataloader( params, tokenizer) model, optimizer = get_model_and_opt(params) best_intent_acc = -1 best_slot_f1 = -1 best_epoch = -1 for epoch in range(1, params.n_epoch + 1): # if epoch == 5: # print("Switching to target dataloader") for transfer learning # train_loader = train_loader2 print( f'Training Epoch : {epoch}, best results so far : {best_intent_acc}, {best_slot_f1} @ epoch : {best_epoch} (by intent)' ) train(train_loader, model, optimizer, tokenizer) validation_results = validate(val_loader, model, tokenizer) if validation_results['intent_acc'] > best_intent_acc: best_epoch = epoch best_intent_acc = validation_results['intent_acc'] if validation_results['slot_f1'] > best_slot_f1: best_slot_f1 = validation_results['slot_f1'] if epoch == best_epoch: print('Saving model and opt') torch.save(model.state_dict(), save_dir + "/model_" + str(epoch) + ".pt") torch.save(optimizer.state_dict(), save_dir + "/opt_" + str(epoch) + ".pt") with open(save_dir + '/output_slot_outs_' + str(epoch) + '.conll', 'w') as f: f.write('\n'.join(validation_results['output']))
def main(): start_time = datetime.now() start_time_str = datetime.strptime(drop_msecond(start_time), "%Y-%m-%d %H:%M:%S") args = opts() from trainer import train, validate # if args.ablation == '': # from trainer import train, validate # elif args.ablation == 'baseline': # from trainer_baseline import train, validate # elif args.ablation == 'wo_taskt': # from trainer_wo_taskt import train, validate # elif args.ablation == 'wo_Mst': # from trainer_wo_Mst import train, validate # elif args.ablation == 'wo_confusion': # from trainer_wo_confusion import train, validate # elif args.ablation == 'wo_category_confusion': # from trainer_wo_category_confusion import train, validate # 将每一个epoch洗牌后的序列固定, 以使多次训练的过程中不发生较大的变化(到同一个epoch时会得到同样的模型) # 师兄说不固定也问题不大,他一般都没固定 # if args.seed != 666: # if torch.cuda.is_available(): # torch.cuda.manual_seed(args.seed) # torch.manual_seed(args.seed) # else: # torch.manual_seed(args.seed) # else: # if torch.cuda.is_available(): # torch.cuda.manual_seed(666) # torch.manual_seed(args.seed) # else: # torch.manual_seed(666) # init models, multi GPU # model = nn.DataParallel(resnet(args)) # multi-GPU feature_extractor = nn.DataParallel(Extractor(args)) class_classifier = nn.DataParallel( Class_classifier(2048, num_classes=args.num_classes) ) # 512 for ResNet18 and 32, 2048 for ResNet50 domain_classifier = nn.DataParallel( Domain_classifier(2048, hidden_size=128)) # print(id(model.module)) # check_model([3, 200, 200], Extractor(args)) if torch.cuda.is_available(): # model = model.cuda() feature_extractor = feature_extractor.cuda() class_classifier = class_classifier.cuda() domain_classifier = domain_classifier.cuda() # optimizer for multi gpu optimizer = torch.optim.SGD( [{ 'params': feature_extractor.module.parameters(), 'name': 'pre-trained' }, { 'params': class_classifier.module.parameters(), 'name': 'new-added' }, { 'params': domain_classifier.module.parameters(), 'name': 'new-added' }], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True) best_prec1 = 0 if args.resume: if os.path.isfile(args.resume): print("==> loading checkpoints '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) else: raise ValueError('The file to be resumed is not exited', args.resume) train_loader_source, train_loader_target, val_loader_target = generate_dataloader( args) print('Begin training') print(len(train_loader_source), len(train_loader_target)) train_loader_source_batches = enumerate(train_loader_source) train_loader_target_batches = enumerate(train_loader_target) if torch.cuda.is_available(): criterion_y = nn.CrossEntropyLoss().cuda() criterion_d = nn.CrossEntropyLoss().cuda() # not used in this code else: criterion_y = nn.CrossEntropyLoss() criterion_d = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir=args.log) # for epoch in range(args.start_epoch, args.epochs): epoch = args.start_epoch epochs_has_not_been_improved = 0 maximum_gap = 0 while epoch < args.epochs: # train for one epoch # pred1_acc_train, loss = train(train_loader_source, train_loader_source_batches, train_loader_target, # train_loader_target_batches, model, criterion_y, criterion_d, optimizer_C, optimizer_G, epoch, args) # pred1_acc_train, loss_C, loss_G = train(train_loader_source, train_loader_source_batches, train_loader_target, train_loader_target_batches, model, criterion_y, criterion_d, optimizer_C, optimizer_G, epoch, args) # pred1_acc_train, loss_C, loss_G, new_epoch_flag = train(train_loader_source, train_loader_source_batches, train_loader_target, train_loader_target_batches, model, criterion_y, criterion_d, optimizer_C, optimizer_G, epoch, args) # train_loader_source_batches, train_loader_target_batches, epoch, pred1_acc_train, loss_C, loss_G, new_epoch_flag = train(train_loader_source, train_loader_source_batches, train_loader_target, train_loader_target_batches, model, criterion_y, criterion_d, optimizer_C, optimizer_G, epoch, args) # -------------尚未更新(开始),可能会有错误------------- # -------------尚未更新(结束),可能会有错误------------- train_loader_source_batches, train_loader_target_batches, epoch, pred1_acc_train, loss_C, loss_G, new_epoch_flag = train( train_loader_source, train_loader_source_batches, train_loader_target, train_loader_target_batches, feature_extractor, class_classifier, domain_classifier, criterion_y, criterion_d, optimizer, epoch, args) if new_epoch_flag: # 测试一下如果没有这两个语句,会不会出现异常 # train_loader_source_batches = enumerate(train_loader_source) # (inputs_source, labels_source) = train_loader_source_batches.__next__()[1] # evaluate on the val data if epoch % args.test_freq == (args.test_freq - 1): # prec1, _ = validate(None, val_loader_target, model, criterion_y, criterion_d, epoch, args) prec1, _ = validate(None, val_loader_target, feature_extractor, class_classifier, domain_classifier, criterion_y, criterion_d, epoch, args) is_best = prec1 > best_prec1 if is_best: epochs_has_not_been_improved = 0 best_prec1 = prec1 with open(os.path.join(args.log, 'log.txt'), 'a') as fp: fp.write(' \nTarget_T1 acc: %3f' % (best_prec1)) else: epochs_has_not_been_improved += 1 writer.add_scalars('data/scalar_group', { 'pred1_acc_valid': prec1, 'best_prec1': best_prec1 }, epoch) # updating the maximum distance between current and best current_gap = best_prec1 - prec1 if current_gap > maximum_gap: maximum_gap = current_gap save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, # 'model_state_dict': model.state_dict(), 'feature_extractor_state_dict': feature_extractor.state_dict(), 'class_classifier_state_dict': class_classifier.state_dict(), 'domain_classifier_state_dict': domain_classifier.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict() }, is_best, args, epoch + 1) writer.close() end_time = datetime.now() end_time_str = datetime.strptime(drop_msecond(end_time), "%Y-%m-%d %H:%M:%S") through_time = end_time - start_time through_time_str = time_delta2str(through_time) with open(os.path.join(args.result, 'overview.txt'), 'a') as fp: fp.write( '%s: \nbest_prec1:%.2f%%, epochs_has_not_been_improved:%d, maximum distance between current and best:%.2f%%\n\ start at %s, finish at %s, it takes %s \n' % (args.log.split('/')[1], best_prec1, epochs_has_not_been_improved, maximum_gap, start_time_str, end_time_str, through_time_str))
from trainer import train, validate def get_args(): ''' Get arguments for running the main code ''' parser = argparse.ArgumentParser() parser.add_argument('--mode', type=str, required=True, \ help='Mode (training v/s validation)', choices=['train', 'val']) parser.add_argument('--config', type=str, required=True, \ help='Config file to read from.') return parser if __name__ == '__main__': args = get_args().parse_args() if not os.path.exists(args.config): print 'Config file {} does not exist.'.format(args.config) with open(args.config, 'r') as fi: CONFIG = yaml.load(fi.read()) CONFIG = utils.convert_to_lower(CONFIG) if args.mode == 'train': train(CONFIG) else: with torch.no_grad(): validate(CONFIG)
def main(): global args, best_prec1 args = opts() # args = parser.parse_args() model = resnet(args) # define-multi GPU model = torch.nn.DataParallel(model).cuda() print(model) # define loss function(criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # optimizer = torch.optim.SGD(model.parameters(), # train with stanford dogs from scratch if args.new_fc: optimizer = torch.optim.SGD( [ { 'params': model.module.conv1.parameters(), 'lr': args.lr, 'name': 'pre-trained' }, { 'params': model.module.bn1.parameters(), 'lr': args.lr, 'name': 'pre-trained' }, { 'params': model.module.layer1.parameters(), 'lr': args.lr, 'name': 'pre-trained' }, { 'params': model.module.layer2.parameters(), 'lr': args.lr, 'name': 'pre-trained' }, { 'params': model.module.layer3.parameters(), 'lr': args.lr, 'name': 'pre-trained' }, { 'params': model.module.layer4.parameters(), 'lr': args.lr, 'name': 'pre-trained' }, # {'params': model.module.fc.parameters(), 'lr': args.lr, 'name': 'pre-trained'} { 'params': model.module.fc.parameters(), 'lr': args.lr, 'name': 'new-added' } ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("==> loading checkpoints '{}'".format(args.resume)) checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] model_state_dict = checkpoint['target_state_dict'] model_state_dict_tmp = copy.deepcopy(model_state_dict) if args.new_fc: model_state_dict_init = model.state_dict() for k_tmp in model_state_dict_tmp.keys(): if k_tmp.find('.resnet_conv') != -1: k = k_tmp.replace('.resnet_conv', '') model_state_dict[k] = model_state_dict.pop(k_tmp) if args.new_fc: # initialize fc layer if k_tmp.find('.fc') != -1: model_state_dict[k_tmp] = model_state_dict_init[k_tmp] model.load_state_dict(model_state_dict) # optimizer.load_state_dict(checkpoint['optimizer']) print("==> loaded checkpoint '{}'(epoch {})".format( args.resume, checkpoint['epoch'])) else: raise ValueError('The file to be resumed from is not exited', args.resume) # else: if not os.path.isdir(args.log): os.makedirs(args.log) log = open(os.path.join(args.log, 'log.txt'), 'w') state = {k: v for k, v in args._get_kwargs()} log.write(json.dumps(state) + '\n') log.close() cudnn.benchmark = True # process the data and prepare the dataloaders. train_loader, val_loader = generate_dataloader(args) #test only if args.test_only: validate(val_loader, model, criterion, -1, args) return for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on the val data prec1 = validate(val_loader, model, criterion, epoch, args) # record the best prec1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best: log = open(os.path.join(args.log, 'log.txt'), 'a') log.write(' \nTop1 acc: %3f' % (best_prec1)) log.close() save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, args)
def main(): # train_df = pd.read_csv(TRAIN_PATH).sample(frac=1.0, random_state=seed) # train_size = int(len(train_df) * 0.9) train_df = pd.read_csv(TRAIN_PATH).sample(train_size + valid_size, random_state=seed) LOGGER.info(f'data_size is {len(train_df)}') LOGGER.info(f'train_size is {train_size}') y = np.where(train_df['target'] >= 0.5, 1, 0) y_aux = train_df[AUX_COLUMNS].values identity_columns_new = [] for column in identity_columns + ['target']: train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True, False) if column != "target": identity_columns_new.append(column + "_bin") sample_weights = np.ones(len(train_df), dtype=np.float32) sample_weights += train_df[identity_columns_new].sum(axis=1) sample_weights += train_df['target_bin'] * (~train_df[identity_columns_new]).sum(axis=1) sample_weights += (~train_df['target_bin']) * train_df[identity_columns_new].sum(axis=1) * 5 sample_weights /= sample_weights.mean() with timer('preprocessing text'): # df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]] train_df['comment_text'] = train_df['comment_text'].astype(str) train_df = train_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) X_text = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer) test_df = train_df[train_size:] with timer('train'): X_train, y_train, y_aux_train, w_train = X_text[:train_size], y[:train_size], y_aux[ :train_size], sample_weights[ :train_size] X_val, y_val, y_aux_val, w_val = X_text[train_size:], y[train_size:], y_aux[train_size:], sample_weights[ train_size:] model = BertForSequenceClassification(bert_config, num_labels=n_labels) model.load_state_dict(torch.load(model_path)) model.zero_grad() model = model.to(device) train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float)) valid = torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size * 2, shuffle=False) sample_weight_train = [w_train.values, np.ones_like(w_train)] sample_weight_val = [w_val.values, np.ones_like(w_val)] param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(epochs * train_size / batch_size / accumulation_steps) total_step = int(epochs * train_size / batch_size) optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5*gamma, warmup=0.05, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) criterion = torch.nn.BCEWithLogitsLoss().to(device) LOGGER.info(f"Starting 1 epoch...") tr_loss, train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device, accumulation_steps, total_step, n_labels) LOGGER.info(f'Mean train loss: {round(tr_loss,5)}') torch.save(model.state_dict(), '{}_dic'.format(exp)) valid_loss, oof_pred = validate(model, valid_loader, criterion, device, n_labels) del model gc.collect() torch.cuda.empty_cache() test_df["pred"] = oof_pred.reshape(-1) test_df = convert_dataframe_to_bool(test_df) bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df)) LOGGER.info(f'final score is {score}') test_df.to_csv("oof.csv", index=False) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss'); plt.legend(); plt.xticks(xs); plt.xlabel('Iter') plt.savefig("loss.png")
def run(args): start_epoch = 1 best = {'L1': 1e+9, 'MAE': 1e+9} # logs if args.expid == '': args.expid = dt.datetime.now().strftime('%Y%m%d%H%M') args.log_dir = os.path.join(args.log_dir, args.expid) if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) os.chmod(args.log_dir, 0o0777) logger = get_logger(os.path.join(args.log_dir, 'main.log')) logger.info(args) writer = SummaryWriter(args.log_dir) args.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') # data if args.trainset == 'trainset': train_set = WCTrainset(args.data_root, args.train_csv, args=args) else: train_set = WCDataset(args.data_root, args.train_csv, args=args) valid_set = WCValidset(args.data_root, args.valid_csv, args=args) train_loader = DataLoader(train_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=True) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, num_workers=args.n_workers, shuffle=False) # network model = models.__dict__[args.model](args=args) if torch.cuda.device_count() > 1: logger.info('{} GPUs found.'.format(torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(args.device) # training criterion, valid_loss_fn = get_loss_fn(args) optimizer = get_optimizer(model, args.optim_str) scheduler = get_scheduler(optimizer, args) logger.debug(optimizer) if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 best['L1'] = checkpoint['best/L1'] best['MAE'] = checkpoint['best/MAE'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Loaded checkpoint {} (epoch {})'.format( args.resume, start_epoch - 1)) else: raise IOError('No such file {}'.format(args.resume)) for epoch_i in range(start_epoch, args.epochs + 1): message = '[{}] Epoch {} Train/{} {:.2f} /MAE {:.4f} Valid/L1 {:.2f} /MAE {:.4f} (Best {:.4f}) ' # noqa for param_group in optimizer.param_groups: message += 'LR {:.4f} '.format(param_group['lr']) training = train(train_loader, model, criterion, optimizer, logger=logger, args=args) validation = validate(valid_loader, model, valid_loss_fn, logger=logger, args=args) writer.add_scalar('{}/Train'.format(args.loss), training['loss'], epoch_i) writer.add_scalar('{}/Valid'.format(args.loss), validation['loss'], epoch_i) writer.add_scalar('MAE/Train', training['mae'], epoch_i) writer.add_scalar('MAE/Valid', validation['mae'], epoch_i) writer.add_scalar('Grad/L2/Mean/BeforeClipped/Train', training['grad/L2/BeforeClipped'], epoch_i) writer.add_scalar('Grad/L2/Mean/Clipped/Train', training['grad/L2/Clipped'], epoch_i) writer.add_scalar('Grad/L2/Mean/Train', training['grad/L2/Mean'], epoch_i) if epoch_i % args.freq_to_log_image == 0: writer.add_image('Train/Predict', _get_images(training['pred'], args), epoch_i) writer.add_image('Train/Target', _get_images(training['true'], args), epoch_i) writer.add_image('Valid/Predict', _get_images(validation['pred'], args), epoch_i) writer.add_image('Valid/Target', _get_images(validation['true'], args), epoch_i) is_best = (validation['mae'] < best['MAE'], validation['loss'] < best['L1']) if is_best[0]: best['MAE'] = validation['mae'] if is_best[1]: best['L1'] = validation['loss'] save_checkpoint( { 'epoch': epoch_i, 'state_dict': model.state_dict(), 'valid/L1': validation['loss'], 'valid/MAE': validation['mae'], 'best/L1': best['L1'], 'best/MAE': best['MAE'], 'optimizer': optimizer.state_dict(), }, is_best, args.log_dir) if scheduler is not None: scheduler.step(epoch=epoch_i) message = message.format(args.expid, epoch_i, args.loss, training['loss'], training['mae'], validation['loss'], validation['mae'], best['MAE']) logger.info(message)
def main(): global args, best_prec1, best_test_prec1, cond_best_test_prec1, best_cluster_acc, best_cluster_acc_2 # define model model = Model_Construct(args) print(model) model = torch.nn.DataParallel(model).cuda() # define multiple GPUs # define learnable cluster centers learn_cen = Variable(torch.cuda.FloatTensor(args.num_classes, 2048).fill_(0)) learn_cen.requires_grad_(True) learn_cen_2 = Variable(torch.cuda.FloatTensor(args.num_classes, args.num_neurons * 4).fill_(0)) learn_cen_2.requires_grad_(True) # define loss function/criterion and optimizer criterion = torch.nn.CrossEntropyLoss().cuda() criterion_cons = ConsensusLoss(nClass=args.num_classes, div=args.div).cuda() np.random.seed(1) # may fix test data random.seed(1) torch.manual_seed(1) # apply different learning rates to different layer optimizer = torch.optim.SGD([ {'params': model.module.conv1.parameters(), 'name': 'conv'}, {'params': model.module.bn1.parameters(), 'name': 'conv'}, {'params': model.module.layer1.parameters(), 'name': 'conv'}, {'params': model.module.layer2.parameters(), 'name': 'conv'}, {'params': model.module.layer3.parameters(), 'name': 'conv'}, {'params': model.module.layer4.parameters(), 'name': 'conv'}, {'params': model.module.fc1.parameters(), 'name': 'ca_cl'}, {'params': model.module.fc2.parameters(), 'name': 'ca_cl'}, {'params': learn_cen, 'name': 'conv'}, {'params': learn_cen_2, 'name': 'conv'} ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # resume epoch = 0 init_state_dict = model.state_dict() if args.resume: if os.path.isfile(args.resume): print("==> loading checkpoints '{}'".format(args.resume)) checkpoint = torch.load(args.resume) epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] best_test_prec1 = checkpoint['best_test_prec1'] cond_best_test_prec1 = checkpoint['cond_best_test_prec1'] model.load_state_dict(checkpoint['state_dict']) learn_cen = checkpoint['learn_cen'] learn_cen_2 = checkpoint['learn_cen_2'] print("==> loaded checkpoint '{}'(epoch {})".format(args.resume, checkpoint['epoch'])) else: raise ValueError('The file to be resumed from does not exist!', args.resume) # make log directory if not os.path.isdir(args.log): os.makedirs(args.log) log = open(os.path.join(args.log, 'log.txt'), 'a') state = {k: v for k, v in args._get_kwargs()} log.write(json.dumps(state) + '\n') log.close() # start time log = open(os.path.join(args.log, 'log.txt'), 'a') log.write('\n-------------------------------------------\n') log.write(time.asctime(time.localtime(time.time()))) log.write('\n-------------------------------------------') log.close() cudnn.benchmark = True # process data and prepare dataloaders train_loader_source, train_loader_target, val_loader_target, val_loader_target_t, val_loader_source = generate_dataloader(args) train_loader_target.dataset.tgts = list(np.array(torch.LongTensor(train_loader_target.dataset.tgts).fill_(-1))) # avoid using ground truth labels of target print('begin training') batch_number = count_epoch_on_large_dataset(train_loader_target, train_loader_source, args) num_itern_total = args.epochs * batch_number new_epoch_flag = False # if new epoch, new_epoch_flag=True test_flag = False # if test, test_flag=True src_cs = torch.cuda.FloatTensor(len(train_loader_source.dataset.tgts)).fill_(1) # initialize source weights count_itern_each_epoch = 0 for itern in range(epoch * batch_number, num_itern_total): # evaluate on the target training and test data if (itern == 0) or (count_itern_each_epoch == batch_number): prec1, c_s, c_s_2, c_t, c_t_2, c_srctar, c_srctar_2, source_features, source_features_2, source_targets, target_features, target_features_2, target_targets, pseudo_labels = validate_compute_cen(val_loader_target, val_loader_source, model, criterion, epoch, args) test_acc = validate(val_loader_target_t, model, criterion, epoch, args) test_flag = True # K-means clustering or its variants if ((itern == 0) and args.src_cen_first) or (args.initial_cluster == 2): cen = c_s cen_2 = c_s_2 else: cen = c_t cen_2 = c_t_2 if (itern != 0) and (args.initial_cluster != 0) and (args.cluster_method == 'kernel_kmeans'): cluster_acc, c_t = kernel_k_means(target_features, target_targets, pseudo_labels, train_loader_target, epoch, model, args, best_cluster_acc) cluster_acc_2, c_t_2 = kernel_k_means(target_features_2, target_targets, pseudo_labels, train_loader_target, epoch, model, args, best_cluster_acc_2, change_target=False) elif args.cluster_method != 'spherical_kmeans': cluster_acc, c_t = k_means(target_features, target_targets, train_loader_target, epoch, model, cen, args, best_cluster_acc) cluster_acc_2, c_t_2 = k_means(target_features_2, target_targets, train_loader_target, epoch, model, cen_2, args, best_cluster_acc_2, change_target=False) elif args.cluster_method == 'spherical_kmeans': cluster_acc, c_t = spherical_k_means(target_features, target_targets, train_loader_target, epoch, model, cen, args, best_cluster_acc) cluster_acc_2, c_t_2 = spherical_k_means(target_features_2, target_targets, train_loader_target, epoch, model, cen_2, args, best_cluster_acc_2, change_target=False) # record the best accuracy of K-means clustering log = open(os.path.join(args.log, 'log.txt'), 'a') if cluster_acc != best_cluster_acc: best_cluster_acc = cluster_acc log.write('\n best_cluster acc: %3f' % best_cluster_acc) if cluster_acc_2 != best_cluster_acc_2: best_cluster_acc_2 = cluster_acc_2 log.write('\n best_cluster_2 acc: %3f' % best_cluster_acc_2) log.close() # re-initialize learnable cluster centers if args.init_cen_on_st: cen = (c_t + c_s) / 2# or c_srctar cen_2 = (c_t_2 + c_s_2) / 2# or c_srctar_2 else: cen = c_t cen_2 = c_t_2 #if itern == 0: learn_cen.data = cen.data.clone() learn_cen_2.data = cen_2.data.clone() # select source samples if (itern != 0) and (args.src_soft_select or args.src_hard_select): src_cs = source_select(source_features, source_targets, target_features, pseudo_labels, train_loader_source, epoch, c_t.data.clone(), args) # use source pre-trained model to extract features for first clustering if (itern == 0) and args.src_pretr_first: model.load_state_dict(init_state_dict) if itern != 0: count_itern_each_epoch = 0 epoch += 1 batch_number = count_epoch_on_large_dataset(train_loader_target, train_loader_source, args) train_loader_target_batch = enumerate(train_loader_target) train_loader_source_batch = enumerate(train_loader_source) new_epoch_flag = True del source_features del source_features_2 del source_targets del target_features del target_features_2 del target_targets del pseudo_labels gc.collect() torch.cuda.empty_cache() torch.cuda.empty_cache() elif (args.src.find('visda') != -1) and (itern % int(num_itern_total / 200) == 0): prec1, _, _, _, _, _, _, _, _, _, _, _, _, _ = validate_compute_cen(val_loader_target, val_loader_source, model, criterion, epoch, args, compute_cen=False) test_acc = validate(val_loader_target_t, model, criterion, epoch, args) test_flag = True if test_flag: # record the best prec1 and save checkpoint log = open(os.path.join(args.log, 'log.txt'), 'a') if prec1 > best_prec1: best_prec1 = prec1 cond_best_test_prec1 = 0 log.write('\n best val acc till now: %3f' % best_prec1) if test_acc > best_test_prec1: best_test_prec1 = test_acc log.write('\n best test acc till now: %3f' % best_test_prec1) ipdb.set_trace() is_cond_best = ((prec1 == best_prec1) and (test_acc > cond_best_test_prec1)) if is_cond_best: cond_best_test_prec1 = test_acc log.write('\n cond best test acc till now: %3f' % cond_best_test_prec1) log.close() save_checkpoint({ 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'learn_cen': learn_cen, 'learn_cen_2': learn_cen_2, 'best_prec1': best_prec1, 'best_test_prec1': best_test_prec1, 'cond_best_test_prec1': cond_best_test_prec1, }, is_cond_best, args) test_flag = False # early stop if epoch > args.stop_epoch: break # train for one iteration train_loader_source_batch, train_loader_target_batch = train(train_loader_source, train_loader_source_batch, train_loader_target, train_loader_target_batch, model, learn_cen, learn_cen_2, criterion_cons, optimizer, itern, epoch, new_epoch_flag, src_cs, args) model = model.cuda() new_epoch_flag = False count_itern_each_epoch += 1 log = open(os.path.join(args.log, 'log.txt'), 'a') log.write('\n*** best val acc: %3f ***' % best_prec1) log.write('\n*** best test acc: %3f ***' % best_test_prec1) log.write('\n*** cond best test acc: %3f ***' % cond_best_test_prec1) # end time log.write('\n-------------------------------------------\n') log.write(time.asctime(time.localtime(time.time()))) log.write('\n-------------------------------------------\n') log.close()
import trainer network = trainer.runTraining() trainer.validate(network)
mode ='encoder', hard_examples = hard_examples) else: train(epoch, model, enc_optimizer, args, use_cuda = use_cuda, mode ='encoder') if args.num_train_dec > 0: for idx in range(args.num_train_dec): if args.hard_example: train_loss, hard_examples = train_hardexample(epoch, model, dec_optimizer, args, use_cuda = use_cuda, mode ='decoder', hard_examples = hard_examples) else: train(epoch, model, dec_optimizer, args, use_cuda = use_cuda, mode ='decoder') this_loss, this_ber = validate(model, general_optimizer, args, use_cuda = use_cuda) report_loss.append(this_loss) report_ber.append(this_ber) if args.print_test_traj == True: print('test loss trajectory', report_loss) print('test ber trajectory', report_ber) print('total epoch', args.num_epoch) ################################################# # Testing Processes ################################################# test(model, args, use_cuda = use_cuda) torch.save(model.state_dict(), './tmp/torch_model_'+identity+'.pt') print('saved model', './tmp/torch_model_'+identity+'.pt')
def main(): global args, best_prec1 args = opts() # args = parser.parse_args() model = resnet(args.arch, args.pretrain, args) # define-multi GPU model = torch.nn.DataParallel(model).cuda() #print(model) # define loss function(criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # optimizer = torch.optim.SGD(model.parameters(), # To apply different learning rate to different layer #print(model.module) optimizer = torch.optim.SGD([{ 'params': model.module.conv1.parameters(), 'name': 'pre-trained' }, { 'params': model.module.bn1.parameters(), 'name': 'pre-trained' }, { 'params': model.module.layer1.parameters(), 'name': 'pre-trained' }, { 'params': model.module.layer2.parameters(), 'name': 'pre-trained' }, { 'params': model.module.layer3.parameters(), 'name': 'pre-trained' }, { 'params': model.module.layer4.parameters(), 'name': 'pre-trained' }, { 'params': model.module.fc.parameters(), 'lr': args.lr * 10, 'name': 'new-added' }], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) #optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("==> loading checkpoints '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("==> loaded checkpoint '{}'(epoch {})".format( args.resume, checkpoint['epoch'])) else: raise ValueError('The file to be resumed from is not exited', args.resume) else: if not os.path.isdir(args.log): os.makedirs(args.log) log = open(os.path.join(args.log, 'log.txt'), 'w') state = {k: v for k, v in args._get_kwargs()} log.write(json.dumps(state) + '\n') log.close() cudnn.benchmark = True # process the data and prepare the dataloaders. train_loader, val_loader = generate_dataloader(args) #test only if args.test_only: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on the val data prec1 = validate(val_loader, model, criterion, epoch, args) # record the best prec1 and save checkpoint is_best = prec1 > best_prec1 if is_best: log = open(os.path.join(args.log, 'log.txt'), 'a') log.write(" best result is %3f" % (prec1)) log.close() best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, epoch, is_best, args)
def main(): start_epoch = 0 max_loss = math.inf epochs_since_improvement = 0 dataset = GaitSequenceDataset(root_dir=data_dir, longest_sequence=85, shortest_sequence=55) train_sampler, validation_sampler = generate_train_validation_samplers( dataset, validation_split=0.2) print('Building dataloaders..') train_dataloader = data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_dataloader = data.DataLoader(dataset, batch_size=1, sampler=validation_sampler, drop_last=True) model = RNN(num_features, hidden_dimension, num_classes, num_layers=2).to(device) if load_pretrained is True: print('Loading pretrained model..') checkpoint = torch.load(checkpoint_path) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model.load_state_dict(checkpoint['model_state_dict']) optimizer = checkpoint['optimizer'] else: print('Creating model..') optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) criterion = nn.CrossEntropyLoss().to(device) if mode == 'train': summary = SummaryWriter() #summary = None model.to(device) print('########### ', model) for epoch in range(start_epoch, start_epoch + num_epochs): if epochs_since_improvement == 20: break if epochs_since_improvement > 0 and epochs_since_improvement % 4 == 0: adjust_learning_rate(optimizer, 0.8) train(model, train_dataloader, optimizer, criterion, clip_gradient, device, epoch, num_epochs, summary, loss_display_interval) current_loss = validate(model, validation_dataloader, criterion, device, epoch, num_epochs, summary, loss_display_interval) is_best = max_loss > current_loss max_loss = min(max_loss, current_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 save_checkpoint(epoch, epochs_since_improvement, model, optimizer, is_best) print('Current loss : ', current_loss, ' Max loss : ', max_loss) else: print('testing...') model = RNN(num_features, hidden_dimension, num_classes, num_layers=2) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint['model_state_dict']) model.to(device) print(model) for batch_idx, val_data in enumerate(validation_dataloader): sequence = val_data['sequence'].permute(1, 0, 2).to(device) piano_roll = val_data['piano_roll'].permute(1, 0, 2).squeeze(1).to('cpu') sequence_length = val_data['sequence_length'] file_name = val_data['file_name'] frame = val_data['frame'] leg = val_data['leg'] sonify_sequence(model, sequence, sequence_length) plt.imshow(piano_roll) plt.show() print(file_name, frame, leg) break
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) y1 = (df.EncodedPixels_1 != "-1").astype("float32").values.reshape(-1, 1) y2 = (df.EncodedPixels_2 != "-1").astype("float32").values.reshape(-1, 1) y3 = (df.EncodedPixels_3 != "-1").astype("float32").values.reshape(-1, 1) y4 = (df.EncodedPixels_4 != "-1").astype("float32").values.reshape(-1, 1) y = np.concatenate([y1, y2, y3, y4], axis=1) with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] y_train, y_val = y[df.fold_id != FOLD_ID], y[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100,140), p=0.5), RandomBrightnessContrast(p=0.5), ], p=0.5), OneOf([ GaussNoise(p=0.5), ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0, class_y=y_train, gamma=GAMMA) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation, gamma=GAMMA) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp.Unet('resnet34', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish", freeze_bn=True, classification=CLASSIFICATION, attention_type="cbam", center=True) model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.SGD( model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001, nesterov=False, ) scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=0) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 best_model_score = 0 checkpoint = base_ckpt+1 for epoch in range(84, EPOCHS + 1): seed = seed + epoch seed_torch(seed) LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0, classification=CLASSIFICATION) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss, val_score = validate(model, val_loader, criterion, device, classification=CLASSIFICATION) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) LOGGER.info('Mean valid score: {}'.format(round(val_score, 5))) scheduler.step() if val_score > best_model_score: torch.save(model.module.state_dict(), 'models/{}_fold{}_ckpt{}_score.pth'.format(EXP_ID, FOLD_ID, checkpoint)) best_model_score = val_score best_model_ep_score = epoch if valid_loss < best_model_loss: torch.save(model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format(EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save(model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format(round(best_model_loss, 5), best_model_ep)) LOGGER.info('Best valid score: {} on epoch={}'.format(round(best_model_score, 5), best_model_ep_score)) checkpoint += 1 best_model_loss = 999 best_model_score = 0 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format(round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(): global args, best_score, best_epoch best_score, best_epoch = -1, -1 if len(sys.argv) > 1: args = parse_args() print('----- Experiments parameters -----') for k, v in args.__dict__.items(): print(k, ':', v) else: print( 'Please provide some parameters for the current experiment. Check-out arg.py for more info!' ) sys.exit() # init random seeds utils.setup_env(args) # init tensorboard summary is asked tb_writer = SummaryWriter(f'{args.data_dir}/runs/{args.name}/tensorboard' ) if args.tensorboard else None # init data loaders loader = get_loader(args) train_loader = torch.utils.data.DataLoader(loader( path_to_data=args.data_dir, mode='TRAIN'), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(loader(path_to_data=args.data_dir, mode='VAL'), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) exp_logger, lr = None, None model = get_model(args) criterion = losses.get_criterion(args) # optionally resume from a checkpoint if args.resume: model, exp_logger, args.start_epoch, best_score, best_epoch, lr = load_checkpoint( args, model) args.lr = lr else: # create all output folders utils.init_output_env(args) if exp_logger is None: exp_logger = init_logger(args, model) optimizer, scheduler = optimizers.get_optimizer(args, model) print(' + Number of params: {}'.format(utils.count_params(model))) model.to(args.device) criterion.to(args.device) if args.test: test_loader = torch.utils.data.DataLoader(loader( path_to_data=args.data_dir, mode='TEST'), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) trainer.test(args, test_loader, model, criterion, args.start_epoch, eval_score=metrics.accuracy_regression, output_dir=args.out_pred_dir, has_gt=True) sys.exit() is_best = True for epoch in range(args.start_epoch, args.epochs + 1): print('Current epoch: ', epoch) trainer.train(args, train_loader, model, criterion, optimizer, exp_logger, epoch, eval_score=metrics.accuracy_regression, tb_writer=tb_writer) # evaluate on validation set val_mae, val_squared_mse, val_loss = trainer.validate( args, val_loader, model, criterion, exp_logger, epoch, eval_score=metrics.accuracy_regression, tb_writer=tb_writer) # update learning rate if scheduler is None: trainer.adjust_learning_rate(args, optimizer, epoch) else: prev_lr = optimizer.param_groups[0]['lr'] if 'ReduceLROnPlateau' == args.scheduler: scheduler.step(val_loss) else: scheduler.step() print( f"Updating learning rate from {prev_lr} to {optimizer.param_groups[0]['lr']}" ) # remember best acc and save checkpoint is_best = val_mae < best_score best_score = min(val_mae, best_score) if True == is_best: best_epoch = epoch save_checkpoint( args, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_score': best_score, 'best_epoch': best_epoch, 'exp_logger': exp_logger, }, is_best) # write plots to disk generate_plots(args, exp_logger, is_best=is_best) # generate html report logger.export_logs(args, epoch, best_epoch) if args.tensorboard: tb_writer.close() print("That's all folks!")
def main(seed): with timer('load data'): df = pd.read_csv(FOLD_PATH) soft_df = pd.read_csv(SOFT_PATH) df = df.append(pd.read_csv(PSEUDO_PATH)).reset_index(drop=True) soft_df = soft_df.append( pd.read_csv(PSEUDO_PATH)).reset_index(drop=True) soft_df = df[[ID_COLUMNS]].merge(soft_df, how="left", on=ID_COLUMNS) LOGGER.info(df.head()) LOGGER.info(soft_df.head()) for c in [ "EncodedPixels_1", "EncodedPixels_2", "EncodedPixels_3", "EncodedPixels_4" ]: df[c] = df[c].astype(str) soft_df[c] = soft_df[c].astype(str) df["fold_id"] = df["fold_id"].fillna(FOLD_ID + 1) y = (df.sum_target != 0).astype("float32").values y += (soft_df.sum_target != 0).astype("float32").values y = y / 2 with timer('preprocessing'): train_df, val_df = df[df.fold_id != FOLD_ID], df[df.fold_id == FOLD_ID] train_soft_df, val_soft_df = soft_df[df.fold_id != FOLD_ID], soft_df[ df.fold_id == FOLD_ID] y_train, y_val = y[df.fold_id != FOLD_ID], y[df.fold_id == FOLD_ID] train_augmentation = Compose([ Flip(p=0.5), OneOf([ GridDistortion(p=0.5), OpticalDistortion(p=0.5, distort_limit=2, shift_limit=0.5) ], p=0.5), OneOf([ RandomGamma(gamma_limit=(100, 140), p=0.5), RandomBrightnessContrast(p=0.5), ], p=0.5), OneOf([ GaussNoise(p=0.5), ], p=0.5), ShiftScaleRotate(rotate_limit=20, p=0.5), ]) val_augmentation = None train_dataset = SeverDataset(train_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=train_augmentation, crop_rate=1.0, class_y=y_train, soft_df=train_soft_df) val_dataset = SeverDataset(val_df, IMG_DIR, IMG_SIZE, N_CLASSES, id_colname=ID_COLUMNS, transforms=val_augmentation, soft_df=val_soft_df) train_sampler = MaskProbSampler(train_df, demand_non_empty_proba=0.6) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) del train_df, val_df, df, train_dataset, val_dataset gc.collect() with timer('create model'): model = smp_old.Unet('resnet34', encoder_weights="imagenet", classes=N_CLASSES, encoder_se_module=True, decoder_semodule=True, h_columns=False, skip=True, act="swish", freeze_bn=True, classification=CLASSIFICATION) model = convert_model(model) if base_model is not None: model.load_state_dict(torch.load(base_model)) model.to(device) criterion = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam([ { 'params': model.decoder.parameters(), 'lr': 3e-3 }, { 'params': model.encoder.parameters(), 'lr': 3e-4 }, ]) if base_model is None: scheduler_cosine = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) scheduler = GradualWarmupScheduler( optimizer, multiplier=1.1, total_epoch=CLR_CYCLE * 2, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, T_max=CLR_CYCLE, eta_min=3e-5) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = torch.nn.DataParallel(model) with timer('train'): train_losses = [] valid_losses = [] best_model_loss = 999 best_model_ep = 0 checkpoint = base_ckpt + 1 for epoch in range(1, EPOCHS + 1): seed = seed + epoch seed_torch(seed) LOGGER.info("Starting {} epoch...".format(epoch)) tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device, cutmix_prob=0.0, classification=CLASSIFICATION) train_losses.append(tr_loss) LOGGER.info('Mean train loss: {}'.format(round(tr_loss, 5))) valid_loss, val_score = validate(model, val_loader, criterion, device, classification=CLASSIFICATION) valid_losses.append(valid_loss) LOGGER.info('Mean valid loss: {}'.format(round(valid_loss, 5))) LOGGER.info('Mean valid score: {}'.format(round(val_score, 5))) scheduler.step() if valid_loss < best_model_loss: torch.save( model.module.state_dict(), 'models/{}_fold{}_ckpt{}.pth'.format( EXP_ID, FOLD_ID, checkpoint)) best_model_loss = valid_loss best_model_ep = epoch #np.save("val_pred.npy", val_pred) if epoch % (CLR_CYCLE * 2) == CLR_CYCLE * 2 - 1: torch.save( model.module.state_dict(), 'models/{}_fold{}_latest.pth'.format(EXP_ID, FOLD_ID)) LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) checkpoint += 1 best_model_loss = 999 #del val_pred gc.collect() LOGGER.info('Best valid loss: {} on epoch={}'.format( round(best_model_loss, 5), best_model_ep)) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.plot(xs, valid_losses, label='Val loss') plt.legend() plt.xticks(xs) plt.xlabel('Epochs') plt.savefig("loss.png")
def main(): global args, best_score, best_epoch best_score, best_epoch = -1, -1 if len(sys.argv) > 1: args = parse_args() print('----- Experiments parameters -----') for k, v in args.__dict__.items(): print(k, ':', v) else: print('Please provide some parameters for the current experiment. Check-out args.py for more info!') sys.exit() # init random seeds utils.setup_env(args) # init tensorboard summary is asked tb_writer = SummaryWriter(f'{args.data_dir}/runs/{args.name}/tensorboard') if args.tensorboard else None # init data loaders loader = get_loader(args) train_loader = torch.utils.data.DataLoader(loader(data_dir=args.data_dir, split='train', min_size=args.min_size_train, max_size=args.max_size_train, dataset_size=args.dataset_size_train), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, collate_fn=lambda x: x, pin_memory=True) val_loader = torch.utils.data.DataLoader(loader(data_dir=args.data_dir, split='val', min_size=args.min_size_val, max_size=args.max_size_val, dataset_size=args.dataset_size_val), batch_size=1, shuffle=False, num_workers=args.workers, collate_fn=lambda x: x, pin_memory=True) exp_logger, lr = None, None model = get_model(args) criterion = losses.get_criterion(args) # optionally resume from a checkpoint if args.resume: model, exp_logger, args.start_epoch, best_score, best_epoch, lr = load_checkpoint(args, model) args.lr = lr else: # create all output folders utils.init_output_env(args) if exp_logger is None: exp_logger = init_logger(args, model) optimizer, scheduler = optimizers.get_optimizer(args, model) print(' + Number of params: {}'.format(utils.count_params(model))) model.to(args.device) criterion.to(args.device) if args.test: test_loader = torch.utils.data.DataLoader(loader(data_dir=args.data_dir, split='test', min_size=args.min_size_val, max_size=args.max_size_val, dataset_size=args.dataset_size_val), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, collate_fn=lambda x: x, pin_memory=True) trainer.test(args, test_loader, model, criterion, args.start_epoch, eval_score=metrics.get_score(args.test_type), output_dir=args.out_pred_dir, has_gt=True, print_freq=args.print_freq_val) sys.exit() is_best = True for epoch in range(args.start_epoch, args.epochs + 1): print('Current epoch:', epoch) trainer.train(args, train_loader, model, criterion, optimizer, exp_logger, epoch, eval_score=metrics.get_score(args.train_type), print_freq=args.print_freq_train, tb_writer=tb_writer) # evaluate on validation set mAP, val_loss = trainer.validate(args, val_loader, model, criterion, exp_logger, epoch, eval_score=metrics.get_score(args.val_type), print_freq=args.print_freq_val, tb_writer=tb_writer) # Update learning rate if scheduler is None: trainer.adjust_learning_rate(args, optimizer, epoch) else: prev_lr = optimizer.param_groups[0]['lr'] if 'ReduceLROnPlateau' == args.scheduler: scheduler.step(val_loss) else: scheduler.step() print(f"Updating learning rate from {prev_lr} to {optimizer.param_groups[0]['lr']}") # remember best acc and save checkpoint is_best = mAP > best_score best_score = max(mAP, best_score) if True == is_best: best_epoch = epoch save_checkpoint(args, { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_score': best_score, 'best_epoch': best_epoch, 'exp_logger': exp_logger, }, is_best) if args.tensorboard: tb_writer.close() print(" ***** Processes all done. *****")
def main(): global args, best_prec1 args = opts() # ipdb.set_trace() # args = parser.parse_args() model_source, model_target = resnet(args) # define-multi GPU model_source = torch.nn.DataParallel(model_source).cuda() model_target = torch.nn.DataParallel(model_target).cuda() print('the memory id should be same for the shared feature extractor:') print(id(model_source.module.resnet_conv)) # the memory is shared here print(id(model_target.module.resnet_conv)) print('the memory id should be different for the different classifiers:') print(id(model_source.module.fc)) # the memory id shared here. print(id(model_target.module.fc)) # define loss function(criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() np.random.seed(1) ### fix the random data. random.seed(1) # optimizer = torch.optim.SGD(model.parameters(), # To apply different learning rate to different layer if args.meta_sgd: meta_train_lr = [] for param in model_target.parameters(): meta_train_lr.append( torch.FloatTensor(param.data.size()).fill_( args.meta_train_lr).cuda()) if args.pretrained: print('the pretrained setting of optimizer') if args.auxiliary_dataset == 'imagenet': optimizer = torch.optim.SGD([ { 'params': model_source.module.resnet_conv.parameters(), 'name': 'pre-trained' }, { 'params': model_source.module.fc.parameters(), 'name': 'pre-trained' }, { 'params': model_target.module.fc.parameters(), 'name': 'new-added' }, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.auxiliary_dataset == 'l_bird': optimizer = torch.optim.SGD([ { 'params': model_source.module.resnet_conv.parameters(), 'name': 'pre-trained' }, { 'params': model_source.module.fc.parameters(), 'name': 'pre-trained' }, { 'params': model_target.module.fc.parameters(), 'name': 'new-added' }, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) else: print('the from scratch setting of optimizer') optimizer = torch.optim.SGD([ { 'params': model_source.module.resnet_conv.parameters(), 'name': 'new-added' }, { 'params': model_source.module.fc.parameters(), 'name': 'new-added' }, { 'params': model_target.module.fc.parameters(), 'name': 'new-added' }, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) #optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): # raise ValueError('the resume function is not finished') print("==> loading checkpoints '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.meta_sgd: meta_train_lr = checkpoint['meta_train_lr'] best_prec1 = checkpoint['best_prec1'] model_source.load_state_dict(checkpoint['source_state_dict']) model_target.load_state_dict(checkpoint['target_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("==> loaded checkpoint '{}'(epoch {})".format( args.resume, checkpoint['epoch'])) else: raise ValueError('The file to be resumed from is not exited', args.resume) if not os.path.isdir(args.log): os.makedirs(args.log) log = open(os.path.join(args.log, 'log.txt'), 'w') state = {k: v for k, v in args._get_kwargs()} log.write(json.dumps(state) + '\n') log.close() cudnn.benchmark = True # process the data and prepare the dataloaders. dataloader_returned = generate_dataloader(args) dataloader_number_returned = len(dataloader_returned) print('the number of dataloader number returned is: ', dataloader_number_returned) if dataloader_number_returned != 2: train_loader_source, val_loader_source, train_loader_target, val_loader_target = dataloader_returned else: train_loader_target, val_loader_target = dataloader_returned train_loader_source = None # train_loader, val_loader = generate_dataloader(args) # test only if args.test_only: if dataloader_number_returned == 2: validate(None, val_loader_target, model_source, model_target, criterion, 0, args) else: validate(val_loader_source, val_loader_target, model_source, model_target, criterion, 0, args) # if args.auxiliary_dataset == 'imagenet': # validate(val_loader_source, val_loader_target, model_source, model_target, criterion, 0, args) # else: # validate(None, val_loader_target, model_source, model_target, criterion, 0, args) return print('begin training') if train_loader_source: train_loader_source_batch = enumerate(train_loader_source) else: train_loader_source_batch = None train_loader_target_batch = enumerate(train_loader_target) for epoch in range(args.start_epoch, args.epochs): # train for one epoch if args.meta_sgd: train_loader_source_batch, train_loader_target_batch, meta_train_lr = train( train_loader_source, train_loader_source_batch, train_loader_target, train_loader_target_batch, model_source, model_target, criterion, optimizer, epoch, args, meta_train_lr) else: train_loader_source_batch, train_loader_target_batch = train( train_loader_source, train_loader_source_batch, train_loader_target, train_loader_target_batch, model_source, model_target, criterion, optimizer, epoch, args, None) # train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on the val data if (epoch + 1) % args.test_freq == 0 or (epoch + 1) % args.epochs == 0: if dataloader_number_returned == 2: prec1 = validate(None, val_loader_target, model_source, model_target, criterion, epoch, args) else: prec1 = validate(val_loader_source, val_loader_target, model_source, model_target, criterion, epoch, args) # prec1 = 1 # record the best prec1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best: log = open(os.path.join(args.log, 'log.txt'), 'a') log.write(' \nTarget_T1 acc: %3f' % (best_prec1)) log.close() if args.meta_sgd: save_checkpoint( { 'epoch': epoch + 1, 'meta_train_lr': meta_train_lr, 'arch': args.arch, 'source_state_dict': model_source.state_dict(), 'target_state_dict': model_target.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, args, epoch) else: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'source_state_dict': model_source.state_dict(), 'target_state_dict': model_target.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, args, epoch + 1)