def main(): parser = U.get_argparser() args = parser.parse_args() U.set_manual_seed(args.seed) train_session_args = vars(args) train_session = U.get_random_name() current_time = datetime.now().strftime('%b%d_%H_%M') prefix = f'{current_time}_{args.model}_{args.prepare}_{args.augmentation}_{train_session}' if args.fold is not None: prefix += f'_fold_{args.stratify}_{args.fold}' log_dir = os.path.join('runs', prefix) exp_dir = os.path.join('experiments', args.model, args.prepare, args.augmentation, prefix) os.makedirs(exp_dir, exist_ok=True) train_ids = D.all_train_ids() depths = D.read_depths(train_ids) images = D.read_train_images(train_ids) masks = D.read_train_masks(train_ids) if args.fix_masks: masks, changed_ids = D.fix_masks(masks, train_ids) with open(os.path.join(exp_dir, 'fixed_masks.txt'), 'w') as f: for sample_id in changed_ids: f.write(sample_id) f.write('\n') print(f'Fixed {len(changed_ids)} masks') if args.fold is not None: train_indexes, test_indexes = D.get_train_test_split_for_fold( args.stratify, args.fold, train_ids) else: train_indexes, test_indexes = train_test_split( np.arange(len(train_ids)), shuffle=False, random_state=args.split_seed, test_size=0.2) ids_train, ids_test = train_ids[train_indexes], train_ids[test_indexes] img_train, img_test = images[train_indexes], images[test_indexes] mask_train, mask_test = masks[train_indexes], masks[test_indexes] depth_train, depth_test = depths[train_indexes], depths[test_indexes] # Here we can exclude some images from training, but keep in validation train_mask = D.drop_some(img_train, mask_train, drop_black=True, drop_vstrips=args.drop_vstrips, drop_few=args.drop_few) ids_train = ids_train[train_mask] img_train = img_train[train_mask] mask_train = mask_train[train_mask] depth_train = depth_train[train_mask] if not is_sorted(ids_train): raise RuntimeError("ids_train is not sorted") if not is_sorted(ids_test): raise RuntimeError("ids_test_sorted is not sorted") prepare_fn = D.get_prepare_fn(args.prepare, **train_session_args) # This line valid if we apply prepare_fn first and then do augmentation target_size = prepare_fn.target_size if prepare_fn is not None else D.ORIGINAL_SIZE # target_size = D.ORIGINAL_SIZE build_augmentation_fn = D.AUGMENTATION_MODES[args.augmentation] aug = build_augmentation_fn(target_size, border_mode=args.border_mode) train_transform_list = [] valid_transform_list = [] if prepare_fn is not None: train_transform_list.append(prepare_fn.t_forward) valid_transform_list.append(prepare_fn.t_forward) train_transform_list.append(aug) trainset = D.ImageAndMaskDataset(ids_train, img_train, mask_train, depth_train, augment=A.Compose(train_transform_list)) validset = D.ImageAndMaskDataset(ids_test, img_test, mask_test, depth_test, augment=A.Compose(valid_transform_list)) trainloader = DataLoader(trainset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, drop_last=True, shuffle=True) validloader = DataLoader(validset, batch_size=args.batch_size, pin_memory=True, drop_last=False, shuffle=False) # Save train/val split for future use train_session_args.update({ 'train_set': list(ids_train), 'valid_set': list(ids_test) }) # Declare variables we will use during training start_epoch = 0 train_history = pd.DataFrame() target_metric = args.target_metric target_metric_mode = 'max' best_metric_val = 0 best_lb_checkpoint = os.path.join(exp_dir, f'{prefix}_{target_metric}.pth') model = U.get_model(args.model, num_classes=args.num_classes, num_channels=trainset.channels(), abn=args.abn, use_dropout=not args.no_dropout, pretrained=not args.no_pretrain).cuda() print('Train set size :', len(ids_train), 'batch size', trainloader.batch_size) print('Valid set size :', len(ids_test), 'batch size', validloader.batch_size) print('Tile transform :', prepare_fn if prepare_fn is not None else "None") print('Model :', args.model, count_parameters(model)) print('Augmentations :', args.augmentation, args.border_mode) print('Input channels :', trainset.channels()) print('Output classes :', args.num_classes) print('Optimizer :', args.optimizer, 'wd', args.weight_decay) print('Use of dropout :', not args.no_dropout) print('Train session :', train_session) print('Freeze encoder :', args.freeze_encoder) print('Seed :', args.seed, args.split_seed) print('Restart every :', args.restart_every) print('Fold :', args.fold, args.stratify) print('Fine-tune :', args.fine_tune) print('ABN Mode :', args.abn) print('Fix masks :', args.fix_masks) if args.resume: fname = U.auto_file(args.resume) start_epoch, train_history, best_score = U.restore_checkpoint( fname, model) print(train_history) print('Resuming training from epoch', start_epoch, ' and score', best_score, args.resume) if args.fine_tune and args.freeze_encoder > 0: raise ValueError( 'Incompatible options --fune-tune and --freeze-encoder') writer = SummaryWriter(log_dir) writer.add_text('train/params', '```' + json.dumps(train_session_args, indent=2) + '```', 0) config_fname = os.path.join(exp_dir, f'{train_session}.json') with open(config_fname, 'w') as f: f.write(json.dumps(train_session_args, indent=2)) weights = { 'mask': 1.0, 'class': 0.05, 'dsv': 0.1, } bce = U.get_loss('bce') bce_lovasz = U.get_loss('bce_lovasz') bce_jaccard = U.get_loss('bce_jaccard') losses = { 'warmup': { 'mask': bce, 'class': bce, 'dsv': bce, }, 'main': { 'mask': bce_jaccard, 'class': bce, 'dsv': bce, }, 'annealing': { 'mask': bce_lovasz, 'class': bce, 'dsv': bce, } } epochs = {'warmup': 50, 'main': 250, 'annealing': 50} if args.fast: for key in epochs.keys(): epochs[key] = 1 learning_rates = { 'warmup': args.learning_rate, 'main': 1e-3, 'annealing': 1e-2 } # Warmup phase if epochs['warmup']: print(torch.cuda.max_memory_allocated(), torch.cuda.max_memory_cached()) trainable_parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = U.get_optimizer(args.optimizer, trainable_parameters, learning_rates['warmup'], weight_decay=args.weight_decay) scheduler = None # StepLR(optimizer, gamma=0.5, step_size=50) train_history, best_metric_val, start_epoch = train( model, losses['warmup'], weights, optimizer, scheduler, trainloader, validloader, writer, start_epoch, epochs=epochs['warmup'], early_stopping=args.early_stopping, train_history=train_history, experiment_dir=exp_dir, target_metric=target_metric, best_metric_val=best_metric_val, target_metric_mode=target_metric_mode, checkpoint_filename=best_lb_checkpoint) U.save_checkpoint(os.path.join(exp_dir, f'{prefix}_warmup.pth'), model, start_epoch, train_history, metric_name=target_metric, metric_score=best_metric_val) del trainable_parameters, optimizer, scheduler torch.cuda.empty_cache() torch.cuda.synchronize() print('Finished warmup phase. Main train loop.') # Main training phase print(torch.cuda.max_memory_allocated(), torch.cuda.max_memory_cached()) trainable_parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = U.get_optimizer(args.optimizer, trainable_parameters, learning_rates['main'], weight_decay=args.weight_decay) scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=50, factor=0.5, min_lr=1e-5) train_history, best_metric_val, start_epoch = train( model, losses['main'], weights, optimizer, scheduler, trainloader, validloader, writer, start_epoch, epochs=epochs['main'], early_stopping=args.early_stopping, train_history=train_history, experiment_dir=exp_dir, target_metric=target_metric, best_metric_val=best_metric_val, target_metric_mode=target_metric_mode, checkpoint_filename=best_lb_checkpoint) del trainable_parameters, optimizer, scheduler torch.cuda.empty_cache() torch.cuda.synchronize() snapshots = [best_lb_checkpoint] U.save_checkpoint(os.path.join(exp_dir, f'{prefix}_main.pth'), model, start_epoch, train_history, metric_name=target_metric, metric_score=best_metric_val) print('Finished train phase.') # Cosine annealing if epochs['annealing']: for snapshot in range(5): print(f'Starting annealing phase {snapshot}') print(torch.cuda.max_memory_allocated(), torch.cuda.max_memory_cached()) # model.set_fine_tune(True) trainable_parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = U.get_optimizer('sgd', trainable_parameters, learning_rates['annealing'], weight_decay=args.weight_decay) scheduler = CosineAnnealingLR(optimizer, epochs['annealing'], eta_min=1e-7) snapshot_name = os.path.join( exp_dir, f'{prefix}_{target_metric}_snapshot_{snapshot}.pth') snapshots.append(snapshot_name) train_history, best_metric_val, start_epoch = train( model, losses['annealing'], weights, optimizer, scheduler, trainloader, validloader, writer, start_epoch, epochs=epochs['annealing'], early_stopping=args.early_stopping, train_history=train_history, experiment_dir=exp_dir, target_metric=target_metric, best_metric_val=0, target_metric_mode=target_metric_mode, checkpoint_filename=snapshot_name) del trainable_parameters, optimizer, scheduler torch.cuda.empty_cache() torch.cuda.synchronize() print('Training finished') train_history.to_csv(os.path.join(exp_dir, 'train_history.csv'), index=False) for snapshot_file in snapshots: generate_model_submission(snapshot_file, config_fname, mine_on_val=True)
def main(): parser = U.get_argparser() args = parser.parse_args() U.set_manual_seed(args.seed) train_session_args = vars(args) train_session = U.get_random_name() current_time = datetime.now().strftime('%b%d_%H_%M') prefix = f'{current_time}_{args.model}_{args.prepare}_{args.augmentation}_{train_session}' if args.fold is not None: prefix += f'_fold_{args.stratify}_{args.fold}' log_dir = os.path.join('runs', prefix) exp_dir = os.path.join('experiments', args.model, args.prepare, args.augmentation, prefix) os.makedirs(exp_dir, exist_ok=True) train_ids = D.get_train_ids(drop_black=True, drop_vstrips=args.drop_vstrips, drop_empty=args.drop_empty, drop_few=args.drop_few, fast=args.fast) depths = D.read_depths(train_ids) images = D.read_train_images(train_ids) masks = D.read_train_masks(train_ids) if args.fix_masks: masks, changed_ids = D.fix_masks(masks, train_ids) with open(os.path.join(exp_dir, 'fixed_masks.txt'), 'w') as f: for sample_id in changed_ids: f.write(sample_id) f.write('\n') print(f'Fixed {len(changed_ids)} masks') if args.fold is not None: train_indexes, test_indexes = D.get_train_test_split_for_fold( args.stratify, args.fold, train_ids) else: train_indexes, test_indexes = train_test_split( np.arange(len(train_ids)), shuffle=False, random_state=args.split_seed, test_size=0.2) ids_train, ids_test = train_ids[train_indexes], train_ids[test_indexes] if not is_sorted(ids_train): raise RuntimeError("ids_train is not sorted") if not is_sorted(ids_test): raise RuntimeError("ids_test_sorted is not sorted") img_train, img_test = images[train_indexes], images[test_indexes] mask_train, mask_test = masks[train_indexes], masks[test_indexes] depth_train, depth_test = depths[train_indexes], depths[test_indexes] prepare_fn = D.get_prepare_fn(args.prepare, **train_session_args) # This line valid if we apply prepare_fn first and then do augmentation target_size = prepare_fn.target_size if prepare_fn is not None else D.ORIGINAL_SIZE # target_size = D.ORIGINAL_SIZE build_augmentation_fn = D.AUGMENTATION_MODES[args.augmentation] aug = build_augmentation_fn(target_size, border_mode=args.border_mode) train_transform_list = [] valid_transform_list = [] if prepare_fn is not None: train_transform_list.append(prepare_fn.t_forward) valid_transform_list.append(prepare_fn.t_forward) train_transform_list.append(aug) trainset = D.ImageAndMaskDataset(ids_train, img_train, mask_train, depth_train, augment=A.Compose(train_transform_list)) validset = D.ImageAndMaskDataset(ids_test, img_test, mask_test, depth_test, augment=A.Compose(valid_transform_list)) trainloader = DataLoader(trainset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, drop_last=True, shuffle=True) validloader = DataLoader(validset, batch_size=args.batch_size, pin_memory=True, drop_last=False, shuffle=False) # Save train/val split for future use train_session_args.update({ 'train_set': list(ids_train), 'valid_set': list(ids_test) }) # Declare variables we will use during training start_epoch = 0 train_history = pd.DataFrame() scheduler = None optimizer = None target_metric = args.target_metric target_metric_mode = 'max' best_metric_val = 0 best_lb_checkpoint = os.path.join(exp_dir, f'{prefix}_{target_metric}.pth') model = U.get_model(args.model, num_classes=args.num_classes, num_channels=trainset.channels(), abn=args.abn, use_dropout=not args.no_dropout, pretrained=not args.no_pretrain).cuda() print('Train set size :', len(trainloader), 'batch size', trainloader.batch_size) print('Valid set size :', len(validloader), 'batch size', validloader.batch_size) print('Tile transform :', prepare_fn if prepare_fn is not None else "None") print('Model :', args.model, count_parameters(model)) print('Augmentations :', args.augmentation, args.border_mode) print('Input channels :', trainset.channels()) print('Output classes :', args.num_classes) print('Criterion :', args.loss), print('Optimizer :', args.optimizer, args.learning_rate, args.weight_decay) print('Use of dropout :', not args.no_dropout) print('Train session :', train_session) print('Freeze encoder :', args.freeze_encoder) print('Seed :', args.seed, args.split_seed) print('Restart every :', args.restart_every) print('Fold :', args.fold, args.stratify) print('Fine-tune :', args.fine_tune) print('ABN Mode :', args.abn) print('Fix masks :', args.fix_masks) if args.resume: fname = U.auto_file(args.resume) start_epoch, train_history, best_score = U.restore_checkpoint( fname, model) print(train_history) print('Resuming training from epoch', start_epoch, ' and score', best_score, args.resume) segmentation_loss = U.get_loss(args.loss) if args.fine_tune and args.freeze_encoder > 0: raise ValueError( 'Incompatible options --fune-tune and --freeze-encoder') writer = SummaryWriter(log_dir) writer.add_text('train/params', '```' + json.dumps(train_session_args, indent=2) + '```', 0) config_fname = os.path.join(exp_dir, f'{train_session}.json') with open(config_fname, 'w') as f: f.write(json.dumps(train_session_args, indent=2)) # Start training loop no_improvement_epochs = 0 for epoch in range(start_epoch, start_epoch + args.epochs): # On Epoch begin if U.should_quit(exp_dir) or ( args.early_stopping is not None and no_improvement_epochs > args.early_stopping): break epochs_trained = epoch - start_epoch should_restart_optimizer = ( args.restart_every > 0 and epochs_trained % args.restart_every == 0) or (epochs_trained == args.freeze_encoder) or optimizer is None if should_restart_optimizer: del optimizer if args.fine_tune: model.set_fine_tune(args.fine_tune) else: model.set_encoder_training_enabled( epochs_trained >= args.freeze_encoder) trainable_parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = U.get_optimizer(args.optimizer, trainable_parameters, args.learning_rate, weight_decay=args.weight_decay) print('Restarting optimizer state', epoch, count_parameters(model)) if args.lr_scheduler: scheduler = U.get_lr_scheduler(args.lr_scheduler, optimizer, args.epochs) if scheduler is not None and not isinstance(scheduler, ReduceLROnPlateau): scheduler.step(epochs_trained) U.log_learning_rate(writer, optimizer, epoch) # Epoch train_metrics = process_epoch(model, segmentation_loss, optimizer, trainloader, epoch, True, writer, mask_postprocess=prepare_fn.backward) valid_metrics = process_epoch(model, segmentation_loss, None, validloader, epoch, False, writer, mask_postprocess=prepare_fn.backward) all_metrics = {} all_metrics.update(train_metrics) all_metrics.update(valid_metrics) # On Epoch End summary = { 'epoch': [int(epoch)], 'lr': [float(optimizer.param_groups[0]['lr'])] } for k, v in all_metrics.items(): summary[k] = [v] train_history = train_history.append(pd.DataFrame.from_dict(summary), ignore_index=True) print(epoch, summary) if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(all_metrics[target_metric], epochs_trained) if U.is_better(all_metrics[target_metric], best_metric_val, target_metric_mode): best_metric_val = all_metrics[target_metric] U.save_checkpoint(best_lb_checkpoint, model, epoch, train_history, metric_name=target_metric, metric_score=best_metric_val) print('Checkpoint saved', epoch, best_metric_val, best_lb_checkpoint) no_improvement_epochs = 0 else: no_improvement_epochs += 1 print('Training finished') generate_model_submission(best_lb_checkpoint, config_fname, mine_on_val=True)
net_em = models.environment_model.EnvironmentModel(obs_shape, act_n, config) # net_em.load_state_dict(torch.load(config.EM_FILE_NAME, map_location=lambda storage, loc: storage)) net_em = net_em.to(device) config.EM_NET = str(net_em) net_i2a = i2a_model_no_LSTM.I2A_FC(obs_shape, act_n, net_em, net_policy, config).to(device) config.I2A_NET = str(net_i2a) config.ROLLOUT_ENCODER = str(net_i2a.encoder) # net_i2a.load_state_dict(torch.load("saves/03_i2a_test/best_pong_-018.667_1300.dat", map_location=lambda storage, loc: storage)) # print(net_policy) # print(net_em) print(net_i2a) print("em param count: ", common.count_parameters(net_em)) print("net_policy param count: ", common.count_parameters(net_policy)) print("ia policy param count: ", common.count_parameters(net_i2a)) obs = envs[0].reset() obs_v = ptan.agent.default_states_preprocessor([obs]).to(device) res = net_i2a(obs_v) optimizer = optim.RMSprop(net_i2a.parameters(), lr=config.LEARNING_RATE, eps=1e-5) policy_opt = optim.Adam(net_policy.parameters(), lr=config.POLICY_LR) trainer = lib.trainer.A2CTrainer(envs, test_env, net_i2a, optimizer, device, config)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-g', '--grayscale', action='store_true', help='Whether to use grayscale image instead of RGB') parser.add_argument('-m', '--model', required=True, type=str, help='Name of the model') parser.add_argument('-p', '--patch-size', type=int, default=224) parser.add_argument('-b', '--batch-size', type=int, default=1, help='Batch Size during training, e.g. -b 64') parser.add_argument('-lr', '--learning-rate', type=float, default=1e-3, help='Initial learning rate') parser.add_argument('-l', '--loss', type=str, default='bce', help='Target loss') parser.add_argument('-o', '--optimizer', default='SGD', help='Name of the optimizer') parser.add_argument('-e', '--epochs', type=int, default=100, help='Epoch to run') parser.add_argument('-d', '--dataset', type=str, help='Name of the dataset to use for training.') parser.add_argument('-dd', '--data-dir', type=str, default='data', help='Root directory where datasets are located.') parser.add_argument('-s', '--steps', type=int, default=128, help='Steps per epoch') parser.add_argument('-x', '--experiment', type=str, help='Name of the experiment') parser.add_argument('-w', '--workers', default=0, type=int, help='Num workers') parser.add_argument('-r', '--resume', action='store_true') parser.add_argument('-mem', '--memory', action='store_true') parser.add_argument('-sgdr', action='store_true') args = parser.parse_args() cudnn.benchmark = True if args.experiment is None: args.experiment = '%s_%s_%d_%s_%s' % ( args.dataset, args.model, args.patch_size, 'gray' if args.grayscale else 'rgb', args.loss) experiment_dir = os.path.join('experiments', args.dataset, args.loss, args.experiment) os.makedirs(experiment_dir, exist_ok=True) writer = SummaryWriter(comment='_' + args.experiment) with open(os.path.join(experiment_dir, 'arguments.txt'), 'w') as f: f.write(' '.join(sys.argv[1:])) model = get_model(args.model, patch_size=args.patch_size, num_channels=1 if args.grayscale else 3) # Write model graph dummy_input = torch.autograd.Variable( torch.rand((args.batch_size, 1 if args.grayscale else 3, args.patch_size, args.patch_size))) writer.add_graph(model, dummy_input) model = model.cuda() loss = get_loss(args.loss).cuda() optimizer = get_optimizer(args.optimizer, model.parameters(), args.learning_rate) metrics = { 'iou': JaccardScore().cuda(), 'accuracy': PixelAccuracy().cuda() } trainset, validset, num_classes = get_dataset(args.dataset, args.data_dir, grayscale=args.grayscale, patch_size=args.patch_size, keep_in_mem=args.memory) print('Train set size', len(trainset)) print('Valid set size', len(validset)) print('Model ', args.model) print('Parameters ', count_parameters(model)) trainloader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) validloader = DataLoader(validset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, drop_last=True) start_epoch = 0 best_loss = np.inf train_history = pd.DataFrame() # Checkpoint is train result of epoch with best loss checkpoint_filename = os.path.join(experiment_dir, f'{args.model}_checkpoint.pth') # Snapshot is train result of last epoch snapshot_filename = os.path.join(experiment_dir, f'{args.model}_snapshot.pth') if args.resume: start_epoch, train_history, best_loss = restore_snapshot( model, optimizer, checkpoint_filename) print('Resuming training from epoch', start_epoch, ' and loss', best_loss) print(train_history) scheduler = None if args.sgdr: scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-8) for epoch in range(start_epoch, args.epochs): if scheduler is not None: scheduler.step(epoch) lrs = scheduler.get_lr() if len(lrs) > 1: writer.add_scalars('train/lr', dict(enumerate(lrs)), global_step=epoch) else: writer.add_scalar('train/lr', lrs[0], global_step=epoch) train_loss, train_scores = train(model, loss, optimizer, trainloader, epoch, metrics, summary_writer=writer) valid_loss, valid_scores = validate(model, loss, validloader, epoch, metrics, summary_writer=writer) summary = { 'epoch': [epoch], 'loss': [train_loss.avg], 'val_loss': [valid_loss.avg] } for key, value in train_scores.items(): summary[key] = [value.avg] for key, value in valid_scores.items(): summary['val_' + key] = [value.avg] train_history = train_history.append(pd.DataFrame.from_dict(summary), ignore_index=True) print(epoch, summary) if valid_loss.avg < best_loss: save_snapshot(model, optimizer, valid_loss.avg, epoch, train_history, checkpoint_filename) best_loss = valid_loss.avg print('Checkpoint saved', epoch, best_loss) save_snapshot(model, optimizer, valid_loss.avg, epoch, train_history, snapshot_filename) print('Training is finished...') train_history.to_csv(os.path.join(experiment_dir, args.experiment + '.csv'), index=False, mode='a' if args.resume else 'w', header=not args.resume)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-g', '--grayscale', action='store_true', help='Whether to use grayscale image instead of RGB') parser.add_argument('-m', '--model', required=True, type=str, help='Name of the model') parser.add_argument('-p', '--patch-size', type=int, default=224) parser.add_argument('-b', '--batch-size', type=int, default=1, help='Batch Size during training, e.g. -b 64') parser.add_argument('-lr', '--learning-rate', type=float, default=1e-3, help='Initial learning rate') parser.add_argument('-l', '--loss', type=str, default='bce', help='Target loss') parser.add_argument('-o', '--optimizer', default='SGD', help='Name of the optimizer') parser.add_argument('-e', '--epochs', type=int, default=100, help='Epoch to run') parser.add_argument('-d', '--dataset', type=str, help='Name of the dataset to use for training.') parser.add_argument('-dd', '--data-dir', type=str, default='data', help='Root directory where datasets are located.') parser.add_argument('-s', '--steps', type=int, default=128, help='Steps per epoch') parser.add_argument('-x', '--experiment', type=str, help='Name of the experiment') parser.add_argument('-w', '--workers', default=0, type=int, help='Num workers') parser.add_argument('-r', '--resume', action='store_true') parser.add_argument('-mem', '--memory', action='store_true') args = parser.parse_args() cudnn.benchmark = True if args.experiment is None: args.experiment = 'torch_%s_%s_afterburn_%d_%s_%s' % ( args.dataset, args.model, args.patch_size, 'gray' if args.grayscale else 'rgb', args.loss) experiment_dir = os.path.join('experiments', args.dataset, args.loss, args.experiment) os.makedirs(experiment_dir, exist_ok=True) writer = SummaryWriter(comment=args.experiment) with open(os.path.join(experiment_dir, 'arguments.txt'), 'w') as f: f.write(' '.join(sys.argv[1:])) trainset, validset, num_classes = TT.get_dataset( args.dataset, args.data_dir, grayscale=args.grayscale, patch_size=args.patch_size, keep_in_mem=args.memory) print('Train set size', len(trainset)) print('Valid set size', len(validset)) trainloader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) validloader = DataLoader(validset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, drop_last=True) head_model = TT.get_model(args.model, patch_size=args.patch_size, num_channels=1 if args.grayscale else 3).cuda() TT.restore_snapshot(head_model, None, auto_file('linknet34_checkpoint.pth')) # Freeze model training for param in head_model.parameters(): param.requires_grad = False afterburner = Afterburner() model = nn.Sequential(head_model, nn.Sigmoid(), afterburner).cuda() optimizer = TT.get_optimizer(args.optimizer, afterburner.parameters(), args.learning_rate) loss = TT.get_loss(args.loss).cuda() metrics = { 'iou': JaccardScore().cuda(), 'accuracy': PixelAccuracy().cuda() } start_epoch = 0 best_loss = np.inf train_history = pd.DataFrame() checkpoint_filename = os.path.join(experiment_dir, f'{args.model}_checkpoint.pth') if args.resume: start_epoch, train_history, best_loss = restore_snapshot( model, optimizer, checkpoint_filename) print('Resuming training from epoch', start_epoch, ' and loss', best_loss) print(train_history) print('Head :', count_parameters(head_model)) print('Afterburner:', count_parameters(afterburner)) for epoch in range(start_epoch, args.epochs): train_loss, train_scores = train(model, loss, optimizer, trainloader, epoch, metrics, summary_writer=writer) valid_loss, valid_scores = validate(model, loss, validloader, epoch, metrics, summary_writer=writer) summary = { 'epoch': [epoch], 'loss': [train_loss.avg], 'val_loss': [valid_loss.avg] } for key, value in train_scores.items(): summary[key] = [value.avg] for key, value in valid_scores.items(): summary['val_' + key] = [value.avg] train_history = train_history.append(pd.DataFrame.from_dict(summary), ignore_index=True) print(epoch, summary) if valid_loss.avg < best_loss: save_snapshot(model, optimizer, valid_loss.avg, epoch, train_history, checkpoint_filename) best_loss = valid_loss.avg print('Checkpoint saved', epoch, best_loss) print('Training is finished...') train_history.to_csv(os.path.join(experiment_dir, args.experiment + '.csv'), index=False, mode='a' if args.resume else 'w', header=not args.resume)
device = torch.device(config.DEVICE) print(config.REPLACEMENT) print(type(config.REPLACEMENT)) writer = SummaryWriter(comment="_a2c_" + config.build_name_for_writer()) saves_path = writer.logdir #envs used for sampling tuples of experience envs = [ common.makeCustomizedGridEnv(config) for _ in range(config.NUM_ENVS) ] #env used to test the avg reward produced by the current best net test_env = common.makeCustomizedGridEnv(config) net = common.getNet(device, config) print(common.count_parameters(net)) config.A2CNET = str(net) #sets seed on torch operations and on all environments common.set_seed(seed=config.SEED, envs=envs) common.set_seed(seed=config.SEED, envs=[test_env]) optimizer = optim.Adam(net.parameters(), lr=config.LEARNING_RATE, eps=1e-5) trainer = lib.trainer.A2CTrainer(envs, test_env, net, optimizer, device, config) epoch = 0 total_steps = 0 best_reward = None ts_start = time.time()
def maybe_drop(self, x, p=0.5): if self.use_dropout: x = F.dropout(x, p, training=self.training) return x def set_fine_tune(self, fine_tune_enabled): layers = [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5] for layer in layers: for param in layer.parameters(): param.requires_grad = bool(not fine_tune_enabled) def set_encoder_training_enabled(self, enabled): # First layer is trainable since we use 1-channel image instead of 3-channel layers = [self.conv2, self.conv3, self.conv4, self.conv5] for layer in layers: for param in layer.parameters(): param.requires_grad = bool(enabled) if __name__ == '__main__': net = TernausNetOC(num_classes=1, num_channels=1) net = net.eval() print(count_parameters(net)) x = {'image': torch.rand((4, 1, 128, 128)), 'depth': torch.rand((4)) } y = net(x) print(y['mask'].size())
net.load_state_dict( torch.load(config.A2C_FILE_NAME, map_location=lambda storage, loc: storage)) net = net.to(device) config.A2CNET = str(net) # net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n) net_em = models.environment_model.EnvironmentModel( envs[0].observation_space.shape, envs[0].action_space.n, config).to(device) # net_em.load_state_dict(torch.load("/home/valy/OneDrive/experiments/repl/9_22/Jan19_20-40-19_valy_em_22_9_True/best_1.4249e-06_195121.dat", map_location=lambda storage, loc: storage)) config.EM_NET = str(net_em) print(net) print(net_em) print("em param count: " + str(common.count_parameters(net_em))) # sets seed on torch operations and on all environments common.set_seed(seed=config.SEED, envs=envs) optimizer = optim.Adam(net_em.parameters(), lr=config.LEARNING_RATE) epoch = 0 best_loss = np.inf desc = "" pbar = trange(config.EM_STEPS, desc='', leave=True) progress = iter(pbar) with ptan.common.utils.TBMeanTracker( writer, batch_size=config.BATCH_SIZE) as tb_tracker: #obtain batch transitions from the a2c model free agent (st, at, st+1, r)
encoder_outs.append(before_pool) for i, module in enumerate(self.up_convs): before_pool = encoder_outs[-(i + 2)] x = module(before_pool, x) # No softmax is used. This means you need to use # nn.CrossEntropyLoss is your training script, # as this module includes a softmax already. x = self.drop_final(x) x = self.conv_final(x) return x def set_fine_tune(self, fine_tune_enabled): pass def set_encoder_training_enabled(self, enabled): pass if __name__ == "__main__": """ testing """ model = UNet(num_classes=1, num_channels=1, depth=5, merge_mode='concat').eval() x = torch.rand((1, 1, 128, 128)) out = model(x) print(out.size()) print(count_parameters(model))
def resnext50(**kwargs): return ResNeXt([3, 4, 6, 3], **kwargs) def resnext101(pretrained=True, input_3x3=True, abn_block=ABN, **kwargs): model = ResNeXt([3, 4, 23, 3], input_3x3=input_3x3, abn_block=abn_block, classes=1000, **kwargs) if pretrained and input_3x3: checkpoint = torch.load( os.path.join('pretrain', 'resnext101_ipabn_lr_512.pth.tar')) state_dict = checkpoint['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v model.load_state_dict(new_state_dict) return model def resnext152(**kwargs): return ResNeXt([3, 8, 36, 3], **kwargs) if __name__ == '__main__': print(count_parameters(resnext50())) print(count_parameters(resnext101())) print(count_parameters(resnext152()))