def main(args): # initialize the multi-GPU / multi-node training init_distributed_mode(args, make_communication_groups=False) # initialize the experiment logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec', 'loss', 'prec_val', 'loss_val') # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() if not 'pascal' in args.data_path: main_data_path = args.data_path args.data_path = os.path.join(main_data_path, 'train') train_dataset = load_data(args) else: train_dataset = VOC2007_dataset(args.data_path, split=args.split) args.test = 'val' if args.split == 'train' else 'test' if not 'pascal' in args.data_path: if args.cross_valid is None: args.data_path = os.path.join(main_data_path, 'val') val_dataset = load_data(args) else: val_dataset = VOC2007_dataset(args.data_path, split=args.test) if args.cross_valid is not None: kfold = KFold(per_target(train_dataset.imgs), args.cross_valid, args.kfold) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=kfold.train, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, sampler=kfold.val, num_workers=args.workers) else: train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) # prepare the different data transformations tr_val, tr_train = get_data_transformations() train_dataset.transform = tr_train val_dataset.transform = tr_val # build model skeleton fix_random_seeds(args.seed) model = model_factory(args.arch, args.sobel) load_pretrained(model, args) # keep only conv layers model.body.classifier = None model.conv = args.conv if 'places' in args.data_path: nmb_classes = 205 elif 'pascal' in args.data_path: nmb_classes = 20 else: nmb_classes = 1000 reglog = RegLog(args.arch, nmb_classes, args.conv) # distributed training wrapper model = to_cuda(model, [args.gpu_to_work_on], apex=False) reglog = to_cuda(reglog, [args.gpu_to_work_on], apex=False) logger.info('model to cuda') # set optimizer optimizer = sgd_optimizer(reglog, args.lr, args.wd) ## variables to reload to fetch in checkpoint to_restore = {'epoch': 0, 'start_iter': 0} # re start from checkpoint restart_from_checkpoint( args, run_variables=to_restore, state_dict=reglog, optimizer=optimizer, ) args.epoch = to_restore['epoch'] args.start_iter = to_restore['start_iter'] model.eval() reglog.train() # Linear training for _ in range(args.epoch, args.nepochs): logger.info("============ Starting epoch %i ... ============" % args.epoch) # train the network for one epoch scores = train_network(args, model, reglog, optimizer, train_loader) if not 'pascal' in args.data_path: scores_val = validate_network(val_loader, [model, reglog], args) else: scores_val = evaluate_pascal(val_dataset, [model, reglog]) scores = scores + scores_val # save training statistics logger.info(scores) training_stats.update(scores)
def main(args): # initialize the multi-GPU / multi-node training init_distributed_mode(args, make_communication_groups=False) # initialize the experiment logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec', 'loss', 'prec_val', 'loss_val') # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() main_data_path = args.data_path if args.debug: args.data_path = os.path.join(main_data_path, 'val') else: args.data_path = os.path.join(main_data_path, 'train') train_dataset = load_data(args) args.data_path = os.path.join(main_data_path, 'val') val_dataset = load_data(args) # prepare the different data transformations tr_val, tr_train = get_data_transformations() train_dataset.transform = tr_train val_dataset.transform = tr_val val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, ) # build model skeleton fix_random_seeds(args.seed) nmb_classes = 205 if 'places' in args.data_path else 1000 model = model_factory(args, relu=True, num_classes=nmb_classes) # load pretrained weights load_pretrained(model, args) # merge sobel layers with first convolution layer if args.sobel2RGB: sobel2RGB(model) # re initialize classifier if hasattr(model.body, 'classifier'): for m in model.body.classifier.modules(): if isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.01) m.bias.data.fill_(0.1) # distributed training wrapper model = to_cuda(model, [args.gpu_to_work_on], apex=True) logger.info('model to cuda') # set optimizer optimizer = sgd_optimizer(model, args.lr, args.wd) ## variables to reload to fetch in checkpoint to_restore = {'epoch': 0, 'start_iter': 0} # re start from checkpoint restart_from_checkpoint( args, run_variables=to_restore, state_dict=model, optimizer=optimizer, ) args.epoch = to_restore['epoch'] args.start_iter = to_restore['start_iter'] if args.evaluate: validate_network(val_loader, [model], args) return # Supervised training for _ in range(args.epoch, args.nepochs): logger.info("============ Starting epoch %i ... ============" % args.epoch) fix_random_seeds(args.seed + args.epoch) # train the network for one epoch adjust_learning_rate(optimizer, args) scores = train_network(args, model, optimizer, train_dataset) scores_val = validate_network(val_loader, [model], args) # save training statistics logger.info(scores + scores_val) training_stats.update(scores + scores_val)
def main(args): """ This code implements the paper: https://arxiv.org/abs/1905.01278 The method consists in alternating between a hierachical clustering of the features and learning the parameters of a convnet by predicting both the angle of the rotation applied to the input data and the cluster assignments in a single hierachical loss. """ # initialize communication groups training_groups, clustering_groups = init_distributed_mode(args) # check parameters check_parameters(args) # initialize the experiment logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec', 'loss', 'prec_super_class', 'loss_super_class', 'prec_sub_class', 'loss_sub_class') # initialize SLURM signal handler for time limit / pre-emption init_signal_handler() # load data dataset = YFCC100M_dataset(r'./dataset', size=args.size_dataset) # prepare the different data transformations tr_cluster, tr_train = get_data_transformations(args.rotation * 90) # build model skeleton fix_random_seeds() model = model_factory(args.sobel) logger.info('model created') # load pretrained weights load_pretrained(model, args) # convert batch-norm layers to nvidia wrapper to enable batch stats reduction model = apex.parallel.convert_syncbn_model(model) # distributed training wrapper model = to_cuda(model, args.gpu_to_work_on, apex=False) logger.info('model to cuda') # set optimizer optimizer = sgd_optimizer(model, args.lr, args.wd) # load cluster assignments cluster_assignments = load_cluster_assignments(args, dataset) # build prediction layer on the super_class pred_layer, optimizer_pred_layer = build_prediction_layer( model.module.body.dim_output_space, args, ) nmb_sub_classes = args.k // args.nmb_super_clusters sub_class_pred_layer, optimizer_sub_class_pred_layer = build_prediction_layer( model.module.body.dim_output_space, args, num_classes=nmb_sub_classes, group=training_groups[args.training_local_world_id], ) # variables to fetch in checkpoint to_restore = {'epoch': 0, 'start_iter': 0} # re start from checkpoint restart_from_checkpoint( args, run_variables=to_restore, state_dict=model, optimizer=optimizer, pred_layer_state_dict=pred_layer, optimizer_pred_layer=optimizer_pred_layer, ) pred_layer_name = str(args.training_local_world_id) + '-pred_layer.pth.tar' restart_from_checkpoint( args, ckp_path=os.path.join(args.dump_path, pred_layer_name), state_dict=sub_class_pred_layer, optimizer=optimizer_sub_class_pred_layer, ) args.epoch = to_restore['epoch'] args.start_iter = to_restore['start_iter'] for _ in range(args.epoch, args.nepochs): logger.info("============ Starting epoch %i ... ============" % args.epoch) fix_random_seeds(args.epoch) # step 1: Get the final activations for the whole dataset / Cluster them if cluster_assignments is None and not args.epoch % args.reassignment: logger.info("=> Start clustering step") dataset.transform = tr_cluster cluster_assignments = get_cluster_assignments( args, model, dataset, clustering_groups) # reset prediction layers if args.nmb_super_clusters > 1: pred_layer, optimizer_pred_layer = build_prediction_layer( model.module.body.dim_output_space, args, ) sub_class_pred_layer, optimizer_sub_class_pred_layer = build_prediction_layer( model.module.body.dim_output_space, args, num_classes=nmb_sub_classes, group=training_groups[args.training_local_world_id], ) # step 2: Train the network with the cluster assignments as labels # prepare dataset dataset.transform = tr_train dataset.sub_classes = cluster_assignments # concatenate models and their corresponding optimizers models = [model, pred_layer, sub_class_pred_layer] optimizers = [ optimizer, optimizer_pred_layer, optimizer_sub_class_pred_layer ] # train the network for one epoch scores = train_network(args, models, optimizers, dataset) ## save training statistics logger.info(scores) training_stats.update(scores) # reassign clusters at the next epoch if not args.epoch % args.reassignment: cluster_assignments = None dataset.subset_indexes = None end_of_epoch(args) dist.barrier()
def main(args): # initialize the experiment logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec', 'loss', 'prec_val', 'loss_val') if not 'VOC2007' in args.data_path: main_data_path = args.data_path args.data_path = os.path.join(main_data_path, 'train') train_dataset = load_data(args) else: train_dataset = VOC2007_dataset(args.data_path, split=args.split) args.test = 'val' if args.split == 'train' else 'test' if not 'VOC2007' in args.data_path: if args.cross_valid is None: args.data_path = os.path.join(main_data_path, 'val') val_dataset = load_data(args) else: val_dataset = VOC2007_dataset(args.data_path, split=args.test) if args.cross_valid is not None: kfold = KFold(per_target(train_dataset.imgs), args.cross_valid, args.kfold) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=kfold.train, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, sampler=kfold.val, num_workers=args.workers) else: train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers) # prepare the different data transformations tr_val, tr_train = get_data_transformations() #data preprocess transformation could keep in consistency train_dataset.transform = tr_val val_dataset.transform = tr_val # build model skeleton fix_random_seeds() model = model_factory(args) # keep only conv layers model.body.classifier = None load_pretrained(model, args) print('feature at conv{} is extracting!'.format(args.conv)) model.conv = args.conv if 'places' in args.data_path: nmb_classes = 205 elif 'VOC2007' in args.data_path: nmb_classes = 20 else: nmb_classes = 1000 # distributed training wrappere) model = model.cuda() logger.info('model to cuda') ## variables to reload to fetch in checkpoint to_restore = {'epoch': 0, 'start_iter': 0} model.eval() save_feature(args, model, train_loader, 'trainval') save_feature(args, model, val_loader, 'test') print('save finished')