Esempio n. 1
0
def main(args):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(args, make_communication_groups=False)

    # initialize the experiment
    logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec',
                                            'loss', 'prec_val', 'loss_val')

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    if not 'pascal' in args.data_path:
        main_data_path = args.data_path
        args.data_path = os.path.join(main_data_path, 'train')
        train_dataset = load_data(args)
    else:
        train_dataset = VOC2007_dataset(args.data_path, split=args.split)

    args.test = 'val' if args.split == 'train' else 'test'
    if not 'pascal' in args.data_path:
        if args.cross_valid is None:
            args.data_path = os.path.join(main_data_path, 'val')
        val_dataset = load_data(args)
    else:
        val_dataset = VOC2007_dataset(args.data_path, split=args.test)

    if args.cross_valid is not None:
        kfold = KFold(per_target(train_dataset.imgs), args.cross_valid, args.kfold)
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=args.batch_size, sampler=kfold.train,
            num_workers=args.workers, pin_memory=True)
        val_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=args.batch_size, sampler=kfold.val,
            num_workers=args.workers)

    else:
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=args.batch_size, shuffle=True,
            num_workers=args.workers, pin_memory=True)
        val_loader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=args.batch_size, shuffle=False,
            num_workers=args.workers)

    # prepare the different data transformations
    tr_val, tr_train = get_data_transformations()
    train_dataset.transform = tr_train
    val_dataset.transform = tr_val

    # build model skeleton
    fix_random_seeds(args.seed)
    model = model_factory(args.arch, args.sobel)

    load_pretrained(model, args)

    # keep only conv layers
    model.body.classifier = None
    model.conv = args.conv

    if 'places' in args.data_path:
        nmb_classes = 205
    elif 'pascal' in args.data_path:
        nmb_classes = 20
    else:
        nmb_classes = 1000

    reglog = RegLog(args.arch, nmb_classes, args.conv)

    # distributed training wrapper
    model = to_cuda(model, [args.gpu_to_work_on], apex=False)
    reglog = to_cuda(reglog, [args.gpu_to_work_on], apex=False)
    logger.info('model to cuda')


    # set optimizer
    optimizer = sgd_optimizer(reglog, args.lr, args.wd)

    ## variables to reload to fetch in checkpoint
    to_restore = {'epoch': 0, 'start_iter': 0}

    # re start from checkpoint
    restart_from_checkpoint(
        args,
        run_variables=to_restore,
        state_dict=reglog,
        optimizer=optimizer,
    )
    args.epoch = to_restore['epoch']
    args.start_iter = to_restore['start_iter']

    model.eval()
    reglog.train()

    # Linear training
    for _ in range(args.epoch, args.nepochs):

        logger.info("============ Starting epoch %i ... ============" % args.epoch)

        # train the network for one epoch
        scores = train_network(args, model, reglog, optimizer, train_loader)

        if not 'pascal' in args.data_path:
            scores_val = validate_network(val_loader, [model, reglog], args)
        else:
            scores_val = evaluate_pascal(val_dataset, [model, reglog])

        scores = scores + scores_val

        # save training statistics
        logger.info(scores)
        training_stats.update(scores)
Esempio n. 2
0
def main(args):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(args, make_communication_groups=False)

    # initialize the experiment
    logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec',
                                            'loss', 'prec_val', 'loss_val')

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    main_data_path = args.data_path
    if args.debug:
        args.data_path = os.path.join(main_data_path, 'val')
    else:
        args.data_path = os.path.join(main_data_path, 'train')
    train_dataset = load_data(args)

    args.data_path = os.path.join(main_data_path, 'val')
    val_dataset = load_data(args)

    # prepare the different data transformations
    tr_val, tr_train = get_data_transformations()
    train_dataset.transform = tr_train
    val_dataset.transform = tr_val
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size,
        num_workers=args.workers,
        pin_memory=True,
    )

    # build model skeleton
    fix_random_seeds(args.seed)
    nmb_classes = 205 if 'places' in args.data_path else 1000
    model = model_factory(args, relu=True, num_classes=nmb_classes)

    # load pretrained weights
    load_pretrained(model, args)

    # merge sobel layers with first convolution layer
    if args.sobel2RGB:
        sobel2RGB(model)

    # re initialize classifier
    if hasattr(model.body, 'classifier'):
        for m in model.body.classifier.modules():
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.fill_(0.1)

    # distributed training wrapper
    model = to_cuda(model, [args.gpu_to_work_on], apex=True)
    logger.info('model to cuda')

    # set optimizer
    optimizer = sgd_optimizer(model, args.lr, args.wd)

    ## variables to reload to fetch in checkpoint
    to_restore = {'epoch': 0, 'start_iter': 0}

    # re start from checkpoint
    restart_from_checkpoint(
        args,
        run_variables=to_restore,
        state_dict=model,
        optimizer=optimizer,
    )
    args.epoch = to_restore['epoch']
    args.start_iter = to_restore['start_iter']

    if args.evaluate:
        validate_network(val_loader, [model], args)
        return

    # Supervised training
    for _ in range(args.epoch, args.nepochs):

        logger.info("============ Starting epoch %i ... ============" %
                    args.epoch)

        fix_random_seeds(args.seed + args.epoch)

        # train the network for one epoch
        adjust_learning_rate(optimizer, args)
        scores = train_network(args, model, optimizer, train_dataset)

        scores_val = validate_network(val_loader, [model], args)

        # save training statistics
        logger.info(scores + scores_val)
        training_stats.update(scores + scores_val)
Esempio n. 3
0
File: main.py Progetto: GG-yuki/bugs
def main(args):
    """
    This code implements the paper: https://arxiv.org/abs/1905.01278
    The method consists in alternating between a hierachical clustering of the
    features and learning the parameters of a convnet by predicting both the
    angle of the rotation applied to the input data and the cluster assignments
    in a single hierachical loss.
           """

    # initialize communication groups
    training_groups, clustering_groups = init_distributed_mode(args)

    # check parameters
    check_parameters(args)

    # initialize the experiment
    logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec',
                                            'loss', 'prec_super_class',
                                            'loss_super_class',
                                            'prec_sub_class', 'loss_sub_class')

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    dataset = YFCC100M_dataset(r'./dataset', size=args.size_dataset)

    # prepare the different data transformations
    tr_cluster, tr_train = get_data_transformations(args.rotation * 90)

    # build model skeleton
    fix_random_seeds()
    model = model_factory(args.sobel)
    logger.info('model created')

    # load pretrained weights
    load_pretrained(model, args)

    # convert batch-norm layers to nvidia wrapper to enable batch stats reduction
    model = apex.parallel.convert_syncbn_model(model)

    # distributed training wrapper
    model = to_cuda(model, args.gpu_to_work_on, apex=False)
    logger.info('model to cuda')

    # set optimizer
    optimizer = sgd_optimizer(model, args.lr, args.wd)

    # load cluster assignments
    cluster_assignments = load_cluster_assignments(args, dataset)

    # build prediction layer on the super_class
    pred_layer, optimizer_pred_layer = build_prediction_layer(
        model.module.body.dim_output_space,
        args,
    )

    nmb_sub_classes = args.k // args.nmb_super_clusters
    sub_class_pred_layer, optimizer_sub_class_pred_layer = build_prediction_layer(
        model.module.body.dim_output_space,
        args,
        num_classes=nmb_sub_classes,
        group=training_groups[args.training_local_world_id],
    )

    # variables to fetch in checkpoint
    to_restore = {'epoch': 0, 'start_iter': 0}

    # re start from checkpoint
    restart_from_checkpoint(
        args,
        run_variables=to_restore,
        state_dict=model,
        optimizer=optimizer,
        pred_layer_state_dict=pred_layer,
        optimizer_pred_layer=optimizer_pred_layer,
    )
    pred_layer_name = str(args.training_local_world_id) + '-pred_layer.pth.tar'
    restart_from_checkpoint(
        args,
        ckp_path=os.path.join(args.dump_path, pred_layer_name),
        state_dict=sub_class_pred_layer,
        optimizer=optimizer_sub_class_pred_layer,
    )
    args.epoch = to_restore['epoch']
    args.start_iter = to_restore['start_iter']

    for _ in range(args.epoch, args.nepochs):

        logger.info("============ Starting epoch %i ... ============" %
                    args.epoch)
        fix_random_seeds(args.epoch)

        # step 1: Get the final activations for the whole dataset / Cluster them

        if cluster_assignments is None and not args.epoch % args.reassignment:

            logger.info("=> Start clustering step")
            dataset.transform = tr_cluster

            cluster_assignments = get_cluster_assignments(
                args, model, dataset, clustering_groups)

            # reset prediction layers
            if args.nmb_super_clusters > 1:
                pred_layer, optimizer_pred_layer = build_prediction_layer(
                    model.module.body.dim_output_space,
                    args,
                )
            sub_class_pred_layer, optimizer_sub_class_pred_layer = build_prediction_layer(
                model.module.body.dim_output_space,
                args,
                num_classes=nmb_sub_classes,
                group=training_groups[args.training_local_world_id],
            )

        # step 2: Train the network with the cluster assignments as labels

        # prepare dataset
        dataset.transform = tr_train
        dataset.sub_classes = cluster_assignments

        # concatenate models and their corresponding optimizers
        models = [model, pred_layer, sub_class_pred_layer]
        optimizers = [
            optimizer, optimizer_pred_layer, optimizer_sub_class_pred_layer
        ]

        # train the network for one epoch
        scores = train_network(args, models, optimizers, dataset)

        ## save training statistics
        logger.info(scores)
        training_stats.update(scores)

        # reassign clusters at the next epoch
        if not args.epoch % args.reassignment:
            cluster_assignments = None
            dataset.subset_indexes = None
            end_of_epoch(args)

        dist.barrier()
Esempio n. 4
0
def main(args):

    # initialize the experiment
    logger, training_stats = initialize_exp(args, 'epoch', 'iter', 'prec',
                                            'loss', 'prec_val', 'loss_val')

    if not 'VOC2007' in args.data_path:
        main_data_path = args.data_path
        args.data_path = os.path.join(main_data_path, 'train')
        train_dataset = load_data(args)
    else:
        train_dataset = VOC2007_dataset(args.data_path, split=args.split)

    args.test = 'val' if args.split == 'train' else 'test'
    if not 'VOC2007' in args.data_path:
        if args.cross_valid is None:
            args.data_path = os.path.join(main_data_path, 'val')
        val_dataset = load_data(args)
    else:
        val_dataset = VOC2007_dataset(args.data_path, split=args.test)

    if args.cross_valid is not None:
        kfold = KFold(per_target(train_dataset.imgs), args.cross_valid,
                      args.kfold)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   sampler=kfold.train,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
        val_loader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=args.batch_size,
                                                 sampler=kfold.val,
                                                 num_workers=args.workers)

    else:
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=False,
                                                   num_workers=args.workers,
                                                   pin_memory=True)
        val_loader = torch.utils.data.DataLoader(val_dataset,
                                                 batch_size=args.batch_size,
                                                 shuffle=False,
                                                 num_workers=args.workers)

    # prepare the different data transformations
    tr_val, tr_train = get_data_transformations()

    #data preprocess transformation could keep in consistency
    train_dataset.transform = tr_val
    val_dataset.transform = tr_val

    # build model skeleton
    fix_random_seeds()
    model = model_factory(args)

    # keep only conv layers
    model.body.classifier = None

    load_pretrained(model, args)

    print('feature at conv{} is extracting!'.format(args.conv))
    model.conv = args.conv

    if 'places' in args.data_path:
        nmb_classes = 205
    elif 'VOC2007' in args.data_path:
        nmb_classes = 20
    else:
        nmb_classes = 1000

    # distributed training wrappere)
    model = model.cuda()

    logger.info('model to cuda')

    ## variables to reload to fetch in checkpoint
    to_restore = {'epoch': 0, 'start_iter': 0}

    model.eval()

    save_feature(args, model, train_loader, 'trainval')
    save_feature(args, model, val_loader, 'test')

    print('save finished')