Beispiel #1
0
def train_ddp(rank, cfg, return_dict):
    cfg.gpu = rank + cfg.base_gpu
    print(f"Train Running basic DDP example on rank {rank}.")
    setup(rank, cfg.world_size, start_port)

    cfg.log_file = 'train_{}.txt'.format(cfg.gpu)
    log_file = os.path.join(cfg.exp_dir, cfg.log_file)
    logging.config.dictConfig(log_utils.get_logging_dict(log_file, mode='a+'))
    cfg.logger = logging.getLogger('train')

    model = net_utils.get_model(cfg)
    cfg.logger.info('Moving the model to GPU {}'.format(cfg.gpu))
    model = net_utils.move_model_to_gpu(cfg, model)
    cfg.logger.info('Model conv 1 initialization {}'.format(
        torch.sum(model.backbone.conv1.weight)))
    if cfg.world_size > 1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model = DDP(model, device_ids=[cfg.gpu], output_device=cfg.gpu)

    trn_pretrain.trn(cfg, model)

    if cfg.gpu == cfg.base_gpu:
        return_dict['ckpt_path'] = None

    cleanup()
Beispiel #2
0
def train_dense(cfg, generation):

    model = net_utils.get_model(cfg)

    if cfg.pretrained and cfg.pretrained != 'imagenet':
        net_utils.load_pretrained(cfg.pretrained,cfg.gpu, model,cfg)
        model = net_utils.move_model_to_gpu(cfg, model)
        net_utils.split_reinitialize(cfg,model,reset_hypothesis=cfg.reset_hypothesis)
    else:
        model = net_utils.move_model_to_gpu(cfg, model)

    cfg.trainer = 'default_cls'
    # cfg.split_rate = 1.0
    # cfg.bias_split_rate = 1.0
    cfg.pretrained = None
    ckpt_path = KE_model.ke_cls_train(cfg, model,generation)

    return ckpt_path
Beispiel #3
0
def train_ddp(rank, cfg, return_dict):
    cfg.gpu = rank + cfg.base_gpu
    print(f"Train Running basic DDP example on rank {rank}.")
    setup(rank, cfg.world_size, start_port)

    cfg.log_file = 'train_{}.txt'.format(cfg.gpu)
    log_file = os.path.join(cfg.exp_dir, cfg.log_file)
    logging.config.dictConfig(log_utils.get_logging_dict(log_file, mode='a+'))
    cfg.logger = logging.getLogger('train')

    cfg.logger.info('Getting the model')
    pretrain_model = net_utils.get_model(cfg)
    pretrain_model = torch.nn.DataParallel(pretrain_model)
    if cfg.pretrained and cfg.pretrained != 'imagenet':
        net_utils.load_pretrained(cfg.pretrained, cfg.gpu, pretrain_model, cfg)

    classifier_layer = nn.Linear(
        in_features=pretrain_model.module.backbone.output_dim,
        out_features=cfg.num_cls,
        bias=True).cuda()

    for m in pretrain_model.parameters():
        if hasattr(m, "requires_grad") and m.requires_grad is not None:
            m.requires_grad = False
    cfg.logger.info('Start Training: Model conv 1 initialization {}'.format(
        torch.sum(pretrain_model.module.backbone.conv1.weight)))
    model = nn.Sequential(
        pretrain_model.module.backbone,
        classifier_layer,
    )

    cfg.logger.info('Moving the model to GPU {}'.format(cfg.gpu))
    model = net_utils.move_model_to_gpu(cfg, model)
    cfg.logger.info('Model conv 1 initialization {}'.format(
        torch.sum(model[0].conv1.weight)))
    if cfg.world_size > 1:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    model = DDP(model, device_ids=[cfg.gpu], output_device=cfg.gpu)

    trn_classifier.trn(cfg, model)
    if cfg.gpu == cfg.base_gpu:
        return_dict['ckpt_path'] = None

    cleanup()
Beispiel #4
0
def eval_slim(cfg, generation):
    original_num_epos = cfg.epochs
    # cfg.epochs = 0
    softmax_criterion = nn.CrossEntropyLoss().cuda()
    epoch = 1
    writer = None
    model = net_utils.get_model(cfg)
    net_utils.load_pretrained(cfg.pretrained, cfg.gpu, model,cfg)
    # if cfg.reset_mask:
    #     net_utils.reset_mask(cfg, model)
    model = net_utils.move_model_to_gpu(cfg, model)

    save_filter_stats = (cfg.arch in ['split_alexnet','split_vgg11_bn'])
    if save_filter_stats:
        for n, m in model.named_modules():
            if hasattr(m, "weight") and m.weight is not None:
                if hasattr(m, "mask"):
                    layer_mask = m.mask
                    if m.__class__ == conv_type.SplitConv:
                        # filter_state = [''.join(map(str, ((score_mask == True).type(torch.int).squeeze().tolist())))]
                        filter_mag = ['{},{}'.format(
                            float(torch.mean(torch.abs(m.weight[layer_mask.type(torch.bool)]))),
                            float(torch.mean(torch.abs(m.weight[(1-layer_mask).type(torch.bool)]))))
                        ]
                        os_utils.txt_write(osp.join(cfg.exp_dir, n.replace('.', '_') + '_mean_magnitude.txt'), filter_mag, mode='a+')

    dummy_input_tensor = torch.zeros((1, 3, 224, 224)).cuda()
    total_ops, total_params = model_profile.profile(model, dummy_input_tensor)
    cfg.logger.info("Dense #Ops: %f GOps" % (total_ops / 1e9))
    cfg.logger.info("Dense #Parameters: %f M" % (total_params / 1e6))

    original_split_rate = cfg.split_rate
    original_bias_split_rate = cfg.bias_split_rate

    if cfg.split_mode == 'kels':
        cfg.slim_factor = cfg.split_rate
        cfg.split_rate = 1.0
        cfg.bias_split_rate = 1.0
        split_model = net_utils.get_model(cfg)
        split_model = net_utils.move_model_to_gpu(cfg, split_model)

        total_ops, total_params = model_profile.profile(split_model, dummy_input_tensor)
        cfg.logger.info("Split #Ops: %f GOps" % (total_ops / 1e9))
        cfg.logger.info("Split #Parameters: %f M" % (total_params / 1e6))

        net_utils.extract_slim(split_model, model)
        dataset = getattr(data, cfg.set)(cfg)
        train, validate = get_trainer(cfg)
        last_val_acc1, last_val_acc5 = validate(dataset.tst_loader, split_model, softmax_criterion, cfg, writer, epoch)
        cfg.logger.info('Split Model : {} , {}'.format(last_val_acc1, last_val_acc5))
    else:
        last_val_acc1 = 0
        last_val_acc5 = 0

    csv_utils.write_cls_result_to_csv(
        ## Validation
        curr_acc1=0,
        curr_acc5=0,
        best_acc1=0,
        best_acc5=0,

        ## Test
        last_tst_acc1=last_val_acc1,
        last_tst_acc5=last_val_acc5,
        best_tst_acc1=0,
        best_tst_acc5=0,

        ## Train
        best_train_acc1=0,
        best_train_acc5=0,

        split_rate='slim',
        bias_split_rate='slim',

        base_config=cfg.name,
        name=cfg.name,
    )

    cfg.epochs = original_num_epos

    cfg.slim_factor = 1
    cfg.split_rate = original_split_rate
    cfg.bias_split_rate = original_bias_split_rate