def train(x, lr, iters):
     tic = time()
     t = 1
     v = x.zeros_like()
     sqr = x.zeros_like()
     optim = optimizer.Adam(learning_rate=lr)
     for idx in range(iters):
         with autograd.record():
             loss = go(x)
         loss.backward()
         optim.update(t, x, x.grad, [sqr, v])
         nd.waitall()  # TODO:it is a time cost operation
         t = t + 1
         sys.stdout.write('\r training..........%s%%' %
                          (100 * idx // iters + 1))
         sys.stdout.flush()
     print("      all_train_time:", time() - tic)
     return x
Esempio n. 2
0
def build_optimizer(type, lr, kerasDefaults):


    if type == 'sgd':
        if kerasDefaults['nesterov_sgd']:
            return optimizer.NAG(learning_rate=lr,
                                 momentum=kerasDefaults['momentum_sgd'],
                                 #rescale_grad=kerasDefaults['clipnorm'],
                                 #clip_gradient=kerasDefaults['clipvalue'],
                                 lr_scheduler=None)
        else:
            return optimizer.SGD(learning_rate=lr,
                                 momentum=kerasDefaults['momentum_sgd'],
                                 #rescale_grad=kerasDefaults['clipnorm'],
                                 #clip_gradient=kerasDefaults['clipvalue'],
                                 lr_scheduler=None)
    
    elif type == 'rmsprop':
        return optimizer.RMSProp(learning_rate=lr,
                                 gamma1=kerasDefaults['rho'],
                                 epsilon=kerasDefaults['epsilon'],
                                 centered=False,
                                 #rescale_grad=kerasDefaults['clipnorm'],
                                 #clip_gradient=kerasDefaults['clipvalue'],
                                 lr_scheduler=None)

    elif type == 'adagrad':
        return optimizer.AdaGrad(learning_rate=lr,
                                 epsilon=kerasDefaults['epsilon'])#,
                                 #rescale_grad=kerasDefaults['clipnorm'],
                                 #clip_gradient=kerasDefaults['clipvalue'])

    elif type == 'adadelta':
        return optimizer.AdaDelta(epsilon=kerasDefaults['epsilon'],
                                  rho=kerasDefaults['rho'])#,
                                  #rescale_grad=kerasDefaults['clipnorm'],
                                  #clip_gradient=kerasDefaults['clipvalue'])

    elif type == 'adam':
        return optimizer.Adam(learning_rate=lr, beta_1=kerasDefaults['beta_1'],
                              beta_2=kerasDefaults['beta_2'],
                              epsilon=kerasDefaults['epsilon'])#,
def cifar_mxnet_objective(config):
    #net = MXNet_AlexNet(config)
    net = gluoncv.model_zoo.get_model('alexnet',
                                      classes=1000,
                                      pretrained=False)
    gpus = mx.test_utils.list_gpus()
    ctx = [mx.gpu(0)] if gpus else [mx.cpu(0)]
    net.initialize(mx.init.Uniform(scale=1), ctx=ctx)
    optim = optimizer.Adam(learning_rate=config['learning_rate'])
    trainer = gluon.Trainer(net.collect_params(), optim)

    train_data = gluon.data.DataLoader(vision.datasets.CIFAR100(
        train=True, transform=transform),
                                       batch_size=config['batch_size'],
                                       shuffle=False)
    val_data = gluon.data.DataLoader(vision.datasets.CIFAR100(
        train=False, transform=transform),
                                     batch_size=config['batch_size'],
                                     shuffle=False)

    #     # Use Accuracy as the evaluation metric.
    #     metric = mx.metric.Accuracy()
    criterion = gluon.loss.SoftmaxCrossEntropyLoss()

    for epoch in tqdm(range(config['epochs'])):
        for data, label in train_data:
            # forward + backward
            with ag.record():
                output = net(data)
                loss = criterion(output, label)
            loss.backward()
            # update parameters
            trainer.step(config['batch_size'])

    # Evaluate on Validation data
    name, val_acc = test(ctx, val_data, net)
    return val_acc, net
Esempio n. 4
0
def main(args):
    _seed = 727
    random.seed(_seed)
    np.random.seed(_seed)
    mx.random.seed(_seed)
    ctx = []
    #   cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    #   if len(cvd)>0:
    #     for i in range(len(cvd.split(','))):
    #       ctx.append(mx.gpu(i))
    #   if len(ctx)==0:
    #     ctx = [mx.cpu()]
    #     print('use cpu')
    #   else:
    #     print('gpu num:', len(ctx))
    ctx = [mx.cpu()]
    args.ctx_num = len(ctx)

    args.batch_size = args.per_batch_size * args.ctx_num
    config.per_batch_size = args.per_batch_size

    print('Call with', args, config)
    train_iter = FaceSegIter(
        path_imgrec=os.path.join(config.dataset_path, 'train.rec'),
        batch_size=args.batch_size,
        per_batch_size=args.per_batch_size,
        aug_level=1,
        exf=args.exf,
        args=args,
    )

    data_shape = train_iter.get_data_shape()
    #label_shape = train_iter.get_label_shape()
    sym = sym_heatmap.get_symbol(num_classes=config.num_classes)
    if len(args.pretrained) == 0:
        #data_shape_dict = {'data' : (args.per_batch_size,)+data_shape, 'softmax_label' : (args.per_batch_size,)+label_shape}
        data_shape_dict = train_iter.get_shape_dict()
        arg_params, aux_params = sym_heatmap.init_weights(sym, data_shape_dict)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        #sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
        label_names=train_iter.get_label_names(),
    )
    #lr = 1.0e-3
    #lr = 2.5e-4
    _rescale_grad = 1.0 / args.ctx_num
    #_rescale_grad = 1.0/args.batch_size
    #lr = args.lr
    #opt = optimizer.Nadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0)
    if args.optimizer == 'onadam':
        opt = ONadam(learning_rate=args.lr,
                     wd=args.wd,
                     rescale_grad=_rescale_grad,
                     clip_gradient=5.0)
    elif args.optimizer == 'nadam':
        opt = optimizer.Nadam(learning_rate=args.lr,
                              rescale_grad=_rescale_grad)
    elif args.optimizer == 'rmsprop':
        opt = optimizer.RMSProp(learning_rate=args.lr,
                                rescale_grad=_rescale_grad)
    elif args.optimizer == 'adam':
        opt = optimizer.Adam(learning_rate=args.lr, rescale_grad=_rescale_grad)
    else:
        opt = optimizer.SGD(learning_rate=args.lr,
                            momentum=0.9,
                            wd=args.wd,
                            rescale_grad=_rescale_grad)
    initializer = mx.init.Xavier(rnd_type='gaussian',
                                 factor_type="in",
                                 magnitude=2)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
    _metric = LossValueMetric()
    #_metric = NMEMetric()
    #_metric2 = AccMetric()
    #eval_metrics = [_metric, _metric2]
    eval_metrics = [_metric]
    lr_steps = [int(x) for x in args.lr_step.split(',')]
    print('lr-steps', lr_steps)
    global_step = [0]

    def val_test():
        all_layers = sym.get_internals()
        vsym = all_layers['heatmap_output']
        vmodel = mx.mod.Module(symbol=vsym, context=ctx, label_names=None)
        #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
        vmodel.bind(data_shapes=[('data', (args.batch_size, ) + data_shape)])
        arg_params, aux_params = model.get_params()
        vmodel.set_params(arg_params, aux_params)
        for target in config.val_targets:
            _file = os.path.join(config.dataset_path, '%s.rec' % target)
            if not os.path.exists(_file):
                continue
            val_iter = FaceSegIter(
                path_imgrec=_file,
                batch_size=args.batch_size,
                #batch_size = 4,
                aug_level=0,
                args=args,
            )
            _metric = NMEMetric()
            val_metric = mx.metric.create(_metric)
            val_metric.reset()
            val_iter.reset()
            for i, eval_batch in enumerate(val_iter):
                #print(eval_batch.data[0].shape, eval_batch.label[0].shape)
                batch_data = mx.io.DataBatch(eval_batch.data)
                model.forward(batch_data, is_train=False)
                model.update_metric(val_metric, eval_batch.label)
            nme_value = val_metric.get_name_value()[0][1]
            print('[%d][%s]NME: %f' % (global_step[0], target, nme_value))

    def _batch_callback(param):
        _cb(param)
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == _lr:
                opt.lr *= 0.2
                print('lr change to', opt.lr)
                break
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
        if mbatch > 0 and mbatch % args.verbose == 0:
            val_test()
            if args.ckpt == 1:
                msave = mbatch // args.verbose
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
                                         aux)
        if mbatch == lr_steps[-1]:
            if args.ckpt == 2:
                #msave = mbatch//args.verbose
                msave = 1
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
                                         aux)
            sys.exit(0)

    train_iter = mx.io.PrefetchingIter(train_iter)

    model.fit(
        train_iter,
        begin_epoch=0,
        num_epoch=9999,
        #eval_data          = val_iter,
        eval_data=None,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=None,
    )
Esempio n. 5
0
def main(args):
    _seed = 727
    random.seed(_seed)
    np.random.seed(_seed)
    mx.random.seed(_seed)
    ctx = []
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    #ctx = [mx.gpu(0)]
    args.ctx_num = len(ctx)

    args.batch_size = args.per_batch_size * args.ctx_num
    config.per_batch_size = args.per_batch_size

    print('Call with', args, config)
    train_iter = FaceSegIter(
        path_imgrec=os.path.join(config.dataset_path, 'train.rec'),
        batch_size=args.batch_size,
        per_batch_size=args.per_batch_size,
        aug_level=1,
        exf=args.exf,
        args=args,
    )

    data_shape, data_size = train_iter.get_data_shape()
    #label_shape = train_iter.get_label_shape()
    sym = eval(config.network).get_symbol(num_classes=config.num_classes)
    if len(args.pretrained) == 0:
        #data_shape_dict = {'data' : (args.per_batch_size,)+data_shape, 'softmax_label' : (args.per_batch_size,)+label_shape}
        data_shape_dict = train_iter.get_shape_dict()
        arg_params, aux_params = init_weights(sym, data_shape_dict)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        #sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
        label_names=train_iter.get_label_names(),
    )
    #lr = 1.0e-3
    #lr = 2.5e-4
    _rescale_grad = 1.0 / args.ctx_num
    #_rescale_grad = 1.0/args.batch_size
    #lr = args.lr
    #opt = optimizer.Nadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0)
    if args.optimizer == 'onadam':
        opt = ONadam(learning_rate=args.lr,
                     wd=args.wd,
                     rescale_grad=_rescale_grad,
                     clip_gradient=5.0)
    elif args.optimizer == 'nadam':
        opt = optimizer.Nadam(learning_rate=args.lr,
                              rescale_grad=_rescale_grad)
    elif args.optimizer == 'rmsprop':
        opt = optimizer.RMSProp(learning_rate=args.lr,
                                rescale_grad=_rescale_grad)
    elif args.optimizer == 'adam':
        opt = optimizer.Adam(learning_rate=args.lr, rescale_grad=_rescale_grad)
    else:
        opt = optimizer.SGD(learning_rate=args.lr,
                            momentum=0.9,
                            wd=args.wd,
                            rescale_grad=_rescale_grad)
    initializer = mx.init.Xavier(rnd_type='gaussian',
                                 factor_type="in",
                                 magnitude=2)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
    _metric = LossValueMetric()
    #_metric = NMEMetric()
    #_metric2 = AccMetric()
    #eval_metrics = [_metric, _metric2]
    eval_metrics = [_metric]
    lr_epoch_steps = [int(x) for x in args.lr_epoch_step.split(',')]
    print('lr-epoch-steps', lr_epoch_steps)

    global_step = [0]
    highest_acc = [1.0, 1.0]

    def _batch_callback(param):
        _cb(param)
        global_step[0] += 1
        mbatch = global_step[0]
        mepoch = mbatch * args.batch_size // data_size
        pre = mbatch * args.batch_size % data_size
        is_highest = False
        for _lr in lr_epoch_steps[0:-1]:
            if mepoch == _lr and pre < args.batch_size:
                opt.lr *= 0.2
                print('lr change to', opt.lr)
                break
        if mbatch % 1000 == 0:
            print('lr:', opt.lr, 'batch:', param.nbatch, 'epoch:', param.epoch)
        if mbatch > 0 and mbatch % args.verbose == 0:
            acc_list = val_test(sym, model, ctx, data_shape, global_step)
            score = np.mean(acc_list)
            if acc_list[0] < highest_acc[0]:  # ibug
                is_highest = True
                highest_acc[0] = acc_list[0]
            if score < highest_acc[1]:  # mean
                is_highest = True
                highest_acc[1] = score
            if args.ckpt == 1 and is_highest == True:
                msave = mbatch // args.verbose
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
                                         aux)
        if mepoch == lr_epoch_steps[-1]:
            if args.ckpt == 1:
                acc_list = val_test(sym, model, ctx, data_shape, global_step)
                msave = mbatch // args.verbose
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
                                         aux)
            sys.exit(0)

    train_iter = mx.io.PrefetchingIter(train_iter)

    model.fit(
        train_iter,
        begin_epoch=0,
        num_epoch=9999,
        #eval_data          = val_iter,
        eval_data=None,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=None,
    )
Esempio n. 6
0
def train_net(args):
    # 判断使用GPU还是CPU
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in range(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))

    # 保存模型的前缀
    prefix = os.path.join(args.models_root,
                          '%s-%s-%s' % (args.network, args.loss, args.dataset),
                          'model')
    # 保存模型的路径
    prefix_dir = os.path.dirname(prefix)
    print('prefix', prefix)

    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)

    # GPU的数目
    args.ctx_num = len(ctx)

    # 计算总batch_size
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0

    args.image_channel = config.image_shape[2]
    config.batch_size = args.batch_size
    # 每个GPU一个批次的大小
    config.per_batch_size = args.per_batch_size

    # 训练数据的目录
    data_dir = config.dataset_path
    path_imgrec = None
    path_imglist = None

    # 图片大小以及验证
    image_size = config.image_shape[0:2]
    assert len(image_size) == 2
    assert image_size[0] == image_size[1]
    print('image_size', image_size)

    # 数据集id数目
    print('num_classes', config.num_classes)
    path_imgrec = os.path.join(data_dir, "train.rec")

    print('Called with argument:', args, config)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0

    # 判断预训练模型是否存在,如果不存在,初始化权重
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym = get_symbol(args)  # 模型构建
        if config.net_name == 'spherenet':
            data_shape_dict = {'data': (args.per_batch_size, ) + data_shape}
            spherenet.init_weights(sym, data_shape_dict, args.num_layers)
    else:  # 如果存在,则加载模型
        print('loading', args.pretrained, args.pretrained_epoch)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            args.pretrained, args.pretrained_epoch)
        sym = get_symbol(args)

    # 浮点型数据占用空间计算
    if config.count_flops:
        all_layers = sym.get_internals()
        _sym = all_layers['fc1_output']
        FLOPs = flops_counter.count_flops(_sym,
                                          data=(1, 3, image_size[0],
                                                image_size[1]))
        _str = flops_counter.flops_str(FLOPs)
        print('Network FLOPs: %s' % _str)

    # label_name = 'softmax_label'
    # label_shape = (args.batch_size,)
    model = mx.mod.Module(
        context=mx.gpu(),
        symbol=sym,
    )
    val_dataiter = None

    # 主要获取数据的迭代器,triplet与sfotmax输入数据的迭代器是不一样的,具体哪里不一样,后续章节为大家分析
    if config.loss_name.find('triplet') >= 0:
        from triplet_image_iter import FaceImageIter
        triplet_params = [
            config.triplet_bag_size, config.triplet_alpha,
            config.triplet_max_ap
        ]
        train_dataiter = FaceImageIter(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=path_imgrec,
            shuffle=True,
            rand_mirror=config.data_rand_mirror,
            mean=mean,
            cutoff=config.data_cutoff,
            ctx_num=args.ctx_num,
            images_per_identity=config.images_per_identity,
            triplet_params=triplet_params,
            mx_model=model,
        )
        _metric = LossValueMetric()
        eval_metrics = [mx.metric.create(_metric)]
    else:
        from image_iter import FaceImageIter
        train_dataiter = FaceImageIter(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=path_imgrec,
            shuffle=True,
            rand_mirror=config.data_rand_mirror,
            mean=mean,
            cutoff=config.data_cutoff,
            color_jittering=config.data_color,
            images_filter=config.data_images_filter,
        )
        metric1 = AccMetric()
        eval_metrics = [mx.metric.create(metric1)]
        if config.ce_loss:
            metric2 = LossValueMetric()
            eval_metrics.append(mx.metric.create(metric2))

    # 对权重进行初始化
    if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  # resnet style
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    # initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    _rescale = 1.0 / args.ctx_num
    #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale)
    opt = optimizer.Adam(learning_rate=args.lr,
                         wd=args.wd,
                         rescale_grad=_rescale)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)

    # 加载所有测试数据集
    ver_list = []
    ver_name_list = []
    for name in config.val_targets:
        path = os.path.join(data_dir, name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    # 对测试集进行测试
    def ver_test(nbatch):
        results = []
        for i in range(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                ver_list[i], model, args.batch_size, 10, None, None)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            # print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                  (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    # 最高的准曲率
    highest_acc = [0.0, 0.0]  # lfw and target

    # for i in range(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        # global global_step

        global_step[0] += 1
        mbatch = global_step[0]
        # 降低学习率到原来的十分之一
        for step in lr_steps:
            if mbatch == step:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        #print(param)
        _cb(param)
        # 每1000批次进行一次打印
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)

        # 进行
        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc_list = ver_test(mbatch)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            is_highest = False

            # 如果存在评估集
            print('-' * 50)
            print(acc_list)
            if len(acc_list) > 0:
                # lfw_score = acc_list[0]
                # if lfw_score>highest_acc[0]:
                #  highest_acc[0] = lfw_score
                #  if lfw_score>=0.998:
                #    do_save = True
                score = sum(acc_list)
                if acc_list[-1] >= highest_acc[-1]:
                    if acc_list[-1] > highest_acc[-1]:
                        #print('is_highest = True')
                        is_highest = True
                    else:
                        if score >= highest_acc[0]:
                            is_highest = True
                            highest_acc[0] = score
                    highest_acc[-1] = acc_list[-1]
                    # if lfw_score>=0.99:
                    #  do_save = True
            if is_highest:
                do_save = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt == 2:
                do_save = True
            elif args.ckpt == 3:
                msave = 1

            # 模型保存
            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()
                if config.ckpt_embedding:
                    all_layers = model.symbol.get_internals()
                    _sym = all_layers['fc1_output']
                    _arg = {}
                    for k in arg:
                        if not k.startswith('fc7'):
                            _arg[k] = arg[k]
                    mx.model.save_checkpoint(prefix, msave, _sym, _arg, aux)
                else:
                    mx.model.save_checkpoint(prefix, msave, model.symbol, arg,
                                             aux)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if config.max_steps > 0 and mbatch > config.max_steps:
            sys.exit(0)

    epoch_cb = None
    # 把train_dataiter转化为mx.ioPrefetchingIter迭代器

    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=999999,
        eval_data=val_dataiter,
        eval_metric=eval_metrics,
        kvstore=args.kvstore,
        optimizer=opt,
        # optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Esempio n. 7
0
    def train(self, model: mdn_rnn):
        """
        Trains a given model on data. Applies truncated BPTT

        :param model (mdn_rnn) 
            the model to be trained

        :param data ((nd.array(float), nd.array(float)) 
            The training data. In our case, this contains an array of hidden states, and an array of actions. 
            The hidden states are of shape [n_episodes, n_timesteps_per_episode, z_dim]
            The actions are of shape [n_episodes, n_timesteps_per_episode, a_dim]

        :param n_epochs (int)
            number of epochs to train

        :return test (int)
            This is a testr
        return:
        model:(mdn_rnn) trained mdn_rnn object
        negative_log_likelihoods: (nd.array(float)) the training losses
        """

        retain_graph = self.args.k1 < self.args.k2
        optim = optimizer.Adam(learning_rate=self.args.rnn_lr)
        trainer = gluon.Trainer(model.collect_params(), optim)
        # losses = np.zeros((self.args.rnn_rounds, 500))
        for epo in range(self.args.rnn_rounds):
            input_data, output_data = self.get_single_rollout()
            observations = input_data.shape[0] - self.args.k2
            hidden_states = [(nd.zeros(
                (1, model.RNN.h_dim)), nd.zeros((1, model.RNN.c_dim)))]
            gc.collect()
            # epo_loss = nd.zeros(observations)
            for t in range(observations):

                print(f"Epoch {epo},  timestep {t}")

                # Re-use previously computed states
                h_cur, c_cur = hidden_states[t]
                za_t = input_data[t]
                z_tplusone = output_data[t]

                with autograd.record():

                    # Model the new prediction, and get updated hidden and output states
                    pz, h_cur, c_cur = model(za_t[None, :], h_cur, c_cur)

                    # Store the hidden states to re-use them later
                    hidden_states.append((h_cur.detach(), c_cur.detach()))

                    # Take k2-1 more steps
                    for j in range(self.args.k2 - 1):

                        # Get new input and target
                        za_t = input_data[t + j + 1]
                        z_tplusone = output_data[t + j + 1]

                        # Make new prediction
                        pz, h_cur, c_cur = model(za_t[None, :], h_cur, c_cur)

                    neg_log_prob = -pz.log_prob(z_tplusone)

                # Do backprop on the current output
                neg_log_prob.backward(retain_graph=retain_graph)

                trainer.step(1, ignore_stale_grad=False)
Esempio n. 8
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in range(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    #args.rescale_threshold = 0
    args.image_channel = 3

    data_dir_list = args.data_dir.split(',')
    assert len(data_dir_list) == 1
    data_dir = data_dir_list[0]
    # path_imgrec = None
    # path_imglist = None
    args.num_classes = 0
    image_size = (64, 64)
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)
    path_imgrec = os.path.join(data_dir, "train.rec")

    print('Called with argument:', args)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    # if args.network[0]=='s':
    #   data_shape_dict = {'data' : (args.per_batch_size,)+data_shape}
    #   spherenet.init_weights(sym, data_shape_dict, args.num_layers)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    model = mx.mod.Module(context=ctx,
                          symbol=sym,
                          data_names=['data'],
                          label_names=['label_gender'])
    print(data_shape)
    train_dataiter = SSR_ITER(
        batch_size=args.batch_size,
        data_shape=data_shape,
        path_imgrec=path_imgrec,
        shuffle=True,
        mean=mean,
    )
    val_rec = os.path.join(data_dir, "val.rec")
    val_iter = None
    if os.path.exists(val_rec):
        val_iter = SSR_ITER(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=val_rec,
            shuffle=False,
            mean=mean,
        )
    print(train_dataiter.provide_label)

    initializer = mx.init.Xavier(rnd_type='uniform',
                                 factor_type="in",
                                 magnitude=2)
    # initializer = mx.init.Xavier(rnd_type='uniform')

    _rescale = 1.0 / args.ctx_num
    # opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale)
    opt = optimizer.Adam(learning_rate=base_lr, rescale_grad=_rescale)
    #opt = optimizer.SGD(learning_rate=base_lr)
    #opt = optimizer.Nadam(learning_rate=base_lr, wd=base_wd, rescale_grad=_rescale)
    som = 50
    _cb = mx.callback.Speedometer(args.batch_size, som)

    global_step = [0]
    save_step = [0]
    if len(args.lr_steps) == 0:
        lr_steps = [40000, 60000, 80000]
        # if args.loss_type>=1 and args.loss_type<=7:
        #   lr_steps = [100000, 140000, 160000]
        # p = 512.0/args.batch_size
        for l in range(len(lr_steps)):
            lr_steps[l] = int(lr_steps[l])
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt > 1:
                do_save = True
            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
            # print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    epoch_cb = None

    a = mx.viz.plot_network(sym,
                            shape={"data": (1, 3, 64, 64)},
                            node_attrs={
                                "shape": 'rect',
                                "fixedsize": 'false'
                            })
    a.render('xx')

    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=val_iter,
        eval_metric='mae',
        kvstore='device',
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Esempio n. 9
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()

    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx), ctx, cvd)
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3

    os.environ['BETA'] = str(args.beta)
    data_dir_list = args.data_dir.split(',')
    assert len(data_dir_list) == 1
    data_dir = data_dir_list[0]
    path_imgrec = None
    path_imglist = None
    prop = face_image.load_property(data_dir)
    args.num_classes = prop.num_classes
    # image_size = prop.image_size
    image_size = [int(x) for x in args.image_size.split(',')]
    assert len(image_size) == 2
    assert image_size[0] == image_size[1]
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)
    assert (args.num_classes > 0)
    print('num_classes', args.num_classes)
    path_imgrec = os.path.join(data_dir, "train.rec")

    if args.loss_type == 1 and args.num_classes > 20000:
        args.beta_freeze = 5000
        args.gamma = 0.06

    print('Called with argument:', args)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    arg_params = None
    aux_params = None
    sym, arg_params, aux_params = get_symbol(args,
                                             arg_params,
                                             aux_params,
                                             layer_name='ms1m_fc7')
    fixed_args = [n for n in sym.list_arguments() if 'fc7' in n]

    # sym.get_internals()
    # sym.list_arguments()
    # sym.list_auxiliary_states()
    # sym.list_inputs()
    # sym.list_outputs()

    # label_name = 'softmax_label'
    # label_shape = (args.batch_size,)
    # arg_params['glint_fc7_weight'] = arg_params['fc7_weight'].copy()
    # arg_params['ms1m_fc7_weight'] = arg_params['glint_fc7_weight'].copy()
    assert 'ms1m_fc7_weight' in arg_params
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
        fixed_param_names=fixed_args,
    )
    val_dataiter = None

    train_dataiter = FaceImageIter(
        batch_size=args.batch_size,
        data_shape=data_shape,
        path_imgrec=path_imgrec,
        shuffle=True,
        rand_mirror=args.rand_mirror,
        mean=mean,
        cutoff=args.cutoff,
        color_jittering=args.color,
        images_filter=args.images_filter,
    )

    metric1 = AccMetric()
    eval_metrics = [mx.metric.create(metric1)]
    if args.ce_loss:
        metric2 = LossValueMetric()
        eval_metrics.append(mx.metric.create(metric2))

    if args.network[0] == 'r' or args.network[0] == 'y':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  # resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  # inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    # initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    _rescale = 1.0 / args.ctx_num
    # opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale)
    logging.info(f'base lr {base_lr}')
    opt = optimizer.Adam(
        learning_rate=base_lr,
        wd=base_wd,
        rescale_grad=_rescale,
    )
    som = 20
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []
    for name in args.target.split(','):
        path = os.path.join(data_dir, name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in xrange(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                ver_list[i], model, args.batch_size, 10, None, None)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            # print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                  (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    # ver_test( 0 )
    highest_acc = [0.0, 0.0]  # lfw and target
    # for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]

    if len(args.lr_steps) == 0:
        lr_steps = [40000, 60000, 80000]
        if args.loss_type >= 1 and args.loss_type <= 7:
            lr_steps = [100000, 140000, 160000]
        p = 512.0 / args.batch_size
        for l in xrange(len(lr_steps)):
            lr_steps[l] = int(lr_steps[l] * p)
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        # global global_step
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == args.beta_freeze + _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch: lr ', opt.lr, 'nbatch ', param.nbatch,
                  'epoch ', param.epoch, 'mbatch ', mbatch, 'lr_step',
                  lr_steps)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc_list = ver_test(mbatch)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            is_highest = False
            if len(acc_list) > 0:
                # lfw_score = acc_list[0]
                # if lfw_score>highest_acc[0]:
                #  highest_acc[0] = lfw_score
                #  if lfw_score>=0.998:
                #    do_save = True
                score = sum(acc_list)
                if acc_list[-1] >= highest_acc[-1]:
                    if acc_list[-1] > highest_acc[-1]:
                        is_highest = True
                    else:
                        if score >= highest_acc[0]:
                            is_highest = True
                            highest_acc[0] = score
                    highest_acc[-1] = acc_list[-1]
                    # if lfw_score>=0.99:
                    #  do_save = True
            if is_highest:
                do_save = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt == 2:
                do_save = True
            elif args.ckpt == 3:
                msave = 1

            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)

            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if mbatch <= args.beta_freeze:
            _beta = args.beta
        else:
            move = max(0, mbatch - args.beta_freeze)
            _beta = max(
                args.beta_min,
                args.beta * math.pow(1 + args.gamma * move, -1.0 * args.power))
        # print('beta', _beta)
        os.environ['BETA'] = str(_beta)
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    epoch_cb = None
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)
    # model.set_params(arg_params, aux_params)
    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=val_dataiter,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        # optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
      for i in xrange(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
      print('use cpu')
    else:
      print('gpu num:', len(ctx))
    prefix = os.path.join(args.models_root, '%s-%s-%s'%(args.network, args.loss, args.dataset), 'model')
    prefix_dir = os.path.dirname(prefix)
    print('prefix', prefix)
    if not os.path.exists(prefix_dir):
      os.makedirs(prefix_dir)
    args.ctx_num = len(ctx)  #GPU num
    args.batch_size = args.per_batch_size*args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = config.image_shape[2]
    config.batch_size = args.batch_size
    config.per_batch_size = args.per_batch_size

    data_dir = config.dataset_path
    path_imgrec = None
    path_imglist = None
    image_size = config.image_shape[0:2]
    assert len(image_size)==2
    assert image_size[0]==image_size[1]
    print('image_size', image_size)
    print('num_classes', config.num_classes)
    path_imgrec = os.path.join(data_dir, "train.rec")

    print('Called with argument:', args, config)
    data_shape = (args.image_channel,image_size[0],image_size[1]) # chw
    mean = None #[127.5,127.5,127.5]
    


    begin_epoch = 0
    if len(args.pretrained)==0:
      arg_params = None
      aux_params = None
      sym = get_symbol(args)  
      if config.net_name=='spherenet':
        data_shape_dict = {'data' : (args.per_batch_size,)+data_shape}
        spherenet.init_weights(sym, data_shape_dict, args.num_layers)
    else:  #��Ԥѵ��ģ�ͣ�������,sym����get_symbol(args)������

      sym,sym_high,arg_params,aux_params,t_arg_params, t_aux_params = two_sym(args)
      d_sym = discriminator(args)

      
            
    config.count_flops=False #me add
    if config.count_flops:  #true
      all_layers = sym.get_internals()
      _sym = all_layers['fc1_output']  #ͼƬ�� 128 ά�ȵ�����fc1 ���ٶ�
      FLOPs = flops_counter.count_flops(_sym, data=(1,3,image_size[0],image_size[1]))
      _str = flops_counter.flops_str(FLOPs)
      print('Network FLOPs: %s'%_str)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)

    val_dataiter = None

    if config.loss_name.find('triplet')>=0:
      from triplet_image_iter import FaceImageIter
      triplet_params = [config.triplet_bag_size, config.triplet_alpha, config.triplet_max_ap]
      train_dataiter = FaceImageIter(
          batch_size           = args.batch_size,
          data_shape           = data_shape,
          path_imgrec          = path_imgrec,
          shuffle              = True,
          rand_mirror          = config.data_rand_mirror,
        #   rand_resize          = True, #me add to differ resolution img 
          mean                 = mean,
          cutoff               = config.data_cutoff,
          ctx_num              = args.ctx_num,
          images_per_identity  = config.images_per_identity,
          triplet_params       = triplet_params,
          mx_model             = model,
      )
      _metric = LossValueMetric()
      eval_metrics = [mx.metric.create(_metric)]
    else:
      from distribute_image_iter import FaceImageIter

      train_dataiter_low = FaceImageIter(  #�õ� batch  img  label, train_dataiter_high
          batch_size           = args.batch_size,
          data_shape           = data_shape,
          path_imgrec          = path_imgrec,
          shuffle              = True,
          rand_mirror          = config.data_rand_mirror, #true
          rand_resize          = True, #me add to differ resolution img 
          mean                 = mean,
          cutoff               = config.data_cutoff,  #0
          color_jittering      = config.data_color,  #0
          images_filter        = config.data_images_filter, #0
      )
      source_imgrec = os.path.join("/home/svt/mxnet_recognition/dataes/faces_glintasia","train.rec")
      data2 = FaceImageIter(  #�õ� batch  img  label, train_dataiter_high
          batch_size           = args.batch_size,
          data_shape           = data_shape,
          path_imgrec          = source_imgrec,
          shuffle              = True,
          rand_mirror          = config.data_rand_mirror, #true
          rand_resize          = False, #me add to differ resolution img
          mean                 = mean,
          cutoff               = config.data_cutoff,  #0
          color_jittering      = config.data_color,  #0
          images_filter        = config.data_images_filter, #0
      )
      metric1 = AccMetric()  #�õ����ȼ���
      eval_metrics = [mx.metric.create(metric1)]
      if config.ce_loss:  #is True
        metric2 = LossValueMetric()  #�õ���ʧֵ
        eval_metrics.append( mx.metric.create(metric2) )  #

    if config.net_name=='fresnet' or config.net_name=='fmobilefacenet':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    else:
      initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2)
    #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    _rescale = 1.0/args.ctx_num
    #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale)
    opt = optimizer.Adam(learning_rate=0.0001, beta1=0.5, beta2=0.9, epsilon=1e-08)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)

    ver_list = []
    ver_name_list = []
    for name in config.val_targets:
      path = os.path.join(data_dir,name+".bin")
      if os.path.exists(path):
        data_set = verification.load_bin(path, image_size)
        ver_list.append(data_set)
        ver_name_list.append(name)
        print('ver', name)



    def ver_test(nbatch):
      results = []
      for i in xrange(len(ver_list)):
        acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, 10, None, None)
        print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
        #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
        print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2))
        results.append(acc2)
      return results



    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    lr_steps = [int(x) for x in args.lr_steps.split(',')]
    high_save = 0 #  me  add
    print('lr_steps', lr_steps)
    def _batch_callback(param):
      #global global_step
      global_step[0]+=1
      mbatch = global_step[0]
      for step in lr_steps:
        if mbatch==step:
          opt.lr *= 0.1
          print('lr change to', opt.lr)
          break

      _cb(param)
      if mbatch%1000==0:
        print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)
      
      if mbatch %4000==0:#(fc7_save):
          name=os.path.join(args.models_root, '%s-%s-%s'%(args.network, args.loss, args.dataset), 'modelfc7')
          arg, aux = model.get_params()
          mx.model.save_checkpoint(name, param.epoch, model.symbol, arg, aux)
          print('save model include fc7 layer')
          print("mbatch",mbatch)
      
      me_msave=0
      if mbatch>=0 and mbatch%args.verbose==0:  #default.verbose = 2000,mbatch is
        acc_list = ver_test(mbatch)
        save_step[0]+=1
        msave = save_step[0]  # batch ��512��һ��epoch1300
        me_msave=me_msave+1
        do_save = False
        is_highest = False
        #me add
        save2 = False
        if len(acc_list)>0:
          lfw_score = acc_list[0]
          if lfw_score>highest_acc[0]:
            highest_acc[0] = lfw_score
            if lfw_score>=0.9960:
              save2 = True
              
          score = sum(acc_list)
          if acc_list[-1]>=highest_acc[-1]:
            if acc_list[-1]>highest_acc[-1]:
              is_highest = True
            else:
              if score>=highest_acc[0]:
                is_highest = True
                highest_acc[0] = score
            highest_acc[-1] = acc_list[-1]
            #if lfw_score>=0.99:
            #  do_save = True
        # if is_highest:
          # do_save = True
        if args.ckpt==0:
          do_save = False
        elif args.ckpt==2:
          do_save = True
        elif args.ckpt==3 and is_highest:  #me add and is_highest
          high_save = 0   #ÿ�α���lfw��ߵ�ģ��,�и��ߵ��滻ԭ�������ģ��

        if do_save:  #������ߵ����ݲ���
          print('saving high pretrained-epoch always:  ', high_save)
          arg, aux = model.get_params()
          if config.ckpt_embedding:  #true
            all_layers = model.symbol.get_internals()
            _sym = all_layers['fc1_output']
            _arg = {}
            for k in arg:
              if not k.startswith('fc7'):#�ַ�����ʼ�� fc7 ��ͷ������ѭ�������������������㣩
                _arg[k] = arg[k]
            mx.model.save_checkpoint(prefix, high_save, _sym, _arg, aux)  #��������֣������ǰ׺������IJ���ֻ��fc1(128ά�ȵ�����)
          else:
            mx.model.save_checkpoint(prefix, high_save, model.symbol, arg, aux)
          print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))
          
        if save2:
          arg, aux = model.get_params()
          if config.ckpt_embedding:  #true
            all_layers = model.symbol.get_internals()
            _sym = all_layers['fc1_output']
            _arg = {}
            for k in arg:
              if not k.startswith('fc7'):#�ַ�����ʼ�� fc7 ��ͷ������ѭ�������������������㣩
                _arg[k] = arg[k]
            mx.model.save_checkpoint(prefix, (me_msave), _sym, _arg, aux)  #��������֣������ǰ׺������IJ���ֻ��fc1(128ά�ȵ�����)
          else:
            mx.model.save_checkpoint(prefix, (me_msave), model.symbol, arg, aux)
          print("save pretrained-epoch :param.epoch + me_msave",param.epoch,me_msave)
          print('[%d]LFW Accuracy>=0.9960: %1.5f'%(mbatch, highest_acc[-1])) #mbatch  �Ǵ�0 ��13000 һ��epoch ,Ȼ���ٴ�0����
    
      if config.max_steps>0 and mbatch>config.max_steps:
        sys.exit(0)
        
    ###########################################################################
   
    
    
    epoch_cb = None
    train_dataiter_low = mx.io.PrefetchingIter(train_dataiter_low) #���̵߳�����
    data2 = mx.io.PrefetchingIter(data2)  # ���̵߳�����

    #����model, �õ����ݣ�bind(data��label,�������ִ�к󣬷�����Դ�ռ�)��Ȼ���ʼ���������params
    #Ȼ�� fit ����ѵ��
    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(step=[100, 200, 300], factor=0.1)
    optimizer_params = {'learning_rate':0.01,
                    'momentum':0.9,
                    'wd':0.0005,
                    # 'lr_scheduler':lr_scheduler,
                    "rescale_grad":_rescale}  #���ݶȽ�����ƽ��
    ######################################################################
    # # ��ʦ����
    data_shapes = [('data', (args.batch_size, 3, 112, 112))]  #teacher model only need data, no label 
    t_module = mx.module.Module(symbol=sym_high, context=ctx, label_names=[])
    t_module.bind(data_shapes=data_shapes, for_training=False, grad_req='null')
    t_module.set_params(arg_params=t_arg_params, aux_params=t_aux_params)
    t_model=t_module
    ######################################################################
    ##ѧ������
    label_shapes = [('softmax_label', (args.batch_size, ))]
    model = mx.mod.Module(
    context       = ctx,
    symbol        = sym,
    label_names=[]
    # data_names    =  #Ĭ��data,�� softmax_label,����Ķ���label �����֣���Ҫ���´���
    )
    #ѧ��������Ҫ ���ݺͱ�ǩ����ѵ��
    #��ʦ������Ҫ���ݣ����ñ�ǩ����ѵ�������Ұ����������ֵ��ӵ���ǩ����
    # print (train_dataiter_low.provide_data)
    # print ((train_dataiter_low.provide_label))
    #opt_d = optimizer.SGD(learning_rate=args.lr*0.01, momentum=args.mom, wd=args.wd, rescale_grad=_rescale) ##lr e-5
    opt_d = optimizer.Adam(learning_rate=0.0001, beta1=0.5, beta2=0.9, epsilon=1e-08)
    model.bind(data_shapes=data_shapes,for_training=True) #label shape���ˣ����˱�ǩ��������
    model.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
                         allow_missing=True)  #���Ϊtrue����������ܰ���ȱ�ٵ�ֵ�����ҽ����ó�ʼֵ�趨���������Щȱ�ٵIJ���
    # model.init_optimizer(kvstore=args.kvstore,optimizer='sgd', optimizer_params=(optimizer_params))
    model.init_optimizer(kvstore=args.kvstore,optimizer=opt_d)
    # metric = eval_metrics  #�������㣬�б�
    ##########################################################################
    ## ����������
    # ����ģ�飬�DZ����
    model_d = mx.module.Module(symbol=d_sym, context=ctx,data_names=['data'], label_names=['softmax_label'])
    data_shapes = [('data', (args.batch_size*2,512))]
    label_shapes = [('softmax_label', (args.batch_size*2,))]  #bind ������Զ��ı�batch��С��Ҳ����ʹ�õ�ʱ���ٰ�
    model_d.bind(data_shapes=data_shapes,label_shapes = label_shapes,inputs_need_grad=True)
    model_d.init_params(initializer=initializer)
    model_d.init_optimizer(kvstore=args.kvstore,optimizer=opt) #�Ż���������Ҫ�Ķ� #lr e-3
    ## �����õ��ǣ������� discriminator  �������������
    metric_d = AccMetric_d()  #�õ����ȼ���,��metric.py ��Ӻ���AccMetric_d�������õ���softmax
    eval_metrics_d = [mx.metric.create(metric_d)]
    metric2_d = LossValueMetric_d()  #�õ���ʧֵ  ,metric.py ��Ӻ���AccMetric_d�������õ���cros entropy
    eval_metrics_d.append( mx.metric.create(metric2_d) )  #
    metric_d =eval_metrics_d  # mx.metric.create('acc')## ����������softmax��  symbol ֻ��һ�����softmax ,ʱ���,

    global_step=[0]
    batch_num=[0]
    resize_acc=[0]
    for epoch in range(0, 40):
        # if epoch==1 or epoch==2 or epoch==3:
        #     model.init_optimizer(kvstore=args.kvstore,optimizer='sgd', optimizer_params=(optimizer_params))
        if not isinstance(metric_d, mx.metric.EvalMetric):#�������������
            metric_d = mx.metric.create(metric_d)
        # metric_d = mx.metric.create(metric_d)
        metric_d.reset()
        train_dataiter_low.reset()
        data2.reset()
        print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

        data_iter = iter(train_dataiter_low)
        data2_iter = iter(data2)
        data_len=0
        for batch in data_iter:  # batch is high
            ##   1���õ� ��ʦ����train false,   ѧ������train true   ����������ϲ������� label���趨��1����0 
            ####��ʦ����õ�feature����ӳ� label����Ϊ�������ݣ�
            data_len +=len(batch.data[0])
            
            if len(batch.data[0])<args.batch_size:  #batch.data[0] is ����batch 
                print ("���data����batch,����")
                print ("data_len:",data_len)
                break
            if data_len >=2830147: #2830147,Ŀ���������ݳ���
                print ("һ��batch ����")
                break

            batch2 = data2_iter.next()
            t_model.forward(batch2, is_train=False)  #high data,�Լ� low_data,,�������������ݣ����ݿ��Դ�С��ͬ
            t_feat = t_model.get_outputs() # type list   batch.label,type list�����ֻ��fc1
            
            # print (batch.data[0].grad is None) # not None,  batch.data[0].detach.grad ,is None
            ## batch.data[0].grad ��None   ,batch.data[0].detach.grad Ҳ��None 
            ## �����û�����ݶ� ��bind, bind ������������������ݶȣ�������detach ,��ʾ������������ݶȼ���
            ## batch.data[0] #���ص����б�[batch_data] [label]����[  array[bchw]  ] [ array[0 1...]]
            ## ѧ���������ɶԿ�����  fack
            model.forward(batch,is_train=True) ##fc1 ���
            g_feat = model.get_outputs()    #get_symol ���صģ�����ֵ����,���յļ���ֵ����һ����fc1����
            label_t = nd.ones((args.batch_size,)) #1
            label_g = nd.zeros((args.batch_size,)) #0
            ## ������һ��
            label_concat = nd.concat(label_t,label_g,dim=0)
            feat_concat = nd.concat(t_feat[0],g_feat[0],dim=0) # ����nd �ϲ�nd.L2Normalization(����Ҫ
            
            ### 2.1�� �ϲ������ݽ���ѵ�����ݶȸ��£��ڶ���,�ڽ��У� is train = true,�� �����������ݵ��ݶȣ�
            ##��false,�Dz�����������ݶȣ����벻�䣬������Ҫ������ݶȣ�
            feat_data = mx.io.DataBatch([feat_concat.detach()], [label_concat])
            model_d.forward(feat_data, is_train=True) # #���е���ʧ
            model_d.backward()
            # print(feat_data.data[0].grad is None)  #is None
            ##��ֵ ģ���ݶȴ���
            gradD = [[grad.copyto(grad.context) for grad in grads] for grads in model_d._exec_group.grad_arrays]
            model_d.update()   ##�ݶȸ���
            model_d.update_metric(metric_d, [label_concat])
            
            
            ### 2.2 ,��ѧ������������õ� ����ֵ�������ݶ����ô��ݸ� ѧ�����磬�����£����ݵ��������� batch ��С
            label_g = nd.ones((args.batch_size,)) #��ǩ����Ϊ1

            feat_data = mx.io.DataBatch([g_feat[0]], [label_g])  #have input grad
            model_d.forward(feat_data, is_train=True) # #true  �õ�������ݶ�
            model_d.backward() ## �ҵ����û���ۼӹ��ܣ���һ����ִ������ forward �Ḳ���ϴεĽ��


            ####3. G �õ� �ݶ�  ���򴫵� ��ѧ������
            g_grad=model_d.get_input_grads()
            model.backward(g_grad)
            model.update()

            ## ѵ�������� s t ���������뵽���������磬�������ݶȸ��£�Ȼ�󣬵õ�s������������������н�������ʧ���ݶȴ���
            ## ������ ���� �������ǽ�ʦ��ѧ�����������ƴ�ӣ�label�ǣ�1 �� 0 
            
            # gan_label = [nd.empty((args.batch_size*2,2))]  #(batch*2,2) ����ģ�͵�������ƴ�� ��С��0 1 label,
            # discrim_data = [nd.empty((args.batch_size*2,512))]  #(batch*2,512)
            # print (gan_label[0].shape)



            lr_steps = [int(x) for x in args.lr_steps.split(',')]
            global_step[0]+=1
            batch_num[0]+=1
            mbatch = global_step[0]
            for step in lr_steps:
                if mbatch==step:
                    opt.lr *= 0.1
                    opt_d.lr*=0.1
                    print('opt.lr ,opt_d.lr lr change to', opt.lr,opt_d.lr)
                    break
            
            if mbatch %200==0 and mbatch >0: #(fc7_save):            
                print('mbath %d, Training %s' % (epoch, metric_d.get()))

            if mbatch %1000==0 and mbatch >0: 
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, epoch, model.symbol, arg, aux)
                
                arg, aux = model_d.get_params()
                mx.model.save_checkpoint(prefix+"discriminator", epoch, model_d.symbol, arg, aux)
                
                top1,top10 = my_top(epoch)
                yidong_test_top1,yidong_test_top1=my_top_yidong_test(epoch)
                if top1 >= resize_acc[0]:
                    resize_acc[0]=top1
                    #������ߵ����ݲ���
                    arg, aux = model.get_params()
                    all_layers = model.symbol.get_internals()
                    _sym = all_layers['fc1_output']
                    _arg = {}
                    for k in arg:
                      if not k.startswith('fc7'):#�ַ�����ʼ�� fc7 ��ͷ������ѭ�������������������㣩
                        _arg[k] = arg[k]
                    mx.model.save_checkpoint(prefix+"_best", 1, _sym, _arg, aux)  
                    acc_list = ver_test(mbatch)
                    if len(acc_list)>0:
                        print ("LFW acc is :",acc_list[0])
 
                print("batch_num",batch_num[0],"epoch",epoch, "lr ",opt.lr)
                print('mbath %d, Training %s' % (epoch, metric_d.get()))
Esempio n. 11
0
# <NDArray 1x4 @cpu(0)>

# update
trainer.step(batch_size)
print(net.weight.data())
# [[0.31892323 0.21269077 0.34669656 0.29598683]]
# <NDArray 1x4 @cpu(0)>

print(curr_weight - net.weight.data() * 1 / batch_size)
# [[ 0.02714116 -0.03028122 -0.00145487  0.00512915]]
# <NDArray 1x4 @cpu(0)>

################################################################################
# define an optimzer directly and pass to trainer
# ex. using the AdamOptimizer: a popular adaptive optimizer for deep learning
optim = optimizer.Adam(learning_rate=1)
trainer = gluon.Trainer(net.collect_params(), optim)

# update network weights
forward_backward()
trainer.step(batch_size)
print(net.weight.data())
# [[-0.6810826  -0.7873151  -0.65330917 -0.7040191 ]]
# <NDArray 1x4 @cpu(0)>

################################################################################
# changing learning rate

print(trainer.learning_rate)
# 1
Esempio n. 12
0
    def do_train(self):

        # 判断使用GPU还是CPU
        ctx = []
        cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()

        if len(cvd) > 0:
            gpu_num_list = cvd.split(",")
            gpu_num_list_len = len(gpu_num_list)
            for i in range(gpu_num_list_len):
                ctx.append(mx.gpu(i))
                pass
            pass

        if len(ctx) == 0:
            ctx = [mx.cpu()]
            print("use cpu")
            pass
        else:
            print("gpu num: {}".format(ctx))
            pass

        assert self.face_shape[0] == self.face_shape[
            1], "face_shape[0] neq face_shape[1]"

        # 每个 GPU 的 batch size
        per_batch_size = self.net_info["per_batch_size"]

        ctx_count = len(ctx)

        batch_size = per_batch_size * ctx_count

        data_shape = [
            self.face_shape[2], self.face_shape[0], self.face_shape[1]
        ]

        data_shape = tuple(data_shape)

        # 模型是否存在,如果存在则加载模型,如果不存在则初始化权重
        if os.path.exists(self.model_path):
            path_info, file_name = os.path.split(self.model_path)
            name_info, suffix_info = os.path.splitext(file_name)
            name_prefix, name_epoch = name_info.split("-")
            model_path_prefix = os.path.join(path_info, name_prefix)
            epoch_num = int(name_epoch)

            symbol, arg_params, aux_params = mx.model.load_checkpoint(
                model_path_prefix, epoch_num)

            # 模型构建
            symbol_info = self.get_symbol()
            pass
        else:
            arg_params = None
            aux_params = None
            # 模型构建
            symbol_info = self.get_symbol()
            pass

        # 计算模型的缓存空间
        if self.count_flops_flag:
            all_layers = symbol_info.get_internals()
            fc1_sym = all_layers['fc1_output']
            flops_info = flops_utils.count_flops(fc1_sym,
                                                 data=(1, data_shape[0],
                                                       data_shape[1],
                                                       data_shape[2]))
            flops_info_str = flops_utils.flops_str(flops_info)
            print("Network flops_info_str: {}".format(flops_info_str))
            pass

        model = mx.mod.Module(
            context=mx.gpu(),
            # context=ctx,
            symbol=symbol_info)

        loss_name = self.loss_info["loss_name"]

        rec_data_path = self.data_info["rec_data_path"]
        idx_data_path = self.data_info["idx_data_path"]

        bin_data_file_path = self.data_info["bin_data_file_path"]
        val_targets_list = self.data_info["val_targets"]

        # 加载 .bin data 验证数据
        bin_data_list = get_data_utils.load_bin(
            bin_data_file_path=bin_data_file_path,
            bin_name_list=val_targets_list,
            image_shape=self.face_shape)

        # 主要获取数据的迭代器,triplet 与 sfotmax 输入数据的迭代器是不一样的,
        # 具体哪里不一样,后续章节为大家分析
        if loss_name.find("triplet") >= 0:
            triplet_bag_size = self.loss_info["triplet_bag_size"]
            triplet_alpha = self.loss_info["triplet_alpha"]
            triplet_max_ap = self.loss_info["triplet_max_ap"]
            images_per_identity = self.loss_info["images_per_identity"]
            triplet_params = [triplet_bag_size, triplet_alpha, triplet_max_ap]

            train_data_iter = TripletFaceImageIter(
                rec_data_path=rec_data_path,
                idx_data_path=idx_data_path,
                batch_size=batch_size,
                data_shape=data_shape,
                shuffle_flag=True,
                rand_mirror=self.data_rand_mirror_flag,
                cutoff=self.data_crop_flag,
                ctx_num=ctx_count,
                images_per_identity=images_per_identity,
                triplet_params=triplet_params,
                mx_model=model)

            metric2 = LossValueMetric()
            eval_metrics = [mx.metric.create(metric2)]
            pass
        else:
            train_data_iter = FaceImageIter(
                rec_data_path=rec_data_path,
                idx_data_path=idx_data_path,
                batch_size=batch_size,
                data_shape=data_shape,
                shuffle_flag=True,
                rand_mirror=self.data_rand_mirror_flag,
                cutoff=self.data_crop_flag,
                color_jitter=self.data_color_aug,
                images_filter=self.data_image_filter)

            metric1 = AccMetric()
            eval_metrics = [mx.metric.create(metric1)]

            # Focal loss,一种改进的交叉损失熵
            if self.ce_loss:
                metric2 = LossValueMetric()
                eval_metrics.append(mx.metric.create(metric2))
                pass
            pass

        # 把 train_data_iter 转化为 mx.io.PrefetchingIter 迭代器
        train_data_iter = mx.io.PrefetchingIter(train_data_iter)

        net_name = self.net_info["net_name"]

        if net_name == "f_res_net" or net_name == "f_mobile_face_net":
            # resNet style
            initializer = mx.init.Xavier(rnd_type='gaussian',
                                         factor_type="out",
                                         magnitude=2)
            pass
        else:
            initializer = mx.init.Xavier(rnd_type='uniform',
                                         factor_type="in",
                                         magnitude=2)
            pass

        re_scale = 1.0 / ctx_count

        optimize = optimizer.Adam(learning_rate=self.learning_rate,
                                  wd=self.weight_decay,
                                  rescale_grad=re_scale)
        callback_speed = mx.callback.Speedometer(batch_size, self.print_step)

        # 最高的准曲率 lfw and target
        highest_acc = [0.0, 0.0]

        global_step = [0]
        save_step = [0]

        print("learning_rate_step_list: {}".format(
            self.learning_rate_step_list))

        def batch_callback_fun(param):
            global_step[0] += 1
            m_batch = global_step[0]

            for step in self.learning_rate_step_list:
                if m_batch == step:
                    optimize.lr *= 0.1
                    print("learning rate change to: {}".format(optimize.lr))
                    break
                    pass
                pass

            # print(param)
            callback_speed(param)

            # 每1000批次进行一次打印
            if m_batch % 1000 == 0:
                print("learning_rate: {}\nbatch: {}\n epoch: {}".format(
                    optimize.lr, param.nbatch, param.epoch))
                pass

            if m_batch >= 0 and m_batch % self.val_step == 0:
                acc_list = self.ver_test(bin_data_list=bin_data_list,
                                         val_targets_list=val_targets_list,
                                         model_net=model,
                                         batch_size=batch_size,
                                         n_batch=m_batch)

                save_step[0] += 1
                m_save = save_step[0]
                do_save_flag = False
                is_highest_flag = False

                print("-" * 100)
                # 如果存在评估集
                print("acc_list: {}".format(acc_list))
                if len(acc_list) > 0:
                    score = sum(acc_list)
                    if acc_list[-1] >= highest_acc[-1]:
                        if acc_list[-1] > highest_acc[-1]:
                            is_highest_flag = True
                            pass
                        else:
                            if score >= highest_acc[0]:
                                is_highest_flag = True
                                highest_acc[0] = score
                                pass
                            pass

                        highest_acc[-1] = acc_list[-1]
                        pass

                    pass

                # 判断是否保存模型
                if is_highest_flag:
                    do_save_flag = True
                    pass
                if self.save_model_num == 0:
                    do_save_flag = False
                    pass
                elif self.save_model_num == 2:
                    do_save_flag = True
                    pass
                elif self.save_model_num == 3:
                    m_save = 1
                    pass

                if do_save_flag:
                    print("m_save: {}".format(m_save))

                    arg, aux = model.get_params()

                    if self.check_feature_flag:
                        all_layers = model.symbol.get_internals()
                        fc1_sym = all_layers["fc1_output"]

                        arg_base = {}
                        for key in arg:
                            if not key.startswith("fc7"):
                                arg_base[key] = arg[key]
                                pass
                            pass
                        mx.model.save_checkpoint(self.save_model_prefix_path,
                                                 m_save, fc1_sym, arg_base,
                                                 aux)
                        pass
                    else:
                        mx.model.save_checkpoint(self.save_model_prefix_path,
                                                 m_save, model.symbol, arg,
                                                 aux)
                        pass
                    pass

                print("highest_acc[m_batch: {}]: {:.5f}".format(
                    m_batch, highest_acc[-1]))
                pass

            # 如果最大步骤大于 0, 且 训练步骤 > 最大步骤, 则退出程序
            if self.max_steps > 0 and m_batch > self.max_steps:
                sys.exit(0)
                pass
            pass

        begin_epoch = 0

        # 训练模型
        model.fit(train_data=train_data_iter,
                  begin_epoch=begin_epoch,
                  num_epoch=999999,
                  eval_metric=eval_metrics,
                  kvstore=self.kv_store,
                  optimizer=optimize,
                  initializer=initializer,
                  arg_params=arg_params,
                  aux_params=aux_params,
                  allow_missing=True,
                  batch_end_callback=batch_callback_fun)

        pass