Exemple #1
0
def test_sma():
    x = tf.Variable(tf.ones([], tf.float32))
    y = x * x
    optimizer = tf.train.GradientDescentOptimizer(0.1)
    optimizer = SynchronousAveragingOptimizer(optimizer)
    train_op = optimizer.minimize(y)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(BroadcastGlobalVariablesOp())
        for _ in range(2):
            sess.run(train_op)
Exemple #2
0
def test_sma():
    x = tf.Variable(tf.ones([], tf.float32))
    opt = tf.keras.optimizers.SGD(0.1)
    opt = SynchronousAveragingOptimizer(opt)

    @tf.function
    def training_step(x, opt, first_batch):
        _training_step(x, opt, first_batch)

    for batch in range(5):
        y = training_step(x, opt, batch == 0)
Exemple #3
0
def get_kungfu_opt(kungfu_option,opt):
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer
    # KungFu configure
    if kungfu_option == KUNGFU.Sync_sgd:
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        opt=PairAveragingOptimizer(opt)
        
    else:
        raise RuntimeError('Unknown distributed training optimizer.')
    return opt
def build_optimizer(name, n_workers=1):
    # Scale learning rate according to the level of data parallelism
    optimizer = tf.keras.optimizers.SGD(learning_rate=(learning_rate *
                                                       n_workers))

    # KUNGFU: Wrap the TensorFlow optimizer with KungFu distributed optimizers.
    if name == 'sync-sgd':
        return SynchronousSGDOptimizer(optimizer, use_locking=True)
    elif name == 'async-sgd':
        return PairAveragingOptimizer(optimizer)
    elif name == 'sma':
        return SynchronousAveragingOptimizer(optimizer)
    else:
        raise RuntimeError('unknown optimizer: %s' % name)
def build_optimizer():
    # KungFu: adjust learning rate based on number of GPUs.
    # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size())
    opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size())

    # KungFu: wrap tf.compat.v1.train.Optimizer.
    if args.kf_optimizer == 'sync-sgd':
        opt = SynchronousSGDOptimizer(opt)
    elif args.kf_optimizer == 'async-sgd':
        opt = PairAveragingOptimizer(opt)
    elif args.kf_optimizer == 'sma':
        opt = SynchronousAveragingOptimizer(opt)
    else:
        raise RuntimeError('Unknown KungFu optimizer')

    return opt
Exemple #6
0
def build_optimizer(name, n_shards=1):
    learning_rate = 0.1

    # Scale learning rate according to the level of data parallelism
    optimizer = tf.train.GradientDescentOptimizer(learning_rate * n_shards)

    # KUNGFU: Wrap the TensorFlow optimizer with KungFu distributed optimizers.
    if name == 'sync-sgd':
        from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer
        return SynchronousSGDOptimizer(optimizer)
    elif name == 'async-sgd':
        from kungfu.tensorflow.optimizers import PairAveragingOptimizer
        return PairAveragingOptimizer(optimizer, fuse_requests=True)
    elif name == 'sma':
        from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer
        return SynchronousAveragingOptimizer(optimizer)
    else:
        raise RuntimeError('unknown optimizer: %s' % name)
def build_optimizer(name, batch_size):
    learning_rate = 0.1

    # Scale learning rate according to the level of data parallelism
    optimizer = tf.train.GradientDescentOptimizer(learning_rate *
                                                  current_cluster_size())

    # KungFu: Wrap the TensorFlow optimizer with KungFu distributed optimizers.
    if name == 'sync-sgd':
        from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer
        return SynchronousSGDOptimizer(optimizer)
    elif name == 'async-sgd':
        from kungfu.tensorflow.optimizers import PairAveragingOptimizer
        return PairAveragingOptimizer(optimizer)
    elif name == 'sma':
        from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer
        return SynchronousAveragingOptimizer(optimizer)
    elif name == 'ada-sgd':
        from kungfu.tensorflow.optimizers import AdaptiveSGDOptimizer
        return AdaptiveSGDOptimizer(optimizer, change_step=300)
    else:
        raise RuntimeError('unknown optimizer: %s' % name)
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])
loss = tf.losses.SparseCategoricalCrossentropy()

# KungFu: adjust learning rate based on number of GPUs.
# opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size())
opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size())

# KungFu: wrap tf.compat.v1.train.Optimizer.
if args.kf_optimizer == 'sync-sgd':
    opt = SynchronousSGDOptimizer(opt)
elif args.kf_optimizer == 'async-sgd':
    opt = PairAveragingOptimizer(opt)
elif args.kf_optimizer == 'sma':
    opt = SynchronousAveragingOptimizer(opt)
else:
    raise RuntimeError('Unknown KungFu optimizer')


@tf.function
def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
        probs = mnist_model(images, training=True)
        loss_value = loss(labels, probs)

    grads = tape.gradient(loss_value, mnist_model.trainable_variables)
    opt.apply_gradients(zip(grads, mnist_model.trainable_variables))

    # KungFu: broadcast is done after the first gradient step to ensure optimizer initialization.
    if first_batch:
Exemple #9
0
def parallel_train(train_model,dataset,config):
    '''Parallel train pipeline of openpose class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''

    init_log(config)
    #train hyper params
    #dataset params
    n_step = config.train.n_step
    batch_size = config.train.batch_size
    #learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    lr_decay_steps = [200000,300000,360000,420000,480000,540000,600000,700000,800000,900000]
    weight_decay_factor = config.train.weight_decay_factor
    #log and checkpoint params
    log_interval=config.log.log_interval
    save_interval=config.train.save_interval
    vis_dir=config.train.vis_dir
    
    #model hyper params
    n_pos = train_model.n_pos
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    model_dir = config.model.model_dir
    pretrain_model_dir=config.pretrain.pretrain_model_dir
    pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz"

    #import kungfu
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer
    

    print(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}")
    #training dataset configure with shuffle,augmentation,and prefetch
    train_dataset=dataset.get_train_dataset()
    dataset_type=dataset.get_dataset_type()
    parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format
    flip_list=get_flip_list(dataset_type)
    paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,flip_list=flip_list,data_format=data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096)
    train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank())
    train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4)
    train_dataset = train_dataset.batch(batch_size)  
    train_dataset = train_dataset.prefetch(64)

    #train model configure 
    step=tf.Variable(1, trainable=False)
    lr=tf.Variable(lr_init,trainable=False)
    if(config.model.model_type==MODEL.Openpose):
        opt=tf.keras.optimizers.RMSprop(learning_rate=lr)
    else:
        opt=tf.keras.optimizers.Adam(learning_rate=lr)
    ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr)
    ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3)

    #load from ckpt
    try:
        log("loading ckpt...")
        ckpt.restore(ckpt_manager.latest_checkpoint)
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")
    #load pretrained backbone
    try:
        log("loading pretrained backbone...")
        tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True)
    except:
        log("pretrained backbone doesn't exist, model backbone are initialized")
    #load model weights
    try:
        train_model.load_weights(os.path.join(model_dir,"newest_model.npz"))
    except:
        log("model_path doesn't exist, model parameters are initialized")

    # KungFu configure
    kungfu_option=config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt=PairAveragingOptimizer(opt)
    

    n_step = n_step // current_cluster_size() + 1  # KungFu
    for step_idx,step in enumerate(lr_decay_steps):
        lr_decay_steps[step_idx] = step // current_cluster_size() + 1  # KungFu
    
    #optimize one step
    @tf.function
    def one_step(image,gt_label,mask,train_model,is_first_batch=False):
        step.assign_add(1)
        with tf.GradientTape() as tape:
            gt_conf=gt_label[:,:n_pos,:,:]
            gt_paf=gt_label[:,n_pos:,:,:]
            pd_conf,pd_paf,stage_confs,stage_pafs=train_model.forward(image,is_train=True)

            pd_loss,loss_confs,loss_pafs=train_model.cal_loss(gt_conf,gt_paf,mask,stage_confs,stage_pafs)
            re_loss=regulize_loss(train_model,weight_decay_factor)
            total_loss=pd_loss+re_loss
        
        gradients=tape.gradient(total_loss,train_model.trainable_weights)
        opt.apply_gradients(zip(gradients,train_model.trainable_weights))
        #Kung fu
        if(is_first_batch):
            broadcast_variables(train_model.all_weights)
            broadcast_variables(opt.variables())
        return gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss

    #train each step
    tic=time.time()
    train_model.train()
    log(f"Worker {current_rank()}: Initialized")
    log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {}'.format(
            n_step, batch_size, lr_init, lr_decay_steps, lr_decay_factor))
    for image,gt_label,mask in train_dataset:
        #learning rate decay
        if(step in lr_decay_steps):
            new_lr_decay = lr_decay_factor**(float(lr_decay_steps.index(step)+1)) 
            lr=lr_init*new_lr_decay
        #optimize one step
        gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss=one_step(image.numpy(),gt_label.numpy(),mask.numpy(),\
            train_model,step==0)
        #save log info periodly
        if((step.numpy()!=0) and (step.numpy()%log_interval)==0):
            tic=time.time()
            log('Total Loss at iteration {} / {} is: {} Learning rate {} l2_loss {} time:{}'.format(
                step.numpy(), n_step, total_loss, lr.numpy(), re_loss,time.time()-tic))

        #save result and ckpt periodly
        if((step!=0) and (step%save_interval)==0 and current_rank()==0):
            log("saving model ckpt and result...")
            draw_results(image.numpy(), gt_conf.numpy(), pd_conf.numpy(), gt_paf.numpy(), pd_paf.numpy(), mask.numpy(),\
                 vis_dir,'train_%d_' % step)
            ckpt_save_path=ckpt_manager.save()
            log(f"ckpt save_path:{ckpt_save_path} saved!\n")
            model_save_path=os.path.join(model_dir,"newest_model.npz")
            train_model.save_weights(model_save_path)
            log(f"model save_path:{model_save_path} saved!\n")

        #training finished
        if(step==n_step):
            break
Exemple #10
0
def parallel_train(train_model, dataset, config):
    '''Parallel train pipeline of PoseProposal class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''
    init_log(config)
    #train hyper params
    #dataset params
    n_step = config.train.n_step
    batch_size = config.train.batch_size
    #learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    weight_decay_factor = config.train.weight_decay_factor
    #log and checkpoint params
    log_interval = config.log.log_interval
    save_interval = config.train.save_interval
    vis_dir = config.train.vis_dir

    #model hyper params
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    hnei = train_model.hnei
    wnei = train_model.wnei
    model_dir = config.model.model_dir

    #import kungfu
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    print(
        f"parallel training using learning rate:{lr_init} batch_size:{batch_size}"
    )
    #training dataset configure with shuffle,augmentation,and prefetch
    train_dataset = dataset.get_train_dataset()
    parts, limbs, data_format = train_model.parts, train_model.limbs, train_model.data_format
    paramed_map_fn = get_paramed_map_fn(hin, win, hout, wout, hnei, wnei,
                                        parts, limbs, data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096)
    train_dataset = train_dataset.shard(num_shards=current_cluster_size(),
                                        index=current_rank())
    train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4)
    train_dataset = train_dataset.batch(batch_size)
    train_dataset = train_dataset.prefetch(buffer_size=2)

    #train model configure
    step = tf.Variable(1, trainable=False)
    lr = tf.Variable(lr_init, trainable=False)
    opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9)
    ckpt = tf.train.Checkpoint(step=step, optimizer=opt, lr=lr)
    ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3)

    #load from ckpt
    try:
        ckpt.restore(ckpt_manager.latest_checkpoint)
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")
    try:
        train_model.load_weights(os.path.join(model_dir, "newest_model.npz"))
    except:
        log("model_path doesn't exist, model parameters are initialized")

    #Kungfu configure
    kungfu_option = config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt = PairAveragingOptimizer(opt)

    n_step = n_step // current_cluster_size() + 1  # KungFu

    #optimize one step
    @tf.function
    def one_step(image, targets, train_model, is_first_batch=False):
        step.assign_add(1)
        with tf.GradientTape() as tape:
            delta, tx, ty, tw, th, te, te_mask = targets
            pc, pi, px, py, pw, ph, pe = train_model.forward(image,
                                                             is_train=True)
            loss_rsp,loss_iou,loss_coor,loss_size,loss_limb=\
                train_model.cal_loss(delta,tx,ty,tw,th,te,te_mask,pc,pi,px,py,pw,ph,pe)
            pd_loss = loss_rsp + loss_iou + loss_coor + loss_size + loss_limb
            re_loss = regulize_loss(train_model, weight_decay_factor)
            total_loss = pd_loss + re_loss

        gradients = tape.gradient(total_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(gradients, train_model.trainable_weights))
        #Kung fu
        if (is_first_batch):
            broadcast_variables(train_model.all_weights)
            broadcast_variables(opt.variables())
        predicts = (pc, px, py, pw, ph, pe)
        return predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb

    #train each step
    tic = time.time()
    train_model.train()
    log(f"Worker {current_rank()}: Initialized")
    log(f'Start - n_step: {n_step} batch_size: {batch_size} lr_init: {lr_init} lr_decay_factor: {lr_decay_factor}'
        )
    avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0.
    for image, targets in train_dataset:
        #learning rate decay
        lr = lr_init * (1 - step / n_step * lr_decay_factor)
        #optimize one step
        predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb = one_step(
            image, targets, train_model)

        avg_loss_rsp += loss_rsp / log_interval
        avg_loss_iou += loss_iou / log_interval
        avg_loss_coor += loss_coor / log_interval
        avg_loss_size += loss_size / log_interval
        avg_loss_limb += loss_limb / log_interval
        avg_pd_loss += pd_loss / log_interval
        avg_re_loss += re_loss / log_interval

        #save log info periodly
        if ((step != 0) and (step % log_interval) == 0):
            tic = time.time()
            log(f"worker:{current_rank()} Train iteration {step.numpy()}/{n_step}, learning rate:{lr.numpy()},"+\
                    f"loss_rsp:{avg_loss_rsp},loss_iou:{avg_loss_iou},loss_coor:{avg_loss_coor},loss_size:{avg_loss_size},"+\
                        f"loss_limb:{avg_loss_limb},loss_pd:{avg_pd_loss},loss_re:{avg_re_loss} ,time:{time.time()-tic}")
            avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0.

        #save result and ckpt periodly
        if ((step != 0) and (step % save_interval) == 0):
            log("saving model ckpt and result...")
            draw_results(image.numpy(),
                         predicts,
                         targets,
                         parts,
                         limbs,
                         save_dir=vis_dir,
                         name=f"ppn_step_{step.numpy()}")
            ckpt_save_path = ckpt_manager.save()
            log(f"ckpt save_path:{ckpt_save_path} saved!\n")
            model_save_path = os.path.join(model_dir, "newest_model.npz")
            train_model.save_weights(model_save_path)
            log(f"model save_path:{model_save_path} saved!\n")

        #training finished
        if (step == n_step):
            break
Exemple #11
0
def parallel_train(train_model,dataset,config):
    '''Parallel train pipeline of openpose class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''

    init_log(config)
    #train hyper params
    #dataset params
    n_step = config.train.n_step
    batch_size = config.train.batch_size
    #learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    lr_decay_steps = config.train.lr_decay_steps
    warm_up_step=8000
    warm_up_decay=0.01
    weight_decay_factor = config.train.weight_decay_factor
    #log and checkpoint params
    log_interval=config.log.log_interval
    save_interval=config.train.save_interval
    vis_dir=config.train.vis_dir
    
    #model hyper params
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    model_dir = config.model.model_dir
    pretrain_model_dir=config.pretrain.pretrain_model_dir
    pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz"

    #import kungfu
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    log(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}")
    #training dataset configure with shuffle,augmentation,and prefetch
    train_dataset=dataset.get_train_dataset()
    dataset_type=dataset.get_dataset_type()
    parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format
    paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,data_format=data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096)
    train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank())
    train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.map(paramed_map_fn,num_parallel_calls=max(multiprocessing.cpu_count()//2,1))
    train_dataset = train_dataset.batch(batch_size)  
    train_dataset = train_dataset.prefetch(64)
    
    #train configure
    step=tf.Variable(1, trainable=False)
    lr=tf.Variable(lr_init,trainable=False)
    lr_init=tf.Variable(lr_init,trainable=False)
    opt=tf.optimizers.Adam(learning_rate=lr)
    ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr)
    ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3)
    
    #load from ckpt
    log("loading ckpt...")
    try:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        log("ckpt loaded successfully!")
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")

    #load pretrained backbone
    log("loading pretrained backbone...")
    try:
        tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True)
        log("pretrained backbone loaded successfully")
    except:
        log("pretrained backbone doesn't exist, model backbone are initialized")

    #load model weights
    log("loading saved training model weights...")
    try:
        train_model.load_weights(os.path.join(model_dir,"newest_model.npz"))
        log("saved training model weights loaded successfully")
    except:
        log("model_path doesn't exist, model parameters are initialized")
    
    # KungFu configure
    kungfu_option=config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt=PairAveragingOptimizer(opt)
    
    # KungFu adjust
    n_step = n_step // current_cluster_size() + 1  # KungFu
    for step_idx,step in enumerate(lr_decay_steps):
        lr_decay_steps[step_idx] = step // current_cluster_size() + 1  # KungFu

    for lr_decay_step in lr_decay_steps:
        if(step>lr_decay_step):
            lr=lr*lr_decay_factor
    
    #optimize one step
    @tf.function
    def one_step(image,gt_label,mask,train_model,is_first_batch=False):
        step.assign_add(1)
        with tf.GradientTape() as tape:
            gt_pif_maps,gt_paf_maps=gt_label
            pd_pif_maps,pd_paf_maps=train_model.forward(image,is_train=True)
            loss_pif_maps,loss_paf_maps,total_loss=train_model.cal_loss(pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps)
            decay_loss=regulize_loss(train_model,weight_decay_factor)
            total_loss+=decay_loss
        
        gradients=tape.gradient(total_loss,train_model.trainable_weights)
        opt.apply_gradients(zip(gradients,train_model.trainable_weights))
        #Kung fu
        if(is_first_batch):
            broadcast_variables(train_model.all_weights)
            broadcast_variables(opt.variables())
        return pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss

    #train each step
    train_model.train()
    tic=time.time()
    avg_time=AvgMetric(name="time_iter",metric_interval=log_interval)
    #total loss metrics
    avg_total_loss=AvgMetric(name="total_loss",metric_interval=log_interval)
    #decay loss metrics
    avg_decay_loss=AvgMetric(name="decay_loss",metric_interval=log_interval)
    #pif loss metrics
    avg_pif_conf_loss=AvgMetric(name="pif_conf_loss",metric_interval=log_interval)
    avg_pif_vec_loss=AvgMetric(name="pif_vec_loss",metric_interval=log_interval)
    avg_pif_scale_loss=AvgMetric(name="pif_scale_loss",metric_interval=log_interval)
    #paf loss metrics
    avg_paf_conf_loss=AvgMetric(name="paf_conf_loss",metric_interval=log_interval)
    avg_paf_src_vec_loss=AvgMetric(name="paf_src_vec_loss",metric_interval=log_interval)
    avg_paf_dst_vec_loss=AvgMetric(name="paf_dst_vec_loss",metric_interval=log_interval)
    avg_paf_src_scale_loss=AvgMetric(name="paf_src_scale_loss",metric_interval=log_interval)
    avg_paf_dst_scale_loss=AvgMetric(name="paf_dst_scale_loss",metric_interval=log_interval)
    log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {} weight_decay_factor: {}'.format(
            n_step, batch_size, lr_init.numpy(), lr_decay_steps, lr_decay_factor, weight_decay_factor))
    for image,gt_label,mask,labeled in train_dataset:
        #get losses
        pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss=one_step(image,gt_label,mask,train_model,step==0)
        loss_pif_conf,loss_pif_vec,loss_pif_scale=loss_pif_maps
        loss_paf_conf,loss_paf_src_vec,loss_paf_dst_vec,loss_paf_src_scale,loss_paf_dst_scale=loss_paf_maps
        #update metrics
        avg_time.update(time.time()-tic)
        tic=time.time()
        #update total losses
        avg_total_loss.update(total_loss)
        #update decay loss
        avg_decay_loss.update(decay_loss)
        #update pif_losses metrics
        avg_pif_conf_loss.update(loss_pif_conf)
        avg_pif_vec_loss.update(loss_pif_vec)
        avg_pif_scale_loss.update(loss_pif_scale)
        #update paf_losses metrics
        avg_paf_conf_loss.update(loss_paf_conf)
        avg_paf_src_vec_loss.update(loss_paf_src_vec)
        avg_paf_dst_vec_loss.update(loss_paf_dst_vec)
        avg_paf_src_scale_loss.update(loss_paf_src_scale)
        avg_paf_dst_scale_loss.update(loss_paf_dst_scale)

        #learning rate decay
        if(step in lr_decay_steps):
            new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step)+1) 
            lr=lr_init*new_lr_decay
        #warm_up learning rate decay
        if(step <= warm_up_step):
            lr=lr_init*warm_up_decay**(1.0-step/warm_up_step)

        #save log info periodly
        if((step.numpy()!=0) and (step.numpy()%log_interval)==0):
            log(f"Train iteration {n_step} / {step.numpy()}, Learning rate:{lr.numpy()} {avg_total_loss.get_metric()} "+\
                f"{avg_pif_conf_loss.get_metric()} {avg_pif_vec_loss.get_metric()} {avg_pif_scale_loss.get_metric()}"+\
                f"{avg_paf_conf_loss.get_metric()} {avg_paf_src_vec_loss.get_metric()} {avg_paf_dst_vec_loss.get_metric()}"+\
                f"{avg_paf_src_scale_loss.get_metric()} {avg_paf_dst_scale_loss.get_metric()} {avg_decay_loss.get_metric()} {avg_time.get_metric()}")

        #save result and ckpt periodly
        if((step.numpy()!=0) and (step.numpy()%save_interval)==0):
            #save ckpt
            log("saving model ckpt and result...")
            ckpt_save_path=ckpt_manager.save()
            log(f"ckpt save_path:{ckpt_save_path} saved!\n")
            #save train model
            model_save_path=os.path.join(model_dir,"newest_model.npz")
            train_model.save_weights(model_save_path)
            log(f"model save_path:{model_save_path} saved!\n")
            #draw result
            stride=train_model.stride
            gt_pif_maps,gt_paf_maps=gt_label
            draw_result(image,pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps,mask,parts,limbs,stride,save_dir=vis_dir,\
                name=f"train_{step.numpy()}")

        #training finished
        if(step==n_step):
            break
Exemple #12
0
def parallel_train(train_model, dataset, config, augmentor:BasicAugmentor, \
                        preprocessor:BasicPreProcessor,postprocessor:BasicPostProcessor,visualizer=BasicVisualizer):
    '''Single train pipeline of Openpose class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''

    # train hyper params
    # dataset params
    total_step = config.train.n_step
    batch_size = config.train.batch_size
    # learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    lr_decay_steps = [
        200000, 300000, 360000, 420000, 480000, 540000, 600000, 700000, 800000,
        900000
    ]
    weight_decay_factor = config.train.weight_decay_factor
    # log and checkpoint params
    log_interval = config.log.log_interval
    vis_interval = config.train.vis_interval
    save_interval = config.train.save_interval
    vis_dir = config.train.vis_dir

    # model hyper params
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    parts, limbs, colors = train_model.parts, train_model.limbs, train_model.colors
    data_format = train_model.data_format
    model_dir = config.model.model_dir
    pretrain_model_dir = config.pretrain.pretrain_model_dir
    pretrain_model_path = f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz"

    # metrics
    metric_manager = MetricManager()

    # initializing train dataset
    train_dataset = dataset.get_train_dataset()
    epoch_size = dataset.get_train_datasize() // batch_size
    paramed_map_fn = get_paramed_map_fn(augmentor=augmentor,
                                        preprocessor=preprocessor,
                                        data_format=data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096).repeat()
    train_dataset = train_dataset.map(
        paramed_map_fn, num_parallel_calls=get_num_parallel_calls())
    train_dataset = train_dataset.batch(config.train.batch_size)
    train_dataset = train_dataset.prefetch(3)
    train_dataset_iter = iter(train_dataset)

    #train configure
    save_step = tf.Variable(1, trainable=False)
    save_lr = tf.Variable(lr_init, trainable=False)
    opt = tf.keras.optimizers.Adam(learning_rate=save_lr)
    domainadapt_flag = config.data.domainadapt_flag
    total_epoch = total_step // epoch_size

    #domain adaptation params
    if (not domainadapt_flag):
        ckpt = tf.train.Checkpoint(save_step=save_step,
                                   save_lr=save_lr,
                                   opt=opt)
    else:
        log("Domain adaptaion in training enabled!")
        # weight param
        lambda_adapt = 1e-4
        # construct discrminator model
        feature_hin = train_model.hin // train_model.backbone.scale_size
        feature_win = train_model.win // train_model.backbone.scale_size
        in_channels = train_model.backbone.out_channels
        adapt_dis = Discriminator(feature_hin,
                                  feature_win,
                                  in_channels,
                                  data_format=data_format)
        opt_d = tf.keras.optimizers.Adam(learning_rate=save_lr)
        ckpt = tf.train.Checkpoint(save_step=save_step,
                                   save_lr=save_lr,
                                   opt=opt,
                                   opt_d=opt_d)
        # construct domain adaptation dataset
        dmadapt_train_dataset = dataset.get_dmadapt_train_dataset()
        paramed_dmadapt_map_fn = get_paramed_dmadapt_map_fn(augmentor)
        dmadapt_train_dataset = dmadapt_train_dataset.map(
            paramed_dmadapt_map_fn,
            num_parallel_calls=get_num_parallel_calls())
        dmadapt_train_dataset = dmadapt_train_dataset.shuffle(
            buffer_size=4096).repeat()
        dmadapt_train_dataset = dmadapt_train_dataset.batch(
            config.train.batch_size)
        dmadapt_train_dataset = dmadapt_train_dataset.prefetch(3)
        dmadapt_train_dataset_iter = iter(dmadapt_train_dataset)

    #load from ckpt
    ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3)
    try:
        log("loading ckpt...")
        ckpt.restore(ckpt_manager.latest_checkpoint)
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")
    #load pretrained backbone
    try:
        log("loading pretrained backbone...")
        tl.files.load_and_assign_npz_dict(name=pretrain_model_path,
                                          network=train_model.backbone,
                                          skip=True)
    except:
        log("pretrained backbone doesn't exist, model backbone are initialized"
            )
    #load model weights
    try:
        log("loading saved training model weights...")
        train_model.load_weights(os.path.join(model_dir, "newest_model.npz"))
    except:
        log("model_path doesn't exist, model parameters are initialized")
    if (domainadapt_flag):
        try:
            log("loading saved domain adaptation discriminator weight...")
            adapt_dis.load_weights(
                os.path.join(model_dir, "newest_discriminator.npz"))
        except:
            log("discriminator path doesn't exist, discriminator parameters are initialized"
                )

    log(f"Parallel training using learning rate:{lr_init} batch_size:{batch_size}"
        )
    step = save_step.numpy()
    lr = save_lr.numpy()

    #import kungfu
    from kungfu.python import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    total_step = total_step // current_cluster_size() + 1  # KungFu
    total_epoch = total_epoch // current_cluster_size() + 1  # KungFu
    for step_idx, decay_step in enumerate(lr_decay_steps):
        lr_decay_steps[
            step_idx] = decay_step // current_cluster_size() + 1  # KungFu

    # optimize one step
    def optimize_step(image, mask, target_x, train_model,
                      metric_manager: MetricManager):
        # tape
        with tf.GradientTape() as tape:
            predict_x = train_model.forward(x=image,
                                            is_train=True,
                                            ret_backbone=domainadapt_flag)
            total_loss = train_model.cal_loss(predict_x=predict_x, target_x=target_x, \
                                                        mask=mask, metric_manager=metric_manager)
        # optimize model
        gradients = tape.gradient(total_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(gradients, train_model.trainable_weights))
        return predict_x

    def optimize_step_dmadapt(image_src, image_dst, train_model,
                              adapt_dis: Discriminator,
                              metric_manager: MetricManager):
        # tape
        with tf.GradientTape(persistent=True) as tape:
            # feature extraction
            # src feature
            predict_src = train_model.forward(x=image_src,
                                              is_train=True,
                                              ret_backbone=True)
            backbone_feature_src = predict_src["backbone_features"]
            adapt_pd_src = adapt_dis.forward(backbone_feature_src)
            # dst feature
            predict_dst = train_model.forward(x=image_dst,
                                              is_train=True,
                                              ret_backbone=True)
            backbone_feature_dst = predict_dst["backbone_features"]
            adapt_pd_dst = adapt_dis.forward(backbone_feature_dst)

            # loss calculation
            # loss of g
            g_adapt_loss = adapt_dis.cal_loss(x=adapt_pd_dst,
                                              label=True) * lambda_adapt
            # loss of d
            d_adapt_loss_src = adapt_dis.cal_loss(x=adapt_pd_src, label=True)
            d_adapt_loss_dst = adapt_dis.cal_loss(x=adapt_pd_dst, label=False)
            d_adapt_loss = (d_adapt_loss_src + d_adapt_loss_dst) / 2

        # optimize model
        g_gradient = tape.gradient(g_adapt_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(g_gradient, train_model.trainable_weights))
        metric_manager.update("model/g_adapt_loss", g_adapt_loss)
        # optimize dis
        d_gradients = tape.gradient(d_adapt_loss, adapt_dis.trainable_weights)
        opt_d.apply_gradients(zip(d_gradients, adapt_dis.trainable_weights))
        metric_manager.update("dis/d_adapt_loss_src", d_adapt_loss_src)
        metric_manager.update("dis/d_adapt_loss_dst", d_adapt_loss_dst)
        # delete persistent tape
        del tape
        return predict_dst

    # formal training procedure

    # KungFu configure
    kungfu_option = config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt = PairAveragingOptimizer(opt)

    train_model.train()
    cur_epoch = step // epoch_size + 1
    log(f"Start Training- total_epoch: {total_epoch} total_step: {total_step} current_epoch:{cur_epoch} "\
        +f"current_step:{step} batch_size:{batch_size} lr_init:{lr_init} lr_decay_steps:{lr_decay_steps} "\
        +f"lr_decay_factor:{lr_decay_factor} weight_decay_factor:{weight_decay_factor}" )
    for epoch_idx in range(cur_epoch, total_epoch):
        log(f"Epoch {epoch_idx}/{total_epoch}:")
        for _ in tqdm(range(0, epoch_size)):
            step += 1
            metric_manager.start_timing()
            image, mask, target_list = next(train_dataset_iter)
            # extract gt_label
            target_list = [
                cPickle.loads(target) for target in target_list.numpy()
            ]
            target_x = {key: [] for key, value in target_list[0].items()}
            target_x = reduce(
                lambda x, y:
                {key: x[key] + [y[key]]
                 for key, value in x.items()}, [target_x] + target_list)
            target_x = {
                key: np.stack(value)
                for key, value in target_x.items()
            }
            target_x = to_tensor_dict(target_x)

            # learning rate decay
            if (step in lr_decay_steps):
                new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step) +
                                                 1)
                lr = lr_init * new_lr_decay

            # optimize one step
            predict_x = optimize_step(image, mask, target_x, train_model,
                                      metric_manager)

            # optimize domain adaptation
            if (domainadapt_flag):
                src_image = image
                dst_image = next(dmadapt_train_dataset_iter)
                predict_dst = optimize_step_dmadapt(src_image, dst_image,
                                                    train_model, adapt_dis,
                                                    metric_manager)

            if (step == 1):
                broadcast_variables(train_model.all_weights)
                broadcast_variables(opt.variables())

            # log info periodly
            if ((step != 0) and (step % log_interval) == 0):
                log(f"Train Epoch={epoch_idx} / {total_epoch}, Step={step} / {total_step}: learning_rate: {lr:.6e} {metric_manager.report_timing()}\n"\
                        +f"{metric_manager.report_train()} ")

            # visualize periodly
            if ((step != 0) and (step % vis_interval) == 0
                    and current_rank() == 0):
                log(f"Visualizing prediction maps and target maps")
                visualizer.visual_compare(image_batch=image.numpy(), mask_batch=mask.numpy(), predict_x=predict_x, target_x=target_x,\
                                                    name=f"train_{step}")

            # save result and ckpt periodly
            if ((step != 0) and (step % save_interval) == 0
                    and current_rank() == 0):
                # save ckpt
                log("saving model ckpt and result...")
                save_step.assign(step)
                save_lr.assign(lr)
                ckpt_save_path = ckpt_manager.save()
                log(f"ckpt save_path:{ckpt_save_path} saved!\n")
                # save train model
                model_save_path = os.path.join(model_dir, "newest_model.npz")
                train_model.save_weights(model_save_path)
                log(f"model save_path:{model_save_path} saved!\n")
                # save discriminator model
                if (domainadapt_flag):
                    dis_save_path = os.path.join(model_dir,
                                                 "newest_discriminator.npz")
                    adapt_dis.save_weights(dis_save_path)
                    log(f"discriminator save_path:{dis_save_path} saved!\n")
Exemple #13
0
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# KungFu: adjust learning rate based on number of GPUs.
opt = keras.optimizers.Adadelta(1.0 * current_cluster_size())

# KungFu: wrap distributed optimizers.
if args.kf_optimizer == 'sync-sgd':
    opt = SynchronousSGDOptimizer(opt, with_keras=True)
elif args.kf_optimizer == 'async-sgd':
    opt = PairAveragingOptimizer(opt, with_keras=True)
elif args.kf_optimizer == 'sma':
    opt = SynchronousAveragingOptimizer(opt, with_keras=True)
else:
    raise RuntimeError('unknown optimizer: %s' % name)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt,
              metrics=['accuracy'])

callbacks = [BroadcastGlobalVariablesCallback(with_keras=True)]

# KungFu: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if current_rank() == 0:
    callbacks.append(
        keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

model.fit(x_train,
args.cuda = not args.no_cuda
reshape = 1 if args.reshape_on else 0

# Set up standard model.
model = getattr(applications, args.model)(weights=None)

# opt = tf.optimizers.SGD(0.01)
opt = tf.compat.v1.train.GradientDescentOptimizer(0.01)

# KungFu: wrap tf.compat.v1.train.Optimizer.
if args.kf_optimizer == 'sync-sgd':
    opt = SynchronousSGDOptimizer(opt,reshape=args.reshape_on, use_locking=True)
elif args.kf_optimizer == 'async-sgd':
    opt = PairAveragingOptimizer(opt)
elif args.kf_optimizer == 'sma':
    opt = SynchronousAveragingOptimizer(opt) #match this to resnet KF
else:
    raise RuntimeError('Unknown KungFu optimizer')

data = tf.random.uniform([args.batch_size, 224, 224, 3])
target = tf.random.uniform([args.batch_size, 1],
                           minval=0,
                           maxval=999,
                           dtype=tf.int64)


@tf.function
def benchmark_step(first_batch):
    # reshape strategy here 
    # reshape_strategy(reshape)
    # gradient calculation and updates
Exemple #15
0
def parallel_train(training_dataset, kungfu_option):
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    ds = training_dataset.shuffle(buffer_size=4096)
    ds = ds.shard(num_shards=current_cluster_size(), index=current_rank())
    ds = ds.repeat(n_epoch)
    ds = ds.map(_map_fn, num_parallel_calls=4)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=1)

    iterator = ds.make_one_shot_iterator()
    one_element = iterator.get_next()
    net, total_loss, log_tensors = make_model(*one_element,
                                              is_train=True,
                                              reuse=False)
    x_ = net.img  # net input
    last_conf = net.last_conf  # net output
    last_paf = net.last_paf  # net output
    confs_ = net.confs  # GT
    pafs_ = net.pafs  # GT
    mask = net.m1  # mask1, GT
    # net.m2 = m2                 # mask2, GT
    stage_losses = net.stage_losses
    l2_loss = net.l2_loss

    global_step = tf.Variable(1, trainable=False)
    # scaled_lr = lr_init * current_cluster_size()  # Horovod: scale the learning rate linearly
    scaled_lr = lr_init  # Linear scaling rule is not working in openpose training.
    with tf.variable_scope('learning_rate'):
        lr_v = tf.Variable(scaled_lr, trainable=False)

    opt = tf.train.MomentumOptimizer(lr_v, 0.9)

    # KungFu
    if kungfu_option == 'sync-sgd':
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == 'async-sgd':
        opt = PairAveragingOptimizer(opt)
    elif kungfu_option == 'sma':
        opt = SynchronousAveragingOptimizer(opt)
    else:
        raise RuntimeError('Unknown distributed training optimizer.')

    train_op = opt.minimize(total_loss, global_step=global_step)
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = True

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # KungFu
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
    bcast = BroadcastGlobalVariablesOp()

    global n_step, lr_decay_every_step
    n_step = n_step // current_cluster_size() + 1  # KungFu
    lr_decay_every_step = lr_decay_every_step // current_cluster_size(
    ) + 1  # KungFu

    # Start training
    with tf.Session(config=config) as sess:
        init.run()
        bcast.run()  # KungFu
        print('Worker{}: Initialized'.format(current_rank()))
        print(
            'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}'
            .format(current_rank(), n_step, batch_size, lr_init,
                    lr_decay_every_step))

        # restore pre-trained weights
        try:
            # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net)
            tl.files.load_and_assign_npz_dict(sess=sess,
                                              name=os.path.join(
                                                  model_path, 'pose.npz'))
        except:
            print("no pre-trained model")

        # train until the end
        while True:
            step = sess.run(global_step)
            if step == n_step:
                break

            tic = time.time()
            if step != 0 and (step % lr_decay_every_step == 0):
                new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
                sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))

            [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
                sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])

            # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
            lr = sess.run(lr_v)
            print(
                'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'
                .format(current_rank(), step, n_step, _loss, lr, _l2,
                        time.time() - tic))
            for ix, ll in enumerate(_stage_losses):
                print('Worker{}:', current_rank(), 'Network#', ix,
                      'For Branch', ix % 2 + 1, 'Loss:', ll)

            # save intermediate results and model
            if current_rank() == 0:  # KungFu
                if (step != 0) and (step % save_interval == 0):
                    # save some results
                    [
                        img_out, confs_ground, pafs_ground, conf_result,
                        paf_result, mask_out
                    ] = sess.run(
                        [x_, confs_, pafs_, last_conf, last_paf, mask])
                    draw_results(img_out, confs_ground, conf_result,
                                 pafs_ground, paf_result, mask_out,
                                 'train_%d_' % step)

                    # save model
                    # tl.files.save_npz(
                    #    net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
                    # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess)
                    tl.files.save_npz_dict(net.all_params,
                                           os.path.join(
                                               model_path,
                                               'pose' + str(step) + '.npz'),
                                           sess=sess)
                    tl.files.save_npz_dict(net.all_params,
                                           os.path.join(
                                               model_path, 'pose.npz'),
                                           sess=sess)