def test_sma(): x = tf.Variable(tf.ones([], tf.float32)) y = x * x optimizer = tf.train.GradientDescentOptimizer(0.1) optimizer = SynchronousAveragingOptimizer(optimizer) train_op = optimizer.minimize(y) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(BroadcastGlobalVariablesOp()) for _ in range(2): sess.run(train_op)
def test_sma(): x = tf.Variable(tf.ones([], tf.float32)) opt = tf.keras.optimizers.SGD(0.1) opt = SynchronousAveragingOptimizer(opt) @tf.function def training_step(x, opt, first_batch): _training_step(x, opt, first_batch) for batch in range(5): y = training_step(x, opt, batch == 0)
def get_kungfu_opt(kungfu_option,opt): from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer # KungFu configure if kungfu_option == KUNGFU.Sync_sgd: opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: opt=PairAveragingOptimizer(opt) else: raise RuntimeError('Unknown distributed training optimizer.') return opt
def build_optimizer(name, n_workers=1): # Scale learning rate according to the level of data parallelism optimizer = tf.keras.optimizers.SGD(learning_rate=(learning_rate * n_workers)) # KUNGFU: Wrap the TensorFlow optimizer with KungFu distributed optimizers. if name == 'sync-sgd': return SynchronousSGDOptimizer(optimizer, use_locking=True) elif name == 'async-sgd': return PairAveragingOptimizer(optimizer) elif name == 'sma': return SynchronousAveragingOptimizer(optimizer) else: raise RuntimeError('unknown optimizer: %s' % name)
def build_optimizer(): # KungFu: adjust learning rate based on number of GPUs. # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') return opt
def build_optimizer(name, n_shards=1): learning_rate = 0.1 # Scale learning rate according to the level of data parallelism optimizer = tf.train.GradientDescentOptimizer(learning_rate * n_shards) # KUNGFU: Wrap the TensorFlow optimizer with KungFu distributed optimizers. if name == 'sync-sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer return SynchronousSGDOptimizer(optimizer) elif name == 'async-sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer return PairAveragingOptimizer(optimizer, fuse_requests=True) elif name == 'sma': from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer return SynchronousAveragingOptimizer(optimizer) else: raise RuntimeError('unknown optimizer: %s' % name)
def build_optimizer(name, batch_size): learning_rate = 0.1 # Scale learning rate according to the level of data parallelism optimizer = tf.train.GradientDescentOptimizer(learning_rate * current_cluster_size()) # KungFu: Wrap the TensorFlow optimizer with KungFu distributed optimizers. if name == 'sync-sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer return SynchronousSGDOptimizer(optimizer) elif name == 'async-sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer return PairAveragingOptimizer(optimizer) elif name == 'sma': from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer return SynchronousAveragingOptimizer(optimizer) elif name == 'ada-sgd': from kungfu.tensorflow.optimizers import AdaptiveSGDOptimizer return AdaptiveSGDOptimizer(optimizer, change_step=300) else: raise RuntimeError('unknown optimizer: %s' % name)
tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy() # KungFu: adjust learning rate based on number of GPUs. # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = mnist_model(images, training=True) loss_value = loss(labels, probs) grads = tape.gradient(loss_value, mnist_model.trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) # KungFu: broadcast is done after the first gradient step to ensure optimizer initialization. if first_batch:
def parallel_train(train_model,dataset,config): '''Parallel train pipeline of openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = [200000,300000,360000,420000,480000,540000,600000,700000,800000,900000] weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval=config.log.log_interval save_interval=config.train.save_interval vis_dir=config.train.vis_dir #model hyper params n_pos = train_model.n_pos hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout model_dir = config.model.model_dir pretrain_model_dir=config.pretrain.pretrain_model_dir pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer print(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}") #training dataset configure with shuffle,augmentation,and prefetch train_dataset=dataset.get_train_dataset() dataset_type=dataset.get_dataset_type() parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format flip_list=get_flip_list(dataset_type) paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,flip_list=flip_list,data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(64) #train model configure step=tf.Variable(1, trainable=False) lr=tf.Variable(lr_init,trainable=False) if(config.model.model_type==MODEL.Openpose): opt=tf.keras.optimizers.RMSprop(learning_rate=lr) else: opt=tf.keras.optimizers.Adam(learning_rate=lr) ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr) ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3) #load from ckpt try: log("loading ckpt...") ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone try: log("loading pretrained backbone...") tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True) except: log("pretrained backbone doesn't exist, model backbone are initialized") #load model weights try: train_model.load_weights(os.path.join(model_dir,"newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") # KungFu configure kungfu_option=config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt=PairAveragingOptimizer(opt) n_step = n_step // current_cluster_size() + 1 # KungFu for step_idx,step in enumerate(lr_decay_steps): lr_decay_steps[step_idx] = step // current_cluster_size() + 1 # KungFu #optimize one step @tf.function def one_step(image,gt_label,mask,train_model,is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: gt_conf=gt_label[:,:n_pos,:,:] gt_paf=gt_label[:,n_pos:,:,:] pd_conf,pd_paf,stage_confs,stage_pafs=train_model.forward(image,is_train=True) pd_loss,loss_confs,loss_pafs=train_model.cal_loss(gt_conf,gt_paf,mask,stage_confs,stage_pafs) re_loss=regulize_loss(train_model,weight_decay_factor) total_loss=pd_loss+re_loss gradients=tape.gradient(total_loss,train_model.trainable_weights) opt.apply_gradients(zip(gradients,train_model.trainable_weights)) #Kung fu if(is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) return gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss #train each step tic=time.time() train_model.train() log(f"Worker {current_rank()}: Initialized") log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {}'.format( n_step, batch_size, lr_init, lr_decay_steps, lr_decay_factor)) for image,gt_label,mask in train_dataset: #learning rate decay if(step in lr_decay_steps): new_lr_decay = lr_decay_factor**(float(lr_decay_steps.index(step)+1)) lr=lr_init*new_lr_decay #optimize one step gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss=one_step(image.numpy(),gt_label.numpy(),mask.numpy(),\ train_model,step==0) #save log info periodly if((step.numpy()!=0) and (step.numpy()%log_interval)==0): tic=time.time() log('Total Loss at iteration {} / {} is: {} Learning rate {} l2_loss {} time:{}'.format( step.numpy(), n_step, total_loss, lr.numpy(), re_loss,time.time()-tic)) #save result and ckpt periodly if((step!=0) and (step%save_interval)==0 and current_rank()==0): log("saving model ckpt and result...") draw_results(image.numpy(), gt_conf.numpy(), pd_conf.numpy(), gt_paf.numpy(), pd_paf.numpy(), mask.numpy(),\ vis_dir,'train_%d_' % step) ckpt_save_path=ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") model_save_path=os.path.join(model_dir,"newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #training finished if(step==n_step): break
def parallel_train(train_model, dataset, config): '''Parallel train pipeline of PoseProposal class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval = config.log.log_interval save_interval = config.train.save_interval vis_dir = config.train.vis_dir #model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout hnei = train_model.hnei wnei = train_model.wnei model_dir = config.model.model_dir #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer print( f"parallel training using learning rate:{lr_init} batch_size:{batch_size}" ) #training dataset configure with shuffle,augmentation,and prefetch train_dataset = dataset.get_train_dataset() parts, limbs, data_format = train_model.parts, train_model.limbs, train_model.data_format paramed_map_fn = get_paramed_map_fn(hin, win, hout, wout, hnei, wnei, parts, limbs, data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(), index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(buffer_size=2) #train model configure step = tf.Variable(1, trainable=False) lr = tf.Variable(lr_init, trainable=False) opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9) ckpt = tf.train.Checkpoint(step=step, optimizer=opt, lr=lr) ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3) #load from ckpt try: ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") try: train_model.load_weights(os.path.join(model_dir, "newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") #Kungfu configure kungfu_option = config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt = PairAveragingOptimizer(opt) n_step = n_step // current_cluster_size() + 1 # KungFu #optimize one step @tf.function def one_step(image, targets, train_model, is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: delta, tx, ty, tw, th, te, te_mask = targets pc, pi, px, py, pw, ph, pe = train_model.forward(image, is_train=True) loss_rsp,loss_iou,loss_coor,loss_size,loss_limb=\ train_model.cal_loss(delta,tx,ty,tw,th,te,te_mask,pc,pi,px,py,pw,ph,pe) pd_loss = loss_rsp + loss_iou + loss_coor + loss_size + loss_limb re_loss = regulize_loss(train_model, weight_decay_factor) total_loss = pd_loss + re_loss gradients = tape.gradient(total_loss, train_model.trainable_weights) opt.apply_gradients(zip(gradients, train_model.trainable_weights)) #Kung fu if (is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) predicts = (pc, px, py, pw, ph, pe) return predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb #train each step tic = time.time() train_model.train() log(f"Worker {current_rank()}: Initialized") log(f'Start - n_step: {n_step} batch_size: {batch_size} lr_init: {lr_init} lr_decay_factor: {lr_decay_factor}' ) avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0. for image, targets in train_dataset: #learning rate decay lr = lr_init * (1 - step / n_step * lr_decay_factor) #optimize one step predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb = one_step( image, targets, train_model) avg_loss_rsp += loss_rsp / log_interval avg_loss_iou += loss_iou / log_interval avg_loss_coor += loss_coor / log_interval avg_loss_size += loss_size / log_interval avg_loss_limb += loss_limb / log_interval avg_pd_loss += pd_loss / log_interval avg_re_loss += re_loss / log_interval #save log info periodly if ((step != 0) and (step % log_interval) == 0): tic = time.time() log(f"worker:{current_rank()} Train iteration {step.numpy()}/{n_step}, learning rate:{lr.numpy()},"+\ f"loss_rsp:{avg_loss_rsp},loss_iou:{avg_loss_iou},loss_coor:{avg_loss_coor},loss_size:{avg_loss_size},"+\ f"loss_limb:{avg_loss_limb},loss_pd:{avg_pd_loss},loss_re:{avg_re_loss} ,time:{time.time()-tic}") avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0. #save result and ckpt periodly if ((step != 0) and (step % save_interval) == 0): log("saving model ckpt and result...") draw_results(image.numpy(), predicts, targets, parts, limbs, save_dir=vis_dir, name=f"ppn_step_{step.numpy()}") ckpt_save_path = ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") model_save_path = os.path.join(model_dir, "newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #training finished if (step == n_step): break
def parallel_train(train_model,dataset,config): '''Parallel train pipeline of openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = config.train.lr_decay_steps warm_up_step=8000 warm_up_decay=0.01 weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval=config.log.log_interval save_interval=config.train.save_interval vis_dir=config.train.vis_dir #model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout model_dir = config.model.model_dir pretrain_model_dir=config.pretrain.pretrain_model_dir pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer log(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}") #training dataset configure with shuffle,augmentation,and prefetch train_dataset=dataset.get_train_dataset() dataset_type=dataset.get_dataset_type() parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn,num_parallel_calls=max(multiprocessing.cpu_count()//2,1)) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(64) #train configure step=tf.Variable(1, trainable=False) lr=tf.Variable(lr_init,trainable=False) lr_init=tf.Variable(lr_init,trainable=False) opt=tf.optimizers.Adam(learning_rate=lr) ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr) ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3) #load from ckpt log("loading ckpt...") try: ckpt.restore(ckpt_manager.latest_checkpoint) log("ckpt loaded successfully!") except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone log("loading pretrained backbone...") try: tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True) log("pretrained backbone loaded successfully") except: log("pretrained backbone doesn't exist, model backbone are initialized") #load model weights log("loading saved training model weights...") try: train_model.load_weights(os.path.join(model_dir,"newest_model.npz")) log("saved training model weights loaded successfully") except: log("model_path doesn't exist, model parameters are initialized") # KungFu configure kungfu_option=config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt=PairAveragingOptimizer(opt) # KungFu adjust n_step = n_step // current_cluster_size() + 1 # KungFu for step_idx,step in enumerate(lr_decay_steps): lr_decay_steps[step_idx] = step // current_cluster_size() + 1 # KungFu for lr_decay_step in lr_decay_steps: if(step>lr_decay_step): lr=lr*lr_decay_factor #optimize one step @tf.function def one_step(image,gt_label,mask,train_model,is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: gt_pif_maps,gt_paf_maps=gt_label pd_pif_maps,pd_paf_maps=train_model.forward(image,is_train=True) loss_pif_maps,loss_paf_maps,total_loss=train_model.cal_loss(pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps) decay_loss=regulize_loss(train_model,weight_decay_factor) total_loss+=decay_loss gradients=tape.gradient(total_loss,train_model.trainable_weights) opt.apply_gradients(zip(gradients,train_model.trainable_weights)) #Kung fu if(is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) return pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss #train each step train_model.train() tic=time.time() avg_time=AvgMetric(name="time_iter",metric_interval=log_interval) #total loss metrics avg_total_loss=AvgMetric(name="total_loss",metric_interval=log_interval) #decay loss metrics avg_decay_loss=AvgMetric(name="decay_loss",metric_interval=log_interval) #pif loss metrics avg_pif_conf_loss=AvgMetric(name="pif_conf_loss",metric_interval=log_interval) avg_pif_vec_loss=AvgMetric(name="pif_vec_loss",metric_interval=log_interval) avg_pif_scale_loss=AvgMetric(name="pif_scale_loss",metric_interval=log_interval) #paf loss metrics avg_paf_conf_loss=AvgMetric(name="paf_conf_loss",metric_interval=log_interval) avg_paf_src_vec_loss=AvgMetric(name="paf_src_vec_loss",metric_interval=log_interval) avg_paf_dst_vec_loss=AvgMetric(name="paf_dst_vec_loss",metric_interval=log_interval) avg_paf_src_scale_loss=AvgMetric(name="paf_src_scale_loss",metric_interval=log_interval) avg_paf_dst_scale_loss=AvgMetric(name="paf_dst_scale_loss",metric_interval=log_interval) log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {} weight_decay_factor: {}'.format( n_step, batch_size, lr_init.numpy(), lr_decay_steps, lr_decay_factor, weight_decay_factor)) for image,gt_label,mask,labeled in train_dataset: #get losses pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss=one_step(image,gt_label,mask,train_model,step==0) loss_pif_conf,loss_pif_vec,loss_pif_scale=loss_pif_maps loss_paf_conf,loss_paf_src_vec,loss_paf_dst_vec,loss_paf_src_scale,loss_paf_dst_scale=loss_paf_maps #update metrics avg_time.update(time.time()-tic) tic=time.time() #update total losses avg_total_loss.update(total_loss) #update decay loss avg_decay_loss.update(decay_loss) #update pif_losses metrics avg_pif_conf_loss.update(loss_pif_conf) avg_pif_vec_loss.update(loss_pif_vec) avg_pif_scale_loss.update(loss_pif_scale) #update paf_losses metrics avg_paf_conf_loss.update(loss_paf_conf) avg_paf_src_vec_loss.update(loss_paf_src_vec) avg_paf_dst_vec_loss.update(loss_paf_dst_vec) avg_paf_src_scale_loss.update(loss_paf_src_scale) avg_paf_dst_scale_loss.update(loss_paf_dst_scale) #learning rate decay if(step in lr_decay_steps): new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step)+1) lr=lr_init*new_lr_decay #warm_up learning rate decay if(step <= warm_up_step): lr=lr_init*warm_up_decay**(1.0-step/warm_up_step) #save log info periodly if((step.numpy()!=0) and (step.numpy()%log_interval)==0): log(f"Train iteration {n_step} / {step.numpy()}, Learning rate:{lr.numpy()} {avg_total_loss.get_metric()} "+\ f"{avg_pif_conf_loss.get_metric()} {avg_pif_vec_loss.get_metric()} {avg_pif_scale_loss.get_metric()}"+\ f"{avg_paf_conf_loss.get_metric()} {avg_paf_src_vec_loss.get_metric()} {avg_paf_dst_vec_loss.get_metric()}"+\ f"{avg_paf_src_scale_loss.get_metric()} {avg_paf_dst_scale_loss.get_metric()} {avg_decay_loss.get_metric()} {avg_time.get_metric()}") #save result and ckpt periodly if((step.numpy()!=0) and (step.numpy()%save_interval)==0): #save ckpt log("saving model ckpt and result...") ckpt_save_path=ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") #save train model model_save_path=os.path.join(model_dir,"newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #draw result stride=train_model.stride gt_pif_maps,gt_paf_maps=gt_label draw_result(image,pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps,mask,parts,limbs,stride,save_dir=vis_dir,\ name=f"train_{step.numpy()}") #training finished if(step==n_step): break
def parallel_train(train_model, dataset, config, augmentor:BasicAugmentor, \ preprocessor:BasicPreProcessor,postprocessor:BasicPostProcessor,visualizer=BasicVisualizer): '''Single train pipeline of Openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' # train hyper params # dataset params total_step = config.train.n_step batch_size = config.train.batch_size # learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = [ 200000, 300000, 360000, 420000, 480000, 540000, 600000, 700000, 800000, 900000 ] weight_decay_factor = config.train.weight_decay_factor # log and checkpoint params log_interval = config.log.log_interval vis_interval = config.train.vis_interval save_interval = config.train.save_interval vis_dir = config.train.vis_dir # model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout parts, limbs, colors = train_model.parts, train_model.limbs, train_model.colors data_format = train_model.data_format model_dir = config.model.model_dir pretrain_model_dir = config.pretrain.pretrain_model_dir pretrain_model_path = f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" # metrics metric_manager = MetricManager() # initializing train dataset train_dataset = dataset.get_train_dataset() epoch_size = dataset.get_train_datasize() // batch_size paramed_map_fn = get_paramed_map_fn(augmentor=augmentor, preprocessor=preprocessor, data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096).repeat() train_dataset = train_dataset.map( paramed_map_fn, num_parallel_calls=get_num_parallel_calls()) train_dataset = train_dataset.batch(config.train.batch_size) train_dataset = train_dataset.prefetch(3) train_dataset_iter = iter(train_dataset) #train configure save_step = tf.Variable(1, trainable=False) save_lr = tf.Variable(lr_init, trainable=False) opt = tf.keras.optimizers.Adam(learning_rate=save_lr) domainadapt_flag = config.data.domainadapt_flag total_epoch = total_step // epoch_size #domain adaptation params if (not domainadapt_flag): ckpt = tf.train.Checkpoint(save_step=save_step, save_lr=save_lr, opt=opt) else: log("Domain adaptaion in training enabled!") # weight param lambda_adapt = 1e-4 # construct discrminator model feature_hin = train_model.hin // train_model.backbone.scale_size feature_win = train_model.win // train_model.backbone.scale_size in_channels = train_model.backbone.out_channels adapt_dis = Discriminator(feature_hin, feature_win, in_channels, data_format=data_format) opt_d = tf.keras.optimizers.Adam(learning_rate=save_lr) ckpt = tf.train.Checkpoint(save_step=save_step, save_lr=save_lr, opt=opt, opt_d=opt_d) # construct domain adaptation dataset dmadapt_train_dataset = dataset.get_dmadapt_train_dataset() paramed_dmadapt_map_fn = get_paramed_dmadapt_map_fn(augmentor) dmadapt_train_dataset = dmadapt_train_dataset.map( paramed_dmadapt_map_fn, num_parallel_calls=get_num_parallel_calls()) dmadapt_train_dataset = dmadapt_train_dataset.shuffle( buffer_size=4096).repeat() dmadapt_train_dataset = dmadapt_train_dataset.batch( config.train.batch_size) dmadapt_train_dataset = dmadapt_train_dataset.prefetch(3) dmadapt_train_dataset_iter = iter(dmadapt_train_dataset) #load from ckpt ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3) try: log("loading ckpt...") ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone try: log("loading pretrained backbone...") tl.files.load_and_assign_npz_dict(name=pretrain_model_path, network=train_model.backbone, skip=True) except: log("pretrained backbone doesn't exist, model backbone are initialized" ) #load model weights try: log("loading saved training model weights...") train_model.load_weights(os.path.join(model_dir, "newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") if (domainadapt_flag): try: log("loading saved domain adaptation discriminator weight...") adapt_dis.load_weights( os.path.join(model_dir, "newest_discriminator.npz")) except: log("discriminator path doesn't exist, discriminator parameters are initialized" ) log(f"Parallel training using learning rate:{lr_init} batch_size:{batch_size}" ) step = save_step.numpy() lr = save_lr.numpy() #import kungfu from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer total_step = total_step // current_cluster_size() + 1 # KungFu total_epoch = total_epoch // current_cluster_size() + 1 # KungFu for step_idx, decay_step in enumerate(lr_decay_steps): lr_decay_steps[ step_idx] = decay_step // current_cluster_size() + 1 # KungFu # optimize one step def optimize_step(image, mask, target_x, train_model, metric_manager: MetricManager): # tape with tf.GradientTape() as tape: predict_x = train_model.forward(x=image, is_train=True, ret_backbone=domainadapt_flag) total_loss = train_model.cal_loss(predict_x=predict_x, target_x=target_x, \ mask=mask, metric_manager=metric_manager) # optimize model gradients = tape.gradient(total_loss, train_model.trainable_weights) opt.apply_gradients(zip(gradients, train_model.trainable_weights)) return predict_x def optimize_step_dmadapt(image_src, image_dst, train_model, adapt_dis: Discriminator, metric_manager: MetricManager): # tape with tf.GradientTape(persistent=True) as tape: # feature extraction # src feature predict_src = train_model.forward(x=image_src, is_train=True, ret_backbone=True) backbone_feature_src = predict_src["backbone_features"] adapt_pd_src = adapt_dis.forward(backbone_feature_src) # dst feature predict_dst = train_model.forward(x=image_dst, is_train=True, ret_backbone=True) backbone_feature_dst = predict_dst["backbone_features"] adapt_pd_dst = adapt_dis.forward(backbone_feature_dst) # loss calculation # loss of g g_adapt_loss = adapt_dis.cal_loss(x=adapt_pd_dst, label=True) * lambda_adapt # loss of d d_adapt_loss_src = adapt_dis.cal_loss(x=adapt_pd_src, label=True) d_adapt_loss_dst = adapt_dis.cal_loss(x=adapt_pd_dst, label=False) d_adapt_loss = (d_adapt_loss_src + d_adapt_loss_dst) / 2 # optimize model g_gradient = tape.gradient(g_adapt_loss, train_model.trainable_weights) opt.apply_gradients(zip(g_gradient, train_model.trainable_weights)) metric_manager.update("model/g_adapt_loss", g_adapt_loss) # optimize dis d_gradients = tape.gradient(d_adapt_loss, adapt_dis.trainable_weights) opt_d.apply_gradients(zip(d_gradients, adapt_dis.trainable_weights)) metric_manager.update("dis/d_adapt_loss_src", d_adapt_loss_src) metric_manager.update("dis/d_adapt_loss_dst", d_adapt_loss_dst) # delete persistent tape del tape return predict_dst # formal training procedure # KungFu configure kungfu_option = config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt = PairAveragingOptimizer(opt) train_model.train() cur_epoch = step // epoch_size + 1 log(f"Start Training- total_epoch: {total_epoch} total_step: {total_step} current_epoch:{cur_epoch} "\ +f"current_step:{step} batch_size:{batch_size} lr_init:{lr_init} lr_decay_steps:{lr_decay_steps} "\ +f"lr_decay_factor:{lr_decay_factor} weight_decay_factor:{weight_decay_factor}" ) for epoch_idx in range(cur_epoch, total_epoch): log(f"Epoch {epoch_idx}/{total_epoch}:") for _ in tqdm(range(0, epoch_size)): step += 1 metric_manager.start_timing() image, mask, target_list = next(train_dataset_iter) # extract gt_label target_list = [ cPickle.loads(target) for target in target_list.numpy() ] target_x = {key: [] for key, value in target_list[0].items()} target_x = reduce( lambda x, y: {key: x[key] + [y[key]] for key, value in x.items()}, [target_x] + target_list) target_x = { key: np.stack(value) for key, value in target_x.items() } target_x = to_tensor_dict(target_x) # learning rate decay if (step in lr_decay_steps): new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step) + 1) lr = lr_init * new_lr_decay # optimize one step predict_x = optimize_step(image, mask, target_x, train_model, metric_manager) # optimize domain adaptation if (domainadapt_flag): src_image = image dst_image = next(dmadapt_train_dataset_iter) predict_dst = optimize_step_dmadapt(src_image, dst_image, train_model, adapt_dis, metric_manager) if (step == 1): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) # log info periodly if ((step != 0) and (step % log_interval) == 0): log(f"Train Epoch={epoch_idx} / {total_epoch}, Step={step} / {total_step}: learning_rate: {lr:.6e} {metric_manager.report_timing()}\n"\ +f"{metric_manager.report_train()} ") # visualize periodly if ((step != 0) and (step % vis_interval) == 0 and current_rank() == 0): log(f"Visualizing prediction maps and target maps") visualizer.visual_compare(image_batch=image.numpy(), mask_batch=mask.numpy(), predict_x=predict_x, target_x=target_x,\ name=f"train_{step}") # save result and ckpt periodly if ((step != 0) and (step % save_interval) == 0 and current_rank() == 0): # save ckpt log("saving model ckpt and result...") save_step.assign(step) save_lr.assign(lr) ckpt_save_path = ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") # save train model model_save_path = os.path.join(model_dir, "newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") # save discriminator model if (domainadapt_flag): dis_save_path = os.path.join(model_dir, "newest_discriminator.npz") adapt_dis.save_weights(dis_save_path) log(f"discriminator save_path:{dis_save_path} saved!\n")
model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes, activation='softmax')) # KungFu: adjust learning rate based on number of GPUs. opt = keras.optimizers.Adadelta(1.0 * current_cluster_size()) # KungFu: wrap distributed optimizers. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt, with_keras=True) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt, with_keras=True) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt, with_keras=True) else: raise RuntimeError('unknown optimizer: %s' % name) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) callbacks = [BroadcastGlobalVariablesCallback(with_keras=True)] # KungFu: save checkpoints only on worker 0 to prevent other workers from corrupting them. if current_rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) model.fit(x_train,
args.cuda = not args.no_cuda reshape = 1 if args.reshape_on else 0 # Set up standard model. model = getattr(applications, args.model)(weights=None) # opt = tf.optimizers.SGD(0.01) opt = tf.compat.v1.train.GradientDescentOptimizer(0.01) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt,reshape=args.reshape_on, use_locking=True) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) #match this to resnet KF else: raise RuntimeError('Unknown KungFu optimizer') data = tf.random.uniform([args.batch_size, 224, 224, 3]) target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) @tf.function def benchmark_step(first_batch): # reshape strategy here # reshape_strategy(reshape) # gradient calculation and updates
def parallel_train(training_dataset, kungfu_option): from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer ds = training_dataset.shuffle(buffer_size=4096) ds = ds.shard(num_shards=current_cluster_size(), index=current_rank()) ds = ds.repeat(n_epoch) ds = ds.map(_map_fn, num_parallel_calls=4) ds = ds.batch(batch_size) ds = ds.prefetch(buffer_size=1) iterator = ds.make_one_shot_iterator() one_element = iterator.get_next() net, total_loss, log_tensors = make_model(*one_element, is_train=True, reuse=False) x_ = net.img # net input last_conf = net.last_conf # net output last_paf = net.last_paf # net output confs_ = net.confs # GT pafs_ = net.pafs # GT mask = net.m1 # mask1, GT # net.m2 = m2 # mask2, GT stage_losses = net.stage_losses l2_loss = net.l2_loss global_step = tf.Variable(1, trainable=False) # scaled_lr = lr_init * current_cluster_size() # Horovod: scale the learning rate linearly scaled_lr = lr_init # Linear scaling rule is not working in openpose training. with tf.variable_scope('learning_rate'): lr_v = tf.Variable(scaled_lr, trainable=False) opt = tf.train.MomentumOptimizer(lr_v, 0.9) # KungFu if kungfu_option == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif kungfu_option == 'async-sgd': opt = PairAveragingOptimizer(opt) elif kungfu_option == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown distributed training optimizer.') train_op = opt.minimize(total_loss, global_step=global_step) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Add variable initializer. init = tf.global_variables_initializer() # KungFu from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp bcast = BroadcastGlobalVariablesOp() global n_step, lr_decay_every_step n_step = n_step // current_cluster_size() + 1 # KungFu lr_decay_every_step = lr_decay_every_step // current_cluster_size( ) + 1 # KungFu # Start training with tf.Session(config=config) as sess: init.run() bcast.run() # KungFu print('Worker{}: Initialized'.format(current_rank())) print( 'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}' .format(current_rank(), n_step, batch_size, lr_init, lr_decay_every_step)) # restore pre-trained weights try: # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net) tl.files.load_and_assign_npz_dict(sess=sess, name=os.path.join( model_path, 'pose.npz')) except: print("no pre-trained model") # train until the end while True: step = sess.run(global_step) if step == n_step: break tic = time.time() if step != 0 and (step % lr_decay_every_step == 0): new_lr_decay = lr_decay_factor**(step // lr_decay_every_step) sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay)) [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \ sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf]) # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time())) lr = sess.run(lr_v) print( 'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s' .format(current_rank(), step, n_step, _loss, lr, _l2, time.time() - tic)) for ix, ll in enumerate(_stage_losses): print('Worker{}:', current_rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll) # save intermediate results and model if current_rank() == 0: # KungFu if (step != 0) and (step % save_interval == 0): # save some results [ img_out, confs_ground, pafs_ground, conf_result, paf_result, mask_out ] = sess.run( [x_, confs_, pafs_, last_conf, last_paf, mask]) draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out, 'train_%d_' % step) # save model # tl.files.save_npz( # net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess) # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose' + str(step) + '.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose.npz'), sess=sess)