parser.add_argument('--quant-delay', type=int, default=-1) args = parser.parse_args() modelpath = logpath = '../models/train/' if args.gpus <= 0: raise Exception('gpus <= 0') # define input placeholder set_network_input_wh(args.input_width, args.input_height) scale = 4 if args.model in ['cmu', 'vgg'] or 'mobilenet' in args.model: scale = 8 set_network_scale(scale) output_w, output_h = args.input_width // scale, args.input_height // scale logger.info('define model+') with tf.device(tf.DeviceSpec(device_type="CPU")): input_node = tf.placeholder(tf.float32, shape=(args.batchsize, args.input_height, args.input_width, 3), name='image') vectmap_node = tf.placeholder(tf.float32, shape=(args.batchsize, output_h, output_w, 38), name='vectmap') heatmap_node = tf.placeholder(tf.float32, shape=(args.batchsize, output_h, output_w, 19), name='heatmap') # prepare data df = get_dataflow_batch(args.datapath, True, args.batchsize, img_path=args.imgpath) enqueuer = DataFlowToQueue(df, [input_node, heatmap_node, vectmap_node], queue_size=100) q_inp, q_heat, q_vect = enqueuer.dequeue() df_valid = get_dataflow_batch(args.datapath, False, args.batchsize, img_path=args.imgpath) df_valid.reset_state()
self.close_op.run() except Exception: pass logger.info("{} Exited.".format(self.name)) def dequeue(self): return self.queue.dequeue() if __name__ == '__main__': os.environ['CUDA_VISIBLE_DEVICES'] = '' from pose_augment import set_network_input_wh, set_network_scale # set_network_input_wh(368, 368) set_network_input_wh(480, 320) set_network_scale(8) # df = get_dataflow('/data/public/rw/coco/annotations', True, '/data/public/rw/coco/') df = _get_dataflow_onlyread('/data/public/rw/coco/annotations', True, '/data/public/rw/coco/') # df = get_dataflow('/root/coco/annotations', False, img_path='http://gpu-twg.kakaocdn.net/braincloud/COCO/') from tensorpack.dataflow.common import TestDataSpeed TestDataSpeed(df).start() sys.exit(0) with tf.Session() as sess: df.reset_state() t1 = time.time() for idx, dp in enumerate(df.get_data()): if idx == 0:
def train(): parser = argparse.ArgumentParser( description='Training codes for Openpose using Tensorflow') parser.add_argument('--batch_size', type=str, default=10) parser.add_argument('--continue_training', type=bool, default=False) parser.add_argument('--checkpoint_path', type=str, default='checkpoints/train/mn_sepconv_33') # parser.add_argument('--backbone_net_ckpt_path', type=str, default='checkpoints/vgg/vgg_19.ckpt') parser.add_argument( '--backbone_net_ckpt_path', type=str, default='checkpoints/mobilenet/mobilenet_v2_1.0_96.ckpt') parser.add_argument('--train_vgg', type=bool, default=True) parser.add_argument('--annot_path', type=str, default='./COCO/annotations/') parser.add_argument('--img_path', type=str, default='./COCO/images/') # parser.add_argument('--annot_path_val', type=str, # default='/run/user/1000/gvfs/smb-share:server=192.168.1.2,share=data/yzy/dataset/' # 'Realtime_Multi-Person_Pose_Estimation-master/training/dataset/COCO/annotations/' # 'person_keypoints_val2017.json') # parser.add_argument('--img_path_val', type=str, # default='/run/user/1000/gvfs/smb-share:server=192.168.1.2,share=data/yzy/dataset/' # 'Realtime_Multi-Person_Pose_Estimation-master/training/dataset/COCO/images/val2017/') parser.add_argument('--save_checkpoint_frequency', type=str, default=1000) parser.add_argument('--save_summary_frequency', type=str, default=100) parser.add_argument('--stage_num', type=str, default=6) parser.add_argument('--hm_channels', type=str, default=19) parser.add_argument('--paf_channels', type=str, default=38) parser.add_argument('--input-width', type=int, default=368) parser.add_argument('--input-height', type=int, default=368) parser.add_argument('--max_echos', type=str, default=5) parser.add_argument('--use_bn', type=bool, default=False) parser.add_argument('--loss_func', type=str, default='l2') args = parser.parse_args() if not args.continue_training: start_time = time.localtime(time.time()) checkpoint_path = args.checkpoint_path + ('%d-%d-%d-%d-%d-%d' % start_time[0:6]) os.mkdir(checkpoint_path) else: checkpoint_path = args.checkpoint_path logger = logging.getLogger('train') logger.setLevel(logging.DEBUG) fh = logging.FileHandler(checkpoint_path + '/train_log.log') fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( '[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) logger.addHandler(ch) logger.addHandler(fh) logger.info(args) logger.info('checkpoint_path: ' + checkpoint_path) # define input placeholder with tf.name_scope('inputs'): raw_img = tf.placeholder(tf.float32, shape=[args.batch_size, 368, 368, 3]) # mask_hm = tf.placeholder(dtype=tf.float32, shape=[args.batch_size, 46, 46, args.hm_channels]) # mask_paf = tf.placeholder(dtype=tf.float32, shape=[args.batch_size, 46, 46, args.paf_channels]) hm = tf.placeholder(dtype=tf.float32, shape=[args.batch_size, 46, 46, args.hm_channels]) paf = tf.placeholder( dtype=tf.float32, shape=[args.batch_size, 46, 46, args.paf_channels]) # defien data loader logger.info('initializing data loader...') set_network_input_wh(args.input_width, args.input_height) scale = 8 set_network_scale(scale) df = get_dataflow_batch(args.annot_path, True, args.batch_size, img_path=args.img_path) steps_per_echo = df.size() enqueuer = DataFlowToQueue(df, [raw_img, hm, paf], queue_size=100) q_inp, q_heat, q_vect = enqueuer.dequeue() q_inp_split, q_heat_split, q_vect_split = tf.split(q_inp, 1), tf.split( q_heat, 1), tf.split(q_vect, 1) img_normalized = q_inp_split[0] / 255 - 0.5 # [-0.5, 0.5] df_valid = get_dataflow_batch(args.annot_path, False, args.batch_size, img_path=args.img_path) df_valid.reset_state() validation_cache = [] logger.info('initializing model...') # define vgg19 # with slim.arg_scope(vgg.vgg_arg_scope()): # vgg_outputs, end_points = vgg.vgg_19(img_normalized) # with slim.arg_scope(mobilenet_v2.training_scope(is_training=False)): # logits, endpoints = mobilenet_v2.mobilenet(img_normalized) layers = {} name = "" with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): logits, endpoints = mobilenet_v2.mobilenet(img_normalized) for k, tensor in sorted(list(endpoints.items()), key=lambda x: x[0]): layers['%s%s' % (name, k)] = tensor # print(k, tensor.shape) def upsample(input, target): return tf.image.resize_bilinear( input, tf.constant([target.shape[1].value, target.shape[2].value]), align_corners=False) mobilenet_feature = tf.concat([ layers['layer_7/output'], upsample(layers['layer_14/output'], layers['layer_7/output']) ], 3) # pdb.set_trace() # get net graph net = PafNet(inputs_x=mobilenet_feature, stage_num=args.stage_num, hm_channel_num=args.hm_channels, use_bn=args.use_bn) hm_pre, paf_pre, added_layers_out = net.gen_net() # two kinds of loss losses = [] with tf.name_scope('loss'): for idx, (l1, l2), in enumerate(zip(hm_pre, paf_pre)): if args.loss_func == 'square': hm_loss = tf.reduce_sum( tf.square(tf.concat(l1, axis=0) - q_heat_split[0])) paf_loss = tf.reduce_sum( tf.square(tf.concat(l2, axis=0) - q_vect_split[0])) losses.append(tf.reduce_sum([hm_loss, paf_loss])) logger.info('use square loss') else: hm_loss = tf.nn.l2_loss( tf.concat(l1, axis=0) - q_heat_split[0]) paf_loss = tf.nn.l2_loss( tf.concat(l2, axis=0) - q_vect_split[0]) losses.append(tf.reduce_mean([hm_loss, paf_loss])) logger.info('use l2 loss') loss = tf.reduce_sum(losses) / args.batch_size global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = tf.train.exponential_decay(1e-4, global_step, steps_per_echo, 0.5, staircase=True) trainable_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='openpose_layers') if args.train_vgg: trainable_var_list = trainable_var_list + tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='MobilenetV2') with tf.name_scope('train'): train = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-8).minimize( loss=loss, global_step=global_step, var_list=trainable_var_list) logger.info('initialize saver...') restorer = tf.train.Saver(tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='MobilenetV2'), name='mobilenet_restorer') saver = tf.train.Saver(trainable_var_list) logger.info('initialize tensorboard') tf.summary.scalar("lr", learning_rate) tf.summary.scalar("loss2", loss) tf.summary.histogram('img_normalized', img_normalized) tf.summary.histogram('mobilenet_outputs', logits) tf.summary.histogram('added_layers_out', added_layers_out) tf.summary.image('mobilenet_out', tf.transpose(logits[0:1, :, :, :], perm=[3, 1, 2, 0]), max_outputs=512) tf.summary.image('added_layers_out', tf.transpose(added_layers_out[0:1, :, :, :], perm=[3, 1, 2, 0]), max_outputs=128) tf.summary.image('paf_gt', tf.transpose(q_vect_split[0][0:1, :, :, :], perm=[3, 1, 2, 0]), max_outputs=38) tf.summary.image('hm_gt', tf.transpose(q_heat_split[0][0:1, :, :, :], perm=[3, 1, 2, 0]), max_outputs=19) for i in range(args.stage_num): tf.summary.image('hm_pre_stage_%d' % i, tf.transpose(hm_pre[i][0:1, :, :, :], perm=[3, 1, 2, 0]), max_outputs=19) tf.summary.image('paf_pre_stage_%d' % i, tf.transpose(paf_pre[i][0:1, :, :, :], perm=[3, 1, 2, 0]), max_outputs=38) tf.summary.image('input', img_normalized, max_outputs=4) logger.info('initialize session...') merged = tf.summary.merge_all() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: writer = tf.summary.FileWriter(checkpoint_path, sess.graph) sess.run(tf.group(tf.global_variables_initializer())) if args.backbone_net_ckpt_path is not None: logger.info('restoring mobilenet weights from %s' % args.backbone_net_ckpt_path) restorer.restore(sess, args.backbone_net_ckpt_path) if args.continue_training: saver.restore( sess, tf.train.latest_checkpoint(checkpoint_dir=checkpoint_path)) logger.info('restoring from checkpoint...') logger.info('start training...') coord = tf.train.Coordinator() enqueuer.set_coordinator(coord) enqueuer.start() while True: best_checkpoint = float('inf') for _ in tqdm(range(steps_per_echo), ): total_loss, _, gs_num = sess.run([loss, train, global_step]) echo = gs_num / steps_per_echo if gs_num % args.save_summary_frequency == 0: total_loss, gs_num, summary, lr = sess.run( [loss, global_step, merged, learning_rate]) writer.add_summary(summary, gs_num) logger.info('echos=%f, setp=%d, total_loss=%f, lr=%f' % (echo, gs_num, total_loss, lr)) if gs_num % args.save_checkpoint_frequency == 0: valid_loss = 0 if len(validation_cache) == 0: for images_test, heatmaps, vectmaps in tqdm( df_valid.get_data()): validation_cache.append( (images_test, heatmaps, vectmaps)) df_valid.reset_state() del df_valid df_valid = None for images_test, heatmaps, vectmaps in validation_cache: valid_loss += sess.run(loss, feed_dict={ q_inp: images_test, q_vect: vectmaps, q_heat: heatmaps }) if valid_loss / len(validation_cache) <= best_checkpoint: best_checkpoint = valid_loss / len(validation_cache) saver.save(sess, save_path=checkpoint_path + '/' + 'model', global_step=gs_num) logger.info( 'best_checkpoint = %f, saving checkpoint to ' % best_checkpoint + checkpoint_path + '/' + 'model-%d' % gs_num) else: logger.info('loss = %f drop' % valid_loss / len(validation_cache)) if echo >= args.max_echos: sess.close() return 0
from pose_dataset import get_dataflow_batch from pose_augment import set_network_input_wh, set_network_scale if __name__ == '__main__': """ OpenPose Data Preparation might be a bottleneck for training. You can run multiple workers to generate input batches in multi-nodes to make training process faster. """ parser = argparse.ArgumentParser( description='Worker for preparing input batches.') parser.add_argument('--datapath', type=str, default='/coco/annotations/') parser.add_argument('--imgpath', type=str, default='/coco/') parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--train', type=bool, default=True) parser.add_argument('--master', type=str, default='tcp://csi-cluster-gpu20.dakao.io:1027') parser.add_argument('--input-width', type=int, default=368) parser.add_argument('--input-height', type=int, default=368) parser.add_argument('--scale-factor', type=int, default=2) args = parser.parse_args() set_network_input_wh(args.input_width, args.input_height) set_network_scale(args.scale_factor) df = get_dataflow_batch(args.datapath, args.train, args.batchsize, args.imgpath) send_dataflow_zmq(df, args.master, hwm=10)
import argparse from tensorpack.dataflow.remote import send_dataflow_zmq from pose_dataset import get_dataflow_batch from pose_augment import set_network_input_wh, set_network_scale if __name__ == '__main__': """ OpenPose Data Preparation might be a bottleneck for training. You can run multiple workers to generate input batches in multi-nodes to make training process faster. """ parser = argparse.ArgumentParser(description='Worker for preparing input batches.') parser.add_argument('--datapath', type=str, default='/coco/annotations/') parser.add_argument('--imgpath', type=str, default='/coco/') parser.add_argument('--batchsize', type=int, default=64) parser.add_argument('--train', type=bool, default=True) parser.add_argument('--master', type=str, default='tcp://csi-cluster-gpu20.dakao.io:1027') parser.add_argument('--input-width', type=int, default=368) parser.add_argument('--input-height', type=int, default=368) parser.add_argument('--scale-factor', type=int, default=2) args = parser.parse_args() set_network_input_wh(args.input_width, args.input_height) set_network_scale(args.scale_factor) df = get_dataflow_batch(args.datapath, args.train, args.batchsize, args.imgpath) send_dataflow_zmq(df, args.master, hwm=10)
parser.add_argument('--input-width', type=int, default=368) parser.add_argument('--input-height', type=int, default=368) args = parser.parse_args() if args.gpus <= 0: raise Exception('gpus <= 0') # define input placeholder set_network_input_wh(args.input_width, args.input_height) scale = 4 if args.model in ['cmu', 'vgg', 'mobilenet_thin', 'mobilenet_try', 'mobilenet_try2', 'mobilenet_try3', 'hybridnet_try']: scale = 8 set_network_scale(scale) output_w, output_h = args.input_width // scale, args.input_height // scale logger.info('define model+') with tf.device(tf.DeviceSpec(device_type="GPU", device_index=0)): input_node = tf.placeholder(tf.float32, shape=(args.batchsize, args.input_height, args.input_width, 3), name='image') vectmap_node = tf.placeholder(tf.float32, shape=(args.batchsize, output_h, output_w, 38), name='vectmap') heatmap_node = tf.placeholder(tf.float32, shape=(args.batchsize, output_h, output_w, 19), name='heatmap') # prepare data if not args.remote_data: df = get_dataflow_batch(args.datapath, True, args.batchsize, img_path=args.imgpath) else: # transfer inputs from ZMQ df = RemoteDataZMQ(args.remote_data, hwm=3) enqueuer = DataFlowToQueue(df, [input_node, heatmap_node, vectmap_node], queue_size=100)