Example #1
0
def train_ops(points, covars, labels, true_state, is_training, num_classes, voxel_num, epoch_batch_num):
    ops={}
    with tf.device('/cpu:0'):
        global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(0), trainable=False)

        decay_steps=epoch_batch_num*FLAGS.decay_epoch
        lr=tf.train.exponential_decay(FLAGS.lr_init,global_step,decay_steps,FLAGS.decay_rate,staircase=True)
        lr=tf.maximum(FLAGS.lr_clip,lr)
        tf.summary.scalar('learning rate',lr)

        decay_steps=epoch_batch_num*FLAGS.recon_decay_epoch
        recon_loss_ratio=tf.train.exponential_decay(1.0,global_step,decay_steps,FLAGS.recon_decay_rate,staircase=True)
        tf.summary.scalar('reconstruction loss ratio',recon_loss_ratio)


        opt=tf.train.AdamOptimizer(lr)

        with tf.name_scope('split_data'):
            tower_points=tf.split(points, FLAGS.num_gpus)
            tower_covars=tf.split(covars,FLAGS.num_gpus)
            tower_labels=tf.split(labels,FLAGS.num_gpus)
            tower_true_state=tf.split(true_state,FLAGS.num_gpus)

        reuse=False
        tower_grads=[]
        tower_recon_grads=[]
        tower_losses,tower_recon_losses=[],[]
        tower_logits=[]
        tower_voxel_state=[]
        for i in range(FLAGS.num_gpus):
            with tf.device('/gpu:{}'.format(i)):
                with tf.name_scope('tower_{}'.format(i)):
                    loss,recon_loss,logits,voxel_state\
                        =tower_loss(tower_points[i], tower_covars[i],tower_labels[i],
                                    tower_true_state[i], is_training, num_classes,voxel_num,reuse)

                    grad=opt.compute_gradients(loss+(recon_loss*recon_loss_ratio))
                    tower_grads.append(grad)

                    all_var=tf.trainable_variables()
                    recon_var=[var for var in all_var if var.name.startswith('point_mlp') or \
                                var.name.startswith('fc') or var.name.startswith('voxel_state')]
                    recon_grad=opt.compute_gradients(recon_loss*recon_loss_ratio,var_list=recon_var)
                    tower_recon_grads.append(recon_grad)

                    tower_losses.append(loss)
                    tower_recon_losses.append(recon_loss)
                    tower_logits.append(logits)
                    tower_voxel_state.append(voxel_state)
                    update_op=tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                    reuse=True

        avg_grad=average_gradients(tower_grads)
        avg_recon_grad=average_gradients(tower_recon_grads)

        with tf.control_dependencies(update_op):
            apply_grad_op=opt.apply_gradients(avg_grad,global_step=global_step)

        apply_recon_grad_op=opt.apply_gradients(avg_recon_grad)

        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        summary_op = tf.summary.merge(summaries)

        total_loss_op=tf.add_n(tower_losses)/FLAGS.num_gpus
        total_recon_loss_op=tf.add_n(tower_recon_losses)/FLAGS.num_gpus

        logits_op=tf.concat(tower_logits,axis=0)
        preds_op=tf.argmax(logits_op,axis=1)
        correct_num_op=tf.reduce_sum(tf.cast(tf.equal(preds_op,labels),tf.float32))

        voxel_state_op=tf.concat(tower_voxel_state,axis=0)

        ops['total_loss']=total_loss_op
        ops['total_recon_loss']=total_recon_loss_op
        ops['apply_grad']=apply_grad_op
        ops['apply_recon_grad']=apply_recon_grad_op
        ops['logits']=logits_op
        ops['preds']=preds_op
        ops['correct_num']=correct_num_op
        ops['summary']=summary_op
        ops['global_step']=global_step
        ops['voxel_state']=voxel_state_op

    return ops
Example #2
0
def train_ops(points, covars, grids, epoch_batch_num):
    ops = {}
    with tf.device('/cpu:0'):
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        decay_steps = epoch_batch_num * FLAGS.decay_epoch
        lr = tf.train.exponential_decay(FLAGS.lr_init,
                                        global_step,
                                        decay_steps,
                                        FLAGS.decay_rate,
                                        staircase=True)

        decay_steps = epoch_batch_num * 15
        reverse_color_ratio = tf.train.exponential_decay(0.99,
                                                         global_step,
                                                         decay_steps,
                                                         0.9,
                                                         staircase=True)

        color_ratio = tf.constant(1.0, tf.float32) - reverse_color_ratio

        tf.summary.scalar('learning rate', lr)
        tf.summary.scalar('color_ratio', color_ratio)

        opt = tf.train.AdamOptimizer(lr)

        tower_points = tf.split(points, FLAGS.num_gpus)
        tower_covars = tf.split(covars, FLAGS.num_gpus)

        reuse = False
        tower_grads = []
        tower_losses = []
        tower_gen_pts = []
        for i in range(FLAGS.num_gpus):
            with tf.device('/gpu:{}'.format(i)):
                with tf.name_scope('tower_{}'.format(i)):
                    loss, gen_pts = tower_loss(tower_points[i],
                                               tower_covars[i], grids,
                                               color_ratio, reuse, 3)
                    # print tf.trainable_variables()
                    grad = opt.compute_gradients(loss,
                                                 tf.trainable_variables())
                    tower_grads.append(grad)
                    tower_losses.append(loss)
                    tower_gen_pts.append(gen_pts)

                    reuse = True

        avg_grad = average_gradients(tower_grads)
        update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_op):
            apply_grad_op = opt.apply_gradients(avg_grad,
                                                global_step=global_step)

        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        summary_op = tf.summary.merge(summaries)

        total_loss_op = tf.add_n(tower_losses)

        gen_pts_op = tf.concat(tower_gen_pts, axis=0)

        ops['total_loss'] = total_loss_op
        ops['apply_grad'] = apply_grad_op
        ops['gen_pts'] = gen_pts_op
        ops['summary'] = summary_op
        ops['global_step'] = global_step

    return ops
Example #3
0
def train_ops(feats, labels, is_training, epoch_batch_num):
    ops = {}
    with tf.device('/cpu:0'):
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        decay_steps = epoch_batch_num * FLAGS.decay_epoch
        lr = tf.train.exponential_decay(FLAGS.lr_init, global_step,
                                        decay_steps, FLAGS.decay_rate, True)
        tf.summary.scalar('learning rate', lr)

        opt = tf.train.AdamOptimizer(lr)

        tower_feats = tf.split(feats, FLAGS.num_gpus)
        tower_labels = tf.split(labels, FLAGS.num_gpus)

        reuse = False
        tower_grads = []
        tower_losses = []
        tower_logits = []
        for i in range(FLAGS.num_gpus):
            with tf.device('/gpu:{}'.format(i)):
                with tf.name_scope('tower_{}'.format(i)):
                    loss, logits = tower_loss(tower_feats[i], tower_labels[i],
                                              FLAGS.num_classes, is_training,
                                              reuse)
                    # print tf.trainable_variables()
                    grad = opt.compute_gradients(loss,
                                                 tf.trainable_variables())
                    tower_grads.append(grad)
                    tower_losses.append(loss)
                    tower_logits.append(logits)

                    batchnorm_updates = tf.get_collection(
                        tf.GraphKeys.UPDATE_OPS)
                    # print batchnorm_updates
                    reuse = True

        # todo: the batchnorm updates will copy to another gpu?
        avg_grad = average_gradients(tower_grads)
        with tf.control_dependencies(batchnorm_updates):
            apply_grad_op = opt.apply_gradients(avg_grad,
                                                global_step=global_step)

        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        summary_op = tf.summary.merge(summaries)

        preds = tf.argmax(tf.concat(tower_logits, axis=0), axis=1)
        correct_pred_op = tf.reduce_sum(
            tf.cast(tf.equal(preds, labels), tf.float32))
        total_loss_op = tf.add_n(tower_losses)

        ops['correct_num'] = correct_pred_op
        ops['total_loss'] = total_loss_op
        ops['apply_grad'] = apply_grad_op
        ops['summary'] = summary_op
        ops['global_step'] = global_step
        ops['preds'] = preds

    return ops
Example #4
0
def train_ops(points, labels, is_training, num_classes, epoch_batch_num):
    ops = {}
    with tf.device('/cpu:0'):
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        decay_steps = epoch_batch_num * FLAGS.decay_epoch
        lr = tf.train.exponential_decay(FLAGS.lr_init,
                                        global_step,
                                        decay_steps,
                                        FLAGS.decay_rate,
                                        staircase=True)
        lr = tf.maximum(FLAGS.lr_clip, lr)
        tf.summary.scalar('learning rate', lr)

        opt = tf.train.AdamOptimizer(lr)

        reuse = False
        tower_grads = []
        tower_losses = []
        tower_logits = []
        for i in range(FLAGS.num_gpus):
            with tf.device('/gpu:{}'.format(i)):
                with tf.name_scope('tower_{}'.format(i)):
                    # print points[i],labels[i]
                    loss, logits = tower_loss(points[i], labels[i],
                                              is_training, num_classes, reuse)

                    tower_grads.append(opt.compute_gradients(loss))
                    tower_losses.append(loss)
                    tower_logits.append(tf.squeeze(logits, axis=0))
                    update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    reuse = True

        avg_grad = average_gradients(tower_grads)

        with tf.control_dependencies(update_op):
            apply_grad_op = tf.group(
                opt.apply_gradients(avg_grad, global_step=global_step))

        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        summary_op = tf.summary.merge(summaries)

        total_loss_op = tf.add_n(tower_losses) / FLAGS.num_gpus

        logits_op = tf.concat(tower_logits, axis=0)
        preds_op = tf.argmax(logits_op, axis=1)

        flatten_labels = []
        for i in xrange(FLAGS.num_gpus):
            flatten_labels.append(
                tf.squeeze(labels[i],
                           axis=0,
                           name='squeeze_labels_{}'.format(i)))

        flatten_labels = tf.concat(flatten_labels, axis=0)
        correct_num_op = tf.reduce_sum(
            tf.cast(tf.equal(preds_op, flatten_labels), tf.float32))

        ops['total_loss'] = total_loss_op
        ops['apply_grad'] = apply_grad_op
        ops['logits'] = logits_op
        ops['preds'] = preds_op
        ops['correct_num'] = correct_num_op
        ops['summary'] = summary_op
        ops['global_step'] = global_step

    return ops
Example #5
0
def train(cfg, logger, model_name):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    model_dir = os.path.join(checkpoint_dir, model_name)
    print('[!!!] model name:{}'.format(model_dir))
    logger.info('[!!!] model name:{}'.format(model_dir))
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    ckpt_path = os.path.join(model_dir, model_name)
    best_model_ckpt_dir = os.path.join(model_dir, 'best_model')

    if not os.path.exists(best_model_ckpt_dir):
        os.makedirs(best_model_ckpt_dir)
    best_ckpt_path = os.path.join(best_model_ckpt_dir, 'best_model')

    trick_model_ckpt_dir = os.path.join(model_dir, 'trick_model')
    if not os.path.exists(trick_model_ckpt_dir):
        os.makedirs(trick_model_ckpt_dir)
    trick_ckpt_path = os.path.join(trick_model_ckpt_dir, 'trick_model')

    lr = cfg.lr

    with tf.device('/cpu:0'):
        g_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='g_step')

        assert cfg.opt in ['Adam', 'SGD'], '[!!!] wrong optimizer name'
        if cfg.opt == 'Adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                               name='optimizer')
        elif cfg.opt == 'SGD':
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr,
                                                          name='optimizer')
        else:
            print('[!!!] wrong optimizer name')

        small_batch = cfg.batch_size // gpu_nums
        iterator = data_iterator(args.dataset,
                                 'train',
                                 cfg,
                                 small_batch,
                                 tfrecord_root_dir=tfrecord_root_dir,
                                 logger=logger)
        # generator_tower_grads
        tower_grads = []

        mae_losses = []
        mse_losses = []
        losses = []

        for i in range(gpu_nums):
            with tf.device('/gpu:{}'.format(i)):
                with tf.name_scope('tower_{}'.format(i)) as scope:
                    images, labels = iterator.get_next()
                    # labels_resized = tf.image.resize_images(labels,
                    #                                         [img_rows // fac, img_cols // fac])

                    with tf.variable_scope('model', reuse=(i > 0)):
                        model_b = CSRNet(cfg, images, small_batch, 'b')
                        # [batch, h, w, c]

                    outputs = model_b.output
                    if i == 0:
                        # print model
                        model_b.full_model.summary()
                        model_b.full_model.summary(print_fn=logger.info)

                    loss = compute_euclidean_distance(outputs, labels)
                    losses.append(loss)
                    mae_loss = compute_mae_error(outputs, labels)
                    mae_losses.append(mae_loss)
                    mse_loss = compute_mse_error(outputs, labels)
                    mse_losses.append(mse_loss)

                    # # 重用variable
                    tf.get_variable_scope().reuse_variables()

                    # add summaries
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)
                    summaries.append(
                        tf.summary.scalar(tensor=loss, name='loss'))
                    summaries.append(
                        tf.summary.scalar(tensor=mae_loss, name='mae_loss'))
                    summaries.append(
                        tf.summary.scalar(tensor=mse_loss, name='mse_loss'))
                    summaries.append(
                        tf.summary.image(tensor=images, name='images'))
                    summaries.append(
                        tf.summary.image(tensor=tf.map_fn(
                            lambda img: colorize(img, cmap='jet'), labels),
                                         name='label'))
                    summaries.append(
                        tf.summary.image(tensor=tf.map_fn(
                            lambda img: colorize(img, cmap='jet'),
                            tf.image.resize_images(outputs, [224, 224])),
                                         name='outputs'))

                    if fine_tuned == 1:
                        vars = [
                            var for var in tf.trainable_variables()
                            if "dil" in var.name
                        ]
                    else:
                        train_vars = tf.trainable_variables()
                        vars = [
                            var for var in train_vars if "model" in var.name
                        ]

                        # equal: vars2 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                        #                           scope='tower_{}/model'.format(i))

                    grads = optimizer.compute_gradients(loss, var_list=vars)
                    tower_grads.append(grads)

        # 计算所有loss
        average_loss = average_losses(losses)
        average_mae_loss = average_losses(mae_losses)
        average_mse_loss = average_losses(mse_losses)
        # cpu 上计算平均梯度
        grads = average_gradients(tower_grads)

        # 更新
        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            # calculate gradients
            train_op = optimizer.apply_gradients(grads, global_step=g_step)

        # add history for variables and gradients in genrator
        for var in vars:
            summaries.append(tf.summary.histogram('Model/' + var.op.name, var))

        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram('Model/' + var.op.name + '/gradients',
                                         grad))

        # create a saver
        saver = tf.train.Saver(max_to_keep=1)
        # saver_for_best = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=3)
        saver_for_best = tf.train.Saver(max_to_keep=3)
        if use_trick:
            saver_for_trick = tf.train.Saver(var_list=tf.global_variables(),
                                             max_to_keep=1)

        # build summary
        summary_op = tf.summary.merge(summaries)

    # start training session
    # "allow_soft_placement" must be set to True to build towers on GPU,
    # as some of the ops do not have GPU implementations.
    # "log_device_placement" set to True will print device place
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = True

    if for_remote != 1:
        # fraction of overall amount of memory that each GPU should be allocated
        config.gpu_options.per_process_gpu_memory_fraction = 0.8

    tf_sess = tf.Session(config=config)
    K_B.set_session(tf_sess)
    with K_B.get_session() as sess:
        # summaries
        summary_path = os.path.join(summary_dir, model_name)
        summary_writer = tf.summary.FileWriter(summary_path, graph=sess.graph)

        init_or_restore(sess, saver, ckpt_path, logger)

        volumes_per_step = small_batch * gpu_nums

        step = -1
        min_train_loss = np.inf
        min_train_loss_step = 0
        patient = 0
        if type(cfg.epochs) is int:
            for epoch in range(cfg.epochs):
                print("EPOCH: {}".format(epoch + 1))
                logger.info("EPOCH: {}".format(epoch + 1))
                sess.run(iterator.initializer)
                while True:
                    try:
                        step += 1
                        # tf.cod的validate分支也会get_next, 避免先用完导致跳出循环
                        _, average_loss_v, average_mae_loss_v, average_mse_loss_v, g_step_v, \
                        summary_str = sess.run(
                            [train_op, average_loss, average_mae_loss, average_mse_loss, g_step,
                             summary_op])

                        assert (not np.isnan(average_loss_v)), 'Model diverged with ' \
                                                                 'loss = NaN'

                        # duration = time.time() - start_time
                        # batch_per_sec = volumes_per_step / duration

                        if step % 10 == 0:
                            print("----- step:{} train loss:{:04f}".format(
                                step, average_loss_v))
                            print("----- step:{} train mae_loss:{:04f}".format(
                                step, average_mae_loss_v))
                            print("----- step:{} train mse_loss:{:04f}".format(
                                step, average_mse_loss_v))
                            logger.info(
                                "----- step:{} train loss:{:04f}".format(
                                    step, average_loss_v))
                            logger.info(
                                "----- step:{} train mae_loss:{:04f}".format(
                                    step, average_mae_loss_v))
                            logger.info(
                                "----- step:{} train mse_loss:{:04f}".format(
                                    step, average_mse_loss_v))
                        if step % 100 == 0:
                            summary_writer.add_summary(summary_str, step)
                        if step % 1000 == 0:
                            saver.save(sess, ckpt_path, global_step=step)
                    except tf.errors.OutOfRangeError:
                        print('train dataset finished')
                        logger.info('train dataset finished')
                        break
                    # catch all other error
                    except BaseException as e:
                        print('[!!!]An exception occurred: {}'.format(e))

                if average_loss_v < min_train_loss:
                    min_train_loss = average_loss_v
                    min_train_loss_step = step
                    patient = 0
                    saver_for_best.save(sess, best_ckpt_path, global_step=step)
                    print(
                        "update best model, min_train_loss is [{:04f}] in step [{}]"
                        .format(average_loss_v, step))
                    logger.info(
                        "update best model, min_train_loss is [{:04f}] in step [{}]"
                        .format(average_loss_v, step))
                else:
                    patient += 1
                    if patient >= cfg.patient:
                        print(
                            '[!!!] Early stop for no reducing in train loss ' +
                            'after [{}] epochs, '.format(cfg.patient) +
                            'min_train_loss is [{:04f}]'.format(min_train_loss)
                            + ', in step [{}]'.format(min_train_loss_step))
                        print(
                            'mae_loss is [{:04f}]'.format(average_mae_loss_v))
                        print(
                            'mse_loss is [{:04f}]'.format(average_mse_loss_v))
                        logger.warning(
                            '[!!!] Early stop for no reducing in train loss ' +
                            'after [{}] epochs, '.format(cfg.patient) +
                            'min_train_loss is [{:04f}]'.format(min_train_loss)
                            + ', in step [{}]'.format(min_train_loss_step))
                        logger.warning(
                            'mae_loss is [{:04f}]'.format(average_mae_loss_v))
                        logger.warning(
                            'mse_loss is [{:04f}]'.format(average_mse_loss_v))
                    break

    print('[!!!] model name:{}'.format(model_name))
    logger.info('[!!!] model name:{}'.format(model_name))
Example #6
0
def train_ops(points, covars, labels, true_state, is_training, num_classes,
              voxel_num, epoch_batch_num):
    ops = {}
    with tf.device('/cpu:0'):
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        decay_steps = epoch_batch_num * FLAGS.decay_epoch
        lr = tf.train.exponential_decay(FLAGS.lr_init,
                                        global_step,
                                        decay_steps,
                                        FLAGS.decay_rate,
                                        staircase=True)
        lr = tf.maximum(FLAGS.lr_clip, lr)
        tf.summary.scalar('learning rate', lr)

        opt = tf.train.AdamOptimizer(lr)

        with tf.name_scope('split_data'):
            tower_points = tf.split(points, FLAGS.num_gpus)
            tower_covars = tf.split(covars, FLAGS.num_gpus)
            tower_labels = tf.split(labels, FLAGS.num_gpus)
            tower_true_state = tf.split(true_state, FLAGS.num_gpus)

        reuse = False
        tower_grads = []
        tower_recon_losses = []
        tower_voxel_state = []
        for i in range(FLAGS.num_gpus):
            with tf.device('/gpu:{}'.format(i)):
                with tf.name_scope('tower_{}'.format(i)):
                    recon_loss,voxel_state\
                        =tower_loss(tower_points[i], tower_covars[i],tower_labels[i],
                                    tower_true_state[i], is_training, num_classes,voxel_num,reuse)

                    grad = opt.compute_gradients(recon_loss)
                    tower_grads.append(grad)

                    tower_recon_losses.append(recon_loss)
                    tower_voxel_state.append(voxel_state)

                    reuse = True

        avg_grad = average_gradients(tower_grads)
        update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_op):
            apply_grad_op = opt.apply_gradients(avg_grad,
                                                global_step=global_step)

        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        summary_op = tf.summary.merge(summaries)

        total_recon_loss_op = tf.add_n(tower_recon_losses) / FLAGS.num_gpus

        voxel_state_op = tf.concat(tower_voxel_state, axis=0)

        ops['total_recon_loss'] = total_recon_loss_op
        ops['apply_grad'] = apply_grad_op
        ops['summary'] = summary_op
        ops['global_step'] = global_step
        ops['voxel_state'] = voxel_state_op

    return ops