Ejemplo n.º 1
0
def main(_):
    # TODO: split file support
    with tf.Graph().as_default():
        global save_model_dir
        with KittiLoader(object_dir=os.path.join(dataset_dir, 'training'), queue_size=50, require_shuffle=True,
                         is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, use_multi_process_num=8, multi_gpu_sum=cfg.GPU_USE_COUNT, aug=True) as train_loader, \
            KittiLoader(object_dir=os.path.join(dataset_dir, 'testing'), queue_size=50, require_shuffle=True,
                        is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, use_multi_process_num=8, multi_gpu_sum=cfg.GPU_USE_COUNT, aug=False) as valid_loader:

            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
                                        visible_device_list=cfg.GPU_AVAILABLE,
                                        allow_growth=True)
            config = tf.ConfigProto(
                gpu_options=gpu_options,
                device_count={
                    "GPU": cfg.GPU_USE_COUNT,
                },
                allow_soft_placement=True,
            )
            with tf.Session(config=config) as sess:
                model = RPN3D(
                    cls=cfg.DETECT_OBJ,
                    single_batch_size=args.single_batch_size,
                    learning_rate=args.lr,
                    max_gradient_norm=5.0,
                    is_train=True,
                    alpha=1.5,
                    beta=1,
                    avail_gpus=cfg.GPU_AVAILABLE.split(',')
                )
                # param init/restore
                if tf.train.get_checkpoint_state(save_model_dir):
                    print("Reading model parameters from %s" % save_model_dir)
                    model.saver.restore(
                        sess, tf.train.latest_checkpoint(save_model_dir))
                else:
                    print("Created model with fresh parameters.")
                    tf.global_variables_initializer().run()

                # train and validate
                iter_per_epoch = int(
                    len(train_loader) / (args.single_batch_size * cfg.GPU_USE_COUNT))
                is_summary, is_summary_image, is_validate = False, False, False

                summary_interval = 5
                summary_image_interval = 20
                save_model_interval = int(iter_per_epoch / 3)
                validate_interval = 60

                summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
                while model.epoch.eval() < args.max_epoch:
                    is_summary, is_summary_image, is_validate = False, False, False
                    iter = model.global_step.eval()
                    if not iter % summary_interval:
                        is_summary = True
                    if not iter % summary_image_interval:
                        is_summary_image = True
                    if not iter % save_model_interval:
                        model.saver.save(sess, os.path.join(
                            save_model_dir, 'checkpoint'), global_step=model.global_step)
                    if not iter % validate_interval:
                        is_validate = True
                    if not iter % iter_per_epoch:
                        sess.run(model.epoch_add_op)
                        print('train {} epoch, total: {}'.format(
                            model.epoch.eval(), args.max_epoch))

                    ret = model.train_step(
                        sess, train_loader.load(), train=True, summary=is_summary)
                    print('train: {}/{} @ epoch:{}/{} loss: {} reg_loss: {} cls_loss: {} {}'.format(iter,
                                                                                                    iter_per_epoch * args.max_epoch, model.epoch.eval(), args.max_epoch, ret[0], ret[1], ret[2], args.tag))

                    if is_summary:
                        summary_writer.add_summary(ret[-1], iter)

                    if is_summary_image:
                        ret = model.predict_step(
                                sess, valid_loader.load(), summary=True)
                        summary_writer.add_summary(ret[-1], iter)

                    if is_validate:
                        ret = model.validate_step(
                                sess, valid_loader.load(), summary=True)
                        summary_writer.add_summary(ret[-1], iter)

                    if check_if_should_pause(args.tag):
                        model.saver.save(sess, os.path.join(
                            save_model_dir, 'checkpoint'), global_step=model.global_step)
                        print('pause and save model @ {} steps:{}'.format(
                            save_model_dir, model.global_step.eval()))
                        sys.exit(0)

                print('train done. total epoch:{} iter:{}'.format(
                    model.epoch.eval(), model.global_step.eval()))

                # finallly save model
                model.saver.save(sess, os.path.join(
                    save_model_dir, 'checkpoint'), global_step=model.global_step)
Ejemplo n.º 2
0
def main(_):
    # TODO: split file support
    with tf.Graph().as_default():
        global save_model_dir
        start_epoch = 0
        global_counter = 0

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
            visible_device_list=cfg.GPU_AVAILABLE,
            allow_growth=True)
        config = tf.ConfigProto(
            gpu_options=gpu_options,
            device_count={
                "GPU": cfg.GPU_USE_COUNT,
            },
            allow_soft_placement=True,
        )
        with tf.Session(config=config) as sess:
            model = RPN3D(
                cls=cfg.DETECT_OBJ,
                single_batch_size=args.single_batch_size,
                learning_rate=args.lr,
                max_gradient_norm=5.0,
                alpha=args.alpha,
                beta=args.beta,
                avail_gpus=cfg.GPU_AVAILABLE  #.split(',')
            )
            # param init/restore
            if tf.train.get_checkpoint_state(save_model_dir):
                print("Reading model parameters from %s" % save_model_dir)
                model.saver.restore(sess,
                                    tf.train.latest_checkpoint(save_model_dir))
                start_epoch = model.epoch.eval() + 1
                global_counter = model.global_step.eval() + 1
            else:
                print("Created model with fresh parameters.")
                tf.global_variables_initializer().run()

            # train and validate
            is_summary, is_summary_image, is_validate = False, False, False

            summary_interval = 5
            summary_val_interval = 10
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

            # training
            for epoch in range(start_epoch, args.max_epoch):
                counter = 0
                batch_time = time.time()
                for batch in iterate_data(train_dir,
                                          shuffle=True,
                                          aug=True,
                                          is_testset=False,
                                          batch_size=args.single_batch_size *
                                          cfg.GPU_USE_COUNT,
                                          multi_gpu_sum=cfg.GPU_USE_COUNT):

                    counter += 1
                    global_counter += 1

                    if counter % summary_interval == 0:
                        is_summary = True
                    else:
                        is_summary = False

                    start_time = time.time()
                    ret = model.train_step(sess,
                                           batch,
                                           train=True,
                                           summary=is_summary)
                    forward_time = time.time() - start_time
                    batch_time = time.time() - batch_time


                    print('train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'\
                            .format(counter,epoch, args.max_epoch, ret[0], ret[1], ret[2], ret[3], ret[4], forward_time, batch_time))

                    with open('log/train.txt', 'a') as f:
                        f.write(
                            'train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f} \n'
                            .format(counter, epoch, args.max_epoch, ret[0],
                                    ret[1], ret[2], ret[3], ret[4],
                                    forward_time, batch_time))

                    #print(counter, summary_interval, counter % summary_interval)
                    if counter % summary_interval == 0:
                        print("summary_interval now")
                        summary_writer.add_summary(ret[-1], global_counter)

                    #print(counter, summary_val_interval, counter % summary_val_interval)
                    if counter % summary_val_interval == 0:
                        print("summary_val_interval now")
                        batch = sample_test_data(
                            val_dir,
                            args.single_batch_size * cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT)

                        ret = model.validate_step(sess, batch, summary=True)
                        summary_writer.add_summary(ret[-1], global_counter)

                        try:
                            ret = model.predict_step(sess, batch, summary=True)
                            summary_writer.add_summary(ret[-1], global_counter)
                        except:
                            print("prediction skipped due to error")

                    if check_if_should_pause(args.tag):
                        model.saver.save(sess,
                                         os.path.join(save_model_dir,
                                                      'checkpoint'),
                                         global_step=model.global_step)
                        print('pause and save model @ {} steps:{}'.format(
                            save_model_dir, model.global_step.eval()))
                        sys.exit(0)

                    batch_time = time.time()

                sess.run(model.epoch_add_op)

                model.saver.save(sess,
                                 os.path.join(save_model_dir, 'checkpoint'),
                                 global_step=model.global_step)

                # dump test data every 10 epochs
                if (epoch + 1) % 10 == 0:
                    # create output folder
                    os.makedirs(os.path.join(args.output_path, str(epoch)),
                                exist_ok=True)
                    os.makedirs(os.path.join(args.output_path, str(epoch),
                                             'data'),
                                exist_ok=True)
                    if args.vis:
                        os.makedirs(os.path.join(args.output_path, str(epoch),
                                                 'vis'),
                                    exist_ok=True)

                    for batch in iterate_data(
                            val_dir,
                            shuffle=False,
                            aug=False,
                            is_testset=False,
                            batch_size=args.single_batch_size *
                            cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT):

                        if args.vis:
                            tags, results, front_images, bird_views, heatmaps = model.predict_step(
                                sess, batch, summary=False, vis=True)
                        else:
                            tags, results = model.predict_step(sess,
                                                               batch,
                                                               summary=False,
                                                               vis=False)

                        for tag, result in zip(tags, results):
                            of_path = os.path.join(args.output_path,
                                                   str(epoch), 'data',
                                                   tag + '.txt')
                            with open(of_path, 'w+') as f:
                                labels = box3d_to_label([result[:, 1:8]],
                                                        [result[:, 0]],
                                                        [result[:, -1]],
                                                        coordinate='lidar')[0]
                                for line in labels:
                                    f.write(line)
                                print('write out {} objects to {}'.format(
                                    len(labels), tag))
                        # dump visualizations
                        if args.vis:
                            for tag, front_image, bird_view, heatmap in zip(
                                    tags, front_images, bird_views, heatmaps):
                                front_img_path = os.path.join(
                                    args.output_path, str(epoch), 'vis',
                                    tag + '_front.jpg')
                                bird_view_path = os.path.join(
                                    args.output_path, str(epoch), 'vis',
                                    tag + '_bv.jpg')
                                heatmap_path = os.path.join(
                                    args.output_path, str(epoch), 'vis',
                                    tag + '_heatmap.jpg')
                                cv2.imwrite(front_img_path, front_image)
                                cv2.imwrite(bird_view_path, bird_view)
                                cv2.imwrite(heatmap_path, heatmap)

                    # execute evaluation code
                    cmd_1 = "./kitti_eval/launch_test.sh"
                    cmd_2 = os.path.join(args.output_path, str(epoch))
                    cmd_3 = os.path.join(args.output_path, str(epoch), 'log')
                    os.system(" ".join([cmd_1, cmd_2, cmd_3]))

            print('train done. total epoch:{} iter:{}'.format(
                epoch, model.global_step.eval()))

            # finallly save model
            model.saver.save(sess,
                             os.path.join(save_model_dir, 'checkpoint'),
                             global_step=model.global_step)
Ejemplo n.º 3
0
def main(_):

    with tf.Graph().as_default():

        start_epoch = 0
        global_counter = 0

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
            visible_device_list=cfg.GPU_AVAILABLE,
            allow_growth=True)

        config = tf.ConfigProto(
            gpu_options=gpu_options,
            device_count={
                "GPU": cfg.GPU_USE_COUNT,
            },
            allow_soft_placement=True,
        )

        with tf.Session(config=config) as sess:
            model = RPN3D(cls=cfg.DETECT_OBJ,
                          decrease=args.decrease,
                          minimize=args.minimize,
                          single_batch_size=args.single_batch_size,
                          learning_rate=args.lr,
                          max_gradient_norm=5.0,
                          alpha=args.alpha,
                          beta=args.beta,
                          avail_gpus=cfg.GPU_AVAILABLE.split(','))

            # param init/restore
            if tf.train.get_checkpoint_state(save_model_dir):
                print("Reading model parameters from %s" % save_model_dir)
                model.saver.restore(sess,
                                    tf.train.latest_checkpoint(save_model_dir))
                start_epoch = model.epoch.eval() + 1
                global_counter = model.global_step.eval() + 1
            else:
                print("Created model with fresh parameters.")
                tf.global_variables_initializer().run()

            # train and validate
            is_summary, is_summary_image, is_validate = False, False, False

            summary_interval = 5
            summary_val_interval = 10
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

            # training
            for epoch in range(start_epoch, args.max_epoch):
                counter = 0
                batch_time = time.time()
                for batch in iterate_data(train_dir,
                                          shuffle=True,
                                          aug=True,
                                          is_testset=False,
                                          batch_size=args.single_batch_size *
                                          cfg.GPU_USE_COUNT,
                                          multi_gpu_sum=cfg.GPU_USE_COUNT):

                    counter += 1
                    global_counter += 1

                    if counter % summary_interval == 0:
                        is_summary = True
                    else:
                        is_summary = False

                    start_time = time.time()
                    ret = model.train_step(sess,
                                           batch,
                                           train=True,
                                           summary=is_summary)
                    forward_time = time.time() - start_time
                    batch_time = time.time() - batch_time

                    print(
                        'train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'
                        .format(counter, epoch + 1, args.max_epoch, ret[0],
                                ret[1], ret[2], ret[3], ret[4], forward_time,
                                batch_time))
                    with open(os.path.join('log', 'train.txt'), 'a') as f:
                        f.write(
                            'train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f} \n'
                            .format(counter, epoch + 1, args.max_epoch, ret[0],
                                    ret[1], ret[2], ret[3], ret[4],
                                    forward_time, batch_time))

                    if counter % summary_interval == 0:
                        print("summary_interval now")
                        summary_writer.add_summary(ret[-1], global_counter)

                    if counter % summary_val_interval == 0:
                        print("summary_val_interval now")
                        batch = sample_test_data(
                            val_dir,
                            args.single_batch_size * cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT)

                        ret = model.validate_step(sess, batch, summary=True)
                        summary_writer.add_summary(ret[-1], global_counter)

                    if check_if_should_pause(args.tag):
                        model.saver.save(sess,
                                         os.path.join(save_model_dir,
                                                      'checkpoint'),
                                         global_step=model.global_step)
                        print('pause and save model @ {} steps:{}'.format(
                            save_model_dir, model.global_step.eval()))
                        sys.exit(0)

                    batch_time = time.time()

                sess.run(model.epoch_add_op)

                model.saver.save(sess,
                                 os.path.join(save_model_dir, 'checkpoint'),
                                 global_step=model.global_step)

                # dump test data every 10 epochs
                if (epoch + 1) % 10 == 0:
                    os.makedirs(os.path.join(res_dir, str(epoch)),
                                exist_ok=True)
                    os.makedirs(os.path.join(res_dir, str(epoch), 'data'),
                                exist_ok=True)

                    for batch in iterate_data(
                            val_dir,
                            shuffle=False,
                            aug=False,
                            is_testset=False,
                            batch_size=args.single_batch_size *
                            cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT):

                        tags, results = model.predict_step(sess,
                                                           batch,
                                                           summary=False,
                                                           vis=False)

                        for tag, result in zip(tags, results):
                            of_path = os.path.join(res_dir, str(epoch), 'data',
                                                   tag + '.txt')
                            with open(of_path, 'w+') as f:
                                labels = box3d_to_label([result[:, 1:8]],
                                                        [result[:, 0]],
                                                        [result[:, -1]],
                                                        coordinate='lidar')[0]
                                for line in labels:
                                    f.write(line)
                                print('write out {} objects to {}'.format(
                                    len(labels), tag))

            # finally save model
            model.saver.save(sess,
                             os.path.join(save_model_dir, 'checkpoint'),
                             global_step=model.global_step)
Ejemplo n.º 4
0
def main(_):
    global log_f
    timestr = time.strftime("%b-%d_%H-%M-%S", time.localtime())
    log_f = open('log/train_{}.txt'.format(timestr), 'w')
    log_print(str(cfg))
    # TODO: split file support
    with tf.Graph().as_default():
        global save_model_dir
        start_epoch = 0
        global_counter = 0

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
            visible_device_list=cfg.GPU_AVAILABLE,
            allow_growth=True)
        config = tf.ConfigProto(
            gpu_options=gpu_options,
            device_count={
                "GPU": cfg.GPU_USE_COUNT,
            },
            allow_soft_placement=True,
        )
        with tf.Session(config=config) as sess:
            model = RPN3D(cls=cfg.DETECT_OBJ,
                          single_batch_size=args.single_batch_size,
                          learning_rate=args.lr,
                          max_gradient_norm=5.0,
                          alpha=args.alpha,
                          beta=args.beta,
                          avail_gpus=cfg.GPU_AVAILABLE.split(','))
            # param init/restore
            if args.restore and tf.train.get_checkpoint_state(save_model_dir):
                log_print("Reading model parameters from %s" % save_model_dir)
                model.saver.restore(sess,
                                    tf.train.latest_checkpoint(save_model_dir))
                start_epoch = model.epoch.eval() + 1
                global_counter = model.global_step.eval() + 1
            else:
                log_print("Created model with fresh parameters.")
                tf.global_variables_initializer().run()

            if cfg.FEATURE_NET_TYPE == 'FeatureNet_AE' and cfg.FeatureNet_AE_WPATH:
                ae_checkpoint_file = tf.train.latest_checkpoint(
                    cfg.FeatureNet_AE_WPATH)
                log_print("Load Pretrained FeatureNet_AE weights %s" %
                          ae_checkpoint_file)
                ae_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope='ae_encoder')
                ae_saver = tf.train.Saver(
                    var_list={v.op.name: v
                              for v in ae_vars})
                ae_saver.restore(sess, ae_checkpoint_file)
            if cfg.FEATURE_NET_TYPE == 'FeatureNet_VAE' and cfg.FeatureNet_VAE_WPATH:
                vae_checkpoint_file = tf.train.latest_checkpoint(
                    cfg.FeatureNet_VAE_WPATH)
                log_print("Load Pretrained FeatureNet_VAE weights %s" %
                          vae_checkpoint_file)
                vae_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope='vae_encoder')
                vae_saver = tf.train.Saver(
                    var_list={v.op.name: v
                              for v in vae_vars})
                vae_saver.restore(sess, vae_checkpoint_file)

            # train and validate
            is_summary, is_summary_image, is_validate = False, False, False

            summary_interval = 5
            summary_val_interval = 20
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

            parameter_num = np.sum(
                [np.prod(v.shape.as_list()) for v in tf.trainable_variables()])
            log_print('Parameter number: {}'.format(parameter_num))

            # training
            for epoch in range(start_epoch, args.max_epoch):
                counter = 0
                batch_time = time.time()
                for batch in iterate_data(train_dir,
                                          db_sampler=sampler,
                                          shuffle=True,
                                          aug=AUG_DATA,
                                          is_testset=False,
                                          batch_size=args.single_batch_size *
                                          cfg.GPU_USE_COUNT,
                                          multi_gpu_sum=cfg.GPU_USE_COUNT):
                    counter += 1
                    global_counter += 1

                    if counter % summary_interval == 0:
                        is_summary = True
                    else:
                        is_summary = False

                    start_time = time.time()
                    ret = model.train_step(sess,
                                           batch,
                                           train=True,
                                           summary=is_summary)
                    forward_time = time.time() - start_time
                    batch_time = time.time() - batch_time

                    log_print(
                        'train: {} @ epoch:{}/{} loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'
                        .format(counter, epoch, args.max_epoch, ret[0], ret[1],
                                ret[2], ret[3], ret[4], forward_time,
                                batch_time),
                        write=is_summary)

                    #print(counter, summary_interval, counter % summary_interval)
                    if counter % summary_interval == 0:
                        log_print("summary_interval now")
                        summary_writer.add_summary(ret[-1], global_counter)

                    #print(counter, summary_val_interval, counter % summary_val_interval)
                    if counter % summary_val_interval == 0:
                        log_print("summary_val_interval now")
                        # Random sample single batch data
                        batch = sample_test_data(
                            val_dir,
                            args.single_batch_size * cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT)

                        ret = model.validate_step(sess, batch, summary=True)
                        summary_writer.add_summary(ret[-1], global_counter)
                        log_print(
                            'validation: loss: {:.4f} reg_loss: {:.4f} cls_loss: {:.4f} '
                            .format(ret[0], ret[1], ret[2]))

                        with warnings.catch_warnings():
                            warnings.filterwarnings('error')
                            try:
                                ret = model.predict_step(sess,
                                                         batch,
                                                         summary=True)
                                summary_writer.add_summary(
                                    ret[-1], global_counter)
                            except:
                                log_print('prediction skipped due to error',
                                          'red')

                    if check_if_should_pause(args.tag):
                        model.saver.save(sess,
                                         os.path.join(save_model_dir, timestr),
                                         global_step=model.global_step)
                        log_print('pause and save model @ {} steps:{}'.format(
                            save_model_dir, model.global_step.eval()))
                        sys.exit(0)

                    batch_time = time.time()

                sess.run(model.epoch_add_op)

                model.saver.save(sess,
                                 os.path.join(save_model_dir, timestr),
                                 global_step=model.global_step)

                # dump test data every 10 epochs
                if (epoch + 1) % 10 == 0:
                    # create output folder
                    os.makedirs(os.path.join(args.output_path, str(epoch)),
                                exist_ok=True)
                    os.makedirs(os.path.join(args.output_path, str(epoch),
                                             'data'),
                                exist_ok=True)
                    if args.vis:
                        os.makedirs(os.path.join(args.output_path, str(epoch),
                                                 'vis'),
                                    exist_ok=True)

                    for batch in iterate_data(
                            val_dir,
                            shuffle=False,
                            aug=False,
                            is_testset=False,
                            batch_size=args.single_batch_size *
                            cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT):
                        if args.vis:
                            tags, results, front_images, bird_views, heatmaps = model.predict_step(
                                sess, batch, summary=False, vis=True)
                        else:
                            tags, results = model.predict_step(sess,
                                                               batch,
                                                               summary=False,
                                                               vis=False)

                        for tag, result in zip(tags, results):
                            of_path = os.path.join(args.output_path,
                                                   str(epoch), 'data',
                                                   tag + '.txt')
                            with open(of_path, 'w+') as f:
                                P, Tr, R = load_calib(
                                    os.path.join(cfg.CALIB_DIR, tag + '.txt'))
                                labels = box3d_to_label([result[:, 1:8]],
                                                        [result[:, 0]],
                                                        [result[:, -1]],
                                                        coordinate='lidar',
                                                        P2=P,
                                                        T_VELO_2_CAM=Tr,
                                                        R_RECT_0=R)[0]
                                for line in labels:
                                    f.write(line)
                                log_print('write out {} objects to {}'.format(
                                    len(labels), tag))
                        # dump visualizations
                        if args.vis:
                            for tag, front_image, bird_view, heatmap in zip(
                                    tags, front_images, bird_views, heatmaps):
                                front_img_path = os.path.join(
                                    args.output_path, str(epoch), 'vis',
                                    tag + '_front.jpg')
                                bird_view_path = os.path.join(
                                    args.output_path, str(epoch), 'vis',
                                    tag + '_bv.jpg')
                                heatmap_path = os.path.join(
                                    args.output_path, str(epoch), 'vis',
                                    tag + '_heatmap.jpg')
                                cv2.imwrite(front_img_path, front_image)
                                cv2.imwrite(bird_view_path, bird_view)
                                cv2.imwrite(heatmap_path, heatmap)

                    # execute evaluation code
                    cmd_1 = "./kitti_eval/launch_test.sh"
                    cmd_2 = os.path.join(args.output_path, str(epoch))
                    cmd_3 = os.path.join(args.output_path, str(epoch), 'log')
                    os.system(" ".join([cmd_1, cmd_2, cmd_3]))

            log_print('train done. total epoch:{} iter:{}'.format(
                epoch, model.global_step.eval()))

            # finallly save model
            model.saver.save(sess,
                             os.path.join(save_model_dir, 'checkpoint'),
                             global_step=model.global_step)
Ejemplo n.º 5
0
def main(_):
    # TODO: split file support
    with tf.Graph().as_default():
        global save_model_dir
        start_epoch = 0
        global_counter = 0

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
            visible_device_list=cfg.GPU_AVAILABLE,
            allow_growth=True)
        config = tf.ConfigProto(
            gpu_options=gpu_options,
            device_count={
                "GPU": cfg.GPU_USE_COUNT,
            },
            allow_soft_placement=True,
        )
        with tf.Session(config=config) as sess:
            model = RPN3D(cls=cfg.DETECT_OBJ,
                          single_batch_size=args.single_batch_size,
                          learning_rate=args.lr,
                          max_gradient_norm=5.0,
                          is_train=True,
                          alpha=args.alpha,
                          beta=args.beta,
                          avail_gpus=cfg.GPU_AVAILABLE.split(','))
            # param init/restore
            if tf.train.get_checkpoint_state(save_model_dir):
                print("Reading model parameters from %s" % save_model_dir)
                model.saver.restore(sess,
                                    tf.train.latest_checkpoint(save_model_dir))
                start_epoch = model.epoch.eval() + 1
                global_counter = model.global_step.eval() + 1
            else:
                print("Created model with fresh parameters.")
                tf.global_variables_initializer().run()

            # train and validate
            is_summary, is_summary_image, is_validate = False, False, False

            summary_interval = 5
            summary_val_interval = 10
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

            # training
            for epoch in range(start_epoch, args.max_epoch):
                counter = 0
                for batch in iterate_data(train_dir,
                                          shuffle=True,
                                          aug=True,
                                          is_testset=False,
                                          batch_size=args.single_batch_size *
                                          cfg.GPU_USE_COUNT,
                                          multi_gpu_sum=cfg.GPU_USE_COUNT):

                    counter += 1
                    global_counter += 1

                    if counter % summary_interval == 0:
                        is_summary = True
                    else:
                        is_summary = False

                    start_time = time.time()
                    ret = model.train_step(sess,
                                           batch,
                                           train=True,
                                           summary=is_summary)
                    times = time.time() - start_time

                    print(
                        'train: {} @ epoch:{}/{} loss: {} reg_loss: {} cls_loss: {} time: {}'
                        .format(counter, epoch, args.max_epoch, ret[0], ret[1],
                                ret[2], times))
                    with open('log/train.txt', 'a') as f:
                        f.write(
                            'train: {} @ epoch:{}/{} loss: {} reg_loss: {} cls_loss: {} time: {} \n'
                            .format(counter, epoch, args.max_epoch, ret[0],
                                    ret[1], ret[2], times))

                    #print(counter, summary_interval, counter % summary_interval)
                    if counter % summary_interval == 0:
                        print("summary_interval now")
                        summary_writer.add_summary(ret[-1], global_counter)

                    #print(counter, summary_val_interval, counter % summary_val_interval)
                    if counter % summary_val_interval == 0:
                        print("summary_val_interval now")
                        batch = sample_test_data(
                            val_dir,
                            args.single_batch_size * cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT)

                        ret = model.validate_step(sess, batch, summary=True)
                        summary_writer.add_summary(ret[-1], global_counter)

                        try:
                            ret = model.predict_step(sess, batch, summary=True)
                            summary_writer.add_summary(ret[-1], global_counter)
                        except:
                            print("prediction skipped due to error")

                    if check_if_should_pause(args.tag):
                        model.saver.save(sess,
                                         os.path.join(save_model_dir,
                                                      'checkpoint'),
                                         global_step=model.global_step)
                        print('pause and save model @ {} steps:{}'.format(
                            save_model_dir, model.global_step.eval()))
                        sys.exit(0)

                sess.run(model.epoch_add_op)

                model.saver.save(sess,
                                 os.path.join(save_model_dir, 'checkpoint'),
                                 global_step=model.global_step)

            print('train done. total epoch:{} iter:{}'.format(
                epoch, model.global_step.eval()))

            # finallly save model
            model.saver.save(sess,
                             os.path.join(save_model_dir, 'checkpoint'),
                             global_step=model.global_step)
Ejemplo n.º 6
0
def main(_):
	with tf.Graph().as_default():
		global save_model_dir
		with KittiLoader(object_dir=os.path.join(dataset_dir, 'object', 'training'), require_shuffle=True,
						 split_file=os.path.join(cfg.ROOT_DIR, 'DataSplits', 'train.txt'),
			             is_testset=False, batch_size=args.single_batch_size, aug=False, aug_num=0) as train_loader, \
			KittiLoader(object_dir=os.path.join(dataset_dir, 'object', 'training'), require_shuffle=False,
						 split_file=os.path.join(cfg.ROOT_DIR, 'DataSplits', 'val.txt'),
			             is_testset=False, batch_size=args.single_batch_size, aug=False, aug_num=0) as valid_loader:

			gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
												visible_device_list=cfg.GPU_AVAILABLE,
												allow_growth=True)
			config = tf.ConfigProto(gpu_options=gpu_options, device_count={"GPU": cfg.GPU_USE_COUNT}, allow_soft_placement=True)
			with tf.Session(config=config) as sess:
				premodelTime = time.time()
				model = VoxelNet(cls=args.cls, single_batch_size=args.single_batch_size,
								 learning_rate=args.learning_rate, max_gradient_norm=5.0,
								 is_train=True, alpha=1.5, beta=1, avail_gpus=cfg.GPU_AVAILABLE.split(','))
				postmodelTime = time.time()
				getTotalNumberOfParams(model)
				print("It took {} seconds to create model".format(postmodelTime - premodelTime))

				# Restore from checkpoint if it exists
				if tf.train.get_checkpoint_state(save_model_dir):
					print("Reading model parameters from ", save_model_dir)
					prereadTime = time.time()
					model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir))
					postreadTime = time.time()
					print("It took {} seconds to read parameters from file".format(postreadTime - prereadTime))
				else: # No checkpoint exists
					print("Initializing model parameters")
					preInitTime = time.time()
					tf.global_variables_initializer().run()
					postInitTime = time.time()
					print("It took {} seconds to freshly initialize model parameters".format(postInitTime - preInitTime))

				# Train and validate
				iter_per_epoch = int(len(train_loader) / (args.single_batch_size*cfg.GPU_USE_COUNT))
				is_summary, is_summary_image, is_validate = False, False, False
				
				summary_interval = 5
				summary_image_interval = 20
				#summary_image_interval = 1
				save_model_interval = int(iter_per_epoch / 3)
				validate_interval = 60
				bestValLoss = 100000

				print('\n--------------------------------------------------------------')
				print('Training parameters')
				print('batch size={} with {} augmented members added per batch'.format(args.single_batch_size, args.num_aug_per_batch))
				print('\tnum members per pass {}'.format(args.single_batch_size+args.num_aug_per_batch))
				print('max epoch={}'.format(args.max_epoch))
				print('iter_per_epoch={}'.format(iter_per_epoch))
				print('current epoch={}'.format(model.epoch.eval()))
				print('summary_interval={}'.format(summary_interval))
				print('summary_image_interval={}'.format(summary_image_interval))
				print('save_model_interval={}'.format(save_model_interval))
				print('validate_interval={}'.format(validate_interval))

				summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
				startTraining = time.time()
				#while model.epoch.eval() < args.max_epoch:
				while True:
					is_summary, is_summary_image, is_validate = False, False, False
					iter = model.global_step.eval()
					print('iteration = {}'.format(iter))
					if not iter % summary_interval:
						is_summary = True
					if not iter % summary_image_interval:
						is_summary_image = True
					if not iter % save_model_interval:
						model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)
					if not iter % validate_interval:
						is_validate = True
					if not iter % iter_per_epoch:
						sess.run(model.epoch_add_op)
						print('training epoch {} of {} total'.format(model.epoch.eval(), args.max_epoch))
					flag, data = train_loader.load(args.single_batch_size)
					if flag:
						train_loader.reset()
					ret = model.train_step(sess, data, train=True, summary=is_summary)
					print('train: {}/{} @ epoch:{}/{} loss: {} reg_loss: {} cls_loss: {} {}'.format(iter,
																									iter_per_epoch * args.max_epoch, 
																									model.epoch.eval(), args.max_epoch, 
																									ret[0], ret[1], ret[2], args.tag))
					print('Time since training started {} secs'.format(time.time() - startTraining))

					if is_summary:
						print('\twritting summary')
						summary_writer.add_summary(ret[-1], iter)

					if is_summary_image:
						print('\tmaking images')
						flag, valdat = valid_loader.load(args.single_batch_size)
						if flag:
							valid_loader.reset()
						ret = model.predict_step(sess, valdat, summary=True)
						summary_writer.add_summary(ret[-1], iter)

					if is_validate:
						print('\trunning validate')
						losses = []
						for i in range(50):
							flag, valdat = valid_loader.load(args.single_batch_size)
							if flag:
								valid_loader.reset()
							ret = model.validate_step(sess, valdat, summary=True)
							losses.append(ret[0])
						ave_loss = np.average(np.array(losses))
						if ave_loss < bestValLoss:
							print('\tnew best average validation loss for 50 forward passes was {} now {} at iteration {}'.format(bestValLoss, ave_loss, iter))
							bestValLoss = ave_loss
							model.saver.save(sess, os.path.join(save_best_dir, 'checkpoint'), global_step=model.global_step)

					if check_if_should_pause(args.tag):
						print('\tsaving model')
						model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)

				stopTraining = time.time()
				print('Training took a total of {} secs for {} total iterations'.format(stopTraining - startTraining, args.max_epoch*iter_per_epoch))