Ejemplo n.º 1
0
def main():
    checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/LSTM_east/'
    idname1 = '20180924-191410'
    idname2 = '20180924-191410-5001'
    test_data_path = '/media/dragonx/DataLight/ICDAR2013/test/'
    save_path = '/media/dragonx/DataLight/ICDAR2013/test_results_lstm/'
    filename = '/media/dragonx/DataLight/ICDAR2013/test/Video_6_3_2.mp4'
    idx = 0  # initial frame number
    config = get_config(FLAGS)
    config.batch_size = 1
    config.num_layers = 3
    config.num_steps = 10
    #>>>>>>>>>>>>>>>>>>>>>>Sort test video>>>>>>>>>>>>>>>>>>>>>>>>>>>#
    video_set = []
    for root, dirs, files in os.walk(test_data_path):
        for file in files:
            if file.endswith('.mp4'):
                video_set.append(os.path.splitext(file)[0])
    index = range(0, 1)
    # parser for running outside
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--checkpoint-path', default=checkpoint_path)
    # args = parser.parse_args()
    if not os.path.exists(checkpoint_path):
        raise RuntimeError('Checkpoint `{}` not found'.format(checkpoint_path))

    logger.info('loading model')
    #>>>>>>>>>>>>>>>>>>>>>>> Loading Model >>>>>>>>>>>>>>>>>>>>>>>>>#
    gpu_options = tf.GPUOptions(allow_growth=True)
    input_images = tf.placeholder(tf.float32,
                                  shape=[None, None, None, 3],
                                  name='input_images')
    # global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)

    # Global initializer for Variables in the model
    # log: May 3rd, we need to adapt the model input, with config
    # with tf.name_scope("Train"):
    #     # use placeholder to stand for input and targets
    #     initializer = tf.random_normal_initializer()
    #     x_train = tf.placeholder(tf.float32, shape=[None, config.num_steps, None, None, 3])
    #     m = ArrayModel(True, config, x_train, reuse_variables=None, initializer=initializer)
    with tf.name_scope("Val"):
        # use placeholder to stand for input and targets
        initializer = tf.random_normal_initializer()
        x_val = tf.placeholder(tf.float32,
                               shape=[None, config.num_steps, None, None, 3])
        model = ArrayModel(False,
                           config,
                           x_val,
                           reuse_variables=None,
                           initializer=initializer)
    var_total = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    print(var_total)
    #>>>>>>>>>>>>>>>>>>>>>>>> restore the model from weights>>>>>>>>#
    soft_placement = False
    # var_list1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='feature_fusion')
    # var_list2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='resnet_v1_50')
    # var_list3 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='multi_rnn_cell')
    # var_list4 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='pred_module')
    # var_list = var_list1 + var_list2 + var_list3 + var_list4
    # saver = tf.train.Saver({v.op.name: v for v in var_list})
    saver = tf.train.Saver()
    config_proto = tf.ConfigProto(allow_soft_placement=soft_placement)
    # with sv.managed_session(config=config_proto) as session:
    #     if FLAGS.restore:
    #         print('continue training from previous checkpoint')
    #         # ckpt = tf.train.latest_checkpoint(FLAGS.checkpoints_path)
    #         ckpt = checkpoint_path + idname1 + '/' + idname2
    #         sv.saver.restore(session, ckpt)
    model_path = checkpoint_path + idname1 + '/' + idname2
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    logger.info('Restore from {}'.format(model_path))
    saver.restore(sess, model_path)
    #>>>>>>>>>>>>>>>>>>>>>>Start evaluation>>>>>>>>>>>>>>>>>>>>>>>>>#
    P_test = []
    R_test = []
    f1_test = []
    for k in index:
        P_video = []
        R_video = []
        f1_video = []
        video_save = save_path + video_set[k] + idname1 + '_' + idname2 + '.avi'
        t_start = time.time()
        # sort up all the paths
        xml_solo_path = test_data_path + video_set[k]
        raw_video_path = test_data_path + video_set[k] + '.mp4'
        cap = cv2.VideoCapture(raw_video_path)
        frame_width = int(cap.get(3))
        frame_height = int(cap.get(4))
        cnt_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        out = cv2.VideoWriter(video_save,
                              cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10,
                              (frame_width, frame_height))
        # 1. load both polys and tags; 2. generate geo maps(the format of polys and tags need to match)
        polys_array_list, tags_array_list, id_list_list, frame_num = load_annotations_solo(xml_solo_path, \
                    1, cnt_frame, frame_width, frame_height)
        #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>loop over frames in the time steps >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        for i in range(int(cnt_frame / config.num_steps)):
            data_seq = np.zeros((1, config.num_steps, 512, 512, 3),
                                dtype=np.float32)
            data_original = np.zeros(
                (1, config.num_steps, frame_height, frame_width, 3),
                dtype=np.float32)
            for j in range(config.num_steps):
                ret, frame = cap.read()
                # im_resized = cv2.resize(frame, (int(512), int(512)))
                im_resized = frame[0:512, 0:512, :]
                data_original[0, j, :, :, :] = frame
                data_seq[0, j, :, :, :] = im_resized
            #>>>>>>>>>>>>>>>>>>>>>>>>>Now it's time to run the model>>>>>>>>>>>>>>>>>>>>>>>>>>
            state = sess.run(model.initial_state)
            # tensors dict to run
            fetches = {
                "score_map": model.score_map_set,
                "geometry_map": model.geometry_set
            }
            feed_dict = {}
            feed_dict[model.input_data] = data_seq
            for i, (c, h) in enumerate(model.initial_state):
                feed_dict[c] = state[i].c
                feed_dict[h] = state[i].h
            timer = collections.OrderedDict([('net', 0), ('restore', 0),
                                             ('nms', 0)])
            start = time.time()
            vals = sess.run(fetches, feed_dict=feed_dict)
            timer['net'] = time.time() - start
            #>>>>>>>>>>>>>>>>>>>>>>>>Okay!!!We could evalute the results now>>>>>>>>>>>>>>>>>>>
            for j in range(config.num_steps):
                rtparams = collections.OrderedDict()
                rtparams['start_time'] = datetime.datetime.now().isoformat()
                rtparams['image_size'] = '{}x{}'.format(
                    frame_width, frame_height)
                # im_resized, (ratio_h, ratio_w) = resize_image(img)
                ratio_h, ratio_w = 512 / frame_height, 512 / frame_width
                rtparams['working_size'] = '{}x{}'.format(512, 512)
                # results refinement via NMS
                score = vals["score_map"][j]
                geometry = vals["geometry_map"][j]
                boxes, timer = detect(score_map=score,
                                      geo_map=geometry,
                                      timer=timer)
                logger.info(
                    'net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format(
                        timer['net'] * 1000, timer['restore'] * 1000,
                        timer['nms'] * 1000))
                if boxes is not None:
                    scores = boxes[:, 8].reshape(-1)
                    boxes = boxes[:, :8].reshape((-1, 4, 2))
                    boxes[:, :, 0] /= ratio_w
                    boxes[:, :, 1] /= ratio_h

                duration = time.time() - start
                timer['overall'] = duration
                logger.info('[timing] {}'.format(duration))
                text_lines = []
                if boxes is not None:
                    text_lines = []
                    for box, score in zip(boxes, scores):
                        box = sort_poly(box.astype(np.int32))
                        if np.linalg.norm(box[0] -
                                          box[1]) < 5 or np.linalg.norm(
                                              box[3] - box[0]) < 5:
                            continue
                        tl = collections.OrderedDict(
                            zip([
                                'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'
                            ], map(float, box.flatten())))
                        tl['score'] = float(score)
                        text_lines.append(tl)
                pred = {
                    'text_lines': text_lines,
                    'rtparams': rtparams,
                    'timing': timer,
                }
                text_polys, text_tags = polys_array_list[
                    i * 10 + j], tags_array_list[i * 10 + j]
                text_polys, text_tags = check_and_validate_polys(
                    text_polys, text_tags, (frame_height, frame_width))
                # out.write(new_img)
                #>>>>>>>>>>>>>>>>>>>>>>>>Evaluation>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
                targets = text_polys
                precision, recall, f1 = eval_single_frame(targets, pred)
                P_video.append(precision)
                R_video.append(recall)
                f1_video.append(f1)
                img = data_original[0, j, :, :, :]
                new_img = draw_illu(img.copy(), pred)
                new_img1 = draw_illu_gt(new_img.copy(), targets, precision,
                                        recall, f1)
                out.write(new_img1)
                # using for pre-testing
                if j == 0 and FLAGS.vis:
                    fig1 = plt.figure(figsize=(20, 10))
                    fig1.add_subplot(1, 2, 1)
                    plt.imshow(new_img)
                    plt.title("Text Detection with fine-tuned EAST")
                    fig1.add_subplot(1, 2, 2)
                    plt.imshow(new_img1)
                    plt.title('Text Detection Results Comparison')
                    plt.show()
                if cv2.waitKey(25) & 0xFF == ord('q'):
                    break
                # time.sleep(.100)
            else:
                break
            # evaluation on ret and gt
        P_test.append(np.array(P_video, dtype=np.float32))
        R_test.append(np.array(R_video, dtype=np.float32))
        f1_test.append(np.array(f1_video, dtype=np.float32))
        print(P_video)
        print(R_video)
        print(f1_video)
        print("testing results are P:{}, R:{}, F1:{} on ".format(
            sum(P_video) / cnt_frame,
            sum(R_video) / cnt_frame,
            sum(f1_video) / cnt_frame) + video_set[k])
        cap.release()
        out.release()  # results refinement via NMS
        cv2.destroyAllWindows()
    print('here is the precision')
    for item in P_test:
        print(np.mean(item))
    print('here is the recall')
    for item in R_test:
        print(np.mean(item))
    print('here is the f-score')
    for item in f1_test:
        print(np.mean(item))
    print(video_set)
Ejemplo n.º 2
0
def main():
    #>>>>>>>>>>>>>>>>>>>>>define data/model path>>>>>>>>>>>>>>>>>>>>>#
    #checkpoint_path = '/home/dragonx/Documents/VideoText2018/EAST-master/weights/east_icdar2015_resnet_v1_50_rbox/'
    #checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/LSTM_east/20180908-124306'
    checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/east/'
    idname1 = '20180921-135717'
    idname2 = 'model.ckpt-56092'
    test_data_path = '/media/dragonx/DataLight/ICDAR2015/test/'
    save_path = '/media/dragonx/DataLight/ICDAR2015/test_results/'
    video_set = []
    for root, dirs, files in os.walk(test_data_path):
        for file in files:
            if file.endswith('.mp4'):
                video_set.append(os.path.splitext(file)[0])
    index = range(0, len(video_set))
    if not os.path.exists(checkpoint_path):
        raise RuntimeError(
            'Checkpoint `{}` not found'.format(checkpoint_path))

    if not os.path.exists(checkpoint_path):
        raise RuntimeError(
            'Checkpoint `{}` not found'.format(checkpoint_path))
    # read images until it is completed
    logger.info('loading model')
    #>>>>>>>>>>>>>>>>>>>>>>> Loading Model >>>>>>>>>>>>>>>>>>>>>>>>>#
    gpu_options = tf.GPUOptions(allow_growth=True)
    input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
    global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
    f_score, f_geometry, _ = model.model(input_images, is_training=False)
    variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
    saver = tf.train.Saver(variable_averages.variables_to_restore())
    # restore the model from weights
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # model_path= tf.train.latest_checkpoint(checkpoint_path)
    # ckpt_state = tf.train.get_checkpoint_state(checkpoint_path)
    # model_path = os.path.join(checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path))
    model_path = checkpoint_path + '/' + idname
    logger.info('Restore from {}'.format(model_path))
    saver.restore(sess, model_path)
    # get infos for video written
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    # Define the codec and create VideoWriter object.The output is stored in 'outpy.avi' file.
    video_save = '/media/dragonx/DataLight/ICDAR2015/test_results/EAST_'+ idname + '.avi'
    out = cv2.VideoWriter(video_save, cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height))
    while(cap.isOpened()):
        ret, frame = cap.read()
        index = index+1
        if ret == True:
            cv2.imshow('Frame', frame)
            print('Processing %d frame with '%(index), frame.shape)
            ######### Use EAST text detector ###########
            start_time = time.time()
            img = frame
            rtparams = collections.OrderedDict()
            rtparams['start_time'] = datetime.datetime.now().isoformat()
            rtparams['image_size'] = '{}x{}'.format(img.shape[1], img.shape[0])
            timer = collections.OrderedDict([
                ('net', 0),
                ('restore', 0),
                ('nms', 0)
            ])

            im_resized, (ratio_h, ratio_w) = resize_image(img)
            rtparams['working_size'] = '{}x{}'.format(
                im_resized.shape[1], im_resized.shape[0])
            start = time.time()
            score, geometry = sess.run(
                [f_score, f_geometry],
                feed_dict={input_images: [im_resized[:,:,::-1]]})
            timer['net'] = time.time() - start

            boxes, timer = detect(score_map=score, geo_map=geometry, timer=timer)
            logger.info('net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format(
                timer['net']*1000, timer['restore']*1000, timer['nms']*1000))

            if boxes is not None:
                scores = boxes[:,8].reshape(-1)
                boxes = boxes[:, :8].reshape((-1, 4, 2))
                boxes[:, :, 0] /= ratio_w
                boxes[:, :, 1] /= ratio_h

            duration = time.time() - start_time
            timer['overall'] = duration
            logger.info('[timing] {}'.format(duration))

            text_lines = []
            if boxes is not None:
                text_lines = []
                for box, score in zip(boxes, scores):
                    box = sort_poly(box.astype(np.int32))
                    if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3]-box[0]) < 5:
                        continue
                    tl = collections.OrderedDict(zip(
                        ['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'],
                        map(float, box.flatten())))
                    tl['score'] = float(score)
                    text_lines.append(tl)
            ret = {
                'text_lines': text_lines,
                'rtparams': rtparams,
                'timing': timer,
            }
            new_img = draw_illu(img.copy(), ret)
            cv2.imshow('Annotated Frame with EAST', new_img)
            out.write(new_img)
            fig1 = plt.figure(figsize=(20, 10))
            fig1.add_subplot(1, 2, 1)
            plt.imshow((np.squeeze(score)*255).astype(np.uint8))
            plt.title("Score Map")
            fig1.add_subplot(1, 2, 2)
            plt.imshow(geometry[0, :,:,1])
            plt.title('Geometry map')
            plt.show()
            # Quit when Q is pressedplt.title("Text Detection with fine-tuned EAST")
            if cv2.waitKey(25) & 0xFF == ord('q'):
                break
            time.sleep(.100)
        else:
            break


    cap.release()
    out.release()
    cv2.destroyAllWindows()
def main():
    #>>>>>>>>>>>>>>>>>>>>>define data/model path>>>>>>>>>>>>>>>>>>>>>#
    #checkpoint_path = '/home/dragonx/Documents/VideoText2018/EAST-master/weights/east_icdar2015_resnet_v1_50_rbox/'
    #checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/LSTM_east/20180908-124306'
    checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/east/'
    idname1 = '20180921-173054'
    idname2 = 'model.ckpt-56092'
    test_data_path = '/media/dragonx/DataLight/ICDAR2013/test/'
    save_path = '/media/dragonx/DataLight/ICDAR2013/test_results1/'
    filename = '/media/dragonx/DataLight/ICDAR2013/test/Video_6_3_2.mp4'
    idx = 0  # initial frame number
    if platform.uname()[1] != 'dragonx-H97N-WIFI':
        print("Now it knows it's in a remote cluster")
        checkpoint_path = '/work/cascades/lxiaol9/ARC/EAST/checkpoints/east/'
        idname1 = '20180921-173054'
        idname2 = 'model.ckpt-56092'
        test_data_path = '/work/cascades/lxiaol9/ARC/EAST/data/ICDAR2013/test/'
        save_path = '/work/cascades/lxiaol9/ARC/EAST/data/ICDAR2013/test_results1/'
    #>>>>>>>>>>>>>>>>>>>>>>Sort test video>>>>>>>>>>>>>>>>>>>>>>>>>>>#
    video_set = []
    for root, dirs, files in os.walk(test_data_path):
        for file in files:
            if file.endswith('.mp4'):
                video_set.append(os.path.splitext(file)[0])
    index = range(1, 6)
    # parser for running outside
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--checkpoint-path', default=checkpoint_path)
    # args = parser.parse_args()
    if not os.path.exists(checkpoint_path):
        raise RuntimeError('Checkpoint `{}` not found'.format(checkpoint_path))

    logger.info('loading model')
    #>>>>>>>>>>>>>>>>>>>>>>> Loading Model >>>>>>>>>>>>>>>>>>>>>>>>>#
    gpu_options = tf.GPUOptions(allow_growth=True)
    input_images = tf.placeholder(tf.float32,
                                  shape=[None, None, None, 3],
                                  name='input_images')
    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)
    f_score, f_geometry, _ = model.model(input_images, is_training=False)
    variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
    saver = tf.train.Saver(variable_averages.variables_to_restore())
    #>>>>>>>>>>>>>>>>>>>>>>>> restore the model from weights>>>>>>>>#
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    model_path = checkpoint_path + idname1 + '/' + idname2
    logger.info('Restore from {}'.format(model_path))
    saver.restore(sess, model_path)
    #>>>>>>>>>>>>>>>>>>>>>> construct KF filter model here>>>>>>>>>>#
    # tracker = KalmanRBOXTracker()
    #>>>>>>>>>>>>>>>>>>>>>>Start evaluation>>>>>>>>>>>>>>>>>>>>>>>>>#
    P_test = []
    R_test = []
    f1_test = []
    for k in index:
        P_video = []
        R_video = []
        f1_video = []
        video_save = save_path + video_set[
            k] + idname1 + '_' + idname2 + '_tracking.avi'
        file_txt = save_path + video_set[k] + '.txt'
        file1 = open(file_txt, "w+")
        file1.close()
        t_start = time.time()
        # sort up all the paths
        xml_solo_path = test_data_path + video_set[k]
        raw_video_path = test_data_path + video_set[k] + '.mp4'
        cap = cv2.VideoCapture(raw_video_path)
        frame_width = int(cap.get(3))
        frame_height = int(cap.get(4))
        cnt_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        out = cv2.VideoWriter(video_save,
                              cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10,
                              (frame_width, frame_height))
        # 1. load both polys and tags; 2. generate geo maps(the format of polys and tags need to match)
        # polys_array_list, tags_array_list, id_list_list, frame_num = load_annotations_solo(xml_solo_path, \
        #             1, cnt_frame, frame_width, frame_height)
        #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>loop over frames>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # we will initialize a tracker object for every
        # mot_tracker = motion_bayestrack()
        for m in range(cnt_frame):
            ret, frame = cap.read()
            # text_polys, text_tags = load_annoataion(txt_fn)
            # text_polys, text_tags = polys_array_list[m], tags_array_list[m]
            # text_polys, text_tags = check_and_validate_polys(text_polys, text_tags, (frame_height, frame_width))
            #     # im, text_polys, text_tags = crop_area(im, text_polys, text_tags, crop_background=False)
            # if text_polys.shape[0] == 0:
            #     continue
            if ret == True:
                # print('Processing %d frame with '%(m), frame.shape)
                start_time = time.time()
                img = frame
                rtparams = collections.OrderedDict()
                rtparams['start_time'] = datetime.datetime.now().isoformat()
                rtparams['image_size'] = '{}x{}'.format(
                    img.shape[1], img.shape[0])
                timer = collections.OrderedDict([('net', 0), ('restore', 0),
                                                 ('nms', 0)])
                # im_resized, (ratio_h, ratio_w) = resize_image(img)
                im_resized = cv2.resize(frame, (int(512), int(512)))
                ratio_h, ratio_w = 512 / frame_height, 512 / frame_width
                rtparams['working_size'] = '{}x{}'.format(
                    im_resized.shape[1], im_resized.shape[0])
                start = time.time()
                score, geometry = sess.run(
                    [f_score, f_geometry],
                    feed_dict={input_images: [im_resized[:, :, ::-1]]})
                timer['net'] = time.time() - start
                boxes, timer = detect(score_map=score,
                                      geo_map=geometry,
                                      timer=timer)
                logger.info(
                    'net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format(
                        timer['net'] * 1000, timer['restore'] * 1000,
                        timer['nms'] * 1000))
                # we will store the text boxes here:

                if boxes is not None:
                    with open(file_txt, "a+") as f:
                        print("Writing frame {:d}".format(m))
                        for box in boxes:
                            f.write(
                                "{:d},".format(m) + '-1,' +
                                ','.join(["{:2.3f}".format(x)
                                          for x in box]) + ',-1,-1,-1\n')
                    scores = boxes[:, 8].reshape(-1)
                    boxes = boxes[:, :8].reshape((-1, 4, 2))
                    boxes[:, :, 0] /= ratio_w
                    boxes[:, :, 1] /= ratio_h
                #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Motion model for every box>>>>>>>>>>>>>>>#
                # predict search region,
                # Kalman Filter Updates
                # if(display):
                #     plt.ion()
                #     fig = plt.figure()
                #     ax1 = fig.add_subplot(111, aspect='equal')
                #     fn = 'mot_benchmark/%s/%s/img1/%06d.jpg'%(phase,seq,frame)
                #     im =io.imread(fn)
                #     ax1.imshow(im)
                #     plt.title(seq+' Tracked Targets')
                # start_time = time.time()
                # trackers = mot_tracker.update(boxes, scores)
                # cycle_time = time.time() - start_time
                # total_time += cycle_time

                # for d in trackers:
                #     print('%d,%d,%.2f,%.2f,%.2f,%.2f,1,-1,-1,-1'%(frame,d[4],d[0],d[1],d[2]-d[0],d[3]-d[1]),file=out_file)
                #     if(display):
                #         d = d.astype(np.int32)
                #         ax1.add_patch(patches.Rectangle((d[0],d[1]),d[2]-d[0],d[3]-d[1],fill=False,lw=3,ec=colours[d[4]%32,:]))
                #         ax1.set_adjustable('box-forced')
                #
                # if(display):
                #     fig.canvas.flush_events()
                #     plt.draw()
                #     ax1.cla()
                #>>>>>>>>>>>>>>>>>>>>>>>>> KF end>>>>>>>>>>>>>>>>>>>>>>>>>>>>#
                duration = time.time() - start_time
                timer['overall'] = duration
                logger.info('[timing] {}'.format(duration))

                text_lines = []
                if boxes is not None:
                    text_lines = []
                    for box, score in zip(boxes, scores):
                        box = sort_poly(box.astype(np.int32))
                        if np.linalg.norm(box[0] -
                                          box[1]) < 5 or np.linalg.norm(
                                              box[3] - box[0]) < 5:
                            continue
                        tl = collections.OrderedDict(
                            zip([
                                'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'
                            ], map(float, box.flatten())))
                        tl['score'] = float(score)
                        text_lines.append(tl)
                pred = {
                    'text_lines': text_lines,
                    'rtparams': rtparams,
                    'timing': timer,
                }
                new_img = draw_illu(img.copy(), pred)
                # out.write(new_img)
                #>>>>>>>>>>>>>>>>>>>>>>>>Evaluation>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
                # targets = text_polys
                # precision, recall, f1 = eval_single_frame(targets, pred)
                precision, recall, f1 = 0, 0, 0
                P_video.append(precision)
                R_video.append(recall)
                f1_video.append(f1)
                # new_img1 = draw_illu_gt(new_img.copy(), targets, precision, recall, f1)
                out.write(new_img)
                # if m == 0:
                #     fig1 = plt.figure(figsize=(20, 10))
                #     fig1.add_subplot(1, 2, 1)
                #     plt.imshow(new_img )
                #     plt.title("Text Detection with fine-tuned EAST")
                #     fig1.add_subplot(1, 2, 2)
                #     plt.imshow(new_img1)
                #     plt.title('Text Detection Results Comparison')
                #     plt.show()
                if cv2.waitKey(25) & 0xFF == ord('q'):
                    break
                # time.sleep(.100)
            else:
                break
            # evaluation on ret and gt
        P_test.append(np.array(P_video, dtype=np.float32))
        R_test.append(np.array(R_video, dtype=np.float32))
        f1_test.append(np.array(f1_video, dtype=np.float32))
        print(P_video)
        print(R_video)
        print(f1_video)
        print("testing results are P:{}, R:{}, F1:{} on ".format(
            sum(P_video) / cnt_frame,
            sum(R_video) / cnt_frame,
            sum(f1_video) / cnt_frame) + video_set[k])
        cap.release()
        out.release()
        cv2.destroyAllWindows()
    print('here is the precision')
    for item in P_test:
        print(np.mean(item))
    print('here is the recall')
    for item in R_test:
        print(np.mean(item))
    print('here is the f-score')
    for item in f1_test:
        print(np.mean(item))
    print(video_set)
Ejemplo n.º 4
0
def main():
    vis_flag = True
    #>>>>>>>>>>>>>>>>>>>>>>> all the path needed >>>>>>>>>>>>>>>>>>>>>#
    pth_namepool='/home/lxiaol9/ARC/EASTRNN/data/GAP_process/'   #picked video
    pth_gt_raw='/home/lxiaol9/ARC/EASTRNN/data/ICDAR/train/'   #GT data
    pth_gt_rbox='/home/lxiaol9/ARC/EASTRNN/checkpoints/LSTM/'   #RBOX Array
    pth_save_avi = '/home/lxiaol9/ARC/EASTRNN/checkpoints/LSTM/RBOX/' #path for results storage
    #==================================================================#
    if platform.uname()[1] == 'dragonx-H97N-WIFI':
        print("Now code running in local machine")
        #>>>>>>>>>>>>>>>>>>>>>>> add paths here >>>>>>>>>>>>>>>>>>>>>#
        pth_namepool = '/media/dragonx/DataStorage/ARC/EASTRNN/data/GAP_process/'  # picked video
        pth_gt_raw = '/media/dragonx/DataStorage/temporary/Video_text/ICDAR/train/'  # GT data
        pth_gt_rbox = '/media/dragonx/DataStorage/ARC/EASTRNN/checkpoints/LSTM/'  # RBOX Array
        pth_save_avi = '/media/dragonx/DataStorage/ARC/EASTRNN/checkpoints/LSTM/RBOX/'  # path for results storage
        #============================================================#
    items = os.listdir(pth_namepool)
    newlist = []
    for names in items:
        if names.endswith(".avi"):
            newlist.append(os.path.splitext(names)[0])
    # video names in the selected pool
    print(newlist)
    #>>>>>>>>>>>>>>>>>>>>>>>>>choose the Video No. here >>>>>>>>>>>>>>>#
    k = 1
    #==================================================================#
    sample = newlist[k]
    filename    = pth_gt_raw + sample+'.mp4'
    XML_filepath = pth_gt_rbox + sample+'_GT.xml'
    # read video and get resized frames later
    cap         = cv2.VideoCapture(filename)
    if not os.path.exists(filename):
        raise RuntimeError(
            'Video `{}` not found'.format(filename))
    if not os.path.exists(pth_save_avi):
         os.makedirs(pth_save_avi)
    index = 0
    logger.info('########### Now loading the array data #############')
    # logger.info('loading model')
    # gpu_options = tf.GPUOptions(allow_growth=True)
    # input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images')
    # global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
    # f_score, f_geometry, v_feature = model.model(input_images, is_training=False)
    #
    # variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
    # saver = tf.train.Saver(variable_averages.variables_to_restore())
    # # restore the model from weights
    # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # ckpt_state = tf.train.get_checkpoint_state(checkpoint_path)
    # model_path = os.path.join(checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path))
    # logger.info('Restore from {}'.format(model_path))
    # saver.restore(sess, model_path)
    # get infos for video written
    if k == 1:
        geo_maps = np.load(pth_gt_rbox+'score.npy')
        score_maps = np.load(pth_gt_rbox+'geo.npy')
    else:
        geo_maps = np.load(pth_gt_rbox + 'score'+ str(k-1) + '.npy')
        score_maps = np.load(pth_gt_rbox + 'geo'+ str(k-1) + '.npy')
    # frame_width =  int(cap.get(3))
    # frame_height = int(cap.get(4))
    # Define the codec and create VideoWriter object.The output is stored in 'outpy.avi' file.
    out = cv2.VideoWriter(pth_save_avi+sample+'.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 10, (512,512))
    index = 0
    while(cap.isOpened()):
        ret, frame = cap.read()
        index = index+1
        if ret == True:
            # prepare data used for one frame
            frame_score = np.zeros((1, 512, 512, 1), dtype=np.float32)
            frame_geo = np.zeros((1, 512, 512, 5))
            frame_score[0, :, :, 0] = score_maps[index-1, :, :]
            frame_geo[0,:,:,:] = geo_maps[index-1, :, :, :]
            frame_rsz = cv2.resize(frame, (512, 512))
            if vis_flag is True:
                cv2.imshow('Frame', frame_rsz)
                # cv2.imshow('Score map', score_maps[index-1, :, :])
            print('Processing %d frame with '%(index), frame.shape)
            # for i in range(512):
            #     for j in range(512):
            #         if geo_maps[index - 1, i, j, 1] != 0:
            #             print(geo_maps[index - 1, i, j, :])
            ######### Use EAST text detector ###########
            start_time = time.time()
            img = frame_rsz
            rtparams = collections.OrderedDict()
            rtparams['start_time'] = datetime.datetime.now().isoformat()
            rtparams['image_size'] = '{}x{}'.format(img.shape[1], img.shape[0])
            timer = collections.OrderedDict([
                ('net', 0),
                ('restore', 0),
                ('nms', 0)
            ])
            print('score shape {:s}, geometry shape {:s}'.format(str(frame_score.shape), str(frame_geo.shape)))
            boxes, timer = detect(score_map=frame_score, geo_map=frame_geo, timer=timer)
            logger.info('net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format(
                timer['net']*1000, timer['restore']*1000, timer['nms']*1000))

            if boxes is not None:
                scores = boxes[:,8].reshape(-1)
                boxes = boxes[:, :8].reshape((-1, 4, 2))
                boxes[:, :, 0] /= 1
                boxes[:, :, 1] /= 1

            duration = time.time() - start_time
            timer['overall'] = duration
            logger.info('[timing] {}'.format(duration))

            text_lines = []
            if boxes is not None:
                text_lines = []
                for box, score in zip(boxes, scores):
                    box = sort_poly(box.astype(np.int32))
                    if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3]-box[0]) < 5:
                        continue
                    tl = collections.OrderedDict(zip(
                        ['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'],
                        map(float, box.flatten())))
                    print(tl)
                    tl['score'] = float(score)
                    text_lines.append(tl)
            ret = {
                'text_lines': text_lines,
                # 'rtparams': rtparams,
                # 'timing': timer,
                # 'geometry': geometry,
                # 'score':float(score),
            }
            # # 1. print boxes number
            # print('%d Boxs found'%(len(text_lines)))
            # # 2. eval_single_frame(target, box)
            # p, r, f1 = eval_single_frame(target, ret)
            # print('Precision %f, recall %f, F_measure %f' % (p, r, f1))
            # # 3. save files into directory
            # jsonfile = json.dumps(ret)
            # directory = save_path+sample
            # if not os.path.exists(directory):
            #     os.makedirs(directory+'/json/')
            #     os.makedirs(directory + '/npy/')
            #     os.makedirs(directory + '/score/')
            #
            # jsonfname = directory+'/json/frame'+format(index, '03d')+'.json'
            # npyname   = directory+'/npy/frame'+format(index, '03d')+'.npy'
            # scorename = directory + '/score/frame' + format(index, '03d') + '.npy'
            # np.save(npyname, feature)
            # np.save(scorename, score_m)
            # f = open(jsonfname,"w")
            # f.write(jsonfile)
            # f.close()
            # visualization
            new_img = draw_illu(img.copy(), ret)
            if vis_flag is True:
                cv2.imshow('Images with BBOX', new_img)
            #new_img1 = draw_illu_gt(new_img.copy(), target)
            #cv2.imshow('Annotated Frame with EAST', new_img1)
            out.write(new_img)
            # Quit when Q is pressed
            if cv2.waitKey(25) & 0xFF == ord('q'):
                break
            time.sleep(0.02)
        else:
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()
Ejemplo n.º 5
0
def main():
    #>>>>>>>>>>>>>>>>>>>>>define data/model path>>>>>>>>>>>>>>>>>>>>>#
    #checkpoint_path = '/home/dragonx/Documents/VideoText2018/EAST-master/weights/east_icdar2015_resnet_v1_50_rbox/'
    #checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/LSTM_east/20180908-124306'
    checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/east/'
    idname1 = '20180921-135717'
    idname2 = 'model.ckpt-56092'
    test_data_path = '/media/dragonx/DataLight/ICDAR2015/test/'
    save_path = '/media/dragonx/DataLight/ICDAR2015/test_results1/'
    filename = '/media/dragonx/DataLight/ICDAR2013/test/Video_6_3_2.mp4'
    idx = 0  # initial frame number
    #>>>>>>>>>>>>>>>>>>>>>>Sort test video>>>>>>>>>>>>>>>>>>>>>>>>>>>#
    video_set = []
    for root, dirs, files in os.walk(test_data_path):
        for file in files:
            if file.endswith('.mp4'):
                video_set.append(os.path.splitext(file)[0])
    index = range(0, len(video_set))
    # parser for running outside
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--checkpoint-path', default=checkpoint_path)
    # args = parser.parse_args()
    if not os.path.exists(checkpoint_path):
        raise RuntimeError('Checkpoint `{}` not found'.format(checkpoint_path))

    logger.info('loading model')
    #>>>>>>>>>>>>>>>>>>>>>>> Loading Model >>>>>>>>>>>>>>>>>>>>>>>>>#
    gpu_options = tf.GPUOptions(allow_growth=True)
    input_images = tf.placeholder(tf.float32,
                                  shape=[None, None, None, 3],
                                  name='input_images')
    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)
    f_score, f_geometry, _ = model.model(input_images, is_training=False)
    variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
    saver = tf.train.Saver(variable_averages.variables_to_restore())
    #>>>>>>>>>>>>>>>>>>>>>>>> restore the model from weights>>>>>>>>#
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    model_path = checkpoint_path + idname1 + '/' + idname2
    logger.info('Restore from {}'.format(model_path))
    saver.restore(sess, model_path)
    #>>>>>>>>>>>>>>>>>>>>>>Start evaluation>>>>>>>>>>>>>>>>>>>>>>>>>#
    P_test = []
    R_test = []
    f1_test = []
    for k in index:
        P_video = []
        R_video = []
        f1_video = []
        video_save = save_path + video_set[k] + idname1 + '_' + idname2 + '.avi'
        t_start = time.time()
        # sort up all the paths
        xml_solo_path = test_data_path + video_set[k]
        raw_video_path = test_data_path + video_set[k] + '.mp4'
        cap = cv2.VideoCapture(raw_video_path)
        frame_width = int(cap.get(3))
        frame_height = int(cap.get(4))
        cnt_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        out = cv2.VideoWriter(video_save,
                              cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10,
                              (frame_width, frame_height))
        # 1. load both polys and tags; 2. generate geo maps(the format of polys and tags need to match)
        # polys_array_list, tags_array_list, id_list_list, frame_num = load_annotations_solo(xml_solo_path, \
        #             1, cnt_frame, frame_width, frame_height)
        #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>loop over frames>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        for m in range(cnt_frame):
            ret, frame = cap.read()
            # text_polys, text_tags = load_annoataion(txt_fn)
            # text_polys, text_tags = polys_array_list[m], tags_array_list[m]
            # text_polys, text_tags = check_and_validate_polys(text_polys, text_tags, (frame_height, frame_width))
            #     # im, text_polys, text_tags = crop_area(im, text_polys, text_tags, crop_background=False)
            # if text_polys.shape[0] == 0:
            #     continue
            if ret == True:
                # print('Processing %d frame with '%(m), frame.shape)
                start_time = time.time()
                img = frame
                rtparams = collections.OrderedDict()
                rtparams['start_time'] = datetime.datetime.now().isoformat()
                rtparams['image_size'] = '{}x{}'.format(
                    img.shape[1], img.shape[0])
                timer = collections.OrderedDict([('net', 0), ('restore', 0),
                                                 ('nms', 0)])
                # im_resized, (ratio_h, ratio_w) = resize_image(img)
                im_resized = cv2.resize(frame, (int(512), int(512)))
                ratio_h, ratio_w = 512 / frame_height, 512 / frame_width
                rtparams['working_size'] = '{}x{}'.format(
                    im_resized.shape[1], im_resized.shape[0])
                start = time.time()
                score, geometry = sess.run(
                    [f_score, f_geometry],
                    feed_dict={input_images: [im_resized[:, :, ::-1]]})
                timer['net'] = time.time() - start
                boxes, timer = detect(score_map=score,
                                      geo_map=geometry,
                                      timer=timer)
                logger.info(
                    'net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format(
                        timer['net'] * 1000, timer['restore'] * 1000,
                        timer['nms'] * 1000))

                if boxes is not None:
                    scores = boxes[:, 8].reshape(-1)
                    boxes = boxes[:, :8].reshape((-1, 4, 2))
                    boxes[:, :, 0] /= ratio_w
                    boxes[:, :, 1] /= ratio_h

                duration = time.time() - start_time
                timer['overall'] = duration
                logger.info('[timing] {}'.format(duration))

                text_lines = []
                if boxes is not None:
                    text_lines = []
                    for box, score in zip(boxes, scores):
                        box = sort_poly(box.astype(np.int32))
                        if np.linalg.norm(box[0] -
                                          box[1]) < 5 or np.linalg.norm(
                                              box[3] - box[0]) < 5:
                            continue
                        tl = collections.OrderedDict(
                            zip([
                                'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'
                            ], map(float, box.flatten())))
                        tl['score'] = float(score)
                        text_lines.append(tl)
                pred = {
                    'text_lines': text_lines,
                    'rtparams': rtparams,
                    'timing': timer,
                }
                new_img = draw_illu(img.copy(), pred)
                # out.write(new_img)
                #>>>>>>>>>>>>>>>>>>>>>>>>Evaluation>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
                # targets = text_polys
                # precision, recall, f1 = eval_single_frame(targets, pred)
                precision, recall, f1 = 0, 0, 0
                P_video.append(precision)
                R_video.append(recall)
                f1_video.append(f1)
                # new_img1 = draw_illu_gt(new_img.copy(), targets, precision, recall, f1)
                out.write(new_img)
                # if m == 0:
                #     fig1 = plt.figure(figsize=(20, 10))
                #     fig1.add_subplot(1, 2, 1)
                #     plt.imshow(new_img )
                #     plt.title("Text Detection with fine-tuned EAST")
                #     fig1.add_subplot(1, 2, 2)
                #     plt.imshow(new_img1)
                #     plt.title('Text Detection Results Comparison')
                #     plt.show()
                if cv2.waitKey(25) & 0xFF == ord('q'):
                    break
                # time.sleep(.100)
            else:
                break
            # evaluation on ret and gt
        P_test.append(np.array(P_video, dtype=np.float32))
        R_test.append(np.array(R_video, dtype=np.float32))
        f1_test.append(np.array(f1_video, dtype=np.float32))
        print(P_video)
        print(R_video)
        print(f1_video)
        print("testing results are P:{}, R:{}, F1:{} on ".format(
            sum(P_video) / cnt_frame,
            sum(R_video) / cnt_frame,
            sum(f1_video) / cnt_frame) + video_set[k])
        cap.release()
        out.release()
        cv2.destroyAllWindows()
    print('here is the precision')
    for item in P_test:
        print(np.mean(item))
    print('here is the recall')
    for item in R_test:
        print(np.mean(item))
    print('here is the f-score')
    for item in f1_test:
        print(np.mean(item))
    print(video_set)
Ejemplo n.º 6
0
def main():
    #checkpoint_path = '/home/dragonx/Documents/VideoText2018/EAST-master/weights/east_icdar2015_resnet_v1_50_rbox/'
    filename = '/media/dragonx/752d26ef-8f47-416d-b311-66c6dfabf4a3/Video Detection/ICDAR/train/Video_16_3_2.mp4'

    cap = cv2.VideoCapture(filename)
    parser = argparse.ArgumentParser()
    parser.add_argument('--checkpoint-path', default=checkpoint_path)
    args = parser.parse_args()

    if not os.path.exists(checkpoint_path):
        raise RuntimeError('Checkpoint `{}` not found'.format(checkpoint_path))
    # read images until it is completed
    index = 0
    logger.info('loading model')

    gpu_options = tf.GPUOptions(allow_growth=True)
    input_images = tf.placeholder(tf.float32,
                                  shape=[None, None, None, 3],
                                  name='input_images')
    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)

    f_score, f_geometry = model.model(input_images, is_training=False)

    variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step)
    saver = tf.train.Saver(variable_averages.variables_to_restore())
    # restore the model from weights
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

    ckpt_state = tf.train.get_checkpoint_state(checkpoint_path)
    model_path = os.path.join(
        checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path))
    logger.info('Restore from {}'.format(model_path))
    saver.restore(sess, model_path)
    # get infos for video written
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    # Define the codec and create VideoWriter object.The output is stored in 'outpy.avi' file.
    out = cv2.VideoWriter('EAST_testDemo1.avi',
                          cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10,
                          (frame_width, frame_height))
    while (cap.isOpened()):
        ret, frame = cap.read()
        index = index + 1
        if ret == True:
            cv2.imshow('Frame', frame)
            print('Processing %d frame with ' % (index), frame.shape)
            ######### Use EAST text detector ###########
            start_time = time.time()
            img = frame
            rtparams = collections.OrderedDict()
            rtparams['start_time'] = datetime.datetime.now().isoformat()
            rtparams['image_size'] = '{}x{}'.format(img.shape[1], img.shape[0])
            timer = collections.OrderedDict([('net', 0), ('restore', 0),
                                             ('nms', 0)])

            im_resized, (ratio_h, ratio_w) = resize_image(img)
            rtparams['working_size'] = '{}x{}'.format(im_resized.shape[1],
                                                      im_resized.shape[0])
            start = time.time()
            score, geometry = sess.run(
                [f_score, f_geometry],
                feed_dict={input_images: [im_resized[:, :, ::-1]]})
            timer['net'] = time.time() - start

            boxes, timer = detect(score_map=score,
                                  geo_map=geometry,
                                  timer=timer)
            logger.info('net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format(
                timer['net'] * 1000, timer['restore'] * 1000,
                timer['nms'] * 1000))

            if boxes is not None:
                scores = boxes[:, 8].reshape(-1)
                boxes = boxes[:, :8].reshape((-1, 4, 2))
                boxes[:, :, 0] /= ratio_w
                boxes[:, :, 1] /= ratio_h

            duration = time.time() - start_time
            timer['overall'] = duration
            logger.info('[timing] {}'.format(duration))

            text_lines = []
            if boxes is not None:
                text_lines = []
                for box, score in zip(boxes, scores):
                    box = sort_poly(box.astype(np.int32))
                    if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(
                            box[3] - box[0]) < 5:
                        continue
                    tl = collections.OrderedDict(
                        zip(['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'],
                            map(float, box.flatten())))
                    tl['score'] = float(score)
                    text_lines.append(tl)
            ret = {
                'text_lines': text_lines,
                'rtparams': rtparams,
                'timing': timer,
            }

            new_img = draw_illu(img.copy(), ret)
            cv2.imshow('Annotated Frame with EAST', new_img)
            out.write(new_img)
            # Quit when Q is pressed
            if cv2.waitKey(25) & 0xFF == ord('q'):
                break
            time.sleep(.100)
        else:
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()