def main(): checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/LSTM_east/' idname1 = '20180924-191410' idname2 = '20180924-191410-5001' test_data_path = '/media/dragonx/DataLight/ICDAR2013/test/' save_path = '/media/dragonx/DataLight/ICDAR2013/test_results_lstm/' filename = '/media/dragonx/DataLight/ICDAR2013/test/Video_6_3_2.mp4' idx = 0 # initial frame number config = get_config(FLAGS) config.batch_size = 1 config.num_layers = 3 config.num_steps = 10 #>>>>>>>>>>>>>>>>>>>>>>Sort test video>>>>>>>>>>>>>>>>>>>>>>>>>>># video_set = [] for root, dirs, files in os.walk(test_data_path): for file in files: if file.endswith('.mp4'): video_set.append(os.path.splitext(file)[0]) index = range(0, 1) # parser for running outside # parser = argparse.ArgumentParser() # parser.add_argument('--checkpoint-path', default=checkpoint_path) # args = parser.parse_args() if not os.path.exists(checkpoint_path): raise RuntimeError('Checkpoint `{}` not found'.format(checkpoint_path)) logger.info('loading model') #>>>>>>>>>>>>>>>>>>>>>>> Loading Model >>>>>>>>>>>>>>>>>>>>>>>>># gpu_options = tf.GPUOptions(allow_growth=True) input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') # global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Global initializer for Variables in the model # log: May 3rd, we need to adapt the model input, with config # with tf.name_scope("Train"): # # use placeholder to stand for input and targets # initializer = tf.random_normal_initializer() # x_train = tf.placeholder(tf.float32, shape=[None, config.num_steps, None, None, 3]) # m = ArrayModel(True, config, x_train, reuse_variables=None, initializer=initializer) with tf.name_scope("Val"): # use placeholder to stand for input and targets initializer = tf.random_normal_initializer() x_val = tf.placeholder(tf.float32, shape=[None, config.num_steps, None, None, 3]) model = ArrayModel(False, config, x_val, reuse_variables=None, initializer=initializer) var_total = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) print(var_total) #>>>>>>>>>>>>>>>>>>>>>>>> restore the model from weights>>>>>>>># soft_placement = False # var_list1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='feature_fusion') # var_list2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='resnet_v1_50') # var_list3 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='multi_rnn_cell') # var_list4 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='pred_module') # var_list = var_list1 + var_list2 + var_list3 + var_list4 # saver = tf.train.Saver({v.op.name: v for v in var_list}) saver = tf.train.Saver() config_proto = tf.ConfigProto(allow_soft_placement=soft_placement) # with sv.managed_session(config=config_proto) as session: # if FLAGS.restore: # print('continue training from previous checkpoint') # # ckpt = tf.train.latest_checkpoint(FLAGS.checkpoints_path) # ckpt = checkpoint_path + idname1 + '/' + idname2 # sv.saver.restore(session, ckpt) model_path = checkpoint_path + idname1 + '/' + idname2 sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) logger.info('Restore from {}'.format(model_path)) saver.restore(sess, model_path) #>>>>>>>>>>>>>>>>>>>>>>Start evaluation>>>>>>>>>>>>>>>>>>>>>>>>># P_test = [] R_test = [] f1_test = [] for k in index: P_video = [] R_video = [] f1_video = [] video_save = save_path + video_set[k] + idname1 + '_' + idname2 + '.avi' t_start = time.time() # sort up all the paths xml_solo_path = test_data_path + video_set[k] raw_video_path = test_data_path + video_set[k] + '.mp4' cap = cv2.VideoCapture(raw_video_path) frame_width = int(cap.get(3)) frame_height = int(cap.get(4)) cnt_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) out = cv2.VideoWriter(video_save, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (frame_width, frame_height)) # 1. load both polys and tags; 2. generate geo maps(the format of polys and tags need to match) polys_array_list, tags_array_list, id_list_list, frame_num = load_annotations_solo(xml_solo_path, \ 1, cnt_frame, frame_width, frame_height) #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>loop over frames in the time steps >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> for i in range(int(cnt_frame / config.num_steps)): data_seq = np.zeros((1, config.num_steps, 512, 512, 3), dtype=np.float32) data_original = np.zeros( (1, config.num_steps, frame_height, frame_width, 3), dtype=np.float32) for j in range(config.num_steps): ret, frame = cap.read() # im_resized = cv2.resize(frame, (int(512), int(512))) im_resized = frame[0:512, 0:512, :] data_original[0, j, :, :, :] = frame data_seq[0, j, :, :, :] = im_resized #>>>>>>>>>>>>>>>>>>>>>>>>>Now it's time to run the model>>>>>>>>>>>>>>>>>>>>>>>>>> state = sess.run(model.initial_state) # tensors dict to run fetches = { "score_map": model.score_map_set, "geometry_map": model.geometry_set } feed_dict = {} feed_dict[model.input_data] = data_seq for i, (c, h) in enumerate(model.initial_state): feed_dict[c] = state[i].c feed_dict[h] = state[i].h timer = collections.OrderedDict([('net', 0), ('restore', 0), ('nms', 0)]) start = time.time() vals = sess.run(fetches, feed_dict=feed_dict) timer['net'] = time.time() - start #>>>>>>>>>>>>>>>>>>>>>>>>Okay!!!We could evalute the results now>>>>>>>>>>>>>>>>>>> for j in range(config.num_steps): rtparams = collections.OrderedDict() rtparams['start_time'] = datetime.datetime.now().isoformat() rtparams['image_size'] = '{}x{}'.format( frame_width, frame_height) # im_resized, (ratio_h, ratio_w) = resize_image(img) ratio_h, ratio_w = 512 / frame_height, 512 / frame_width rtparams['working_size'] = '{}x{}'.format(512, 512) # results refinement via NMS score = vals["score_map"][j] geometry = vals["geometry_map"][j] boxes, timer = detect(score_map=score, geo_map=geometry, timer=timer) logger.info( 'net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format( timer['net'] * 1000, timer['restore'] * 1000, timer['nms'] * 1000)) if boxes is not None: scores = boxes[:, 8].reshape(-1) boxes = boxes[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= ratio_w boxes[:, :, 1] /= ratio_h duration = time.time() - start timer['overall'] = duration logger.info('[timing] {}'.format(duration)) text_lines = [] if boxes is not None: text_lines = [] for box, score in zip(boxes, scores): box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm( box[3] - box[0]) < 5: continue tl = collections.OrderedDict( zip([ 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3' ], map(float, box.flatten()))) tl['score'] = float(score) text_lines.append(tl) pred = { 'text_lines': text_lines, 'rtparams': rtparams, 'timing': timer, } text_polys, text_tags = polys_array_list[ i * 10 + j], tags_array_list[i * 10 + j] text_polys, text_tags = check_and_validate_polys( text_polys, text_tags, (frame_height, frame_width)) # out.write(new_img) #>>>>>>>>>>>>>>>>>>>>>>>>Evaluation>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> targets = text_polys precision, recall, f1 = eval_single_frame(targets, pred) P_video.append(precision) R_video.append(recall) f1_video.append(f1) img = data_original[0, j, :, :, :] new_img = draw_illu(img.copy(), pred) new_img1 = draw_illu_gt(new_img.copy(), targets, precision, recall, f1) out.write(new_img1) # using for pre-testing if j == 0 and FLAGS.vis: fig1 = plt.figure(figsize=(20, 10)) fig1.add_subplot(1, 2, 1) plt.imshow(new_img) plt.title("Text Detection with fine-tuned EAST") fig1.add_subplot(1, 2, 2) plt.imshow(new_img1) plt.title('Text Detection Results Comparison') plt.show() if cv2.waitKey(25) & 0xFF == ord('q'): break # time.sleep(.100) else: break # evaluation on ret and gt P_test.append(np.array(P_video, dtype=np.float32)) R_test.append(np.array(R_video, dtype=np.float32)) f1_test.append(np.array(f1_video, dtype=np.float32)) print(P_video) print(R_video) print(f1_video) print("testing results are P:{}, R:{}, F1:{} on ".format( sum(P_video) / cnt_frame, sum(R_video) / cnt_frame, sum(f1_video) / cnt_frame) + video_set[k]) cap.release() out.release() # results refinement via NMS cv2.destroyAllWindows() print('here is the precision') for item in P_test: print(np.mean(item)) print('here is the recall') for item in R_test: print(np.mean(item)) print('here is the f-score') for item in f1_test: print(np.mean(item)) print(video_set)
def main(): #>>>>>>>>>>>>>>>>>>>>>define data/model path>>>>>>>>>>>>>>>>>>>>># #checkpoint_path = '/home/dragonx/Documents/VideoText2018/EAST-master/weights/east_icdar2015_resnet_v1_50_rbox/' #checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/LSTM_east/20180908-124306' checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/east/' idname1 = '20180921-135717' idname2 = 'model.ckpt-56092' test_data_path = '/media/dragonx/DataLight/ICDAR2015/test/' save_path = '/media/dragonx/DataLight/ICDAR2015/test_results/' video_set = [] for root, dirs, files in os.walk(test_data_path): for file in files: if file.endswith('.mp4'): video_set.append(os.path.splitext(file)[0]) index = range(0, len(video_set)) if not os.path.exists(checkpoint_path): raise RuntimeError( 'Checkpoint `{}` not found'.format(checkpoint_path)) if not os.path.exists(checkpoint_path): raise RuntimeError( 'Checkpoint `{}` not found'.format(checkpoint_path)) # read images until it is completed logger.info('loading model') #>>>>>>>>>>>>>>>>>>>>>>> Loading Model >>>>>>>>>>>>>>>>>>>>>>>>># gpu_options = tf.GPUOptions(allow_growth=True) input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) f_score, f_geometry, _ = model.model(input_images, is_training=False) variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) saver = tf.train.Saver(variable_averages.variables_to_restore()) # restore the model from weights sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # model_path= tf.train.latest_checkpoint(checkpoint_path) # ckpt_state = tf.train.get_checkpoint_state(checkpoint_path) # model_path = os.path.join(checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) model_path = checkpoint_path + '/' + idname logger.info('Restore from {}'.format(model_path)) saver.restore(sess, model_path) # get infos for video written frame_width = int(cap.get(3)) frame_height = int(cap.get(4)) # Define the codec and create VideoWriter object.The output is stored in 'outpy.avi' file. video_save = '/media/dragonx/DataLight/ICDAR2015/test_results/EAST_'+ idname + '.avi' out = cv2.VideoWriter(video_save, cv2.VideoWriter_fourcc('M','J','P','G'), 10, (frame_width,frame_height)) while(cap.isOpened()): ret, frame = cap.read() index = index+1 if ret == True: cv2.imshow('Frame', frame) print('Processing %d frame with '%(index), frame.shape) ######### Use EAST text detector ########### start_time = time.time() img = frame rtparams = collections.OrderedDict() rtparams['start_time'] = datetime.datetime.now().isoformat() rtparams['image_size'] = '{}x{}'.format(img.shape[1], img.shape[0]) timer = collections.OrderedDict([ ('net', 0), ('restore', 0), ('nms', 0) ]) im_resized, (ratio_h, ratio_w) = resize_image(img) rtparams['working_size'] = '{}x{}'.format( im_resized.shape[1], im_resized.shape[0]) start = time.time() score, geometry = sess.run( [f_score, f_geometry], feed_dict={input_images: [im_resized[:,:,::-1]]}) timer['net'] = time.time() - start boxes, timer = detect(score_map=score, geo_map=geometry, timer=timer) logger.info('net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format( timer['net']*1000, timer['restore']*1000, timer['nms']*1000)) if boxes is not None: scores = boxes[:,8].reshape(-1) boxes = boxes[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= ratio_w boxes[:, :, 1] /= ratio_h duration = time.time() - start_time timer['overall'] = duration logger.info('[timing] {}'.format(duration)) text_lines = [] if boxes is not None: text_lines = [] for box, score in zip(boxes, scores): box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3]-box[0]) < 5: continue tl = collections.OrderedDict(zip( ['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'], map(float, box.flatten()))) tl['score'] = float(score) text_lines.append(tl) ret = { 'text_lines': text_lines, 'rtparams': rtparams, 'timing': timer, } new_img = draw_illu(img.copy(), ret) cv2.imshow('Annotated Frame with EAST', new_img) out.write(new_img) fig1 = plt.figure(figsize=(20, 10)) fig1.add_subplot(1, 2, 1) plt.imshow((np.squeeze(score)*255).astype(np.uint8)) plt.title("Score Map") fig1.add_subplot(1, 2, 2) plt.imshow(geometry[0, :,:,1]) plt.title('Geometry map') plt.show() # Quit when Q is pressedplt.title("Text Detection with fine-tuned EAST") if cv2.waitKey(25) & 0xFF == ord('q'): break time.sleep(.100) else: break cap.release() out.release() cv2.destroyAllWindows()
def main(): #>>>>>>>>>>>>>>>>>>>>>define data/model path>>>>>>>>>>>>>>>>>>>>># #checkpoint_path = '/home/dragonx/Documents/VideoText2018/EAST-master/weights/east_icdar2015_resnet_v1_50_rbox/' #checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/LSTM_east/20180908-124306' checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/east/' idname1 = '20180921-173054' idname2 = 'model.ckpt-56092' test_data_path = '/media/dragonx/DataLight/ICDAR2013/test/' save_path = '/media/dragonx/DataLight/ICDAR2013/test_results1/' filename = '/media/dragonx/DataLight/ICDAR2013/test/Video_6_3_2.mp4' idx = 0 # initial frame number if platform.uname()[1] != 'dragonx-H97N-WIFI': print("Now it knows it's in a remote cluster") checkpoint_path = '/work/cascades/lxiaol9/ARC/EAST/checkpoints/east/' idname1 = '20180921-173054' idname2 = 'model.ckpt-56092' test_data_path = '/work/cascades/lxiaol9/ARC/EAST/data/ICDAR2013/test/' save_path = '/work/cascades/lxiaol9/ARC/EAST/data/ICDAR2013/test_results1/' #>>>>>>>>>>>>>>>>>>>>>>Sort test video>>>>>>>>>>>>>>>>>>>>>>>>>>># video_set = [] for root, dirs, files in os.walk(test_data_path): for file in files: if file.endswith('.mp4'): video_set.append(os.path.splitext(file)[0]) index = range(1, 6) # parser for running outside # parser = argparse.ArgumentParser() # parser.add_argument('--checkpoint-path', default=checkpoint_path) # args = parser.parse_args() if not os.path.exists(checkpoint_path): raise RuntimeError('Checkpoint `{}` not found'.format(checkpoint_path)) logger.info('loading model') #>>>>>>>>>>>>>>>>>>>>>>> Loading Model >>>>>>>>>>>>>>>>>>>>>>>>># gpu_options = tf.GPUOptions(allow_growth=True) input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) f_score, f_geometry, _ = model.model(input_images, is_training=False) variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) saver = tf.train.Saver(variable_averages.variables_to_restore()) #>>>>>>>>>>>>>>>>>>>>>>>> restore the model from weights>>>>>>>># sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) model_path = checkpoint_path + idname1 + '/' + idname2 logger.info('Restore from {}'.format(model_path)) saver.restore(sess, model_path) #>>>>>>>>>>>>>>>>>>>>>> construct KF filter model here>>>>>>>>>># # tracker = KalmanRBOXTracker() #>>>>>>>>>>>>>>>>>>>>>>Start evaluation>>>>>>>>>>>>>>>>>>>>>>>>># P_test = [] R_test = [] f1_test = [] for k in index: P_video = [] R_video = [] f1_video = [] video_save = save_path + video_set[ k] + idname1 + '_' + idname2 + '_tracking.avi' file_txt = save_path + video_set[k] + '.txt' file1 = open(file_txt, "w+") file1.close() t_start = time.time() # sort up all the paths xml_solo_path = test_data_path + video_set[k] raw_video_path = test_data_path + video_set[k] + '.mp4' cap = cv2.VideoCapture(raw_video_path) frame_width = int(cap.get(3)) frame_height = int(cap.get(4)) cnt_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) out = cv2.VideoWriter(video_save, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (frame_width, frame_height)) # 1. load both polys and tags; 2. generate geo maps(the format of polys and tags need to match) # polys_array_list, tags_array_list, id_list_list, frame_num = load_annotations_solo(xml_solo_path, \ # 1, cnt_frame, frame_width, frame_height) #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>loop over frames>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # we will initialize a tracker object for every # mot_tracker = motion_bayestrack() for m in range(cnt_frame): ret, frame = cap.read() # text_polys, text_tags = load_annoataion(txt_fn) # text_polys, text_tags = polys_array_list[m], tags_array_list[m] # text_polys, text_tags = check_and_validate_polys(text_polys, text_tags, (frame_height, frame_width)) # # im, text_polys, text_tags = crop_area(im, text_polys, text_tags, crop_background=False) # if text_polys.shape[0] == 0: # continue if ret == True: # print('Processing %d frame with '%(m), frame.shape) start_time = time.time() img = frame rtparams = collections.OrderedDict() rtparams['start_time'] = datetime.datetime.now().isoformat() rtparams['image_size'] = '{}x{}'.format( img.shape[1], img.shape[0]) timer = collections.OrderedDict([('net', 0), ('restore', 0), ('nms', 0)]) # im_resized, (ratio_h, ratio_w) = resize_image(img) im_resized = cv2.resize(frame, (int(512), int(512))) ratio_h, ratio_w = 512 / frame_height, 512 / frame_width rtparams['working_size'] = '{}x{}'.format( im_resized.shape[1], im_resized.shape[0]) start = time.time() score, geometry = sess.run( [f_score, f_geometry], feed_dict={input_images: [im_resized[:, :, ::-1]]}) timer['net'] = time.time() - start boxes, timer = detect(score_map=score, geo_map=geometry, timer=timer) logger.info( 'net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format( timer['net'] * 1000, timer['restore'] * 1000, timer['nms'] * 1000)) # we will store the text boxes here: if boxes is not None: with open(file_txt, "a+") as f: print("Writing frame {:d}".format(m)) for box in boxes: f.write( "{:d},".format(m) + '-1,' + ','.join(["{:2.3f}".format(x) for x in box]) + ',-1,-1,-1\n') scores = boxes[:, 8].reshape(-1) boxes = boxes[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= ratio_w boxes[:, :, 1] /= ratio_h #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Motion model for every box>>>>>>>>>>>>>>># # predict search region, # Kalman Filter Updates # if(display): # plt.ion() # fig = plt.figure() # ax1 = fig.add_subplot(111, aspect='equal') # fn = 'mot_benchmark/%s/%s/img1/%06d.jpg'%(phase,seq,frame) # im =io.imread(fn) # ax1.imshow(im) # plt.title(seq+' Tracked Targets') # start_time = time.time() # trackers = mot_tracker.update(boxes, scores) # cycle_time = time.time() - start_time # total_time += cycle_time # for d in trackers: # print('%d,%d,%.2f,%.2f,%.2f,%.2f,1,-1,-1,-1'%(frame,d[4],d[0],d[1],d[2]-d[0],d[3]-d[1]),file=out_file) # if(display): # d = d.astype(np.int32) # ax1.add_patch(patches.Rectangle((d[0],d[1]),d[2]-d[0],d[3]-d[1],fill=False,lw=3,ec=colours[d[4]%32,:])) # ax1.set_adjustable('box-forced') # # if(display): # fig.canvas.flush_events() # plt.draw() # ax1.cla() #>>>>>>>>>>>>>>>>>>>>>>>>> KF end>>>>>>>>>>>>>>>>>>>>>>>>>>>># duration = time.time() - start_time timer['overall'] = duration logger.info('[timing] {}'.format(duration)) text_lines = [] if boxes is not None: text_lines = [] for box, score in zip(boxes, scores): box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm( box[3] - box[0]) < 5: continue tl = collections.OrderedDict( zip([ 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3' ], map(float, box.flatten()))) tl['score'] = float(score) text_lines.append(tl) pred = { 'text_lines': text_lines, 'rtparams': rtparams, 'timing': timer, } new_img = draw_illu(img.copy(), pred) # out.write(new_img) #>>>>>>>>>>>>>>>>>>>>>>>>Evaluation>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # targets = text_polys # precision, recall, f1 = eval_single_frame(targets, pred) precision, recall, f1 = 0, 0, 0 P_video.append(precision) R_video.append(recall) f1_video.append(f1) # new_img1 = draw_illu_gt(new_img.copy(), targets, precision, recall, f1) out.write(new_img) # if m == 0: # fig1 = plt.figure(figsize=(20, 10)) # fig1.add_subplot(1, 2, 1) # plt.imshow(new_img ) # plt.title("Text Detection with fine-tuned EAST") # fig1.add_subplot(1, 2, 2) # plt.imshow(new_img1) # plt.title('Text Detection Results Comparison') # plt.show() if cv2.waitKey(25) & 0xFF == ord('q'): break # time.sleep(.100) else: break # evaluation on ret and gt P_test.append(np.array(P_video, dtype=np.float32)) R_test.append(np.array(R_video, dtype=np.float32)) f1_test.append(np.array(f1_video, dtype=np.float32)) print(P_video) print(R_video) print(f1_video) print("testing results are P:{}, R:{}, F1:{} on ".format( sum(P_video) / cnt_frame, sum(R_video) / cnt_frame, sum(f1_video) / cnt_frame) + video_set[k]) cap.release() out.release() cv2.destroyAllWindows() print('here is the precision') for item in P_test: print(np.mean(item)) print('here is the recall') for item in R_test: print(np.mean(item)) print('here is the f-score') for item in f1_test: print(np.mean(item)) print(video_set)
def main(): vis_flag = True #>>>>>>>>>>>>>>>>>>>>>>> all the path needed >>>>>>>>>>>>>>>>>>>>># pth_namepool='/home/lxiaol9/ARC/EASTRNN/data/GAP_process/' #picked video pth_gt_raw='/home/lxiaol9/ARC/EASTRNN/data/ICDAR/train/' #GT data pth_gt_rbox='/home/lxiaol9/ARC/EASTRNN/checkpoints/LSTM/' #RBOX Array pth_save_avi = '/home/lxiaol9/ARC/EASTRNN/checkpoints/LSTM/RBOX/' #path for results storage #==================================================================# if platform.uname()[1] == 'dragonx-H97N-WIFI': print("Now code running in local machine") #>>>>>>>>>>>>>>>>>>>>>>> add paths here >>>>>>>>>>>>>>>>>>>>># pth_namepool = '/media/dragonx/DataStorage/ARC/EASTRNN/data/GAP_process/' # picked video pth_gt_raw = '/media/dragonx/DataStorage/temporary/Video_text/ICDAR/train/' # GT data pth_gt_rbox = '/media/dragonx/DataStorage/ARC/EASTRNN/checkpoints/LSTM/' # RBOX Array pth_save_avi = '/media/dragonx/DataStorage/ARC/EASTRNN/checkpoints/LSTM/RBOX/' # path for results storage #============================================================# items = os.listdir(pth_namepool) newlist = [] for names in items: if names.endswith(".avi"): newlist.append(os.path.splitext(names)[0]) # video names in the selected pool print(newlist) #>>>>>>>>>>>>>>>>>>>>>>>>>choose the Video No. here >>>>>>>>>>>>>>># k = 1 #==================================================================# sample = newlist[k] filename = pth_gt_raw + sample+'.mp4' XML_filepath = pth_gt_rbox + sample+'_GT.xml' # read video and get resized frames later cap = cv2.VideoCapture(filename) if not os.path.exists(filename): raise RuntimeError( 'Video `{}` not found'.format(filename)) if not os.path.exists(pth_save_avi): os.makedirs(pth_save_avi) index = 0 logger.info('########### Now loading the array data #############') # logger.info('loading model') # gpu_options = tf.GPUOptions(allow_growth=True) # input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') # global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # f_score, f_geometry, v_feature = model.model(input_images, is_training=False) # # variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) # saver = tf.train.Saver(variable_averages.variables_to_restore()) # # restore the model from weights # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # ckpt_state = tf.train.get_checkpoint_state(checkpoint_path) # model_path = os.path.join(checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) # logger.info('Restore from {}'.format(model_path)) # saver.restore(sess, model_path) # get infos for video written if k == 1: geo_maps = np.load(pth_gt_rbox+'score.npy') score_maps = np.load(pth_gt_rbox+'geo.npy') else: geo_maps = np.load(pth_gt_rbox + 'score'+ str(k-1) + '.npy') score_maps = np.load(pth_gt_rbox + 'geo'+ str(k-1) + '.npy') # frame_width = int(cap.get(3)) # frame_height = int(cap.get(4)) # Define the codec and create VideoWriter object.The output is stored in 'outpy.avi' file. out = cv2.VideoWriter(pth_save_avi+sample+'.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 10, (512,512)) index = 0 while(cap.isOpened()): ret, frame = cap.read() index = index+1 if ret == True: # prepare data used for one frame frame_score = np.zeros((1, 512, 512, 1), dtype=np.float32) frame_geo = np.zeros((1, 512, 512, 5)) frame_score[0, :, :, 0] = score_maps[index-1, :, :] frame_geo[0,:,:,:] = geo_maps[index-1, :, :, :] frame_rsz = cv2.resize(frame, (512, 512)) if vis_flag is True: cv2.imshow('Frame', frame_rsz) # cv2.imshow('Score map', score_maps[index-1, :, :]) print('Processing %d frame with '%(index), frame.shape) # for i in range(512): # for j in range(512): # if geo_maps[index - 1, i, j, 1] != 0: # print(geo_maps[index - 1, i, j, :]) ######### Use EAST text detector ########### start_time = time.time() img = frame_rsz rtparams = collections.OrderedDict() rtparams['start_time'] = datetime.datetime.now().isoformat() rtparams['image_size'] = '{}x{}'.format(img.shape[1], img.shape[0]) timer = collections.OrderedDict([ ('net', 0), ('restore', 0), ('nms', 0) ]) print('score shape {:s}, geometry shape {:s}'.format(str(frame_score.shape), str(frame_geo.shape))) boxes, timer = detect(score_map=frame_score, geo_map=frame_geo, timer=timer) logger.info('net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format( timer['net']*1000, timer['restore']*1000, timer['nms']*1000)) if boxes is not None: scores = boxes[:,8].reshape(-1) boxes = boxes[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= 1 boxes[:, :, 1] /= 1 duration = time.time() - start_time timer['overall'] = duration logger.info('[timing] {}'.format(duration)) text_lines = [] if boxes is not None: text_lines = [] for box, score in zip(boxes, scores): box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3]-box[0]) < 5: continue tl = collections.OrderedDict(zip( ['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'], map(float, box.flatten()))) print(tl) tl['score'] = float(score) text_lines.append(tl) ret = { 'text_lines': text_lines, # 'rtparams': rtparams, # 'timing': timer, # 'geometry': geometry, # 'score':float(score), } # # 1. print boxes number # print('%d Boxs found'%(len(text_lines))) # # 2. eval_single_frame(target, box) # p, r, f1 = eval_single_frame(target, ret) # print('Precision %f, recall %f, F_measure %f' % (p, r, f1)) # # 3. save files into directory # jsonfile = json.dumps(ret) # directory = save_path+sample # if not os.path.exists(directory): # os.makedirs(directory+'/json/') # os.makedirs(directory + '/npy/') # os.makedirs(directory + '/score/') # # jsonfname = directory+'/json/frame'+format(index, '03d')+'.json' # npyname = directory+'/npy/frame'+format(index, '03d')+'.npy' # scorename = directory + '/score/frame' + format(index, '03d') + '.npy' # np.save(npyname, feature) # np.save(scorename, score_m) # f = open(jsonfname,"w") # f.write(jsonfile) # f.close() # visualization new_img = draw_illu(img.copy(), ret) if vis_flag is True: cv2.imshow('Images with BBOX', new_img) #new_img1 = draw_illu_gt(new_img.copy(), target) #cv2.imshow('Annotated Frame with EAST', new_img1) out.write(new_img) # Quit when Q is pressed if cv2.waitKey(25) & 0xFF == ord('q'): break time.sleep(0.02) else: break cap.release() out.release() cv2.destroyAllWindows()
def main(): #>>>>>>>>>>>>>>>>>>>>>define data/model path>>>>>>>>>>>>>>>>>>>>># #checkpoint_path = '/home/dragonx/Documents/VideoText2018/EAST-master/weights/east_icdar2015_resnet_v1_50_rbox/' #checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/LSTM_east/20180908-124306' checkpoint_path = '/media/dragonx/DataStorage/ARC/EAST/checkpoints/east/' idname1 = '20180921-135717' idname2 = 'model.ckpt-56092' test_data_path = '/media/dragonx/DataLight/ICDAR2015/test/' save_path = '/media/dragonx/DataLight/ICDAR2015/test_results1/' filename = '/media/dragonx/DataLight/ICDAR2013/test/Video_6_3_2.mp4' idx = 0 # initial frame number #>>>>>>>>>>>>>>>>>>>>>>Sort test video>>>>>>>>>>>>>>>>>>>>>>>>>>># video_set = [] for root, dirs, files in os.walk(test_data_path): for file in files: if file.endswith('.mp4'): video_set.append(os.path.splitext(file)[0]) index = range(0, len(video_set)) # parser for running outside # parser = argparse.ArgumentParser() # parser.add_argument('--checkpoint-path', default=checkpoint_path) # args = parser.parse_args() if not os.path.exists(checkpoint_path): raise RuntimeError('Checkpoint `{}` not found'.format(checkpoint_path)) logger.info('loading model') #>>>>>>>>>>>>>>>>>>>>>>> Loading Model >>>>>>>>>>>>>>>>>>>>>>>>># gpu_options = tf.GPUOptions(allow_growth=True) input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) f_score, f_geometry, _ = model.model(input_images, is_training=False) variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) saver = tf.train.Saver(variable_averages.variables_to_restore()) #>>>>>>>>>>>>>>>>>>>>>>>> restore the model from weights>>>>>>>># sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) model_path = checkpoint_path + idname1 + '/' + idname2 logger.info('Restore from {}'.format(model_path)) saver.restore(sess, model_path) #>>>>>>>>>>>>>>>>>>>>>>Start evaluation>>>>>>>>>>>>>>>>>>>>>>>>># P_test = [] R_test = [] f1_test = [] for k in index: P_video = [] R_video = [] f1_video = [] video_save = save_path + video_set[k] + idname1 + '_' + idname2 + '.avi' t_start = time.time() # sort up all the paths xml_solo_path = test_data_path + video_set[k] raw_video_path = test_data_path + video_set[k] + '.mp4' cap = cv2.VideoCapture(raw_video_path) frame_width = int(cap.get(3)) frame_height = int(cap.get(4)) cnt_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) out = cv2.VideoWriter(video_save, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (frame_width, frame_height)) # 1. load both polys and tags; 2. generate geo maps(the format of polys and tags need to match) # polys_array_list, tags_array_list, id_list_list, frame_num = load_annotations_solo(xml_solo_path, \ # 1, cnt_frame, frame_width, frame_height) #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>loop over frames>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> for m in range(cnt_frame): ret, frame = cap.read() # text_polys, text_tags = load_annoataion(txt_fn) # text_polys, text_tags = polys_array_list[m], tags_array_list[m] # text_polys, text_tags = check_and_validate_polys(text_polys, text_tags, (frame_height, frame_width)) # # im, text_polys, text_tags = crop_area(im, text_polys, text_tags, crop_background=False) # if text_polys.shape[0] == 0: # continue if ret == True: # print('Processing %d frame with '%(m), frame.shape) start_time = time.time() img = frame rtparams = collections.OrderedDict() rtparams['start_time'] = datetime.datetime.now().isoformat() rtparams['image_size'] = '{}x{}'.format( img.shape[1], img.shape[0]) timer = collections.OrderedDict([('net', 0), ('restore', 0), ('nms', 0)]) # im_resized, (ratio_h, ratio_w) = resize_image(img) im_resized = cv2.resize(frame, (int(512), int(512))) ratio_h, ratio_w = 512 / frame_height, 512 / frame_width rtparams['working_size'] = '{}x{}'.format( im_resized.shape[1], im_resized.shape[0]) start = time.time() score, geometry = sess.run( [f_score, f_geometry], feed_dict={input_images: [im_resized[:, :, ::-1]]}) timer['net'] = time.time() - start boxes, timer = detect(score_map=score, geo_map=geometry, timer=timer) logger.info( 'net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format( timer['net'] * 1000, timer['restore'] * 1000, timer['nms'] * 1000)) if boxes is not None: scores = boxes[:, 8].reshape(-1) boxes = boxes[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= ratio_w boxes[:, :, 1] /= ratio_h duration = time.time() - start_time timer['overall'] = duration logger.info('[timing] {}'.format(duration)) text_lines = [] if boxes is not None: text_lines = [] for box, score in zip(boxes, scores): box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm( box[3] - box[0]) < 5: continue tl = collections.OrderedDict( zip([ 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3' ], map(float, box.flatten()))) tl['score'] = float(score) text_lines.append(tl) pred = { 'text_lines': text_lines, 'rtparams': rtparams, 'timing': timer, } new_img = draw_illu(img.copy(), pred) # out.write(new_img) #>>>>>>>>>>>>>>>>>>>>>>>>Evaluation>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # targets = text_polys # precision, recall, f1 = eval_single_frame(targets, pred) precision, recall, f1 = 0, 0, 0 P_video.append(precision) R_video.append(recall) f1_video.append(f1) # new_img1 = draw_illu_gt(new_img.copy(), targets, precision, recall, f1) out.write(new_img) # if m == 0: # fig1 = plt.figure(figsize=(20, 10)) # fig1.add_subplot(1, 2, 1) # plt.imshow(new_img ) # plt.title("Text Detection with fine-tuned EAST") # fig1.add_subplot(1, 2, 2) # plt.imshow(new_img1) # plt.title('Text Detection Results Comparison') # plt.show() if cv2.waitKey(25) & 0xFF == ord('q'): break # time.sleep(.100) else: break # evaluation on ret and gt P_test.append(np.array(P_video, dtype=np.float32)) R_test.append(np.array(R_video, dtype=np.float32)) f1_test.append(np.array(f1_video, dtype=np.float32)) print(P_video) print(R_video) print(f1_video) print("testing results are P:{}, R:{}, F1:{} on ".format( sum(P_video) / cnt_frame, sum(R_video) / cnt_frame, sum(f1_video) / cnt_frame) + video_set[k]) cap.release() out.release() cv2.destroyAllWindows() print('here is the precision') for item in P_test: print(np.mean(item)) print('here is the recall') for item in R_test: print(np.mean(item)) print('here is the f-score') for item in f1_test: print(np.mean(item)) print(video_set)
def main(): #checkpoint_path = '/home/dragonx/Documents/VideoText2018/EAST-master/weights/east_icdar2015_resnet_v1_50_rbox/' filename = '/media/dragonx/752d26ef-8f47-416d-b311-66c6dfabf4a3/Video Detection/ICDAR/train/Video_16_3_2.mp4' cap = cv2.VideoCapture(filename) parser = argparse.ArgumentParser() parser.add_argument('--checkpoint-path', default=checkpoint_path) args = parser.parse_args() if not os.path.exists(checkpoint_path): raise RuntimeError('Checkpoint `{}` not found'.format(checkpoint_path)) # read images until it is completed index = 0 logger.info('loading model') gpu_options = tf.GPUOptions(allow_growth=True) input_images = tf.placeholder(tf.float32, shape=[None, None, None, 3], name='input_images') global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) f_score, f_geometry = model.model(input_images, is_training=False) variable_averages = tf.train.ExponentialMovingAverage(0.997, global_step) saver = tf.train.Saver(variable_averages.variables_to_restore()) # restore the model from weights sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) ckpt_state = tf.train.get_checkpoint_state(checkpoint_path) model_path = os.path.join( checkpoint_path, os.path.basename(ckpt_state.model_checkpoint_path)) logger.info('Restore from {}'.format(model_path)) saver.restore(sess, model_path) # get infos for video written frame_width = int(cap.get(3)) frame_height = int(cap.get(4)) # Define the codec and create VideoWriter object.The output is stored in 'outpy.avi' file. out = cv2.VideoWriter('EAST_testDemo1.avi', cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (frame_width, frame_height)) while (cap.isOpened()): ret, frame = cap.read() index = index + 1 if ret == True: cv2.imshow('Frame', frame) print('Processing %d frame with ' % (index), frame.shape) ######### Use EAST text detector ########### start_time = time.time() img = frame rtparams = collections.OrderedDict() rtparams['start_time'] = datetime.datetime.now().isoformat() rtparams['image_size'] = '{}x{}'.format(img.shape[1], img.shape[0]) timer = collections.OrderedDict([('net', 0), ('restore', 0), ('nms', 0)]) im_resized, (ratio_h, ratio_w) = resize_image(img) rtparams['working_size'] = '{}x{}'.format(im_resized.shape[1], im_resized.shape[0]) start = time.time() score, geometry = sess.run( [f_score, f_geometry], feed_dict={input_images: [im_resized[:, :, ::-1]]}) timer['net'] = time.time() - start boxes, timer = detect(score_map=score, geo_map=geometry, timer=timer) logger.info('net {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format( timer['net'] * 1000, timer['restore'] * 1000, timer['nms'] * 1000)) if boxes is not None: scores = boxes[:, 8].reshape(-1) boxes = boxes[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= ratio_w boxes[:, :, 1] /= ratio_h duration = time.time() - start_time timer['overall'] = duration logger.info('[timing] {}'.format(duration)) text_lines = [] if boxes is not None: text_lines = [] for box, score in zip(boxes, scores): box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm( box[3] - box[0]) < 5: continue tl = collections.OrderedDict( zip(['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3'], map(float, box.flatten()))) tl['score'] = float(score) text_lines.append(tl) ret = { 'text_lines': text_lines, 'rtparams': rtparams, 'timing': timer, } new_img = draw_illu(img.copy(), ret) cv2.imshow('Annotated Frame with EAST', new_img) out.write(new_img) # Quit when Q is pressed if cv2.waitKey(25) & 0xFF == ord('q'): break time.sleep(.100) else: break cap.release() out.release() cv2.destroyAllWindows()