Beispiel #1
0
 def func(*args, **kwargs):
     start_time = time.time()
     result = f(*args, **kwargs)
     end_time = time.time()
     logger.info('function {} using time : {}'.format(
         f.__name__, end_time - start_time))
     return result
Beispiel #2
0
    def process_single(self, item_name: str):
        logger.info(f'processing {item_name}')
        # 开始前清理变量
        debugger.variables.clean()
        config = self.config

        if config.preload_tpl:
            debugger.variables.resultsTemplate.append({
                "name": "tplImage",
                "text": "模板图",
                "image": {
                    "data": "#{tplImage}"
                }
            })

        rec_data_dir = os.path.join(config.work_dir,
                                    config.recognition_data_dirname)
        rec_img_dir = os.path.join(config.work_dir,
                                   config.recognition_img_dirname)
        rec_data_path = os.path.join(rec_data_dir, item_name) + '.json'
        rec_img_path = os.path.join(rec_img_dir, item_name) + '.jpg'

        # 读取识别结果数据
        with open(rec_data_path, mode='r', encoding='utf-8') as f:
            rec_data = json.load(f)
        if not rec_data:
            logger.warning(f'raw_data is not present. path={rec_data_path}')
            return
        rec_img = cv2.imread(rec_img_path)

        if isinstance(rec_data, list):
            rec_data = [_convert_ai_rec_data_item(item) for item in rec_data]
            start_time = time.time()
            structure_result = self.session.process(
                rec_data,
                rec_img,
                class_name=self.config.class_name,
                primary_class=self.config.primary_class,
                secondary_class=self.config.secondary_class,
                ltrb=False)
        else:  # 新rawdata
            start_time = time.time()
            rec_img = cv2.imread(rec_img_path)
            request, rpc_name = _convert_request(rec_data, rec_img,
                                                 self.config)
            # 开始结构化

            structure_result = self.request_processor.process(
                request,
                rpc_name,
                self.config.preload_tpl,
                item_name=item_name)

        process_duration = time.time() - start_time
        logger.debug(f'耗时{process_duration}')
        debugger.variables.structuringDuration = process_duration

        # 收集结构化结果
        self._pack_debug_data(structure_result)
        self._dump_debug_data(item_name)
Beispiel #3
0
 def norm_match(self, node_items: Dict[str, NodeItem]) -> Tuple[List[NodeItem], List[int]]:
     out = []
     ed_dists = []
     for it in node_items.values():
         matched, ed = self._text_match(it.cn_text, remove_symbols=True, remove_space=True, ed_thresh=self.ed_thresh)
         if matched:
             logger.info(f"bg_item [{self}] match {it} by [norm_match]")
             out.append(it)
             ed_dists.append(ed)
     return out, ed_dists
Beispiel #4
0
    def parse_template(
        self,
        node_items: Dict[int, TpNodeItem],
        img: np.ndarray,
        debug_data: DebugData = None,
    ):
        """
        :param node_items:
        :param img: ndarray BGR image 在某些环节中可能有重识别的步骤,需要用到原始图片
        :return: dict[StructureItem]
        """
        structure_items = {}
        for fg_item in self.fg_items.values():
            fg_item.load_data(node_items)
            item_result = fg_item.run_parse(img, debug_data=debug_data)
            if item_result is None:
                content, scores = "", [0]
            else:
                content, scores = item_result

            si = StructureItem(
                item_name=fg_item.item_name,
                show_name=fg_item.show_name,
                content=content,
                scores=scores,
            )

            structure_items[fg_item.item_name] = si

        for region_item in self.region_items.values():
            region_item.load_data(node_items)
            region_item.run_parse(img, structure_items)

        # 通过structure_items 传入image,防止在后处理阶段可能会使用到图片相关的信息
        structure_items = self.tmpl_post_proc(structure_items, self.fg_items,
                                              img)

        # 删除 should_output 为 false 的结构化结果
        for fg_item in self.fg_items.values():
            if fg_item.item_name not in structure_items:
                continue

            if fg_item.should_output is False:
                logger.info(
                    f"Delete structure item should not output: {fg_item.item_name}"
                )
                del structure_items[fg_item.item_name]

        return structure_items
Beispiel #5
0
def common_matched_rule(node1, node2, node_in_row1: pd.Series,
                        node_in_row2: pd.Series,
                        node_items: Dict[str, NodeItem], rows, fields: Dict):
    """
    遵循一定的原则,判断对应的两行是否是匹配的


    """
    row1_fid_set = set(node_in_row1.fid)
    row2_fid_set = set(node_in_row2.fid)

    # 原则,两行存在两个以上的位置是"对应的":即存在两个位置的 文本类型是对应的,且bbox 是对应的
    matched_fields_count = 0
    for fid in row1_fid_set | row2_fid_set:
        nodes_in_fields_1 = node_in_row1[node_in_row1.fid == fid]
        nodes_in_fields_2 = node_in_row2[node_in_row2.fid == fid]

        # 在每个fields 中,至少存在一个match 的组合
        matched_text = []
        has_matched = False
        for comb in product(nodes_in_fields_1.iterrows(),
                            nodes_in_fields_2.iterrows()):
            node_in_f1 = node_items[comb[0][1].uid]
            node_in_f2 = node_items[comb[1][1].uid]

            min_height = (node_in_f1.bbox.height + node_in_f2.bbox.height) / 2

            align_ratio = min(
                abs(node_in_f1.bbox.cx - node_in_f2.bbox.cx),
                abs(node_in_f1.bbox.left - node_in_f2.bbox.left),
                abs(node_in_f1.bbox.right -
                    node_in_f2.bbox.right)) / min_height
            if align_ratio < 0.1:
                has_matched = True
                matched_text = [node_in_f1.text, node_in_f2.text]
                break
        if has_matched:
            logger.info('row {} , {} matched in  {}'.format(
                node1.text, node2.text, matched_text))
            matched_fields_count += 1

    logger.info(
        'matched_fields_count is {} for node {} and node {} , thres is {}'.
        format(matched_fields_count, node1.text, node2.text,
               max(2, node1.num_fid_in_row - 1)))
    if matched_fields_count >= max(2, node1.num_fid_in_row - 1):
        return True

    return False
Beispiel #6
0
    def filter_redudant_content(self, node_info):

        useless_row_id = []
        for filter_config in self.filterrow_config['filter_content']:
            # 遍历所有的过滤配置
            regex_list = filter_config['regex']
            adaptive_fields = filter_config['adaptive_fields']

            for rid, rid_content in self.row_content_in_each_fields.copy(
            ).items():
                # 遍历所有的行
                for fid, content_info in rid_content.items():
                    # 遍历这些行涉及到的列
                    header_type = content_info['header_type']
                    if header_type not in adaptive_fields:
                        continue

                    row_content_in_field = content_info['content']
                    # print('debug, ', regex_list, row_content_in_field)
                    useless = False
                    for regex in regex_list:
                        if re.match(regex, row_content_in_field):
                            useless = True
                            break
                    if useless:
                        del self.row_content_in_each_fields[rid]
                        row_order = content_info['row_order']
                        logger.info(
                            '{} is not useful'.format(row_content_in_field))
                        useless_row_id.append(row_order)
                        break
        for rid, rid_content in self.rows.copy().items():
            row_content = rid_content.content()
            # print('debugger', row_content)
            need_ignore = False
            for regex in self.filterrow_config['filter_content_in_line']:
                if re.search(regex, row_content, re.IGNORECASE):
                    need_ignore = True
            if need_ignore:
                del self.row_content_in_each_fields[rid]
                useless_row_id.append(rid_content.order)

        useless_row_id = set(useless_row_id)
        if not useless_row_id:
            return node_info

        node_info = node_info[~node_info.row_order.isin(useless_row_id)]
        # TODO: 自适应的去除尾部的内容
        return node_info
Beispiel #7
0
    def h_merge_match(self, node_items: Dict[str, NodeItem]) -> Tuple[List[NodeItem], List[int]]:
        norm_match_res = self.norm_match(node_items)
        if len(norm_match_res[0]) != 0:
            return norm_match_res

        candidate_node_items = {}
        candidate_chars_count = 0

        for node_item in node_items.values():
            if node_item.cn_text:
                for ic, c in enumerate(node_item.text):
                    if c in self.text:
                        s = node_item.split(ic, ic + 1)
                        candidate_node_items[s.uid] = s
                        candidate_chars_count += 1

        # 候选的节点的总长度小于背景的 content 长度,直接返回
        if candidate_chars_count < len(self.text):
            return [], []

        line_groups = NodeItemGroup.find_row_lines(candidate_node_items, y_thresh=0.3)

        out = []
        ed_dists = []
        for group in line_groups:
            if len(group.content()) < len(self.text):
                continue

            _g = group
            # 移除大于平均间隔字符
            if len(group.node_items) >= 3:
                avg_space = 0
                for i in range(len(group.node_items) - 1):
                    avg_space += (group.node_items[i + 1].bbox.cx - group.node_items[i].bbox.cx)
                avg_space /= len(group.node_items)
                __g = NodeItemGroup([group.node_items[0], group.node_items[1]])
                for i in range(2, len(group.node_items)):
                    if group.node_items[i].bbox.cx - group.node_items[i - 1].bbox.cx > 2 * avg_space:
                        continue
                    __g.append(group.node_items[i])
                if len(__g.node_items) != 0:
                    _g = __g

            if _g.content() == self.text:
                new_node = NodeItem(_g.gen_raw_node())
                out.append(new_node)
                ed_dists.append(editdistance.eval(new_node.text, self.text))
                logger.info(f"bg_item [{self}] match node_item {new_node} by [h_merge_match]")
        return out, ed_dists
Beispiel #8
0
    def group_into_rows(self, node_items):
        # 如果 angle 是小的,则直接使用简单的流程
        logger.info('angle of header is {}'.format(self.angle_of_header))

        self.require_angle_from_node_items(node_items)

        if not self.cfg.LINE_HANDLER.consider_angle:
            # 大角度采用第二种方法
            rows = ParagraphHandler.group_into_rows(node_items)
        else:
            rows = ParagraphHandler.group_into_lines(
                node_items, self.angle_of_header,
                self.cfg.LINE_HANDLER.angle_merge_thresh)

        return rows
Beispiel #9
0
def setup_grpc_server(port):
    cfg = MyConfig()
    grpc_server = grpc.server(futures.ThreadPoolExecutor(
        max_workers=1),
        options=[
            ('grpc.max_receive_message_length', cfg.grpc_max_message_length.value),
            ('grpc.max_send_message_length', cfg.grpc_max_message_length.value)
        ],
        maximum_concurrent_rpcs=cfg.grpc_max_concurrent.value,
    )
    add_StructuringServicer_to_server(servicer=StructuringServer(), server=grpc_server)
    add_MetricsServicer_to_server(servicer=MetricsServer(cfg), server=grpc_server)
    add_HealthServicer_to_server(servicer=HealthServer(), server=grpc_server)
    grpc_server.add_insecure_port('[::]:%s' % port)
    grpc_server.start()
    logger.info('grpc server starts serving at %s' % port)
    return grpc_server
Beispiel #10
0
    def extract_info(self, block, header_config: ContentConfig):
        block_content = '\n'.join(block.content)

        extract_result = extract_text_from_multiline_text(
            block_content,
            start_key_words=header_config.start_key_words,
            end_key_words=header_config.end_key_words,
            start_exps=header_config.start_exps,
            end_exps=header_config.end_exps,
            start_filter_exps=header_config.start_filter_exps,
            filter_exps=header_config.filter_exps)

        if not extract_result[0]:
            # 如果过滤条件设置的有问题导致所有内容都被过滤,就直接输出原始结果
            logger.info('block content {} is filtered to empty !!'.format(
                block_content))
            if extract_result[1]:
                return True, block_content
            return False, ''
        return True, extract_result[0]
Beispiel #11
0
    def parse_header_requirement(self, fields):
        """
        :param header_group: 表头
        :param fields: 各个列信息
        :return: 返回header_group 是否包含header requirements 所要求的列,并返回 fields 当中的列id
        """
        fields_in_type_req = {
            fid: field
            for fid, field in fields.items()
            if field.header.head_type == self.header_type
        }

        if len(fields_in_type_req) == 0:
            return False, set()

        if self.header_regexs is None:
            logger.info('check header {} by {}'.format([
                f.header.key_node.content for f in fields_in_type_req.values()
            ], self.header_type))
            return True, set(fields_in_type_req.keys())

        try:
            regex_check = {
                fid: field
                for fid, field in fields_in_type_req.items() if any([
                    re.match(regex, field.header.key_node.content,
                             re.IGNORECASE) for regex in self.header_regexs
                ])
            }
        except:
            print('hello')

        if len(regex_check) > 0:
            logger.info('check header {} by {}'.format(
                [f.header.key_node.content for f in regex_check.values()],
                self.header_regexs))
            return True, set(regex_check.keys())
        return False, set()
Beispiel #12
0
def upload_to_server(config):
    result_dir_path = os.path.join(config.work_dir, config.result_dirname)

    if not os.path.exists(result_dir_path):
        logger.info('experiment result not exists, abort upload')
        return
    filenames = os.listdir(result_dir_path)
    if len(filenames) == 0:
        return
    data = {
        'items': {},
        'commonVariables': debugger.commonVariables,
    }
    for filename in filenames:
        item_name, _ = os.path.splitext(filename)
        with open(os.path.join(result_dir_path, filename),
                  encoding='utf-8') as f:
            json_data = json.load(f)
            data['items'][item_name] = json_data
    # result_json = json.dumps(data, ensure_ascii=False, default=lambda x: x.__dict__)
    # logger.debug(f'experiment result: {result_json}')
    StClient(config.debug_server_addr).upload_experiment_result(
        config.exp_id, data)
Beispiel #13
0
    def select_useful_headers(self, header_groups):
        # 目前返回一个list ,避免以后希望输出多行
        # 根据最后返回的bbox ,对header_groups 去重
        bbox_header_group_map = {}
        for header_group in header_groups:
            bbox_info = '_'.join([str(_) for _ in header_group.bbox.rect])
            content_info = '_'.join([_.key_node.content for _ in header_group.finded_header])
            bbox_header_group_map.update({bbox_info + '_' + content_info: header_group})
        # logger
        for header_group in bbox_header_group_map.values():
            content_info = '_|_'.join([_.key_node.content for _ in header_group.finded_header])
            logger.info('find possible header with content {}'.format(content_info))
        filtered_group: List[HeaderGroup] = list(bbox_header_group_map.values())

        # TODO: 如果希望返回多个表头
        # clean_group = self.remove_overlap(filtered_group)
        # return clean_group

        # 筛选原则1,应该保留尽量多的内容?
        filtered_group = sorted(filtered_group, key=lambda x: x.evaluation_score, reverse=True)

        filtered_group = [header_group for header_group in filtered_group if
                          header_group.evaluation_score == filtered_group[0].evaluation_score]

        if len(filtered_group) == 1:
            return [filtered_group[0]]
        else:
            # 如果这几个的高度差不多,选择最长的那一个
            filtered_group = sorted(filtered_group, key=lambda x: x.bbox.width, reverse=True)
            if (filtered_group[0].bbox.cy - np.mean(
                    [header_group.bbox.cy for header_group in filtered_group[1:]])) < 10:
                return [filtered_group[0]]
            else:
                # 去角度变动最小的
                filtered_group = sorted(filtered_group, key=lambda x: x.angle_score)
                return [filtered_group[0]]
Beispiel #14
0
    def build_blocks(self,
                     row_id,
                     rows,
                     row_order_id_map,
                     node_info,
                     auto_remove_tail=False):
        """
        :param row_id: 关键行的行号
        :param rows:  list of row , 记录着这条记录涉及到的row order
        :param row_order_id_map: row_order 和 row 的关系
        :param node_info:
        :param auto_remove_tail : 对于最后一行设置这个参数为True ,会对最后一行,考虑一些特殊的过滤规则,去除掉表尾部的内容
        :return:
        """
        lines_in_field = defaultdict(list)

        useful_row = [True] * len(rows)
        if auto_remove_tail and len(rows) >= 2:
            # 需要自适应的去除一些不需要的信息
            # rule1  , 计算rows 之间的间隔,如果存在一个很大的间隔,对后面的内容不考虑
            # 拿到每个行的top
            row_bottom = [
                self.rows[row_order_id_map[rid]].bbox.bottom for rid in rows
            ]
            row_height = [
                self.rows[row_order_id_map[rid]].bbox.height for rid in rows
            ]
            row_height_diff = np.diff(row_bottom) > 5 * np.mean(row_height)
            after_useless = False
            for idx in range(1, len(rows)):
                if row_height_diff[idx - 1] == True:
                    after_useless = True
                if after_useless == True:
                    useful_row[idx] = False

        for row, is_useful in zip(rows, useful_row):
            if not is_useful:
                continue
            row_info = self.row_content_in_each_fields[row_order_id_map[row]]
            for fid, field_info in row_info.items():
                # header_name = self.fields[fid].header.name
                lines_in_field[fid].append({
                    'line_item':
                    field_info['element_group'],
                    'line_content':
                    field_info['content']
                })

        row_info = {}
        for fid, field_info in lines_in_field.items():
            line_content = [line['line_content'] for line in field_info]
            line_item = [line['line_item'] for line in field_info]

            header_name = self.fields[fid].header.name
            header_type = self.fields[fid].header.head_type

            update = False
            if header_type in [
                    self.header_type[htype]
                    for htype in self.cfg.ELEMENT_HANDLER.get(
                        'block_update_config', [])
            ]:
                logger.info('set update True for {}'.format(header_type))
                update = True

            row_info[fid] = Block(fid,
                                  row_id,
                                  header_name,
                                  header_type,
                                  line_content,
                                  line_item,
                                  update=update)

        return row_info
Beispiel #15
0
    def filter_redudant_line(self,
                             start_filter_line,
                             node_info,
                             possible_key_row=None):
        # 从行的角度筛选数据
        ignore_bg_lines = []
        for idx, (bg_texts, ed_thresh) in enumerate(
                self.filterrow_config['filter_lines']):
            bg_texts = re.sub('[^0-9A-Za-z]', '', bg_texts).lower()
            self.filterrow_config['filter_lines'][idx] = (bg_texts, ed_thresh)

        # 建立row_order 对于rid 的字典
        row_order_id_map = {
            self.rows[rid].order: rid
            for rid in self.row_content_in_each_fields
        }
        # 按照从小到大排序
        row_order_id_map = OrderedDict(
            sorted(row_order_id_map.items(), key=lambda x: x[0]))

        after_filter_row = False  # 在一个过滤行之后的所有内容, 会会被过滤掉
        for order, rid in row_order_id_map.items():
            # 遍历每一行

            if after_filter_row:
                ignore_bg_lines.append(order)
                del self.row_content_in_each_fields[rid]
                continue

            row = self.rows[rid]
            if row.order < start_filter_line:
                continue
            row_content = row.content()
            row_content = re.sub('[^0-9A-Za-z]', '', row_content).lower()
            logger.info('this print used to check rows need filter: {}'.format(
                row_content))
            filtered_by_line_rule = False
            # print('debug',row_content)
            for bg_texts, ed_thresh in self.filterrow_config['filter_lines']:
                dist = ed.eval(row_content, bg_texts)
                if dist < ed_thresh:
                    del self.row_content_in_each_fields[rid]
                    ignore_bg_lines.append(row.order)
                    after_filter_row = True
                    filtered_by_line_rule = True
                    break

            if filtered_by_line_rule:
                # 已经认为是一个需要过滤的行了,这里就不做考虑了
                continue

            for comb in self.filterrow_config['filter_comb']:
                # 拿到每一个comb 的配置
                matched_count = 0
                for header_type_list, regex_config in comb:
                    if isinstance(header_type_list,
                                  self.header_group.header_types):
                        header_type_list = [header_type_list]

                    at_least_succeed = False
                    for header_type in header_type_list:
                        # 遍历所有的在这次配置当中的header_type
                        if at_least_succeed:
                            break
                        if isinstance(regex_config, list):
                            # 如果对某个内容配置为list
                            regex_list = regex_config
                            # 获取这一行涉及到的这个类型的type
                            content = [
                                fid_info['content'] for fid, fid_info in
                                self.row_content_in_each_fields[rid].items()
                                if fid_info['header_type'] == header_type
                            ]
                            for regex in regex_list:
                                if any([
                                        re.search(regex, text, re.IGNORECASE)
                                        is not None for text in content
                                ]):
                                    matched_count += 1
                                    break
                        elif isinstance(regex_config, dict):
                            regex_list = regex_config['content_regex']
                            header_regex_list = regex_config['header_regex']
                            content_list = [
                                (fid, fid_info['content']) for fid, fid_info in
                                self.row_content_in_each_fields[rid].items()
                                if fid_info['header_type'] == header_type
                            ]

                            # 根据fid ,获取每个content 对应的header 的内容
                            content_list = [(self.fields[fid].header.key_node.content, fid_content) \
                                            for fid, fid_content in content_list]

                            # 从这些content 当中挑选 符合 header_regex_list 的内容
                            content_satisfy_header_regex = []
                            for header_content, field_content in content_list:
                                satisfy_regex = False
                                for header_regex in header_regex_list:
                                    if re.search(header_regex, header_content,
                                                 re.IGNORECASE):
                                        satisfy_regex = True
                                        break
                                if satisfy_regex:
                                    content_satisfy_header_regex.append(
                                        field_content)
                            if len(content_satisfy_header_regex) == 0:
                                # 说明这一行没有一个列满足header_regex 的条件
                                continue
                            for regex in regex_list:
                                if any([
                                        re.search(regex, text,
                                                  re.IGNORECASE) is not None
                                        for text in
                                        content_satisfy_header_regex
                                ]):
                                    matched_count += 1
                                    at_least_succeed = True
                                    break

                if matched_count == len(comb):
                    logger.info('filtered {} by filter_comb'.format(
                        self.rows[rid].content()))
                    del self.row_content_in_each_fields[rid]
                    ignore_bg_lines.append(row.order)
                    after_filter_row = True
                    break

        node_info = node_info[~node_info.row_order.isin(ignore_bg_lines)]

        if possible_key_row is not None:
            possible_key_row = possible_key_row - set(ignore_bg_lines)
        return node_info, possible_key_row
Beispiel #16
0
    def filter_nodes_below_headers(self, node_items: Dict[str, NodeItem]):
        # 返回在表头下方一个区域内的node_items
        # 如果存在 rbox 信息, 则可以做更多的处理
        has_rbox = False
        for _, value in node_items.items():
            if getattr(value, 'rbox'):
                has_rbox = True

        if has_rbox:
            # 获取所有的node_item 的下边界
            nodes = self.get_all_nodes()
            # 找到nodes 的最下方
            lowest_head_node = sorted(nodes, key=lambda x: x.rbox.cy, reverse=True)[0]
            # 获取角度的平均值
            # 获取node_items 的角度的均值
            # 对长文本统计有效角度
            node_item_list = [node for node in node_items.values() if len(node.text) > 4]
            if len(node_item_list) > 0:
                meaningfule_angle_list = np.array([node.rbox.meaningful_angle for node in node_item_list])
                # 去除角度为0的部分
                median_angle = np.median(meaningfule_angle_list)
                angle_mark_node = node_item_list[np.argmin(np.abs(meaningfule_angle_list - median_angle))].rbox
                header_line = line_utils.gen_parallel_line(angle_mark_node.up_left[0], angle_mark_node.up_left[1],
                                                           angle_mark_node.up_right[0], angle_mark_node.up_right[1],
                                                           lowest_head_node.rbox.down_left[0],
                                                           lowest_head_node.rbox.down_left[1]
                                                           )
                filtered_nodes = dict()
                for node in node_items.values():

                    if header_line.is_under(line_utils.Point(node.rbox.cx, node.rbox.cy)):
                        filtered_nodes[node.uid] = node

                return filtered_nodes

        mean_interval = self.mean_header_interval
        mean_width = self.mean_header_width

        xmin_limit = self.bbox.left - (mean_interval + mean_width)
        xmax_limit = self.bbox.right + (mean_interval + mean_width)

        head_nodes = [set(node.uid for node in header.key_node.node_items) for header in self.finded_header]
        head_nodes = set.union(*head_nodes)

        min_bottom = min([header.key_node.bbox.bottom for header in self.finded_header])
        ymin_limit = min_bottom - 0.2 * np.mean([header.key_node.avg_height for header in self.finded_header])

        filtered_nodes = dict()
        for uid, node in node_items.items():
            if uid in head_nodes:
                continue

            if node.bbox.top <= ymin_limit:
                continue

            if node.bbox.left <= xmin_limit:
                continue
            if node.bbox.right >= xmax_limit:
                continue

            filtered_nodes[uid] = node
        logger.info('filtered {} node on the top of header'.format(len(node_items) - len(filtered_nodes)))
        return filtered_nodes
Beispiel #17
0
def prepare_data(config, with_cache=True):
    if with_cache:
        if _rec_cache_available(config):
            logger.info(
                'recognition result data and image exists, using cache')
            return
        logger.info(
            'recognition result data and image not exists, fetching from server'
        )

    st_client = StClient(config.debug_server_addr)
    raws = st_client.fetch_raw_data_list(config.lab_id, 1, 10000)
    rec_data_dir = os.path.join(config.work_dir,
                                config.recognition_data_dirname)
    rec_img_dir = os.path.join(config.work_dir, config.recognition_img_dirname)
    img_pool_dir = os.path.join(os.path.dirname(config.work_dir),
                                "ocr_structuring_img_pool")
    os.makedirs(img_pool_dir, exist_ok=True)
    os.makedirs(rec_data_dir, exist_ok=True)
    os.makedirs(rec_img_dir, exist_ok=True)

    def get_data(raw):
        if not os.path.exists(
                os.path.join(rec_data_dir, str(raw['id'])) + '.json'):
            with open(os.path.join(rec_data_dir, str(raw['id'])) + '.json',
                      'w',
                      encoding='utf-8') as f:
                json.dump(raw['data'], f, ensure_ascii=False, indent=2)
        if config.use_img:
            assert raw['media_id'] is not None
            if not os.path.exists(
                    os.path.join(img_pool_dir,
                                 str(raw['media_id']) + '.jpg')):
                st_client.download_media(
                    raw['media_id'],
                    os.path.join(img_pool_dir,
                                 str(raw['media_id']) + '.jpg'))
            if not os.path.exists(
                    os.path.join(rec_img_dir,
                                 str(raw['id']) + '.jpg')):
                os.symlink(
                    os.path.join(img_pool_dir,
                                 str(raw['media_id']) + '.jpg'),
                    os.path.join(rec_img_dir,
                                 str(raw['id']) + '.jpg'))

    def get_data_by_gt_id(raw):
        # 保存时使用 gt_id 作为文件名
        with open(os.path.join(rec_data_dir, str(raw['gt_id'])) + '.json',
                  'w',
                  encoding='utf-8') as f:
            json.dump(raw['data'], f, ensure_ascii=False, indent=2)
        if config.use_img:
            assert raw['media_id'] is not None
            st_client.download_media(
                raw['media_id'],
                os.path.join(rec_img_dir,
                             str(raw['gt_id']) + '.jpg'))

    with ThreadPoolExecutor(3) as executor:
        for raw in raws['items']:
            executor.submit(get_data, raw)
Beispiel #18
0
    def h_split_match(
            self,
            node_items: Dict[str, NodeItem],
            *,
            sub_seq_max_interval: int = 2,
            sub_seq_pre_func: Callable = None,
    ) -> Tuple[List[NodeItem], List[int], List[NodeItem]]:
        """
        :param node_items:
        :param sub_seq_max_interval: 最长公共子序列的间距
        :param sub_seq_pre_func: 在调用 max_sub_seq_order_dp 时,对 it.text 进行预处理
        :return:
        """
        splited_ed_dist = []
        splited_key_node = []
        splited_rest_nodes = []
        for it in node_items.values():
            if sub_seq_pre_func is None:
                res, bg_idxes, node_idxes = max_sub_seq_order_dp(self.text, it.text)
            else:
                res, bg_idxes, node_idxes = max_sub_seq_order_dp(self.text, sub_seq_pre_func(it.text))

            ed_dist = abs(len(res) - len(self.text))
            if self.ed_thresh == -1:
                if res != self.text:
                    continue
            else:
                if ed_dist > self.ed_thresh:
                    continue

            # node 索引的间隔不能超过 1
            should_continue = False
            for i in range(len(node_idxes) - 1):
                if node_idxes[i + 1] - node_idxes[i] > sub_seq_max_interval:
                    should_continue = True
                    break
            if should_continue:
                continue

            if node_idxes[0] > 2:
                # split 出来的节点应该要位于字符串开头位置
                continue

            if sub_seq_pre_func is not None:
                node_idxes = self.align_node_idxes(node_idxes, sub_seq_pre_func(it.text), it.text)

            start_idx = node_idxes[0]
            end_idx = node_idxes[-1] + 1

            # sub_str_start_idxes = str_util.findall_sub_str_idx(sub_text=self.text, text=it.text)
            # if len(sub_str_start_idxes) != 1:
            #     continue
            # start_idx = sub_str_start_idxes[0]
            # # 假设所有要 split 的背景字比较靠前
            # if start_idx >= 3:
            #     continue
            # end_idx = sub_str_start_idxes[0] + len(self.text)

            if end_idx > start_idx:
                new_node = it.split(start_idx, end_idx)
                if new_node:
                    splited_key_node.append(new_node)
                    splited_ed_dist.append(ed_dist)

                rest_node = it.split(end_idx, -1)
                if rest_node:
                    splited_rest_nodes.append(rest_node)

        norm_match_res, norm_match_ed_dists = self.norm_match(node_items)
        splited_key_node.extend(norm_match_res)
        splited_ed_dist.extend(norm_match_ed_dists)

        if len(splited_key_node) != 0:
            logger.info(f"bg_item [{self}] match {' '.join(map(str, splited_key_node))} by [split_match]")

        for node in splited_rest_nodes:
            node.is_cut = True

        return splited_key_node, splited_ed_dist, splited_rest_nodes
Beispiel #19
0
        max_workers=1),
        options=[
            ('grpc.max_receive_message_length', cfg.grpc_max_message_length.value),
            ('grpc.max_send_message_length', cfg.grpc_max_message_length.value)
        ],
        maximum_concurrent_rpcs=cfg.grpc_max_concurrent.value,
    )
    add_StructuringServicer_to_server(servicer=StructuringServer(), server=grpc_server)
    add_MetricsServicer_to_server(servicer=MetricsServer(cfg), server=grpc_server)
    add_HealthServicer_to_server(servicer=HealthServer(), server=grpc_server)
    grpc_server.add_insecure_port('[::]:%s' % port)
    grpc_server.start()
    logger.info('grpc server starts serving at %s' % port)
    return grpc_server


if __name__ == '__main__':
    config = MyConfig()
    multiprocessing.freeze_support()
    server = setup_grpc_server(config.grpc_port.value)
    validator = TimeValidator(datetime(year=2019, month=9, day=24), datetime(year=2029, month=9, day=25))
    try:
        while validator.validate():
            time.sleep(60)  # 1 分钟
        server.stop(0)
        while True:
            time.sleep(60 * 60 * 24)  # 1 天
    except KeyboardInterrupt:
        server.stop(0)
        logger.info('grpc server stop serving')
Beispiel #20
0
    def recheck_rows_v2(self, node_info, row_map, node_items, thresh=2):
        # 此方法不能解决所有的问题,仅针对品字形问题进行解决
        # 基本思路非常简单,对任意相邻的三行进行检查,如果存在第一行,第三行的两个点"紧挨",而第二行有一个节点的高度
        # 和这两个紧挨点的中间高度差不多,则认为三行可以合并

        if len(row_map) <= 2:
            return node_info, row_map

        # 首先对row_map 从上往下进行排序
        ordered_row_map = sorted(row_map.items(), key=lambda x: x[1].bbox.top)

        # 初始化搜索位置
        new_row_group = []
        iter_mask = [False] * len(ordered_row_map)  # 用于记录哪些点已经被遍历过

        for i in range(0, len(ordered_row_map)):
            if i in [len(ordered_row_map) - 1, len(ordered_row_map) - 2]:
                iter_mask[i] = True
                new_row_group.append(ordered_row_map[i][1].node_items)
            # 移动三个节点的位置
            if iter_mask[i]:
                continue
            up_row, middle_row, down_row = ordered_row_map[i:i + 3]

            # 目前这个比较是由上往下的
            up_nodes = sorted(up_row[1].node_items, key=lambda x: x.bbox.left)
            down_nodes = sorted(down_row[1].node_items,
                                key=lambda x: x.bbox.left)
            middle_nodes = sorted(middle_row[1].node_items,
                                  key=lambda x: x.bbox.left)

            benchmark_pair = None
            for up_node in up_nodes:
                find_pair = False
                for down_node in down_nodes:
                    left_to_right = down_node.bbox.left - up_node.bbox.right
                    if left_to_right > 0:
                        break
                    left_align = abs(up_node.bbox.left - down_node.bbox.left)
                    right_align = abs(up_node.bbox.right -
                                      down_node.bbox.right)
                    middle_align = abs(up_node.bbox.cx - down_node.bbox.cx)

                    if min(left_align, right_align, middle_align) > np.mean(
                        [up_node.bbox.height, down_node.bbox.height]):
                        continue
                    if abs(down_node.bbox.top -
                           up_node.bbox.bottom) > 0.5 * np.mean(
                               [up_node.bbox.height, down_node.bbox.height]):
                        continue
                    find_pair = True
                    benchmark_pair = (up_node, down_node)
                    logger.info("find 品 shape pair {} , {}".format(
                        up_node.text, down_node.text))
                    break
                if find_pair:
                    break
            # 计算中心位置
            if benchmark_pair is None:
                new_row_group.append(up_nodes)
                iter_mask[i] = True
                continue

            offset = abs(benchmark_pair[0].bbox.bottom -
                         benchmark_pair[1].bbox.top) / 2
            center_y = min(benchmark_pair[0].bbox.bottom,
                           benchmark_pair[1].bbox.top) + offset
            should_group = False
            for node in middle_nodes:
                if abs(node.bbox.cy - center_y) < node.bbox.height * 0.5:
                    should_group = True
                    break
            if should_group:
                new_group = up_nodes + middle_nodes + down_nodes
                new_row_group.append(new_group)
                iter_mask[i:i + 3] = True, True, True
            else:
                new_row_group.append(up_nodes)
                iter_mask[i] = True

        new_rows = []
        for row in new_row_group:
            new_rows.append(Line(row))

        node_info, row_map = self.make_node_info(new_rows)
        return node_info, row_map
Beispiel #21
0
    def find_valid_keyrow(self, node_info, fields, rows, header_group):
        # 在这里寻找合理的keyrow
        # 完成header_group 的筛选

        # 筛选出regex 涉及到的列
        used_fid_set = []
        for field in self.adaptive_fields:
            satisfy_status, fid_set = field.parse_header_requirement(fields)
            if not satisfy_status:
                continue
            used_fid_set.append(fid_set)
        # 获取有效的fid
        if not used_fid_set:
            return False, {}
        selected_fid = set.union(*used_fid_set)

        if len(selected_fid) == 0:
            # 说明对于这次的数据,不存在一个列满足这条规则对表头的要求
            return False, {}

        # 首先,去除掉哪些列数不满足条件的行
        filtered = node_info
        # filtered = node_info[node_info.num_fid_in_row >= len(self.adaptive_fields)]
        # if self.check_empty(filtered):
        #     return False, {}
        # 选出规则所需要考虑的列的内容
        filtered = filtered[filtered.fid.isin(selected_fid)]
        if self.check_empty(filtered):
            return False, {}

        # 对这些列判断字符和字符类型是否满足要求
        filtered = filtered[filtered.apply(self.map_func, axis=1)]
        if self.check_empty(filtered):
            return False, {}

        key_row = set(filtered.row_order.unique())

        for _, data in filtered.groupby('row_order'):
            content = '--'.join(data.text.to_list())
            logger.info('check row {} by {}'.format(content, self.regexs))

        # 利用unexpected 设置的内容,对key_row 进行过滤
        after_filter_key_row = []
        for krow in key_row:
            node_in_this_row = node_info[node_info.row_order == krow]
            matched_unexpected = False
            for regexes in self.unexpected_content:
                matched_all = True if len(regexes) > 0 else False
                for regex in regexes:
                    matched_filter_rule = node_in_this_row[
                        node_in_this_row.text.map(lambda x: re.search(
                            regex, x, re.IGNORECASE) is not None)]
                    if matched_filter_rule.shape[0] == 0:
                        # 说明没有元素符合这个正则
                        matched_all = False
                if matched_all:
                    matched_unexpected = True
            if matched_unexpected:
                continue
            else:
                after_filter_key_row.append(krow)
        key_row = set(after_filter_key_row)

        return True, key_row
Beispiel #22
0
    opts = parser.parse_args()
    debugger.enabled = True
    config = ProcessConfig(
        class_name=opts.class_name,
        primary_class=opts.primary_class,
        secondary_class=opts.secondary_class,
        use_img=opts.use_img,
        preload_tpl=opts.preload_tpl,
        process_count=opts.process_count,
        debug_server_addr=opts.debug_server_addr,
        lab_id=opts.lab_id,
        exp_id=opts.exp_id,
        raw_data_id=opts.raw_data_id,
        work_dir=opts.work_dir,
    )

    processor = Processor(config)
    if config.is_single_debug:
        # 处理单张
        processor.process_single(str(config.raw_data_id))
    else:
        logger.info('preparing debug data')
        prepare_data(config)
        logger.info('processing')
        processor.process()
        if config.debug_server_addr:
            logger.info('uploading experiment result...')
            upload_to_server(config)
            logger.info('upload experiment result success.')