Exemple #1
0
    def go_json(rawData, ruler, cap=None):
        """单条json 提取"""
        dic = dict()

        if cap != None:
            rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1])

        rulers = ExtraHtml.ruler_killer_ex(ruler)
        json = demjson.decode(rawData)

        list = []
        ExtraJSON.dic_to_list(json, list)

        for item in list:
            for ruler in rulers:
                if item.count(ruler[1]) > 0:
                    dic[ruler[0]] = StringHelper.cut_head(item, ruler[1] + ' ')
                    continue

        return dic
Exemple #2
0
    def ruler_finder_ex(tag, ruler_pair, need_tag=False):
        """
        html 页面元素抓取核心函数
        :param tag: 待遍历 的 soup tag 元素
        :param ruler_pair: 元素寻找规则
        :param need_tag: 如果为 true 则返回 找到的 子 tag 元素
        :return:
        """

        result = None
        ruler2 = ruler_pair[1]

        next_sibling = False

        # 判断是否需要获取 nextSibling
        if ExtraHtml.ruler_finder_condition_next_sibling(ruler_pair[0]):
            next_sibling = True

        # 通过标签中间的内容判断 <span>你好</span> 则 ruler 为 [你好]
        if ExtraHtml.ruler_finder_condition_content(ruler2):
            try:
                if tag.get_text() == ExtraHtml.ruler_finder_condition_content(ruler2, True):
                    if next_sibling:
                        result = ExtraHtml.finder_need_tag(tag, need_tag, next_sibling=True, ruler_pair=ruler_pair)
                    else:
                        result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))

                    return result
            except:
                pass
        #在某个父tag 下取得所有相同类型的子tag
        # 这个好像是个废掉的功能
        elif ExtraHtml.ruler_finder_condition(ruler2) == 'at_loop':
            re = ExtraHtml.ruler_finder_multi_at(tag, ruler2)

            if re is not None:
                result = (ruler_pair[0], re)
                return result
        #依据父属性做查找
        elif ruler_pair[1].count(':') > 0 and ruler_pair[1].count(':') > ruler_pair[1].count('http:'):
            status = False
            _pair = ruler_pair[1].split(':', 1)
            name = _pair[0]#可能 是子标签
            name_part = []

            if name.count(' ') > 0:#可能 有属性
                name_part = name.split(' ', 1)#可能 属性
                name = name_part[0]#可能 标签

            if tag.name != name:#可能 不是此标签,跳过
                return None

            # 可能 父标签部分
            parent_pair = _pair[1].split(' ', 1)
            # 可能 父标签名
            parent_name = parent_pair[0]

            # 可能 被遍历的tag没有父元素 或者 父元素 不是 tag 或者 父元素名不对 跳过
            if tag.parent == None or not isinstance(tag.parent, Tag) or tag.parent.name != parent_name:
                return None

            # 可能 没有添加属性 那么 父元素的要求已达到
            if len(parent_pair) == 1:
                status = True

            # 父标签的属性
            parent_attr = parent_pair[1].split('=', 1)

            # 可能* 如果有父标签的属性 或者 等号后为空
            if len(parent_attr) == 1 or (len(parent_attr) > 1 and parent_attr[1] == None or parent_attr[1] == ''):
                try:
                    attr = tag.parent.attrs[parent_attr[0]]
                    status = True
                except:
                    pass

            if len(parent_attr) > 1 and parent_attr[1] != None or parent_attr[1] != '' :
                try:
                    if ' '.join(tag.parent.attrs[parent_attr[0]]) == parent_attr[1]:
                        status = True
                except:
                    pass

            if status:
                if len(name_part) > 1 and name_part[1] != None and name_part != '':
                    return ExtraHtml.ruler_finder_ex(tag,(ruler_pair[0], _pair[0]), need_tag)
                else:
                    # result = (ruler_pair[0], tag.text)
                    result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))

        #依据自身属性
        elif ruler_pair[1].count(' ') > 0 or ruler_pair[1].count('=') == 2:
            if ruler_pair[1].count('=') == 2:
                ruler_pair_part_of = ruler_pair[1].split(' ', 2)
                identify = ruler_pair_part_of[2].split('=', 1)

                if tag.name != ruler_pair_part_of[0]:
                    return None

                try:
                    if tag.attrs[identify[0]] == identify[1]:
                        return ExtraHtml.ruler_finder_ex(tag, (ruler_pair[0], ruler_pair_part_of[0] + ' ' + ruler_pair_part_of[1]), need_tag)
                except:
                    pass
            else:
                _pair = ruler_pair[1].split(' ', 1)
                _name = _pair[0]

                next_flag = False

                if _name.count('^') == 1:
                    next_flag = True
                    _name = Sh.cut_head(_name, '^')

                if tag.name != _name:
                    return None

                # 关键字查询
                keyword = Sh.extra_a_to_b_x(_pair[1], '[', ']')
                if _pair[1].count('[') == 1 and _pair[1].count(']') == 1:
                    if keyword in tag.text and len(tag.contents) <= 1:
                        if next_flag:
                            # return (ruler_pair[0], tag.next_sibling.text)
                            return (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag, True))
                        else:
                            # return (ruler_pair[0], tag.text)
                            return (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))

                equ_pair = _pair[1].split('=',1)

                _attr = equ_pair[0]

                val = StringHelper.extra_a_to_b(_attr, '(', ')')
                _attr = StringHelper.delete_piece(_attr, val)

                for item in equ_pair:
                    if item == '':
                        equ_pair.remove(item)

                if len(equ_pair) > 1:
                    value = equ_pair[1]

                    try:
                        att = tag.attrs[_attr]
                        _value = ''
                        if isinstance(att, str):
                            _value = att
                        elif isinstance(att, list):
                            _value = " ".join(tag.attrs[_attr])
                        if _value == value:
                            # result = (ruler_pair[0], tag.text)
                            result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))
                    except:
                        pass
                else:
                    try:
                        val = val[1:len(val) - 1]

                        ed = tag.attrs[_attr]
                        op = val
                        # com = op + ed
                        com = RulerExtra.canwecom(op, ed)

                        result = (ruler_pair[0], com)
                    except:
                        return None
                # return (ruler_pair[0], tag.attrs[_attr])

        # 不通过属性 直接查找
        else:
            if tag.name == ruler_pair[1]:
                if tag.text is not None and tag.text != '':
                    # result = (ruler_pair[0], tag.text)
                    result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag))

        # 星判断
        if result != None and len(result) > 1 and result[0].count('*') > 0:
            key = ''
            value = ''
            value_list = []
            if ruler_pair[0].count('*') > 0:
                if len(tag.contents) > 1:#多重判断
                    for con in tag:
                        if isinstance(con, Tag):
                            #提取p标签
                            if ruler_pair[0].count('***') == 1:
                                if con.name == 'p':
                                    # value += con.get_text()
                                    # value += ExtraHtml.ruler_finder_recursion_dig_for_p(con)
                                    ExtraHtml.ruler_finder_recursion_dig_for_p(con, value_list)
                            # 删除所有标签
                            elif ruler_pair[0].count('**') == 1:
                                value += con.get_text()
                            # 保留标签
                            elif ruler_pair[0].count('*') == 1:
                                value += str(con)

                else:#单tag 判断
                    if ruler_pair[0].count('**') == 1:
                        value += tag.get_text()
                    elif ruler_pair[0].count('*') == 1:
                        value += str(tag)

                #善后
                if ruler_pair[0].count('***') == 1:
                    # value = "".join(value.split())
                    value = "".join(value_list)
                    value = "".join(value.split())
                    key = StringHelper.cutfrom(ruler_pair[0], '***')
                elif ruler_pair[0].count('**') == 1:
                    value = "".join(value.split())
                    key = StringHelper.cutfrom(ruler_pair[0], '**')
                elif ruler_pair[0].count('*') == 1:
                    key = StringHelper.cutfrom(ruler_pair[0], '*')

            elif ruler_pair[0].count('^') > 0:
                value = tag.text
                value = "".join(value.split())
                key = StringHelper.cutfrom(ruler_pair[0], '^')
            else:
                key = ruler_pair[0]
                value = tag.text

            if value == '':
                print('info: in ruler_finder_ex star filter -> ' + result[0] + ' got empty result!')
                value = result[1]

            return (key, value)
        elif result == None:
            return None
        else:
            return result