Exemple #1
0
    def looper_html(self, result, raw, exists, ruler, extra3_tup, extra4_tup):
        tup = Sh.str_to_dictup(extra3_tup)
        tag = tup[0]
        dic = tup[1]
        parent_container = (tag, dic)

        tup = Sh.str_to_dictup(extra4_tup)
        tag = tup[0]
        dic = tup[1]
        list_tup = (tag, dic)

        # 获取 list 提取成 字典
        list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup)

        for item in list:
            if len(item) < 1:
                continue

            dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler)

            """日常抓取时的重复验证"""
            try:
                link = dic_list['link']
            except:
                break

            if exists.count(link) < 1:
                result.append(dic_list)
            else:
                break
Exemple #2
0
    def ruler_finder_multi_at(tag, ruler2):
        result = None
        pair = ruler2.split('@', 1)

        tup = Sh.str_to_dictup(pair[1])
        _tag = tup[0]
        dic = tup[1]
        parent_container = (_tag, dic)

        tup = Sh.str_to_dictup(pair[0])
        _tag = tup[0]
        dic = tup[1]
        list_tup = (_tag, dic)

        p_tag = parent_container[0]
        p_dic = parent_container[1]
        s_ruler = ''

        for key in p_dic:
            s_ruler = p_tag + ' ' + key + '=' + p_dic[key]
            break
        ruler_pair = ('', s_ruler)

        ruler_pair_part_of = ruler_pair[1].split(' ', 2)
        identify = ruler_pair_part_of[1].split('=', 1)
        if tag.name != ruler_pair_part_of[0]:
            return result

        try:
            if tag.attrs[identify[0]] == identify[1]:
                result = []
                raw = str(tag)

                # 获取 list 提取成 字典
                list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup)

                for item in list:
                    if len(item) < 1:
                        continue

                    dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler)

                    result.append(dic_list)

                return result
        except:
            result = None

        return result
Exemple #3
0
    def looper_html_ex(self, result, raw, ruler, extra3_tup, extra4_tup):
        tup = Sh.str_to_dictup(extra3_tup)
        tag = tup[0]
        dic = tup[1]
        parent_container = (tag, dic)

        tup = Sh.str_to_dictup(extra4_tup)
        tag = tup[0]
        dic = tup[1]
        list_tup = (tag, dic)

        # 获取 list 提取成 字典
        list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup)

        for item in list:
            if len(item) < 1:
                continue

            dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler)

            result.append(dic_list)
Exemple #4
0
    def extract_dic_list_from_page(self, result, url, parent_attr, list_attr, ruler, exists:list=None, encode:str=None):
        """
        从页面提取 dic 列表
        :param result: 结果列表
        :param raw:
        :param exists:
        :param ruler: 字典规则
        :param parent_attr: parent_container attribute
        :param list_attr: list attribute
        """
        raw = self.req.get(url, encode=encode)

        tup = Sh.str_to_dictup(parent_attr)
        tag = tup[0]
        dic = tup[1]
        parent_container = (tag, dic)

        tup = Sh.str_to_dictup(list_attr)
        tag = tup[0]
        dic = tup[1]
        list_tup = (tag, dic)

        # 获取 list 提取成 字典
        list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup)

        for item in list:
            if len(item) < 1:
                continue

            dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler)

            """抓取时的重复验证"""
            if exists is not None:
                if exists.count(dic_list['link']) < 1:
                    result.append(dic_list)
                else:
                    break
            else:
                result.append(dic_list)





# 备份 1204
# @staticmethod
#     def ruler_finder_ex(tag, ruler_pair, need_tag=False):
#         result = None
#         ruler2 = ruler_pair[1]
#
#         #在某个父tag 下取得所有相同类型的子tag
#         # 这个好像是个废掉的功能
#         if ExtraHtml.ruler_finder_condition(ruler2) == 'at_loop':
#             re = ExtraHtml.ruler_finder_multi_at(tag, ruler2)
#
#             if re is not None:
#                 result = (ruler_pair[0], re)
#                 return result
#         #依据父属性做查找
#         elif ruler_pair[1].count(':') > 0 and ruler_pair[1].count(':') > ruler_pair[1].count('http:'):
#             status = False
#             _pair = ruler_pair[1].split(':', 1)
#             name = _pair[0]#可能 是子标签
#             name_part = []
#
#             if name.count(' ') > 0:#可能 有属性
#                 name_part = name.split(' ', 1)#可能 属性
#                 name = name_part[0]#可能 标签
#
#             if tag.name != name:#可能 不是此标签,跳过
#                 return None
#
#             parent_pair = _pair[1].split(' ', 1)#可能 父标签部分
#             parent_name = parent_pair[0]#可能 父标签名
#
#             if tag.parent == None or not isinstance(tag.parent, Tag) or tag.parent.name != parent_name:#可能 被遍历的tag没有父元素 或者 父元素 不是 tag 或者 父元素名不对 跳过
#                 return None
#
#             if len(parent_pair) == 1:#可能 没有添加属性 那么 父元素的要求已达到
#                 status = True
#
#             parent_attr = parent_pair[1].split('=', 1)#父标签的属性
#
#             if len(parent_attr) == 1 or (len(parent_attr) > 1 and parent_attr[1] == None or parent_attr[1] == ''):#可能* 如果有父标签的属性 或者 等号后为空
#                 try:
#                     attr = tag.parent.attrs[parent_attr[0]]
#                     status = True
#                 except:
#                     pass
#
#             if len(parent_attr) > 1 and parent_attr[1] != None or parent_attr[1] != '' :
#                 try:
#                     if ' '.join(tag.parent.attrs[parent_attr[0]]) == parent_attr[1]:
#                         status = True
#                 except:
#                     pass
#
#             if status:
#                 if len(name_part) > 1 and name_part[1] != None and name_part != '':
#                     return ExtraHtml.ruler_finder_ex(tag,(ruler_pair[0], _pair[0]))
#                 else:
#                     result = (ruler_pair[0], tag.text)
#
#         #依据自身属性
#         elif ruler_pair[1].count(' ') > 0 or ruler_pair[1].count('=') == 2:
#             if ruler_pair[1].count('=') == 2:
#                 ruler_pair_part_of = ruler_pair[1].split(' ', 2)
#                 identify = ruler_pair_part_of[2].split('=', 1)
#
#                 if tag.name != ruler_pair_part_of[0]:
#                     return None
#
#                 try:
#                     if tag.attrs[identify[0]] == identify[1]:
#                         return ExtraHtml.ruler_finder_ex(tag, (ruler_pair[0], ruler_pair_part_of[0] + ' ' + ruler_pair_part_of[1]))
#                 except:
#                     pass
#             else:
#                 _pair = ruler_pair[1].split(' ', 1)
#                 _name = _pair[0]
#
#                 next_flag = False
#
#                 if _name.count('^') == 1:
#                     next_flag = True
#                     _name = Sh.cut_head(_name, '^')
#
#                 if tag.name != _name:
#                     return None
#
#                 # 关键字查询
#                 keyword = Sh.extra_a_to_b_x(_pair[1], '[', ']')
#                 if _pair[1].count('[') == 1 and _pair[1].count(']') == 1:
#                     if keyword in tag.text and len(tag.contents) <= 1:
#                         if next_flag:
#                             return (ruler_pair[0], tag.next_sibling.text)
#                         else:
#                             return (ruler_pair[0], tag.text)
#
#                 equ_pair = _pair[1].split('=',1)
#
#                 _attr = equ_pair[0]
#
#                 val = StringHelper.extra_a_to_b(_attr, '(', ')')
#                 _attr = StringHelper.delete_piece(_attr, val)
#
#                 for item in equ_pair:
#                     if item == '':
#                         equ_pair.remove(item)
#
#                 if len(equ_pair) > 1:
#                     value = equ_pair[1]
#
#                     try:
#                         att = tag.attrs[_attr]
#                         _value = ''
#                         if isinstance(att, str):
#                             _value = att
#                         elif isinstance(att, list):
#                             _value = " ".join(tag.attrs[_attr])
#                         if _value == value:
#                             result = (ruler_pair[0], tag.text)
#                             # return (ruler_pair[0], tag.text)
#                     except:
#                         pass
#                 else:
#                     try:
#                         val = val[1:len(val) - 1]
#
#                         ed = tag.attrs[_attr]
#                         op = val
#                         # com = op + ed
#                         com = RulerExtra.canwecom(op, ed)
#
#                         # return (ruler_pair[0], com)
#                         result = (ruler_pair[0], com)
#                     except:
#                         return None
#                 # return (ruler_pair[0], tag.attrs[_attr])
#
#         # 不通过属性 直接查找
#         else:
#             if tag.name == ruler_pair[1]:
#                 if tag.text is not None and tag.text != '':
#                     # return (ruler_pair[0], tag.text)
#                     result = (ruler_pair[0], tag.text)
#
#         # 星判断
#         if result != None and len(result) > 1 and result[0].count('*') > 0:
#             key = ''
#             value = ''
#             value_list = []
#             if ruler_pair[0].count('*') > 0:
#                 if len(tag.contents) > 1:#多重判断
#                     for con in tag:
#                         if isinstance(con, Tag):
#                             #提取p标签
#                             if ruler_pair[0].count('***') == 1:
#                                 if con.name == 'p':
#                                     # value += con.get_text()
#                                     # value += ExtraHtml.ruler_finder_recursion_dig_for_p(con)
#                                     ExtraHtml.ruler_finder_recursion_dig_for_p(con, value_list)
#                             # 删除所有标签
#                             elif ruler_pair[0].count('**') == 1:
#                                 value += con.get_text()
#                             # 保留标签
#                             elif ruler_pair[0].count('*') == 1:
#                                 value += str(con)
#
#                 else:#单tag 判断
#                     if ruler_pair[0].count('**') == 1:
#                         value += tag.get_text()
#                     elif ruler_pair[0].count('*') == 1:
#                         value += str(tag)
#
#                 #善后
#                 if ruler_pair[0].count('***') == 1:
#                     # value = "".join(value.split())
#                     value = "".join(value_list)
#                     value = "".join(value.split())
#                     key = StringHelper.cutfrom(ruler_pair[0], '***')
#                 elif ruler_pair[0].count('**') == 1:
#                     value = "".join(value.split())
#                     key = StringHelper.cutfrom(ruler_pair[0], '**')
#                 elif ruler_pair[0].count('*') == 1:
#                     key = StringHelper.cutfrom(ruler_pair[0], '*')
#
#             elif ruler_pair[0].count('^') > 0:
#                 value = tag.text
#                 value = "".join(value.split())
#                 key = StringHelper.cutfrom(ruler_pair[0], '^')
#             else:
#                 key = ruler_pair[0]
#                 value = tag.text
#
#             if value == '':
#                 print('info: in ruler_finder_ex star filter -> ' + result[0] + ' got empty result!')
#                 value = result[1]
#
#             return (key, value)
#         elif result == None:
#             return None
#         else:
#             return result