Esempio n. 1
0
    def _htmlparse(self, pq, rule):
        """
        �则解析
        """
        if 'grep' in rule:
            pq = self._grep(pq, rule['grep'])
        if 'not_' in rule:
            pq = pq.not_(rule['not_'])
        if 'is_' in rule:
            pq = pq.is_(rule['is_'])
        if 'eq' in rule:
            idx = int(rule['eq']) or 0
            if idx < 0:
                idx = pq.length + idx
            pq = pq.eq(idx)
        if 'type' in rule:
            onlyOne = int(rule.get('onlyOne', 1))
            target = rule.get('target', None)
            callback = rule.get('callback', None)
            llimiter = rule.get('llimiter', None)
            rlimiter = rule.get('rlimiter', None)
            proccessFun = getattr(self, '_%s' % str(rule['type']))
            content = proccessFun(pq, target=target, onlyOne=onlyOne, callback=callback, llimiter=llimiter, rlimiter=rlimiter)
            if 'match' in rule and rule['match']:
                redata = pcre2re(rule['match']).search(content)
                if not redata:
                    return None
                if 'mkey' in rule:
                    rst = ((rule['mkey'] in redata.groups()) and redata.group(rule['mkey']) or None)
                    if isinstance(rst, six.string_types):
                        return self.patch_result(extract_result(rst, rule, None), rule, None)
                    return rst
                rst = redata.group(1)
                if isinstance(rst, six.string_types):
                    return self.patch_result(extract_result(rst, rule, None), rule, None)
                return rst
            if isinstance(content, six.string_types):
                return self.patch_result(extract_result(content, rule, None), rule, None)
            return content
        elif 'item' in rule:
            onlyOne = int(rule.get('onlyOne', 0))
            if onlyOne:
                parser = PyqueryParser(rule['item'], str(pq))
                return parser.parse()
            else:
                data = []
                for i in range(pq.length):
                    parser = PyqueryParser(rule['item'], str(pq.eq(i)))
                    data.append(parser.parse())

                return data
        else:
            return pq
Esempio n. 2
0
 def _filter(self, doc, rule, onlyOne=1):
     if isinstance(rule, dict):
         if 'filter' in rule:
             if not rule['filter']:
                 return None
             data = self.match(doc, rule['filter'], onlyOne)
             if 'item' in rule:
                 onlyOne = int(rule.get('onlyOne', 0))
                 return self._item_filter(data, rule, onlyOne)
             else:
                 rule.setdefault('type', 'text')
                 callback = rule.get('callback', None)
                 if isinstance(data, list):
                     return [
                         self.patch_result(
                             extract_result(self.f(item, rule), rule, None),
                             rule, callback) for item in data
                     ]
                 return self.patch_result(
                     extract_result(self.f(data, rule), rule, None), rule,
                     callback)
         elif 'item' in rule:
             onlyOne = bool(int(rule.get('onlyOne', 0)))
             return self._item_filter(doc, rule, onlyOne)
         else:
             data = {}
             for k in rule:
                 data[k] = self._filter(doc, rule[k])
             return data
     elif isinstance(rule, list):
         rst = []
         for item in rule:
             data = self._filter(doc, item)
             rst.append(data)
         return rst
     else:
         return self.match(doc, rule)
Esempio n. 3
0
    def process(self, raw_html, final_url, link_hash, encoding=None):

        # create document
        doc = self.get_document(raw_html, encoding)

        # catalogue
        self.catalogue._final_url = final_url or self.config.final_url
        self.catalogue._link_hash = link_hash
        self.catalogue._raw_html = raw_html
        self.catalogue._doc = doc
        self.catalogue._raw_doc = deepcopy(doc)

        custom_rule = self.config.custom_rule
        if custom_rule:
            data = {}
            onlyOne = custom_rule.pop('onlyOne', 1)
            if 'item' in custom_rule and custom_rule['item']:
                if 'filter' in custom_rule and custom_rule['filter']:
                    doc = self.extractor.custom_match_elements(
                        custom_rule['filter'], doc=doc)
                self.catalogue._doc = doc
                for key, rule in custom_rule['item'].items():
                    parsed = self.extractor.extract(key, rule, onlyOne)
                    parsed = utils.patch_result(parsed, rule)
                    parsed = utils.extract_result(parsed, rule)
                    data[key] = [parsed
                                 ] if not isinstance(parsed, list) else parsed
                self.catalogue.data = utils.table2kvlist(data)
            else:
                for key, rule in custom_rule.items():
                    parsed = self.extractor.extract(key, rule)
                    parsed = utils.patch_result(parsed, rule)
                    parsed = utils.extract_result(parsed, rule)
                    data[key] = parsed
                self.catalogue.data = [data]
        return self.catalogue
Esempio n. 4
0
 def _filter(self, data, rule):
     if isinstance(rule, dict):
         if 'filter' in rule:
             if not rule['filter']:
                 return None
             if rule['filter'].startswith("@json:"):
                 rule['filter'] = rule['filter'][6:]
             rst = self._json_parse(data, rule['filter'])
             if rst is None:
                 return None
             if 'item' in rule:
                 onlyOne = bool(int(rule.get('onlyOne', 0)))
                 return self._item_filter(rst, rule, onlyOne)
             else:
                 callback = rule.get('callback', None)
                 return self.patch_result(utils.extract_result(rst, rule, None), rule, callback)
         elif 'item' in rule:
             onlyOne = bool(int(rule.get('onlyOne', 0)))
             rest = {}
             for item in rule['item']:
                 rest[item] = self._filter(copy.deepcopy(data), rule['item'][item])
             rst = utils.table2kvlist(rest, extend=True)
             if onlyOne:
                 return rst[0]
             return rst
         else:
             rst = {}
             for key, val in rule.items():
                 rst[key] = self._filter(data, val)
             return rst
     elif isinstance(rule, list):
         rst = []
         for item in rule:
             rest = self._filter(copy.deepcopy(data), item)
             if rest:
                 rst.append(rest)
         return rst
     else:
         return self._filter(data, {"filter": rule})
Esempio n. 5
0
 def extract_result(self, data, rule, callback=None):
     return extract_result(data, rule, callback)