Esempio n. 1
0
 def _match(self, pq, onlyOne = True, callback = None, target = None, llimiter = None, rlimiter = None):
     """
     正则匹配元素
     """
     if not isinstance(pq, PyQuery):
         return None
     if pq.length == 0:
         return None
     if not llimiter:
         llimiter = ''
     if not rlimiter:
         rlimiter = ''
     pattern = pcre2re(target)
     if onlyOne:
         for i in range(0, pq.length):
             text = pq.eq(i).outer_html()
             if text:
                 m = pattern.search(text)
                 if m:
                     return callback_result(callback, llimiter + m.group(1) + rlimiter)
     else:
         data = []
         for i in range(0, pq.length):
             text = pq.eq(i).text()
             if text:
                 m = pattern.search(text)
                 if m:
                     data.append(callback_result(callback, llimiter + m.group(1) + rlimiter))
         return data
     return None
Esempio n. 2
0
    def _htmlparse(self, pq, rule):
        """
        �则解析
        """
        if 'grep' in rule:
            pq = self._grep(pq, rule['grep'])
        if 'not_' in rule:
            pq = pq.not_(rule['not_'])
        if 'is_' in rule:
            pq = pq.is_(rule['is_'])
        if 'eq' in rule:
            idx = int(rule['eq']) or 0
            if idx < 0:
                idx = pq.length + idx
            pq = pq.eq(idx)
        if 'type' in rule:
            onlyOne = int(rule.get('onlyOne', 1))
            target = rule.get('target', None)
            callback = rule.get('callback', None)
            llimiter = rule.get('llimiter', None)
            rlimiter = rule.get('rlimiter', None)
            proccessFun = getattr(self, '_%s' % str(rule['type']))
            content = proccessFun(pq, target=target, onlyOne=onlyOne, callback=callback, llimiter=llimiter, rlimiter=rlimiter)
            if 'match' in rule and rule['match']:
                redata = pcre2re(rule['match']).search(content)
                if not redata:
                    return None
                if 'mkey' in rule:
                    rst = ((rule['mkey'] in redata.groups()) and redata.group(rule['mkey']) or None)
                    if isinstance(rst, six.string_types):
                        return self.patch_result(extract_result(rst, rule, None), rule, None)
                    return rst
                rst = redata.group(1)
                if isinstance(rst, six.string_types):
                    return self.patch_result(extract_result(rst, rule, None), rule, None)
                return rst
            if isinstance(content, six.string_types):
                return self.patch_result(extract_result(content, rule, None), rule, None)
            return content
        elif 'item' in rule:
            onlyOne = int(rule.get('onlyOne', 0))
            if onlyOne:
                parser = PyqueryParser(rule['item'], str(pq))
                return parser.parse()
            else:
                data = []
                for i in range(pq.length):
                    parser = PyqueryParser(rule['item'], str(pq.eq(i)))
                    data.append(parser.parse())

                return data
        else:
            return pq
Esempio n. 3
0
 def process_fun(el, rule):
     source = self._htmlparse(el, rule)
     if match:
         if source:
             d = pcre2re(match).search(source)
             if d:
                 return True
     elif params:
         if params == source:
             return True
     else:
         if source:
             return True
     return False
Esempio n. 4
0
 def _text(self, elements, **kwargs):
     for element in elements:
         text = element.text
         if 'match' in kwargs and kwargs['match']:
             pattern = utils.pcre2re(kwargs['text'])
             if text and pattern.search(text) and self._getable(
                     kwargs, element):
                 return element
         elif 'partial' in kwargs and kwargs['partial']:
             if text and text.find(kwargs['text']) != -1 and self._getable(
                     kwargs, element):
                 return element
         else:
             if text == kwargs['text'] and self._getable(kwargs, element):
                 return element
Esempio n. 5
0
 def _css(self, elements, **kwargs):
     if 'val' not in kwargs:
         raise CDSpiderSettingError('Selenium val must be not none',
                                    self._base_url,
                                    self.final_url,
                                    rule=kwargs)
     for element in elements:
         css = element.value_of_css_property(kwargs['css'])
         if 'match' in kwargs and kwargs['match']:
             pattern = utils.pcre2re(kwargs['val'])
             if css and pattern.search(css) and self._getable(
                     kwargs, element):
                 return element
         elif 'partial' in kwargs and kwargs['partial']:
             if css and css.find(kwargs['val']) != -1 and self._getable(
                     kwargs, element):
                 return element
         else:
             if css == kwargs['val'] and self._getable(kwargs, element):
                 return element
Esempio n. 6
0
 def _attr(self, elements, **kwargs):
     if 'val' not in kwargs:
         raise CDSpiderSettingError('Selenium val must be not none',
                                    self._base_url,
                                    self.final_url,
                                    rule=kwargs)
     for element in elements:
         attr = element.get_attribute(kwargs['attr'])
         if 'match' in kwargs and kwargs['match']:
             pattern = utils.pcre2re(kwargs['val'])
             if attr and pattern.search(attr) and self._getable(
                     kwargs, element):
                 return element
         elif 'partial' in kwargs and kwargs['partial']:
             if attr and attr.find(kwargs['val']) != -1 and self._getable(
                     kwargs, element):
                 return element
         else:
             if attr == kwargs['val'] and self._getable(kwargs, element):
                 return element