Exemple #1
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value
Exemple #2
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value
Exemple #3
0
 def handle_starttag(self, tag, attrs):
     attrs = dict(attrs)
     if not self.in_table:
         if tag == 'table':
             if ('id' in attrs) and (attrs['id'] == 'dgResult'):
                 self.in_table = True
     else:
         if tag == 'tr':
             self.in_row = True
         elif tag == 'td':
             self.in_cell = True
         elif (tag == 'a') and (len(self.current_row) == 7):
             jsfunction = attrs['href']
             uid = extract_regex(r'javascript:VerifyID\((\d+)\)', jsfunction)[0]
             self.current_cell = uid
Exemple #4
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        for proc in processors:
            if value is None:
                break
            _proc = proc
            proc = wrap_loader_context(proc, self.context)
            try:
                value = proc(value)
            except Exception as e:
                raise ValueError("Error with processor %s value=%r error='%s: %s'" %
                                 (_proc.__class__.__name__, value,
                                  type(e).__name__, str(e)))
        return value
Exemple #5
0
    def get_value(self, value, *processors, **kw):
        regex = kw.get('re', None)
        if regex:
            value = arg_to_iter(value)
            value = flatten(extract_regex(regex, x) for x in value)

        grouped = kw.get('grouped')
        if grouped:
            regex = re.compile(grouped, re.UNICODE)
            value = arg_to_iter(value)
            value = [m.groupdict() for v in value for m in regex.finditer(v)]

        for proc in processors:
            if value is None:
                break
            proc = wrap_loader_context(proc, self.context)
            value = proc(value)
        return value
Exemple #6
0
 def re(self, regex):
     return extract_regex(regex, self.extract())
Exemple #7
0
 def re(self, regex):
     return extract_regex(regex, self.extract())
Exemple #8
0
 def re(self, regex):
     """Return a list of unicode strings by applying the regex over all
     current XPath selections, and flattening the results"""
     return extract_regex(regex, self.extract(), 'utf-8')
Exemple #9
0
    def test_deprecated_extract_regex(self):
        with warnings.catch_warnings(record=True) as w:
            extract_regex(r'\w+', 'this is a test')

            assert len(w) == 1
            assert issubclass(w[0].category, ScrapyDeprecationWarning)
Exemple #10
0
 def extract_id(self, id_cell):
     jsfunction = id_cell.xpath('.//a')[0].attrib['href']
     uid = extract_regex(r'javascript:VerifyID\((\d+)\)', jsfunction)
     return arg_to_iter(uid)