def get_value(self, value, *processors, **kw): regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten(extract_regex(regex, x) for x in value) for proc in processors: if value is None: break proc = wrap_loader_context(proc, self.context) value = proc(value) return value
def handle_starttag(self, tag, attrs): attrs = dict(attrs) if not self.in_table: if tag == 'table': if ('id' in attrs) and (attrs['id'] == 'dgResult'): self.in_table = True else: if tag == 'tr': self.in_row = True elif tag == 'td': self.in_cell = True elif (tag == 'a') and (len(self.current_row) == 7): jsfunction = attrs['href'] uid = extract_regex(r'javascript:VerifyID\((\d+)\)', jsfunction)[0] self.current_cell = uid
def get_value(self, value, *processors, **kw): regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten(extract_regex(regex, x) for x in value) for proc in processors: if value is None: break _proc = proc proc = wrap_loader_context(proc, self.context) try: value = proc(value) except Exception as e: raise ValueError("Error with processor %s value=%r error='%s: %s'" % (_proc.__class__.__name__, value, type(e).__name__, str(e))) return value
def get_value(self, value, *processors, **kw): regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten(extract_regex(regex, x) for x in value) grouped = kw.get('grouped') if grouped: regex = re.compile(grouped, re.UNICODE) value = arg_to_iter(value) value = [m.groupdict() for v in value for m in regex.finditer(v)] for proc in processors: if value is None: break proc = wrap_loader_context(proc, self.context) value = proc(value) return value
def re(self, regex): return extract_regex(regex, self.extract())
def re(self, regex): """Return a list of unicode strings by applying the regex over all current XPath selections, and flattening the results""" return extract_regex(regex, self.extract(), 'utf-8')
def test_deprecated_extract_regex(self): with warnings.catch_warnings(record=True) as w: extract_regex(r'\w+', 'this is a test') assert len(w) == 1 assert issubclass(w[0].category, ScrapyDeprecationWarning)
def extract_id(self, id_cell): jsfunction = id_cell.xpath('.//a')[0].attrib['href'] uid = extract_regex(r'javascript:VerifyID\((\d+)\)', jsfunction) return arg_to_iter(uid)