def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None, deny_extensions=None): tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process_value=process_value) super(SgmlLinkExtractor, self).__init__(lx, allow, deny, allow_domains, deny_domains, restrict_xpaths, canonicalize, deny_extensions) # FIXME: was added to fix a RegexLinkExtractor testcase self.base_url = None
def __call__(self, value, loader_context=None): values = arg_to_iter(value) if loader_context: context = MergeDict(loader_context, self.default_loader_context) else: context = self.default_loader_context wrapped_funcs = [ wrap_loader_context(f, context) for f in self.functions ] for func in wrapped_funcs: next_values = [] for v in values: next_values += arg_to_iter(func(v)) values = next_values return values
def get_value(self, value, *processors, **kw): regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten([extract_regex(regex, x) for x in value]) for proc in processors: if value is None: break proc = wrap_loader_context(proc, self.context) value = proc(value) return value
def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains, restrict_xpaths, canonicalize, deny_extensions): self.link_extractor = link_extractor self.allow_res = [ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow) ] self.deny_res = [ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny) ] self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths)) self.canonicalize = canonicalize if deny_extensions is None: deny_extensions = IGNORED_EXTENSIONS self.deny_extensions = set( ['.' + e for e in arg_to_iter(deny_extensions)])
def iterate_spider_output(result): return [result] if isinstance(result, BaseItem) else arg_to_iter(result)
def process_item(self, item, spider): info = self.spiderinfo requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info) for r in requests] dfd = DeferredList(dlist, consumeErrors=1) return dfd.addCallback(self.item_completed, item, info)
def _add_value(self, field_name, value): value = arg_to_iter(value) processed_value = self._process_input_value(field_name, value) if processed_value: self._values[field_name] += arg_to_iter(processed_value)
def _get_cssvalues(self, csss, **kw): self._check_selector_method() csss = arg_to_iter(csss) return flatten([self.selector.css(css).extract() for css in csss])
def _get_xpathvalues(self, xpaths, **kw): self._check_selector_method() xpaths = arg_to_iter(xpaths) return flatten( [self.selector.xpath(xpath).extract() for xpath in xpaths])