def _pagination_urls(self, response): return [ url for url in unique( canonicalize_url(url, keep_fragments=True) for url in autopager.urls(response)) if self.link_extractor.matches(url) ]
def _pagination_urls(self, response): return [ url for url in unique( canonicalize_url(url, keep_fragments=True) for url in autopager.urls(response) ) if self.link_extractor.matches(url) ]
def extract_links(self, response): base_url = get_base_url(response) if self.restrict_xpaths: docs = [subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)] else: docs = [response.selector] all_links = [] for doc in docs: links = self._extract_links(doc, response.url, response.encoding, base_url) all_links.extend(self._process_links(links)) return unique(all_links)
def extract_links(self, response): from scrapy_balloons.spiders.balloon import balloon_spider url_info = urlparse(response.url) base_url = balloon_spider.base_url if base_url and len(base_url.strip()) == 0: base_url = "%s://%s" % (url_info.scheme, url_info.netloc) all_links = [] if self.allow_res: for allow_re in self.allow_res: all_links = all_links + allow_re.findall(response.body) ## run process value see #LxmlParserLinkExtractor all_links = [self.link_extractor.process_attr(url) for url in all_links if self.link_extractor.process_attr(url) is not None] all_links = [Link(urljoin(base_url, url), "") for url in all_links] return unique(all_links)
def _extract_links(self, response_text, response_url): html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) sel = pyquery.PyQuery(html) evt_links = sel('.news > li:not(.more) > a') ann_links = sel('.announcement > li:not(.more) > a') all_links = [ Link(elem.attrib['href'], text=elem.text) for elem in itertools.chain(evt_links, ann_links) ] return unique(all_links, key=lambda link: link.url)