コード例 #1
0
ファイル: spiders.py プロジェクト: inferlinkdev/undercrawler
 def _pagination_urls(self, response):
     return [
         url for url in unique(
             canonicalize_url(url, keep_fragments=True)
             for url in autopager.urls(response))
         if self.link_extractor.matches(url)
     ]
コード例 #2
0
ファイル: base_spider.py プロジェクト: barravi/undercrawler
 def _pagination_urls(self, response):
     return [
         url for url in
         unique(
             canonicalize_url(url, keep_fragments=True)
             for url in autopager.urls(response)
         )
         if self.link_extractor.matches(url)
         ]
コード例 #3
0
ファイル: utils.py プロジェクト: matthieucham/scrapynuts
 def extract_links(self, response):
     base_url = get_base_url(response)
     if self.restrict_xpaths:
         docs = [subdoc
                 for x in self.restrict_xpaths
                 for subdoc in response.xpath(x)]
     else:
         docs = [response.selector]
     all_links = []
     for doc in docs:
         links = self._extract_links(doc, response.url, response.encoding, base_url)
         all_links.extend(self._process_links(links))
     return unique(all_links)
コード例 #4
0
 def extract_links(self, response):
     from scrapy_balloons.spiders.balloon import balloon_spider
     url_info = urlparse(response.url)
     base_url = balloon_spider.base_url
     if base_url and len(base_url.strip()) == 0:
         base_url = "%s://%s" % (url_info.scheme, url_info.netloc)
     all_links = []
     if self.allow_res:
         for allow_re in self.allow_res:
             all_links = all_links + allow_re.findall(response.body)
     ## run process value see #LxmlParserLinkExtractor
     all_links = [self.link_extractor.process_attr(url) for url in all_links if self.link_extractor.process_attr(url) is not None]
     all_links = [Link(urljoin(base_url, url), "") for url in all_links]
     return unique(all_links)
コード例 #5
0
ファイル: dm.py プロジェクト: yankaics/campuspiders
    def _extract_links(self, response_text, response_url):
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        sel = pyquery.PyQuery(html)

        evt_links = sel('.news > li:not(.more) > a')
        ann_links = sel('.announcement > li:not(.more) > a')

        all_links = [
            Link(elem.attrib['href'], text=elem.text)
            for elem in itertools.chain(evt_links, ann_links)
        ]

        return unique(all_links, key=lambda link: link.url)
コード例 #6
0
ファイル: dm.py プロジェクト: lhproject/campuspiders
    def _extract_links(self, response_text, response_url):
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        sel = pyquery.PyQuery(html)

        evt_links = sel('.news > li:not(.more) > a')
        ann_links = sel('.announcement > li:not(.more) > a')

        all_links = [
                Link(elem.attrib['href'], text=elem.text)
                for elem in itertools.chain(evt_links, ann_links)
                ]

        return unique(all_links, key=lambda link: link.url)