def parse_data(self, response):
        item = CrawlerItem()
        title = response.css("head title::text").extract_first().strip()

        # Extract page title
        if title.endswith(' | University of Illinois at Chicago'):
            title = title[:-36]
        soup = BeautifulSoup(response.text, "html.parser")
        for div in soup.find_all("div", {'class': 'browser-stripe'}):
            div.decompose()

        # Extract page content
        contents = soup.findAll(text=True)
        visible_texts = filter(tag_visible, contents)
        item['content'] = " ".join(t.strip() for t in visible_texts)

        outlinks = []
        le = LxmlLinkExtractor(allow_domains=('uic.edu'),
                               deny_domains=('login.uic.edu'),
                               unique=True,
                               canonicalize=True)
        for link in le.extract_links(response):
            outlinks.append(link.url)

        if title != 'UIC Directory' and title != 'Search Help' and 'uic.edu' in response.request.url:
            item['title'] = title
            item['url'] = response.request.url
            item['outlinks'] = outlinks
            yield item
Example #2
0
class GovMpSpider(scrapy.Spider):
    name = 'govmp'
    start_urls = ['http://www.monitorpolski.gov.pl/MP/rok/2020']
    allowed_domains = ['gov.pl']

    def __init__(self, *args, **kwargs):
        super().__init__(**kwargs)
        self._link_extractor = LxmlLinkExtractor(allow_domains=['gov.pl'])

    def start_requests(self):
        yield Request(url=self.start_urls[0], callback=self.parse_page_list)

    def parse_page_list(self, response):
        for link in self._link_extractor.extract_links(response):
            if '/MP/rok/2020/' in link.url:
                yield response.follow(link, callback=self.parse_filtered_list)

    def parse_filtered_list(self, response):
        for item in response.xpath('//tr[contains(td[3]/a/@href, "pdf")]'):
            # TODO: consider using "title" in filename
            title = item.xpath('td[2]/text()').get().strip()
            href = item.xpath('td[3]/a/@href').get()
            url = response.urljoin(href)
            date = item.xpath('td[4]/text()').get().strip()
            yield PdfItem(file_urls=[url], date=date)
Example #3
0
 def parse(self, response):
     item = CrawlerItem()
     item['jpg_urls'] = []
     linkextractors = LxmlLinkExtractor(
         allow=[r'\.jpg', r'\.tif'], deny_extensions=['md5', 'xmp', 'html'])
     for link in linkextractors.extract_links(response):
         item['jpg_urls'].append(link.url)
     return item
Example #4
0
 def parsePages(self, response):
     linkExtractor = LxmlLinkExtractor(
         deny_extensions=[], process_value=self.formatter.formatLink)
     item = ScraperdaneItem()
     item["name"] = response.url
     item["children"] = [
         link.url for link in linkExtractor.extract_links(response)
     ]
     return item
Example #5
0
    def extract_search_links(self, response, source):
        """
		Yields all search links found on the page
		"""
        extractor = LxmlLinkExtractor(allow=r'/s([/?])',
                                      allow_domains=self.allowed_domains)
        links = extractor.extract_links(response)
        for link in links:
            url = link.url
            yield SearchLinkItem(url=url, source=source)
Example #6
0
 def parse(self, response):
     extractor = LxmlLinkExtractor(deny_domains=[DOMAIN] + SOCIAL_DOMAINS)
     links = extractor.extract_links(response)
     for link in links:
         yield {
             'link': link.url,
             'page': response.url,
             'anchor_text': link.text,
             'response_code': getResponseCode(link.url)
         }
Example #7
0
    def extract_store_links(self, response, store_id):
        """
		Yields all store links found on the page
		"""
        extractor = LxmlLinkExtractor(allow=r'/stores/',
                                      allow_domains=self.allowed_domains)
        links = extractor.extract_links(response)
        for link in links:
            url = link.url
            yield StorePageItem(url=url, store_id=store_id)
Example #8
0
    def extract_product_links(self, response, source):
        """
		Yields all product links found on the page
		"""
        extractor = LxmlLinkExtractor(allow=r'/dp/',
                                      allow_domains=self.allowed_domains)
        links = extractor.extract_links(response)
        for link in links:
            asin = re.search(r'/dp/(?P<asin>([A-Z0-9]+))',
                             link.url).group('asin')
            yield ProductLinkItem(asin=asin, source=source)
Example #9
0
 def __init__(self, response):
     link_extractor = LxmlLinkExtractor(allow=(), deny=(),
                                        allow_domains=(),
                                        deny_domains=(),
                                        deny_extensions=None,
                                        restrict_xpaths=(),
                                        restrict_css=(),
                                        tags=('a', 'area'),
                                        attrs=('href'),
                                        canonicalize=False,
                                        unique=True,
                                        process_value=None,
                                        strip=True)
     self.links = link_extractor.extract_links(response)
Example #10
0
 def parse(self, response):
     selector = Selector(response)
     item = JdbookItem()
     extractor = LxmlLinkExtractor(allow=r'http://item.jd.com/\d.*html')
     link = extractor.extract_links(response)
     try:
         item['_id'] = response.url.split('/')[3].split('.')[0]
         item['url'] = response.url
         item['title'] = selector.xpath(
             '/html/head/title/text()').extract()[0]
         item['keywords'] = selector.xpath(
             '/html/head/meta[2]/@content').extract()[0]
         item['description'] = selector.xpath(
             '/html/head/meta[3]/@content').extract()[0]
         item['img'] = 'http:' + selector.xpath(
             '//*[@id="spec-n1"]/img/@src').extract()[0]
         item['channel'] = selector.xpath(
             '//*[@id="root-nav"]/div/div/strong/a/text()').extract()[0]
         item['tag'] = selector.xpath(
             '//*[@id="root-nav"]/div/div/span[1]/a[1]/text()').extract()[0]
         item['sub_tag'] = selector.xpath(
             '//*[@id="root-nav"]/div/div/span[1]/a[2]/text()').extract()[0]
         item['value'] = selector.xpath(
             '//*[@id="root-nav"]/div/div/span[1]/a[2]/text()').extract()[0]
         comments = list()
         node_comments = selector.xpath('//*[@id="hidcomment"]/div')
         for node_comment in node_comments:
             comment = dict()
             node_comment_attrs = node_comment.xpath(
                 './/div[contains(@class, "i-item")]')
             for attr in node_comment_attrs:
                 url = attr.xpath('.//div/strong/a/@href').extract()[0]
                 comment['url'] = 'http:' + url
                 content = attr.xpath('.//div/strong/a/text()').extract()[0]
                 comment['content'] = content
                 time = attr.xpath('.//div/span[2]/text()').extract()[0]
                 comment['time'] = time
             comments.append(comment)
         item['comments'] = comments
     except Exception as ex:
         print('something wrong', str(ex))
     print('success, go for next')
     yield item
     next_url = self.get_next_url(response.url)  # response.url就是原请求的url
     if next_url != None:  # 如果返回了新的url
         yield Request(next_url,
                       callback=self.parse,
                       headers=self.headers,
                       cookies=self.cookies,
                       meta=self.meta)
Example #11
0
def get_links(url, body):
    from scrapy.http import HtmlResponse
    from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

    start_url = url
    if '.html' in start_url:
        start_url = start_url.rsplit('/', 1)[0]

    response = HtmlResponse(url=start_url, body=body, encoding='utf8')

    link_extractor = LxmlLinkExtractor(allow=[start_url],
                                       deny=[],
                                       tags='a',
                                       attrs='href',
                                       canonicalize=True)

    return link_extractor.extract_links(response)
Example #12
0
class LinkProcedure(BaseProcedure):
    """
    基于scrapy的LxmlLinkExtractor的链接提取器
    link xpath css
    xpath string|array  参考LxmlLinkExtractor的restrict_xpaths
    css string|array  参考LxmlLinkExtractor的restrict_css
    """
    def __init__(self, *args):
        xpath = args[0]
        css = len(args) >= 2 and args[1] or None
        self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath, restrict_css=css)

    def do(self, input_, **kwargs):
        if isinstance(input_, Response):
            links = self._extractor.extract_links(input_)
            return [i.url.strip() for i in links]
        else:
            raise Exception('link input error')
Example #13
0
class LinkProcedure(BaseProcedure):
    """
    基于scrapy的LxmlLinkExtractor的链接提取器
    link xpath css
    xpath string|array  参考LxmlLinkExtractor的restrict_xpaths
    css string|array  参考LxmlLinkExtractor的restrict_css
    """
    def __init__(self, *args):
        xpath = args[0]
        css = len(args) >= 2 and args[1] or None
        self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath,
                                            restrict_css=css)

    def do(self, input_, **kwargs):
        if isinstance(input_, Response):
            links = self._extractor.extract_links(input_)
            return [i.url.strip() for i in links]
        else:
            raise Exception('link input error')
Example #14
0
    def parse(self, response):
        #print('\n\nPARSE')
        #print(response.body_as_unicode())
        allowed_domains = getattr(self, 'allowed_domains', None)
        link_extractor = LxmlLinkExtractor(allow_domains=allowed_domains,
                                           unique=True)

        for link in link_extractor.extract_links(response):
            url = link.url
            # quick fix
            if '%5C%22' in url:
                url = url.split('%5C%22')[1]
                url = url.replace(':/w', '://w')
            normalized_url = self.normalize_url(
                url) if self.normalize_urls else url
            if self.distributeur_list_url_pattern is not NotImplemented:
                if self.distributeur_list_url_pattern.match(normalized_url):
                    yield Request(url, callback=self.parse)
            if self.distributeur_url_pattern.match(normalized_url):
                yield self.distributeur_page_request(url)
Example #15
0
def get_links(url, body):
    start_url = url
    if '.html' in start_url:
        start_url = start_url.rsplit('/', 1)[0]

    response = HtmlResponse(
        url=start_url,
        body=body,
        encoding='utf8'
    )

    link_extractor = LxmlLinkExtractor(
        allow=[start_url],
        deny=[],
        tags='a',
        attrs='href',
        canonicalize=True
    )

    return link_extractor.extract_links(response)
Example #16
0
    def handle_html(self, response, html_selector):
        """
        Parse HTML and extract links

        :type response: scrapy.http.Response
        :type html_selector: scrapy.selector.Selector
        :yields: dict, scrapy.Request
        """
        # @TODO handles for different parts of the HTML. eg. body, head, frameset
        log = structlog.get_logger().bind(
            event = 'PARSE_HTML',
            module = __file__,
            source_url = response.url,
            content_type = 'HTML')

        crawl_depth = response.meta.get('crawl_depth', self._crawl_depth)
        title = response.data.get('title', response.url)

        try:
            body = html_selector.xpath('//body')[0]
        except IndexError:
            body = selector.Selector(text='')

        yield dict(
            source_url = response.url,
            crawl_timestamp = self._crawl_start_datetime.strftime('%Y-%m-%dT%H:%M:%SZ'),
            title = title,
            content_type = 'HTML',
            content = body.extract())

        # add domain to set of traversed domains
        parsed_resp_url = http.urlparse(response.url.encode('utf')).decode()
        self._traversed_domains.add(parsed_resp_url.netloc)

        # extract links
        linkextractor = LxmlLinkExtractor(
            allow = self._patterns_url_whitelist,
            deny = self._patterns_url_blacklist,
            allow_domains = self._patterns_domain_whitelist,
            deny_domains = self._patterns_domain_blacklist)
        href_list = linkextractor.extract_links(response)
        for link in href_list:
            # get the URL in string format
            href = link.url

           # separate meaningful pieces of URL
            try:
                parsed_href = http.urlparse(href.encode('utf8')).decode()
            except:
                # typically href URL is invalid
                log.error(error = "INVALID_URL", href=href)
                continue

            # only parse HTTP links
            if parsed_href.scheme.upper() in ['HTTP', 'HTTPS']:
                # split the query string from the href, do not follow _href!
                _href = ''.join([
                    parsed_href.netloc,
                    parsed_href.path])

                # determine file type from the URL
                content_type = self.identify_type_from_url(_href)

                # make routing decision based on content type
                route = None
                if content_type in ['HTML']:
                    route = response.follow(
                        href,
                        callback = self.parse,
                        errback = self.errback,
                        meta = dict(
                            crawl_depth = crawl_depth - 1,
                            splash = {
                                'endpoint': 'render.json',
                                'args': {
                                    'html': 1,
                                    'iframes': 1,
                                    'timeout': 10,
                                }
                            }
                        )
                    )
                elif content_type in self._processable_ext:
                    log.info('@TODO')     # @TODO

                # is crawl at 0 depth?
                conditions = any([
                    crawl_depth > 0,
                    all([
                        crawl_depth <= 0,
                        parsed_href.netloc in self._traversed_domains
                        ]),
                    ])
                if conditions and route is not None:
                    yield route
 def extract_links(self, response):
     links = LxmlLinkExtractor.extract_links(self, response)
     for x in links:
         x.url = LanguageLinkExtractor.addParams(x.url)
     # links = super(LxmlLinkExtractor, self).extract_links(response);
     return links