def parse_data(self, response): item = CrawlerItem() title = response.css("head title::text").extract_first().strip() # Extract page title if title.endswith(' | University of Illinois at Chicago'): title = title[:-36] soup = BeautifulSoup(response.text, "html.parser") for div in soup.find_all("div", {'class': 'browser-stripe'}): div.decompose() # Extract page content contents = soup.findAll(text=True) visible_texts = filter(tag_visible, contents) item['content'] = " ".join(t.strip() for t in visible_texts) outlinks = [] le = LxmlLinkExtractor(allow_domains=('uic.edu'), deny_domains=('login.uic.edu'), unique=True, canonicalize=True) for link in le.extract_links(response): outlinks.append(link.url) if title != 'UIC Directory' and title != 'Search Help' and 'uic.edu' in response.request.url: item['title'] = title item['url'] = response.request.url item['outlinks'] = outlinks yield item
class GovMpSpider(scrapy.Spider): name = 'govmp' start_urls = ['http://www.monitorpolski.gov.pl/MP/rok/2020'] allowed_domains = ['gov.pl'] def __init__(self, *args, **kwargs): super().__init__(**kwargs) self._link_extractor = LxmlLinkExtractor(allow_domains=['gov.pl']) def start_requests(self): yield Request(url=self.start_urls[0], callback=self.parse_page_list) def parse_page_list(self, response): for link in self._link_extractor.extract_links(response): if '/MP/rok/2020/' in link.url: yield response.follow(link, callback=self.parse_filtered_list) def parse_filtered_list(self, response): for item in response.xpath('//tr[contains(td[3]/a/@href, "pdf")]'): # TODO: consider using "title" in filename title = item.xpath('td[2]/text()').get().strip() href = item.xpath('td[3]/a/@href').get() url = response.urljoin(href) date = item.xpath('td[4]/text()').get().strip() yield PdfItem(file_urls=[url], date=date)
def parse(self, response): item = CrawlerItem() item['jpg_urls'] = [] linkextractors = LxmlLinkExtractor( allow=[r'\.jpg', r'\.tif'], deny_extensions=['md5', 'xmp', 'html']) for link in linkextractors.extract_links(response): item['jpg_urls'].append(link.url) return item
def parsePages(self, response): linkExtractor = LxmlLinkExtractor( deny_extensions=[], process_value=self.formatter.formatLink) item = ScraperdaneItem() item["name"] = response.url item["children"] = [ link.url for link in linkExtractor.extract_links(response) ] return item
def extract_search_links(self, response, source): """ Yields all search links found on the page """ extractor = LxmlLinkExtractor(allow=r'/s([/?])', allow_domains=self.allowed_domains) links = extractor.extract_links(response) for link in links: url = link.url yield SearchLinkItem(url=url, source=source)
def parse(self, response): extractor = LxmlLinkExtractor(deny_domains=[DOMAIN] + SOCIAL_DOMAINS) links = extractor.extract_links(response) for link in links: yield { 'link': link.url, 'page': response.url, 'anchor_text': link.text, 'response_code': getResponseCode(link.url) }
def extract_store_links(self, response, store_id): """ Yields all store links found on the page """ extractor = LxmlLinkExtractor(allow=r'/stores/', allow_domains=self.allowed_domains) links = extractor.extract_links(response) for link in links: url = link.url yield StorePageItem(url=url, store_id=store_id)
def extract_product_links(self, response, source): """ Yields all product links found on the page """ extractor = LxmlLinkExtractor(allow=r'/dp/', allow_domains=self.allowed_domains) links = extractor.extract_links(response) for link in links: asin = re.search(r'/dp/(?P<asin>([A-Z0-9]+))', link.url).group('asin') yield ProductLinkItem(asin=asin, source=source)
def __init__(self, response): link_extractor = LxmlLinkExtractor(allow=(), deny=(), allow_domains=(), deny_domains=(), deny_extensions=None, restrict_xpaths=(), restrict_css=(), tags=('a', 'area'), attrs=('href'), canonicalize=False, unique=True, process_value=None, strip=True) self.links = link_extractor.extract_links(response)
def parse(self, response): selector = Selector(response) item = JdbookItem() extractor = LxmlLinkExtractor(allow=r'http://item.jd.com/\d.*html') link = extractor.extract_links(response) try: item['_id'] = response.url.split('/')[3].split('.')[0] item['url'] = response.url item['title'] = selector.xpath( '/html/head/title/text()').extract()[0] item['keywords'] = selector.xpath( '/html/head/meta[2]/@content').extract()[0] item['description'] = selector.xpath( '/html/head/meta[3]/@content').extract()[0] item['img'] = 'http:' + selector.xpath( '//*[@id="spec-n1"]/img/@src').extract()[0] item['channel'] = selector.xpath( '//*[@id="root-nav"]/div/div/strong/a/text()').extract()[0] item['tag'] = selector.xpath( '//*[@id="root-nav"]/div/div/span[1]/a[1]/text()').extract()[0] item['sub_tag'] = selector.xpath( '//*[@id="root-nav"]/div/div/span[1]/a[2]/text()').extract()[0] item['value'] = selector.xpath( '//*[@id="root-nav"]/div/div/span[1]/a[2]/text()').extract()[0] comments = list() node_comments = selector.xpath('//*[@id="hidcomment"]/div') for node_comment in node_comments: comment = dict() node_comment_attrs = node_comment.xpath( './/div[contains(@class, "i-item")]') for attr in node_comment_attrs: url = attr.xpath('.//div/strong/a/@href').extract()[0] comment['url'] = 'http:' + url content = attr.xpath('.//div/strong/a/text()').extract()[0] comment['content'] = content time = attr.xpath('.//div/span[2]/text()').extract()[0] comment['time'] = time comments.append(comment) item['comments'] = comments except Exception as ex: print('something wrong', str(ex)) print('success, go for next') yield item next_url = self.get_next_url(response.url) # response.url就是原请求的url if next_url != None: # 如果返回了新的url yield Request(next_url, callback=self.parse, headers=self.headers, cookies=self.cookies, meta=self.meta)
def get_links(url, body): from scrapy.http import HtmlResponse from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor start_url = url if '.html' in start_url: start_url = start_url.rsplit('/', 1)[0] response = HtmlResponse(url=start_url, body=body, encoding='utf8') link_extractor = LxmlLinkExtractor(allow=[start_url], deny=[], tags='a', attrs='href', canonicalize=True) return link_extractor.extract_links(response)
class LinkProcedure(BaseProcedure): """ 基于scrapy的LxmlLinkExtractor的链接提取器 link xpath css xpath string|array 参考LxmlLinkExtractor的restrict_xpaths css string|array 参考LxmlLinkExtractor的restrict_css """ def __init__(self, *args): xpath = args[0] css = len(args) >= 2 and args[1] or None self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath, restrict_css=css) def do(self, input_, **kwargs): if isinstance(input_, Response): links = self._extractor.extract_links(input_) return [i.url.strip() for i in links] else: raise Exception('link input error')
class LinkProcedure(BaseProcedure): """ 基于scrapy的LxmlLinkExtractor的链接提取器 link xpath css xpath string|array 参考LxmlLinkExtractor的restrict_xpaths css string|array 参考LxmlLinkExtractor的restrict_css """ def __init__(self, *args): xpath = args[0] css = len(args) >= 2 and args[1] or None self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath, restrict_css=css) def do(self, input_, **kwargs): if isinstance(input_, Response): links = self._extractor.extract_links(input_) return [i.url.strip() for i in links] else: raise Exception('link input error')
def parse(self, response): #print('\n\nPARSE') #print(response.body_as_unicode()) allowed_domains = getattr(self, 'allowed_domains', None) link_extractor = LxmlLinkExtractor(allow_domains=allowed_domains, unique=True) for link in link_extractor.extract_links(response): url = link.url # quick fix if '%5C%22' in url: url = url.split('%5C%22')[1] url = url.replace(':/w', '://w') normalized_url = self.normalize_url( url) if self.normalize_urls else url if self.distributeur_list_url_pattern is not NotImplemented: if self.distributeur_list_url_pattern.match(normalized_url): yield Request(url, callback=self.parse) if self.distributeur_url_pattern.match(normalized_url): yield self.distributeur_page_request(url)
def get_links(url, body): start_url = url if '.html' in start_url: start_url = start_url.rsplit('/', 1)[0] response = HtmlResponse( url=start_url, body=body, encoding='utf8' ) link_extractor = LxmlLinkExtractor( allow=[start_url], deny=[], tags='a', attrs='href', canonicalize=True ) return link_extractor.extract_links(response)
def handle_html(self, response, html_selector): """ Parse HTML and extract links :type response: scrapy.http.Response :type html_selector: scrapy.selector.Selector :yields: dict, scrapy.Request """ # @TODO handles for different parts of the HTML. eg. body, head, frameset log = structlog.get_logger().bind( event = 'PARSE_HTML', module = __file__, source_url = response.url, content_type = 'HTML') crawl_depth = response.meta.get('crawl_depth', self._crawl_depth) title = response.data.get('title', response.url) try: body = html_selector.xpath('//body')[0] except IndexError: body = selector.Selector(text='') yield dict( source_url = response.url, crawl_timestamp = self._crawl_start_datetime.strftime('%Y-%m-%dT%H:%M:%SZ'), title = title, content_type = 'HTML', content = body.extract()) # add domain to set of traversed domains parsed_resp_url = http.urlparse(response.url.encode('utf')).decode() self._traversed_domains.add(parsed_resp_url.netloc) # extract links linkextractor = LxmlLinkExtractor( allow = self._patterns_url_whitelist, deny = self._patterns_url_blacklist, allow_domains = self._patterns_domain_whitelist, deny_domains = self._patterns_domain_blacklist) href_list = linkextractor.extract_links(response) for link in href_list: # get the URL in string format href = link.url # separate meaningful pieces of URL try: parsed_href = http.urlparse(href.encode('utf8')).decode() except: # typically href URL is invalid log.error(error = "INVALID_URL", href=href) continue # only parse HTTP links if parsed_href.scheme.upper() in ['HTTP', 'HTTPS']: # split the query string from the href, do not follow _href! _href = ''.join([ parsed_href.netloc, parsed_href.path]) # determine file type from the URL content_type = self.identify_type_from_url(_href) # make routing decision based on content type route = None if content_type in ['HTML']: route = response.follow( href, callback = self.parse, errback = self.errback, meta = dict( crawl_depth = crawl_depth - 1, splash = { 'endpoint': 'render.json', 'args': { 'html': 1, 'iframes': 1, 'timeout': 10, } } ) ) elif content_type in self._processable_ext: log.info('@TODO') # @TODO # is crawl at 0 depth? conditions = any([ crawl_depth > 0, all([ crawl_depth <= 0, parsed_href.netloc in self._traversed_domains ]), ]) if conditions and route is not None: yield route
def extract_links(self, response): links = LxmlLinkExtractor.extract_links(self, response) for x in links: x.url = LanguageLinkExtractor.addParams(x.url) # links = super(LxmlLinkExtractor, self).extract_links(response); return links