def parse(self, response: Response): image_elements = response.xpath("//img/@src") text_elements = response.xpath( "//*[not(self::script)][not(self::style)][not(self::title)][string-length(normalize-space(text())) > 0]/text()" ) yield { 'url': response.url, 'text_elements': map(lambda text: text.get().strip(), text_elements), 'image_elements': map( lambda image: 'https://kpi.ua' + image.get() if image.get().startswith('/') else image.get(), image_elements) } if response.url == self.start_urls[0]: link_elems = response.xpath( "//a/@href[starts-with(., 'https://kpi.ua/') or starts-with(., '/')]" ) links = [link.get() for link in link_elems if link.get() != "/"] for link in links[:20]: if link.startswith("/"): link = "https://kpi.ua" + link yield scrapy.Request(link, self.parse)
def parse(self, response: Response): img_elems = response.xpath("//img/@data-src[starts-with(., 'http')]") text_elems = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 20]/text()" ) yield { 'url': response.url, 'payload': [ { 'type': 'text', 'data': text.get().strip() } for text in text_elems ] + [ { 'type': 'image', 'data': image.get() } for image in img_elems ] } if response.url == self.start_urls[0]: link_elems = response.xpath( "//a/@href[starts-with(., 'https://isport.ua/') or starts-with(., '/')]" ) links = [ link.get() for link in link_elems if link.get() != "/" ] for link in links[:19]: if link.startswith("/"): link = "https://isport.ua" + link yield scrapy.Request(link, self.parse)
def parse(self, response: Response): ''' 对获得到的结果进行转换 :param response: 获得到的url :return: 直接返回item ''' # 首先提取出所有的图片 image_lists = response.xpath('.//div[@id = "list_img"]//img') # from scrapy.shell import inspect_response # inspect_response(response, self) for image in image_lists: description = image.xpath('.//@alt').extract()[0] src = image.xpath('.//@src').extract_first() item = SecretSpiderItem(image_description=description) if src.startswith('http') or src.startswith('https'): item['image_urls'] = [src] else: full_url = SITE_BASE_URL + src item['image_urls'] = [full_url] yield item # 下面这一段代码 我们来判断是否有下一页来决定是否来构造对应得url(无法通过有效的响应得出来是否存在下一页) pages = response.xpath('//div[@class="page_num"]//a') next_page_url = '' for page in pages: page_text = page.xpath('./text()').extract_first() page_url = page.xpath('./@href').extract_first() if page_text == '下一页': next_page_url = page_url if next_page_url is not '': yield Request(url=next_page_url, callback=self.parse)
def parse(self, response: Response): images = response.xpath("//img/@src") texts = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > " "30]/text()") hyperlinks = response.xpath("//a/@href") yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get() } for text in texts] + [{ 'type': 'image', 'data': image.get() } for image in images] + [{ 'type': 'hyperlink', 'data': hyperlink.get() } for hyperlink in hyperlinks] } if response.url == self.start_urls[0]: links = response.xpath("//a/@href") selected_links = list(set(link.get() for link in links))[:19] for link in selected_links: yield scrapy.Request('http://basketball365.ru' + link, self.parse)
def parse(self, response: Response): pictures = response.xpath("//img/@src[starts-with(., 'http')]") strings = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()" ) yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get().strip() } for text in strings] + [{ 'type': 'image', 'data': image.get() } for image in pictures] } if response.url == self.start_urls[0]: refs = response.xpath("//a/@href") ref = [refs.get() for r in refs][:15] for r in ref: yield scrapy.Request('http://www.posolstva.org.ua' + r, self.parse)
def parse(self, response: Response): all_images = response.xpath("//img/@src[starts-with(., 'http')]") all_text = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()" ) yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get().strip() } for text in all_text] + [{ 'type': 'image', 'data': image.get() } for image in all_images] } n = response.url == self.start_urls[0] if response.url == self.start_urls[0]: all_links = response.xpath( "//a/@href[starts-with(., '//www.ukr.net/')][substring(., string-length() - 4) = '.html']" ) selected_links = [link.get() for link in all_links][:19] for link in selected_links: link = 'https:' + link yield scrapy.Request(link, self.parse)
def parse(self, response: Response): all_images = response.xpath("//img/@src[starts-with(., 'http')]") all_text = response.xpath( "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()" ) yield { 'url': response.url, 'payload': [{ 'type': 'text', 'data': text.get().strip() } for text in all_text] + [{ 'type': 'image', 'data': image.get() } for image in all_images] } if response.url == self.start_urls[0]: link_elems = response.xpath( "//a/@href[starts-with(., 'https://kpi.ua/') or starts-with(., '/')]" ) links = [ link.get() for link in link_elems if link.get() != "https://kpi.ua/" ][:19] for l in links: link = 'https://kpi.ua/' + l yield scrapy.Request(link, self.parse)
def parse_movie(self, response: Response): item = {} item['entity'] = 'movie' item['movie'] = response.xpath( '//h1/span[@property="v:itemreviewed"]/text()').get().split()[0] item['year'] = response.xpath( '//h1/span[@class="year"]/text()').get()[1:-1] item['score'] = response.xpath('//strong/text()').get() item['director'] = response.xpath( '//a[@rel="v:directedBy"]/text()').getall() item['actor'] = response.xpath( '//a[@rel="v:starring"]/text()').getall() item['genre'] = response.xpath( '//span[@property="v:genre"]/text()').getall() info = ''.join(response.xpath('//div[@id="info"]/text()').getall()) item['country'] = info.replace('/', '').split()[0] item['length'] = re.search( r'\d+', response.xpath( '//span[@property="v:runtime"]/text()').get()).group() item['rank'] = re.search( r'\d+', response.xpath('//span[@class="top250-no"]/text()').get()).group() item['img_url'] = response.xpath( '//div[@id="mainpic"]//img/@src').get() # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() # item['name'] = response.xpath('//div[@id="name"]').get() # item['description'] = response.xpath('//div[@id="description"]').get() return item
def parse_sv_links(self, response: Response) -> FoundLink: """ Yields BrokenLink items for broken links not caught by an error or exception and moves them through the broken_link_detector pipeline. :param response: A response produced by a Rule :return: A BrokenLink Item to be passed to the pipeline """ title = response.css('title::text').get() if self.css: links = response.css(self.css) else: links = response links = links.xpath('./descendant::*[@href]') for link in links: if 'vufind' in link.attrib['href'] or 'sfx' in link.attrib['href']: link_obj = FoundLink() link_obj['a_origin'] = response.url link_obj['b_title'] = title link_obj['c_url'] = assemble_absolute_link( response.url, link.attrib['href']) link_obj['d_text'] = link.xpath('./text()').get() yield link_obj
def _parse_sections(self, response: Response, folder_root: pathlib.Path): section_name = response.css(".ds-section-headline::text").get() meta = {'folder_root': folder_root} yield from response.follow_all( css=".layout-weekly-edition-section .teaser a.headline-link", callback=self._parse_article, meta=meta)
def parse(self, response: Response, **kwargs): articles = response.xpath('//div[@class="pad5 english_article persian_article small_font"]') for article in articles: download_url = article.css('.article_links').xpath('(.//a)[3]/@href').get() download_url = response.urljoin(download_url) info_url = article.css('.article_links').xpath('(.//a)[4]/@href').get() info_url = response.urljoin(info_url) yield Request(info_url, cb_kwargs={'download_url': download_url}, callback=self.parse_info)
def get_api_hastag_posts(self, response: Response): hashtag = response.json()['data']['hashtag'] url = self.get_url_to_query_next_posts_for_api(hashtag) if url: yield response.follow(url, callback=self.get_api_hastag_posts) posts: list = hashtag['edge_hashtag_to_media']['edges'] for post in posts: yield InstaPostItem(data=post['node']) if post['node']['edge_media_to_comment']['count'] > 30 or post[ 'node']['edge_liked_by']['count'] > 100: yield response.follow(f'/p/{post["node"]["shortcode"]}/', callback=self.post_page_parse)
def test_crawl(tmpdir): settings = {'CRAWL_ONCE_PATH': str(tmpdir)} crawler = get_crawler(settings_dict=settings) req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True}) req2 = scrapy.Request('http://example.com/2') req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True}) resp1 = Response(req1.url, request=req1) resp2 = Response(req2.url, request=req2) with opened_middleware(crawler) as mw: # 1. check spider middleware interface assert len(mw.db) == 0 assert crawler.stats.get_value('crawl_once/initial') == 0 output = [{}, scrapy.Request('http://example.com')] # crawl_once is False res = list(mw.process_spider_output(resp2, output, crawler.spider)) assert res == output assert len(mw.db) == 0 # crawl_once is True res = list(mw.process_spider_output(resp1, output, crawler.spider)) assert res == output assert len(mw.db) == 1 assert crawler.stats.get_value('crawl_once/initial') == 0 assert crawler.stats.get_value('crawl_once/stored') == 1 # 2. check downloader middleware interface assert mw.process_request(req2, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 0 with pytest.raises(IgnoreRequest): mw.process_request(req1, crawler.spider) assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert mw.process_request(req3, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert crawler.stats.get_value('crawl_once/initial') == 0 crawler = get_crawler(settings_dict=settings) with opened_middleware(crawler) as mw2: # it reuses the same file, so there are records assert len(mw2.db) == 1 assert crawler.stats.get_value('crawl_once/initial') == 1 assert mw2.process_request(req2, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 0 with pytest.raises(IgnoreRequest): mw2.process_request(req1, crawler.spider) assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert mw2.process_request(req3, crawler.spider) is None
def parse(self, response: Response): all_images = response.xpath("//div[@class='foto']/@style[starts-with(., 'background-image: url(/')]") all_text = response.xpath("//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()") yield { 'url': response.url, 'payload': [{'type': 'text', 'data': text.get().strip()} for text in all_text] + [{'type': 'image', 'data': 'https://stejka.com' + image.get()[22:len(image.get())-2]} for image in all_images] } if response.url == self.start_urls[0]: all_links = response.xpath( "//a/@href[starts-with(., '/rus/')]") selected_links = ['https://stejka.com' + link.get() for link in all_links][:20] for link in selected_links: yield scrapy.Request(link, self.parse)
async def parse_book(self, response: Response) -> dict: url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest() page = response.meta["playwright_page"] await page.screenshot( path=Path(__file__).parent / "books" / f"{url_sha256}.png", full_page=True ) await page.close() return { "url": response.url, "title": response.css("h1::text").get(), "price": response.css("p.price_color::text").get(), "breadcrumbs": response.css(".breadcrumb a::text").getall(), "image": f"books/{url_sha256}.png", }
def parse_forum_page(self, response: Response, forum_url: str = None) -> None: """ Forum page callback. Parses TopicItem. Follows next forum page and threads. :param forum_url: forum url, from first page. Will be extracted from response meta if not provided. :param response: scrapy crawl response """ if forum_url is None: forum_url = response.meta['forum_url'] # threads = response.css('a.topictitle') threads = response.css( 'div.topic_read,div.topic_read_hot,div.topic_read_locked,div.topic_moved,div.sticky_read,' 'div.sticky_read_locked,div.announce_read,div.announce_read_locked' ) # if len(threads) != len(threads2): # print(response.url) too_old_thread_found = False for thread_container in threads: thread = thread_container.css('a.topictitle') topic_loader = ItemLoader(item=TopicItem(), response=response) thread_href_selector = thread.css('a::attr(href)') thread_link = response.urljoin(thread_href_selector.get()) topic_loader.add_value('id', thread_href_selector.re(r'-(t[0-9]*).html')) topic_loader.add_value('thread_link', thread_link) topic_loader.add_value('forum_link', forum_url) topic_loader.add_value('name', thread.css('a::text').get()) yield topic_loader.load_item() if not self.full_crawl: last_post_date_candidates = thread_container.css( 'span.post-date::text').getall() last_post_date = max( map(lambda x: parse_date(x), last_post_date_candidates)) if last_post_date < self.start_date: too_old_thread_found = True continue yield scrapy.Request(thread_link + "?sd=d", callback=self.parse_thread) next_page = response.css('a[rel=next]::attr(href)').get() if next_page and not too_old_thread_found: next_request = response.urljoin(next_page) yield scrapy.Request(next_request, callback=self.parse_forum_page, meta={'forum_url': forum_url})
def parse_word(self, response: Response) -> dict: """ Parses the word and subtracts the type(f, m, adj, v or v*), the url and the message to send :param response: scrapy.http.response.Response :return: dict """ # extract type, one of: (f, m, adj, v or v*) l_items = response.css( r"tr>td[colspan='2'][valign='TOP'][width='650']>font>i::text" ).extract() l_items = list(map(lambda item: item.strip(), l_items)) type_possibilities = ["m", "f", "adj", "adv", "v", "v*", "pl", 'símb'] l_type = list(filter(lambda item: item in type_possibilities, l_items)) # should at least have 1 type, if not raise because there is a case that we do not control l_type = [item.strip() for item in l_type] try: s_type = l_type[0] except IndexError: str_err = "Something wrong with this l_items: '{}' in url: '{}'".format( l_items, response.url) logger.error(str_err) raise IndexError(str_err) # if the type is plural, then add and s to the type if len(l_type) > 1: if "pl" == l_type[1]: s_type += "s" # get the word from the title word = response.css(r"span[class='enc']::text").extract()[0].strip() data = { 'word': word, # it's only 1 element 'type': s_type, 'url': response.url, 'used': False, 'next_dict_id': self.start_id } # creates the message to send to twitter depending on the type of the word data["msg"] = return_twitter_msg(data) print(data) yield data
def extract_market(response: Response) -> dict: data = {} for field in response.xpath("//div[contains(@class, 'group-ema-referral-overview')]/dl/dl"): key = '\n'.join(field.xpath("dt[@role='heading']/button/text()").getall()) value = '\n'.join(field.xpath("dd[@role='region']/div/p/text()").getall()) data[key] = value return data
def errback_broken_link(self, failure: Failure) -> FoundLink: # Failure may not be the right typehint """ Handles behavior for links which cause Twisted failures - which is most of the broken links this spider hopes to find :param failure: A Twisted failure raised by the Retry middleware :return: None """ # Structure of this function heavily inspired by: # https://docs.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-errbacks # If its a TCP or DNS error, short-circuit to the pipeline if failure.check(DNSLookupError, TCPTimedOutError): self.logger.info(f'Handled DNS/TCP related error. {failure.request.url}') request = failure.request dummy_response = Response( url=request.url, status=404, # Kind of a lie request=request ) yield from self.parse_broken_link(dummy_response) # If the client timed out, report that elif failure.check(TimeoutError): self.logger.info(f'Client timeout. {failure.request.url}') self.logger.error(repr(failure))
def parse(self, response: Response, **kwargs): try: js_data = self.get_js_shared_data(response) yield scrapy.FormRequest( self.__login_url, method='POST', callback=self.parse, formdata={ 'username': self.__login, 'enc_password': self.__password }, headers={'X-CSRFToken': js_data['config']['csrf_token']}) except AttributeError as e: if response.json().get('authenticated'): yield response.follow(self.__tag_url, callback=self.first_tag_page_parse)
def first_tag_page_parse(self, response: Response): js_data = self.get_js_shared_data(response) hashtag: dict = js_data['entry_data']['TagPage'][0]['graphql'][ 'hashtag'] url = self.get_url_to_query_next_posts_for_api(hashtag) if url: yield response.follow(url, callback=self.get_api_hastag_posts) hashtag['posts_count'] = hashtag['edge_hashtag_to_media']['count'] posts = hashtag.pop('edge_hashtag_to_media')['edges'] yield InstaHashTagItem(data=hashtag) for post in posts: yield InstaPostItem(data=post['node']) if post['node']['edge_media_to_comment']['count'] > 30 or post[ 'node']['edge_liked_by']['count'] > 100: yield response.follow(f'/p/{post["node"]["shortcode"]}/', callback=self.post_page_parse)
def parse(self, response: Response): all_images = response.xpath("//img/@src[starts-with(., 'https')]") yield { 'url': response.url, 'payload': [{ 'type': 'image', 'data': image.get() } for image in all_images] } if response.url == self.start_urls[0]: all_links = response.xpath( "//a/@href[starts-with(., 'https://uahotels.info/')]") selected_links = [link.get() for link in all_links][:19] for link in selected_links: yield scrapy.Request(link, self.parse)
def _parse_world_this_week(self, response: Response, folder_root: pathlib.Path): meta = {'folder_root': folder_root} yield from response.follow_all( css=".layout-weekly-edition-wtw .weekly-edition-wtw__item a", callback=self._parse_article, meta=meta)
def _load_model(response: Response) -> Dict: script = response.xpath( "/html/body/script[text()[contains(.,'window.jsonModel = ')]]/text()" ).extract_first() jsmodel = script[len("window.jsonModel = ") :] model = json.loads(jsmodel) return model
def test_log_formatter_scrapy_1(): middleware = get_test_middleware() logformatter = CrawleraFetchLogFormatter() formatter = Formatter() for case in get_test_requests(): original = case["original"] response = Response(original.url) processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") or {} if crawlera_meta.get("skip"): assert processed is None continue # crawled result = logformatter.crawled(processed, response, foo_spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format( request=original, url=original.url) assert logstr == expected
def parse(self, response: Response): all_images = response.xpath("//img/@data-src[starts-with(., 'http')]") all_text = response.xpath("//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > " "30]/text()") yield { 'url': response.url, 'payload': [{'type': 'text', 'data': text.get().strip()} for text in all_text] + [{'type': 'image', 'data': image.get()} for image in all_images] } if response.url == self.start_urls[0]: all_links = response.xpath( "//a/@href[starts-with(., '/')]") selected_links = ['https://isport.ua' '' + link.get() for link in all_links][:20] for link in selected_links: yield scrapy.Request(link, self.parse)
def process_response(self, request: Request, response: Response, spider: Spider) -> Response: if request.meta.get('dont_cache', False): return response # Skip cached responses and uncacheable requests if 'cached' in response.flags or '_dont_cache' in request.meta: request.meta.pop('_dont_cache', None) return response # RFC2616 requires origin server to set Date header, # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18 if 'Date' not in response.headers: response.headers['Date'] = formatdate(usegmt=True) # Do not validate first-hand responses cachedresponse = request.meta.pop('cached_response', None) if cachedresponse is None: self.stats.inc_value('httpcache/firsthand', spider=spider) self._cache_response(spider, response, request, cachedresponse) return response if self.policy.is_cached_response_valid(cachedresponse, response, request): self.stats.inc_value('httpcache/revalidate', spider=spider) return cachedresponse self.stats.inc_value('httpcache/invalidate', spider=spider) self._cache_response(spider, response, request, cachedresponse) return response
def parse(self, response: Response) -> Iterator[Union[Request, Dict]]: """Обработчик http ответа от сайта. Парсит таблицы с данными и делает запросы к следующим страницам. :param response: ответ, получаемый из Scrapy :return: итератор по словарям с результатами парсинга и по запросам к следующим страницам """ symbol = response.meta['symbol'] link_extractor = LinkExtractor( allow= rf'https://www\.nasdaq\.com/symbol/{symbol.lower()}/insider-trades\?page=\d+' ) link: Link for link in link_extractor.extract_links(response): match_page_number: Optional[Match] = re.search( r'page=(\d+)', link.url) if match_page_number is not None: page_number: int = int(match_page_number.group(1)) if page_number <= MAX_PAGE: yield Request(link.url, meta={'symbol': symbol}) for row in response.xpath( '//div[@id="content_main"]//div[@class="genTable"]/table[@class="certain-width"]/tr' ): raw_row = RawRow.from_selector(row, symbol) try: yield ParsedRow.from_raw_row(raw_row).as_dict() except ValueError: logging.exception( 'Ошибка при парсинге строки таблицы с инсайдерскими сделками.' )
def parse_forum(self, response: Response) -> None: """ Forum callback. Parses ForumItem. Follows subforum links and thread links (through self.parse_forum_page() method). :param response: scrapy crawl response """ forum_loader = ItemLoader(item=ForumItem(), response=response) forum_loader.add_value('link', response.request.url) forum_loader.add_css('name', 'h2 > a::text') yield forum_loader.load_item() subforums = response.css('a.forumtitle::attr(href)').getall() for forum in subforums: next_request = response.urljoin(forum) yield scrapy.Request(next_request, callback=self.parse_forum) yield from self.parse_forum_page(response, response.url)
def _get_floorplan_images(self, response: Response) -> List[str]: xpath = "//div[@id = 'floorplan-1']//div[contains(@class, 'ui-modal-gallery__asset')]/@style" style = response.xpath(xpath).extract_first() if style: match = re.match(r".*url\('(.*)'\).*", style) if match: return [(match.group(1))] return []
def start_requests(self): cookies = build_cookies(self) file_dir = os.getcwd() sub_dir = os.sep.join(['ship', 'shipping']) x = 0 for fn_item in os.walk(sub_dir): for fn in fn_item[2]: self.source = fn.split(u' - ')[0] file_path = os.sep.join([file_dir, sub_dir, fn]) response = Response(file_path, body=''.join(open(file_path, u'r').readlines())) response.body_as_unicode = lambda :response.body hxs = HtmlXPathSelector(response) a_tags = hxs.select('//table[@class="text2"]//a[@class="links2"]') for a_tag in a_tags: detail_url = a_tag.select('@href').extract()[0] try: title = a_tag.select('text()').extract()[0] except Exception as e: continue yield Request(self.home_page + detail_url, self.parse, cookies=cookies) print x
def replace(self, *args, **kwargs): kwargs.setdefault('encoding', self.encoding) return Response.replace(self, *args, **kwargs)
def replace(self, *args, **kwargs): kwargs.setdefault("encoding", getattr(self, "_encoding", None)) return Response.replace(self, *args, **kwargs)