def parse(self, response: Response, **kwargs): articles = response.xpath('//div[@class="pad5 english_article persian_article small_font"]') for article in articles: download_url = article.css('.article_links').xpath('(.//a)[3]/@href').get() download_url = response.urljoin(download_url) info_url = article.css('.article_links').xpath('(.//a)[4]/@href').get() info_url = response.urljoin(info_url) yield Request(info_url, cb_kwargs={'download_url': download_url}, callback=self.parse_info)
def parse_forum_page(self, response: Response, forum_url: str = None) -> None: """ Forum page callback. Parses TopicItem. Follows next forum page and threads. :param forum_url: forum url, from first page. Will be extracted from response meta if not provided. :param response: scrapy crawl response """ if forum_url is None: forum_url = response.meta['forum_url'] # threads = response.css('a.topictitle') threads = response.css( 'div.topic_read,div.topic_read_hot,div.topic_read_locked,div.topic_moved,div.sticky_read,' 'div.sticky_read_locked,div.announce_read,div.announce_read_locked' ) # if len(threads) != len(threads2): # print(response.url) too_old_thread_found = False for thread_container in threads: thread = thread_container.css('a.topictitle') topic_loader = ItemLoader(item=TopicItem(), response=response) thread_href_selector = thread.css('a::attr(href)') thread_link = response.urljoin(thread_href_selector.get()) topic_loader.add_value('id', thread_href_selector.re(r'-(t[0-9]*).html')) topic_loader.add_value('thread_link', thread_link) topic_loader.add_value('forum_link', forum_url) topic_loader.add_value('name', thread.css('a::text').get()) yield topic_loader.load_item() if not self.full_crawl: last_post_date_candidates = thread_container.css( 'span.post-date::text').getall() last_post_date = max( map(lambda x: parse_date(x), last_post_date_candidates)) if last_post_date < self.start_date: too_old_thread_found = True continue yield scrapy.Request(thread_link + "?sd=d", callback=self.parse_thread) next_page = response.css('a[rel=next]::attr(href)').get() if next_page and not too_old_thread_found: next_request = response.urljoin(next_page) yield scrapy.Request(next_request, callback=self.parse_forum_page, meta={'forum_url': forum_url})
def parse_forum(self, response: Response) -> None: """ Forum callback. Parses ForumItem. Follows subforum links and thread links (through self.parse_forum_page() method). :param response: scrapy crawl response """ forum_loader = ItemLoader(item=ForumItem(), response=response) forum_loader.add_value('link', response.request.url) forum_loader.add_css('name', 'h2 > a::text') yield forum_loader.load_item() subforums = response.css('a.forumtitle::attr(href)').getall() for forum in subforums: next_request = response.urljoin(forum) yield scrapy.Request(next_request, callback=self.parse_forum) yield from self.parse_forum_page(response, response.url)
def parse(self, response: Response) -> None: """ Default scrapy callback. To be used on forum main page. Follows subforum links. :param response: scrapy crawl resposne :returns :class:`hyperreal.crawler.hypercrawler.items.PostItem`, :class:`hyperreal.crawler.hypercrawler.items.ForumItem`, :class:`hypercrawler.items.TopicItem` """ date = self.settings.get('START_DATE') self.full_crawl = date is None if not self.full_crawl: self.start_date = date subforums = response.css('a.forumtitle::attr(href)').getall() for forum in subforums: next_request = response.urljoin(forum) yield scrapy.Request(next_request, callback=self.parse_forum)
def parse(self, response: Response): current_code1 = None for code in response.css( "#main > #content > #content-inner > span.text"): if len(code.css('b')) != 0: current_code1 = code.css('b::text').extract_first().strip() continue else: current_item = IarcItem() current_item['code1'] = current_code1 current_item['code2'] = code.css( '::text').extract_first().strip() current_item['code2_name'] = code.css( 'a::text').extract_first().strip() yield Request(response.urljoin( code.css('a').xpath('@href').extract_first()), self.parse_code2, meta={'item': current_item})
def parse_main_page(self, response: Response): book_urls = response.xpath(BOOK_URL) genre_urls = response.xpath(GENRE_URL) for url in book_urls: short_name = get_book_name_from_url(url.get()) yield Request( url=BASE_URL.format(short_name), callback=self.parse_book_info, cb_kwargs=dict(short_name=short_name) ) genre_urls = [x.get() for x in genre_urls] genres = list(set([x.replace('/', '') for x in genre_urls])) for k in range(2): yield Request( url=response.urljoin(genre_urls[k]), callback=self.parse_books_in_page ) yield Request( url=GENRE_LIST_URL.format(genres[k]), callback=self.parse_genre_list, cb_kwargs=dict(genre=genres[k]) )
def parse_thread(self, response: Response) -> None: """ Thread page callback. Parses PostItem. Follows next thread page. :param response: scrapy crawl response """ posts = response.css('div.post.panel-body') post_number = 1 too_old_post_found = False for post in posts: post_loader = ItemLoader(item=PostItem(), selector=post) post_loader.add_value( 'username', post.css('a.username-coloured::text,a.username::text').get()) post_date_string = post.css('div.post-date::text')[1].get()[3:-1] if post_date_string is None: continue post_date = parse_date(post_date_string) post_loader.add_value('date', str(post_date)) post_loader.add_value( 'post_id', post.css('div.post-date > a::attr(href)').re(r'.html#(.*)')) post_loader.add_value('thread_url', response.request.url) post_loader.add_value('post_number', post_number) post_number += 1 post_loader.add_value('content', post.css('div.content').get()) if not self.full_crawl: if post_date < self.start_date: too_old_post_found = True continue yield post_loader.load_item() next_page = response.css('a[rel=next]::attr(href)').get() if next_page and not too_old_post_found: next_request = response.urljoin(next_page) yield scrapy.Request(next_request, callback=self.parse_thread)
def parse_book_info(self, response: Response, short_name): # Get book's full name and author loader = ItemLoader(item=BookInfo(), response=response) # Find elements loader.add_css(FULL_NAME, BOOK_FULL_NAME_PATH) loader.add_css(AUTHOR, BOOK_AUTHOR_PATH) loader.add_css(LAST_CHAPTER, BOOK_LAST_CHAPTER_PATH) loader.add_css(CHAPTERS, BOOK_CHAPTER_PATH) # Extracting data page = loader.load_item() last_chapter = int(page.get(LAST_CHAPTER)) yield { SHORT_NAME: short_name, FULL_NAME: page.get(FULL_NAME), AUTHOR: page.get(AUTHOR), LAST_CHAPTER: last_chapter } urls = tuple(map(lambda x: response.urljoin(x), page.get(CHAPTERS))) for url in urls: try: a = url.split('chuong-')[1] chapter_index = int(a.split('-')[0]) yield SplashRequest( url=url, callback=self.parse_chapter, cb_kwargs=dict(short_name=short_name, chapter_index=chapter_index), args={ 'lua_source': WAIT_FOR_ELEMENT.format('#borderchapter') }) except Exception as e: logging.error(str(e))
def parse_article(self, response: Response): """Specific parsing logic for Geotribu articles :param Response response: HTTP response returned by URL requested """ logging.info("Start parsing ARTICLE: {}".format( response.css("title::text").getall()[0])) item = ArticleItem() # contenu de la art art = response.css("article")[0] # titre art_title_section = art.css("div.title-and-meta") art_title = art_title_section.css("h2.node__title a::text").get() item["title"] = art_title # type d'article - jusqu'en 2013, les revues de presse étaient des articles # comme les autres et n'étaient pas aussi structurées if "revue de presse" in art_title.lower(): item["kind"] = "rdp" else: item["kind"] = "art" # url art_rel_url = art_title_section.css( "h2.node__title a::attr(href)").get() item["url_full"] = art_rel_url # date de publication art_date = art.css("div.date") art_date_day = art_date.css("span.day::text").get() art_date_month = art_date.css("span.month::text").get() art_date_year = art_date.css("span.year::text").get() item["published_date"] = (art_date_day, art_date_month, art_date_year) # tags item["tags"] = art_title_section.css( "span.taxonomy-tag a::text").getall() # récupération de l'intro try: item["intro"] = art.css( "div.field-name-field-introduction").getall()[0] except IndexError: logging.debug("Article doesn't have introduction.") item["intro"] = None # corps art_raw_body = art.css("div.field-name-body") art_out_body = [] for el in art_raw_body: art_out_body.append(el.get()) item["body"] = art_out_body # images URLS (converted into absolute) item["image_urls"] = [ response.urljoin(i) for i in art.css("img").xpath("@src").getall() ] # author author_block = art.css("div.view.view-about-author") if author_block: # author thumbnail thumbnail = (art.css("div.view.view-about-author").css( "img").xpath("@src").getall()) if thumbnail and len(thumbnail): thumbnail = (art.css("div.view.view-about-author").css( "img").xpath("@src").getall()[0]) else: thumbnail = "?" # author name name = (author_block.css( "div.views-field.views-field-field-nom-complet").css( "div.field-content::text").getall()) if name and len(name): author_block.css( "div.views-field.views-field-field-nom-complet").css( "div.field-content::text").getall()[0] else: name = "?" item["author"] = { "thumbnail": thumbnail, "name": name[0], "description": author_block.css( "div.views-field.views-field-field-description p").getall( ), } else: item["author"] = { "thumbnail": "?", "name": art_title_section.css("span.username a::text").get(), "description": "", } yield item
def parse(self, response: Response): for i in range(3, 9): for j in range(1, 3): piece_url = response.xpath(BASE_PIECE_XPATH.format(i, j)).get() yield Request(url=response.urljoin(piece_url), callback=self.parse_chess_piece)