def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'middle east') article.add_value('language', 'english') article.add_value('stopwords', ['al', 'jazeera']) article.nested_css('div.article-body').add_xpath('body', './p//text()') article.nested_css('meta[property="og:title"]').add_xpath( 'headline', './@content') # Function to parse published time to iso6801 published_time_in = Compose( Join(), lambda v: '' if (datetime.strptime( v, '%a, %d %B %Y %H:%M:%S %Z') is None) else datetime.strptime( v, '%a, %d %B %Y %H:%M:%S %Z').isoformat(sep='T')) article.nested_css('meta[name="LastModifiedDate"]').add_xpath( 'published_time', './@content', published_time_in, ) article.nested_css('span.article-topics').add_xpath( 'category', './/text()') article.nested_css('meta[property="ContentType"]').add_xpath( 'category', './@content') article.add_value('url', response.url) article.add_value('encoding', response.encoding) return article.load_item()
def parse_recipe(self, response): loader = ItemLoader(item=RecipesItem(), selector=response) loader.add_css('name', css='div.container h1.main-title span ::text') loader.add_css('image', css='div.product-area div.cuisine-page img::attr(src)') loader.add_css('proportion', css='div.product-area div.people div.number ::text') loader.add_css( 'time_prep', css='div.product-area div.columns :nth-child(3) div.number ::text') loader.add_css( 'time_cook', css='div.product-area div.columns :last-child div.number ::text') loader.add_css('category_name', css='div.main-content li:nth-child(3) a span ::text') loader.add_css('desc', css='div.product-area div.description p::text') ingredient_loader = loader.nested_css( 'div.ingredient-area div.ingredient li') ingredient_loader.add_css('ingredients', css='::text') # Uncomment if we decided to show the recipe prepa_loader = loader.nested_css('div.preparation li') prepa_loader.add_css('preparation', css='div > p ::text') area_loader = loader.nested_css('div.name-area div.item') area_loader.add_css('areas', css='div.title a ::text') # prepa_url = response.css('div.recipe-info div.action a ::attr(href)') yield loader.load_item()
def parse_single(response): loader = ItemLoader(item=PageItem(), response=response) loader.add_value('url', response.url) loader.add_css('title', 'div.article-title') loader.add_css('vintage', 'div.article-title') loader.add_css('rating', '#points') loader.add_css('description', 'p.description') primary_info_loader = loader.nested_css('ul.primary-info') appellation_loader = primary_info_loader.nested_css('li.row:nth-last-child(2) div.info') primary_info_loader.add_css('price', 'li.row:nth-child(1) div.info') if len(primary_info_loader.selector.css('li.row')) == 5: primary_info_loader.add_css('designation', 'li.row:nth-child(2) div.info') primary_info_loader.add_css('varietal', 'li.row:nth-last-child(3) div.info') appellation_loader.add_css('subsubregion', 'span a:nth-last-child(4)') appellation_loader.add_css('subregion', 'span a:nth-last-child(3)') appellation_loader.add_css('region', 'span a:nth-last-child(2)') appellation_loader.add_css('country', 'span a:nth-last-child(1)') primary_info_loader.add_css('winery', 'li.row:nth-last-child(1) div.info') secondary_info_loader = loader.nested_css('ul.secondary-info') secondary_info_loader.add_css('alcohol', 'li.row:nth-child(1) div.info') secondary_info_loader.add_css('category', 'li.row:nth-child(3) div.info') yield loader.load_item()
def parse(self, response): loader = ItemLoader(item=RecipeItem(), selector=response) loader.add_css('recipe_name', css='div.container h1.main-title span ::text') loader.add_css( 'recipe_image', css='div.container div.product-area div.image ::attr(src)') ingredient_loader = loader.nested_css('div.ingredient li') ingredient_loader.add_css('recipe_ingredients', css='::text') prepa_loader = loader.nested_css('div.preparation li') prepa_loader.add_css('recipe_prepa', css='div > p ::text') yield loader.load_item()
def parse_rent(self, response: HtmlResponse) -> Iterator[LyEstateRentItem]: loader = ItemLoader(item=LyEstateRentItem(), selector=response) loader.add_css('title', 'h1::text') info_loader = loader.nested_css('.dtl_frinfo') info_loader.add_css('per_month_price', 'tr:nth-child(1) b::text') info_loader.add_css('area', 'tr:nth-child(2) td:nth-child(1)::text', re='\d+') info_loader.add_css('style', 'tr:nth-child(2) td:nth-child(2)::text', re=self._INFO_PATTERN) info_loader.add_css('direction', 'tr:nth-child(3) td:nth-child(1)::text', re=self._INFO_PATTERN) info_loader.add_css('floor', 'tr:nth-child(3) td:nth-child(2)::text', re=self._INFO_PATTERN) info_loader.add_css('house_type', 'tr:nth-child(4) td:nth-child(1) a::text') info_loader.add_css('decoration', 'tr:nth-child(4) td:nth-child(2)::text', re=self._INFO_PATTERN) info_loader.add_css('age', 'tr:nth-child(5) td::text', re=self._INFO_PATTERN) info_loader.add_css('address', 'tr:nth-child(7) td::text', re=self._INFO_PATTERN) loader.add_css('description', '.dtl_content *::text') loader.add_css('images', '.dtl_pics img::attr("src")') yield loader.load_item()
def parse_forum_post(self, response: HtmlResponse): for floor, post_block in enumerate( response.css('#postlist div.postaaa'), start=1): # =============== 解析用户数据 ======================= user_block = post_block.css('.favatar') username = user_block.css('.xw1::text').extract_first() user_loader = ItemLoader(item=LyCommunityUserItem(), selector=user_block, base_url='http://www.lysq.com/') user_loader.add_value('username', username) user_loader.add_css('avatar_url', '.avtm img::attr("src")') user_loader.add_css('medal_list', 'p.md_ctrl img::attr("alt")') user_loader.add_css('coin_count', 'dl.pil dd::text', re='\d+') user_loader.add_css( 'user_group', 'a[href^="home.php?mod=spacecp&ac=usergroup"]::text') user_loader.add_css('signature', 'p.xg1::text') user_data_loader = user_loader.nested_css('div.tns') user_data_loader.add_css('topic_count', 'th:nth-child(1) a::text') user_data_loader.add_css('post_count', 'th:nth-child(2) a::text') user_data_loader.add_css('credit_count', 'td span::attr("title")') user_data_loader.add_css('credit_count', 'td a::text') yield user_loader.load_item() # =============== 解析post comment ================= comment_block = post_block.css('td.plc') content = comment_block.css('.pcb .t_fsz').extract_first() comment_loader = ItemLoader(item=LyCommunityCommentItem(), selector=comment_block, base_url='http://www.lysq.com/') comment_loader.add_value('post_url', response.meta['post_url']) comment_loader.add_value('author_username', username) comment_loader.add_value('content', content) comment_loader.add_value('image_urls', content, re=RE_IMG_SRC) comment_header_loader = comment_loader.nested_css('div.pi') comment_header_loader.add_css('floor', 'strong em::text') comment_header_loader.add_value('floor', response.meta['page'] * 10 + floor) comment_header_loader.add_css('created_time', 'div.authi em::text', re=RE_DATETIME) yield comment_loader.load_item()
def parse_actress_detail_cn(self, response: HtmlResponse): loader = ItemLoader(item=AvmooActressItem(), selector=response) avatar_box = loader.nested_css('.avatar-box') avatar_box.add_css('avatar', 'img::attr(src)') avatar_box.add_css('name_cn', 'span::text') avatar_box.add_css('info', 'p::text') yield response.follow(response.url.replace('/cn/', '/en/'), callback=self.parse_actress_detail_en, meta={'item': loader.load_item()})
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value("country", 'uk') article.add_value("language", 'english') article.nested_css("div.main-content-column").add_xpath("body", './div/p//text()') article.add_xpath("headline", '//head/meta[@property="og:title"]/@content') # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T') ) article.nested_css('meta[property="article:published_time"]').add_xpath( 'published_time', './@content', time_in, ) article.add_xpath("category", '//head/meta[@property="article:section"]/@content') article.add_xpath("keywords", '//head/meta[@name="keywords"]/@content') article.add_value("url", response.url) article.add_value("encoding", response.encoding) return article.load_item()
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'uk') article.add_xpath('language', '//html/@lang') article.add_xpath('headline', '//head/meta[@property="og:title"]/@content') article.add_value('url', response.url) # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T') ) article.nested_css('meta[property="article:published_time"]').add_xpath( 'published_time', './@content', time_in, ) article.add_xpath('category', '//head/meta[@property="keywords"]/@content', lambda v: v[0].split(',') if v else None) article.add_value('encoding', response.encoding) article.nested_css('div[itemprop="articleBody"]').add_xpath('body', './p//text()') return article.load_item()
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'usa') article.add_value('language', 'english') article.nested_css('meta[property="og:title"]').add_xpath( 'headline', './@content') article.add_value('url', response.url) # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')) article.nested_css('meta[name="pubdate"]').add_xpath( 'published_time', './@content', time_in, ) article.add_xpath('category', '//head/meta[@name="section"]/@content') article.add_xpath('keywords', '//head/meta[@itemprop="keywords"]/@content', re=r'(.*) -') article.add_value('encoding', response.encoding) article.nested_css('div.pg-rail-tall__body').nested_css( 'div.l-container').add_xpath( 'body', './/div[re:test(@class, "zn-.*")]/text()') return article.load_item()
def parse_book_brief(self, response): loader = ItemLoader(item=QuanbenBookItem(), selector=response) box = loader.nested_css('div.box') box.add_css('cover', 'img::attr(src)') box.add_css('name', 'h3 span::text') box.add_css('author_name', 'span[itemprop="author"]::text') box.add_css('category', 'span[itemprop="category"]::text') box.add_css('status', 'p:last-child span::text') box.add_css('brief', 'div.description *::text') item = loader.load_item() yield item chapter_list_url = response.css( 'div.box a.button.s1::attr(href)').extract_first() yield response.follow(chapter_list_url, callback=self.parse_chapter_list, headers=self._HEADERS, meta={'book_item': item})
def parse_article(self, response): article = ItemLoader(item=NewsCrawlerItem(), response=response) article.add_value('country', 'uk') article.add_value('language', 'english') article.nested_css('meta[property="og:title"]').add_xpath('headline', './@content', re=r'(.*) - BBC') article.add_value('url', response.url) # Function to parse published time to iso6801 time_in = Compose( Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T') ) article.add_xpath( 'published_time', '//*[@id="responsive-news"]/head/script[1]/text()', time_in, re=r'"datePublished": "(.*)"', ) article.nested_css('meta[property="article:section"]').add_xpath('category', './@content') article.add_value('encoding', response.encoding) article.nested_css('div.story-body__inner').add_xpath('body', './p//text()') article.nested_css('map-body').add_xpath('body', './p//text()') return article.load_item()
def parse_person(self, response): loader = ItemLoader(item=Person(), response=response) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()') loader.add_value('url', response.url) loader.add_xpath( 'primary_role', '//*[@id="info-card-overview-content"]/div/dl/div/dd') # Fields expected: born, gender, location, website overview = response.xpath( '//*[@id="info-card-overview-content"]/div/dl/dt/text()') overview_loader = loader.nested_xpath( '//*[@id="info-card-overview-content"]/div/dl') for i in range(len(overview)): key = overview[i].extract() key = key[:key.find(':')].lower() try: overview_loader.add_xpath(key, 'dd[{}]/text()'.format(i + 1)) except KeyError as e: # Ignore if key is not in the Item's field pass loader.add_xpath('facebook', '(//a[contains(@class,"facebook")])[1]/@href') loader.add_xpath('twitter', '(//a[contains(@class,"twitter")])[1]/@href') loader.add_xpath('linkedin', '(//a[contains(@class,"linkedin")])[1]/@href') loader.add_xpath('description', '//*[@id="description"]/span/div') loader.add_css('current_jobs', '.current_job') loader.add_css('past_jobs', '.past_job') loader.nested_css('.advisory_roles').add_xpath('board_advisors', './/ul/li') loader.nested_css('table.investors').add_xpath( 'investments', './/tr[not(@class="thead")]') loader.nested_css('.education').add_xpath('education', './/ul/li') return loader.load_item()
def season_matches_parse(self, response): """ Callback to handle parsing of the match list for each tournament season. Parses url such as https://www.rugbypass.com/{tournament}/matches/{season} """ #print(f" {response.url}") print( f" Parsing match list for tournament '{response.meta['tournament']}' in season '{response.meta['season']}'" ) #print(f"Tournament: {response.meta['tournament']}, season/year: {response.meta['season']}") self.logger.info( f"############### Parsing match list for tournament '{response.meta['tournament']}' in season '{response.meta['season']}' ###############" ) #css selectors for match details meta_fields = { 'match_id': ' ::attr(data-id)', 'date': ' ::attr(data-date)', 'home_team_id': " ::attr(data-home)", #'home_team_name' : '', 'away_team_id': " ::attr(data-away)", #'home_team_name': '', 'status': " ::attr(data-status)", #'game_class': '', 'start_time': " [itemprop='startDate']::attr(content)", 'end_time': " [itemprop='endDate']::attr(content)", 'performers': " [itemprop='performer']::attr(content)", 'description': " [itemprop='description']::attr(content)", 'rugbypass_price': " [itemprop='price']::attr(content)", 'rugbypass_price_curr': " [itemprop='priceCurrency']::attr(content)", 'title': "a.link-box::attr(href)", #'name': '', #'comp': '', #'comp_date': '' #'score_home': '' #'score_away': '' } match_ids = response.css( "div[class=game-round] div[itemscope] ::attr(data-id)").getall() for match_id in match_ids: #Extract basic match information into Match container loader = ItemLoader(item=Match(), response=response) #loader to handle each match listed on the season page individually match_loader = loader.nested_css(f"[data-id='{match_id}']") for field, selector in meta_fields.items(): #load css selector for match details match_loader.add_css(field, selector) match_loader.add_value('tournament', response.meta['tournament']) match_loader.add_value('season', response.meta['season']) #fetch match data match = match_loader.load_item() yield match #Send request to parsed match page url = response.css( f"[data-id='{match_id}'] a.link-box::attr(href)").get() #print('url: ', urljoin(url, f"{response.meta['season']}/stats")) yield response.follow(url=urljoin( url, f"{response.meta['season']}/stats"), callback=self.match_stats_page_parse, meta=({ 'match': match, }))