def parse_category(self, response): meta = dict(response.meta) # Navigate to article article_urls = response.css( "div.main-content .late-news-lst li .late-news-tit a::attr(href)" ).extract() self.logger.info("Parse url {}, Num Article urls : {}".format( response.url, len(article_urls))) for article_url in article_urls: if utils.is_valid_url(article_url): yield Request(article_url, self.parse_article, meta={"category": meta["category"]}, errback=self.errback) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len( article_urls) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format(meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
def parse_category(self, response): meta = response.meta articles_urls = response.css("li>div> a::attr(href)").extract() # Navigate to article self.logger.info("Parse url {}, Num Article urls : {}".format( response.url, len(articles_urls))) for article_url in articles_urls: article_url = self.base_url + article_url if utils.is_valid_url(article_url): yield Request(article_url, self.parse_article, meta=meta, errback=self.errback) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len( articles_urls) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format( meta["category_id"], meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
def parse_category_type2(self, response): # Example: thanhnien.vn/giao-duc meta = response.meta # Navigate to article article_urls = response.css( ".cate-content .zone--timeline article>a::attr(href)").extract() self.logger.info("Parse url {}, Num Article urls : {}".format( response.url, len(article_urls))) for article_url in article_urls: article_url = meta["base_url"] + article_url if utils.is_valid_url(article_url): yield Request(article_url, self.parse_article_type2, meta={"category": meta["category"]}, errback=self.errback) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len( article_urls) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format(meta["page_idx"]) yield Request(next_page, self.parse_category_type2, meta=meta, errback=self.errback)
def parse_category(self, response): meta = response.meta # Navigate to article article_urls = [] article_urls.extend( response.css( "section.featured .title_news a:first-child::attr(href)"). extract()) article_urls.extend( response.css( "section.sidebar_1 .title_news a:first-child::attr(href)"). extract()) article_urls = list(set(article_urls)) self.logger.info("Parse url {}, Num Article urls : {}".format( response.url, len(article_urls))) for article_url in article_urls: if utils.is_valid_url(article_url): yield Request(article_url, self.parse_article, meta={"category": meta["category"]}, errback=self.errback) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len( article_urls) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format(meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
def parse_category(self, response): meta = dict(response.meta) # Navigate to article article_urls = response.css( ".contentpage .listhlv21 a::attr(href)").extract() article_urls.extend( response.css( ".contentpage .listitem .item-bt>a::attr(href)").extract()) self.logger.info("Parse url {}, Num Article urls : {}".format( response.url, len(article_urls))) for article_url in article_urls: article_url = self.base_url + article_url if utils.is_valid_url(article_url): yield Request(article_url, self.parse_article, meta={"category": meta["category"]}, errback=self.errback) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len( article_urls) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format(meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
def parse_category(self, response): meta = response.meta prefix_str = "retvar =" data = response.css("::text").extract_first() data = json.loads(data[len(prefix_str):]) # Navigate to article self.logger.info("Parse url {}, Num Article urls : {}".format(response.url, len(data))) for article in data: time = '_'.join([article["publishdate"], article["publishtime"]]) time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y_%H:%M") article_info = { "category": meta["category"], "title": article["title"], "intro": article["lead"], "time": time } article_url = article["link"] if utils.is_valid_url(article_url): yield Request(article_url, self.parse_article, meta=article_info, errback=self.errback) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len(data) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format(meta["c_query"], meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)