def parse_article(self, response): article_div = response.css("#main-article-body article.main-article") url = response.url lang = self.lang title = article_div.css("#headline ::text").extract_first() category = response.meta["category"] intro = article_div.css(".cms-desc ::text").extract_first() content = ' '.join(article_div.css("#article-body ::text").extract()) time = article_div.css( "#article-meta .byline-dateline time::text").extract_first() # Transform time to uniform format if time is not None: time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y %H:%M") self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format( self.name, self.article_scraped_count)) yield Article(url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time)
def parse_article(self, response): url = response.url lang = self.lang category = response.meta["category"] title = response.css(".article-title ::text").extract_first() intro = ' '.join(response.css(".article-detail-hd>p ::text").extract()) content = ' '.join( response.xpath( "//div[@class='article-detail']//" "p[not(ancestor::div[@class='article-detail-hd'])]//text()"). extract()[:-2]) time = response.css(".datetime span::text").extract() # Transform time to uniform format if time is not None: time = "".join(time) time = self.transform_time_fmt(time, src_fmt="%H:%M %d/%m/%Y") self.article_scraped_count += 1 self.print_num_scraped_items(every=20) yield Article(url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time)
def parse_article(self, response): meta = response.meta url = response.url lang = self.lang title = response.css( ".postDetail>.detail_product .product_name::text").extract_first() category = meta["category"] intro = "" content = ' '.join( response.css( ".postDetail .full_description_inside ::text").extract()) time = "" self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format( self.name, self.article_scraped_count)) yield Article(url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time)
def parse_article(self, response): content_div = response.xpath("//div[@id='ctl00_IDContent_ctl00_divContent']") url = response.url lang = self.lang title = content_div.css("h1.fon31.mgb15::text").extract_first() category = response.meta["category"] intro = ' '.join(content_div.css("h2.fon33::text").extract()) content = ' '.join(content_div.css("#divNewsContent ::text").extract()) time = content_div.css("div.box26>span::text").extract_first() # Transform time to uniform format if time is not None: time = time[time.find(", ") + 2:] time = '_'.join(time.split(" - ")) time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y_%H:%M") self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count)) yield Article( url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time )
def parse_article(self, response): section = response.css("section.sidebar_1") url = response.url lang = self.lang title = section.css(".title_news_detail::text").extract_first() category = response.meta["category"] intro = section.css(".description::text").extract_first() content = section.css("article.content_detail ::text").extract() content = ' '.join(content) time = section.css("span.time::text").extract() # Transform time to uniform format if time is not None: time = ", ".join(time) time = time.split(", ") time[-1] = time[-1][:5] time = '_'.join(time[1:]) time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y_%H:%M") self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count)) yield Article( url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time )
def parse_article_type1(self, response): # Example: game.thanhnien.vn url = response.url lang = self.lang title = response.css(".main-title::text").extract_first() category = response.meta["category"] intro = ' '.join( response.css(".details-content .sapo ::text").extract()) content = ' '.join( response.xpath( "//div[@id='abody']//text()[not(ancestor::script)]").extract()) time = response.css(".details-heading time ::text").extract_first() # Transform time to uniform format if time is not None: time = time.split() time = ' '.join([time[0], time[-1]]) time = self.transform_time_fmt(time, src_fmt="%H:%M %d/%m/%Y") self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format( self.name, self.article_scraped_count)) yield Article(url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time)
def parse_article(self, response): article_div = response.css("#main-content article") url = response.url lang = self.lang title = article_div.css(".post-title ::text").extract_first() category = response.meta["category"] intro = '' content = ' '.join( article_div.xpath( ".//div[@class='entry']//text()[not(ancestor::script)]"). extract()) time = article_div.css(".updated ::text").extract_first() # Transform time to uniform format if time is not None: time = self.transform_time_fmt(time, src_fmt="%Y-%m-%d") self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format( self.name, self.article_scraped_count)) yield Article(url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time)
def parse_article(self, response): url = response.url lang = self.lang title = response.css(".header .title::text").extract_first() category = response.meta["category"] intro = ' ' content = ' '.join(response.xpath("//div[@id='NewsContent']//text()[not(ancestor::script)]").extract()) time = response.css( ".header .info_item_popup .note_gera:first-child span::text").extract_first() # Transform time to uniform format if time is not None: time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y") self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count)) yield Article( url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time )
def parse_article(self, response): content_div = response.css(".contentleft") url = response.url lang = self.lang title = content_div.css(".titledetail h1::text").extract_first() category = response.meta["category"] intro = content_div.css( "#ContentRightHeight .sapo::text").extract_first() content = ' '.join( content_div.css( "#ContentRightHeight #divNewsContent ::text").extract()) time = content_div.css( "#ContentRightHeight .ngayxuatban::text").extract_first() # Transform time to uniform format if time is not None: time = time.strip() time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y %H:%M") self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format( self.name, self.article_scraped_count)) yield Article(url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time)
def parse_article(self, response): table = response.css("div.media table") url = response.url lang = self.lang title = table.css("div.ndtitle ::text").extract_first() category = response.meta["category"] intro = table.css("div.ndcontent.ndb p ::text").extract_first() content = table.css("div[class=ndcontent] ::text").extract() content = ' '.join(content) time = table.css("div.icon_date_top>div.pull-left::text").extract_first() # Transform time to uniform format if time is not None: time = '_'.join(time.split(", ")[1:]) time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y_%H:%M:%S") self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count)) yield Article( url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time )
def parse_article(self, response): meta = response.meta section = response.css("section.sidebar_1") url = response.url lang = self.lang title = meta["title"] category = meta["category"] intro = meta["intro"] content = ' '.join(response.xpath("//div[@id='ArticleContent']//text()").extract()) time = meta["time"] self.article_scraped_count += 1 if self.article_scraped_count % 100 == 0: self.logger.info("Spider {}: Crawl {} items".format(self.name, self.article_scraped_count)) yield Article( url=url, lang=lang, title=title, category=category, intro=intro, content=content, time=time )