def parse_post(self, response): author = self.parse_author(response) item = NewsItem( title=response.xpath('//h1/text()').get().strip(), timestamp='', content_html=response.xpath('//div[@class="content_detail"]').get(), body=self.parse_body(response), link=response.url, subhead=response.xpath('//div[@class="teaser_detail"]/text()').get() if response.xpath( '//div[@class="teaser_detail"]/text()').get() else '', pic=self.parse_pictures(response), date='', author=author ) yield item
def parse_post(self, response): author = self.parse_author(response) time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//*[@id="ARTICLEVIEW"]//p[@class="SGTOTitle"]/text()').get(), timestamp=time_format, content_html=response.xpath('//*[@class="Content"]').get(), body=html2text.html2text(response.xpath('//*[@class="Content"]').get()), link=response.url, subhead=response.xpath('//div[@id="ARTICLEVIEW"]//*[@class="SGTOSummary"]/text()').get().strip(), pic=self.parse_pictures(response), date=short_date, author=author ) yield item
def parse_post(self, response): author = self.parse_author(response) time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h1/text()').get(), timestamp=time_format, content_html=response.xpath('//section[@class="cb-entry-content clearfix"]').get(), body=html2text.html2text(response.xpath('//section[@class="cb-entry-content clearfix"]').get()), link=response.url, subhead='', pic=self.parse_pictures(response), date=short_date, author=author ) yield item
def parse_post(self, response): author = self.parse_author(response) time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h2[@class="post-title"]/text()').get(), timestamp=time_format, content_html=response.xpath('//div[@class="elementor-widget-wrap"]').get(), body=html2text.html2text(response.xpath('//div[@class="elementor-widget-wrap"]').get()), link=response.url, subhead='', pic=self.parse_pictures(response), date=short_date, author=author ) yield item
def parse_post(self, response): time_format, short_date = self.parse_date(response) content, html = self.parse_content(response) item = NewsItem(title=html2text.html2text( response.xpath('//*[@id="pt1:pbl16"]').get()), timestamp=time_format, content_html=html, body=content, link=response.url, subhead=html2text.html2text( response.xpath('//*[@id="pt1:pbl18"]').get()), pic='', date=short_date, author='') yield item
def parse_post(self, response): author = self.parse_author(response) time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h1/text()').get(), timestamp=time_format, content_html=response.xpath('//div[@class="description"]').get(), body=html2text.html2text(response.xpath('//div[@class="description"]').get()), link=response.url, subhead=response.xpath('//div[@class="shortDesc"]/text()').get().strip(), pic=self.parse_pictures(response), date=short_date, author=author ) yield item
def parse_post(self, response): time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h1[@itemprop="headline"]/text()').get(), timestamp=time_format, content_html=response.xpath( '//div[@class="content-detail"]').get(), body=html2text.html2text( response.xpath('//div[@class="content-detail"]').get()), link=response.url, subhead=html2text.html2text( response.xpath('//div[@class="brief-detail"]').get()), pic=self.parse_pictures(response), date=short_date, author=self.parse_author(response)) yield item
def parse_post(self, response): author = self.parse_author(response) time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h1/text()').get().strip(), timestamp=time_format, content_html=response.xpath( '//*[@id="cotent_detail"]').get(), body=self.parse_body(response), link=response.url, subhead=response.xpath('//div[@class="sapo_detail fr"]/text()').get().strip(), pic=self.parse_pictures(response), date=short_date, author=author ) yield item
def parse_post(self, response): author = self.parse_author(response) time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h1/text()').get().strip(), timestamp=time_format, content_html=response.xpath( '//div[@class="article-content __MASTERCMS_CONTENT __MB_CONTENT_FOR_PRINTER"]' ).get(), body=self.parse_body(response), link=response.url, subhead=response.xpath( '//p[@class="detail-sapo"]/text()').get().strip(), pic=self.parse_pictures(response), date=short_date, author=author) yield item
def parse_post(self, response): """ This function returns newspaper articles on a given date in a given structure. Return data structure: [ { 'title': string, # The title of a article 'author' # The author of a article, of '' if none 'subhead': string, # The subtitle of a article, or '' if there is no subtitle 'print': string, # The page number of a article 'date': string in '%Y-%m-%d' format, # The publish date of a article 'body': string # The body of a article 'pic_list': string in # The link of pictures of a article, or '' if there are no pictures f"{link1}|{text2}&&..." format 'original_link': string # The url of a article }, ... ] or None :param response: The scrapy response :return: """ author = self.parse_author(response) time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h1/text()').get(), # timestamp=time_format, print='', body=html2text.html2text( response.xpath( '//div[@class="news_content entry-content"]').get()), original_link=response.url, subhead='', pic_list=self.parse_pictures(response), date=short_date, author=author, source='') # TODO: Parse article and yield it yield item
def parse_post(self, response): """ This function returns newspaper articles on a given date in a given structure. Return data structure: [ { 'title': string, # The title of a article 'author' # The author of a article, of '' if none 'subhead': string, # The subtitle of a article, or '' if there is no subtitle 'print': string, # The page number of a article 'date': string in '%Y-%m-%d' format, # The publish date of a article 'timestamp': datetime # The ISO publish date of a article 'body': string # The body of a article 'pic': string in # The link of pictures of a article, or '' if there are no pictures f"{link1}|{text2}&&..." format 'link': string # The url of a article }, ... ] or None :param response: The scrapy response :return: """ time_format, short_date = self.parse_date(response) content, html = self.parse_content(response) if response.xpath('//h1[@class="post-title main-title"]/text()').get(): item = NewsItem( title=response.xpath( '//h1[@class="post-title main-title"]/text()').get(), timestamp=time_format, content_html=content, body=html, link=response.url, subhead=response.xpath( '//h2[@class="post-sapo"]/strong/text()').get(), pic=self.parse_pictures(response), date=short_date, author='') yield item
def parse_post(self, response): """ This function returns newspaper articles on a given date in a given structure. Return data structure: [ { 'title': string, # The title of a article 'author' # The author of a article, of '' if none 'subhead': string, # The subtitle of a article, or '' if there is no subtitle 'print': string, # The page number of a article 'date': string in '%Y-%m-%d' format, # The publish date of a article 'timestamp': datetime # The ISO publish date of a article 'body': string # The body of a article 'pic': string in # The link of pictures of a article, or '' if there are no pictures f"{link1}|{text2}&&..." format 'link': string # The url of a article }, ... ] or None :param response: The scrapy response :return: """ time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h1[@title]/text()').get().strip(), # timestamp=time_format, # content_html=response.xpath('//*[@id="noidung"]').get(), body=html2text.html2text( response.xpath('//*[@id="noidung"]').get()), original_link=response.url, subhead=html2text.html2text( response.xpath('//*[@id="noidung"]/h2').get()), pic_list=self.parse_pictures(response), date=short_date, author=response.xpath('//cite/text()').get()) yield item
def parse_post(self, response): """ This function returns newspaper articles on a given date in a given structure. Return data structure: [ { 'title': string, # The title of a article 'author' # The author of a article, of '' if none 'subhead': string, # The subtitle of a article, or '' if there is no subtitle 'print': string, # The page number of a article 'date': string in '%Y-%m-%d' format, # The publish date of a article 'timestamp': datetime # The ISO publish date of a article 'body': string # The body of a article 'pic': string in # The link of pictures of a article, or '' if there are no pictures f"{link1}|{text2}&&..." format 'link': string # The url of a article }, ... ] or None :param response: The scrapy response :return: """ author = self.parse_author(response) time_format, short_date = self.parse_date(response) item = NewsItem( title=response.xpath('//h1/text()').get(), timestamp=time_format, content_html=response.xpath('//div[@class="col-md-12"]').get(), body=html2text.html2text( response.xpath('//div[@class="col-md-12"]').get()), link=response.url, subhead=response.xpath( '//div[@class="des f-roboto-b t-16-mb dt-des"]/text()').get(), pic=self.parse_pictures(response), date=short_date, author=author) yield item
def parse_post(self, response): """ This function returns newspaper articles on a given date in a given structure. Return data structure: [ { 'title': string, # The title of a article 'author' # The author of a article, of '' if none 'subhead': string, # The subtitle of a article, or '' if there is no subtitle 'date': string in '%Y-%m-%d' format, # The publish date of a article 'timestamp': string in Iso 8061 format # The ISO publish date of a article 'body': string # The body of a article 'pic': string in # The link of pictures of a article, or '' if there are no pictures f"{link1}|{text2}&&..." format 'link': string # The url of a article }, ... ] or None :param response: The scrapy response :return: """ content_html = response.xpath('//*[@id="abody"]').get() item = NewsItem(title=response.xpath( '//*[@class="details__headline"]/text()').get(), timestamp=self.parse_timestamp(response, 'iso'), content_html=content_html, body=html2text(content_html), link=response.url, subhead=html2text( response.xpath('//div[@class="sapo"]').get()), pic=self.parse_pic(response), date=self.parse_timestamp(response, 'date'), author=self.parse_author(response)) yield item
def parse_post(self, response): tags = response.xpath( '//div[@class="tags-container"]/ul/li/a/@title').extract() cate = response.xpath( '//meta[@property="article:section"]//@content').get() author = response.xpath('//div[@class="author"]').css( '::text').extract()[1] if response.xpath( '//div[@class="author"]').css('::text').extract()[1] else '' item = NewsItem( title=response.xpath('//h1[@class="article-title"]/text()').get(), timestamp=self.parse_timestamp(response), content_html=response.xpath( '//div[@class="main-content-body"]').get(), body=html2text.html2text( response.xpath('//div[@class="main-content-body"]').get()), tags=tags, category=cate, link=response.url, subhead=response.xpath('//h2[@class="sapo"]/text()').get(), pic=self.parse_pictures(response), date=self.parse_date(response), author=author) yield item