def parse_detail(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() body = bodyCleaner(response.css('.bodyArt p').getall()) body = body.replace( 'To view this video please enable JavaScript, and consider upgrading to a web browser that\n supports HTML5 video', '') body = body.replace('Xem thêm video:', '') body = body.replace('Xem chi tiết tại đây', '') metaDate = response.css('.dateArt::text').get() try: date = datetime.datetime.strptime(metaDate, '%d/%m/%Y %H:%M') except ValueError: date = datetime.datetime.strptime(metaDate, '%H:%M, %d/%m/%Y') return { 'source': response.url.split("/")[2], 'url': response.url, 'title': extract_with_css('.postTit::text'), 'sapo': extract_with_css('.descArt::text'), 'body': body, 'cates': [extract_with_css('.cate_breadcrumb > .listCate a.current::text')], 'tags': [], 'publish': date }
def parse_detail(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() body = bodyCleaner(response.css('.main_content').getall()) metaDate = response.css('.edn_metaDetails1::text').re( r'([0-9]{,2}\/[0-9]{,2}\/[0-9]{4} \| [0-9]{,2}:[0-9]{,2})') if len(metaDate) > 0: date = datetime.datetime.strptime(metaDate[0], '%d/%m/%Y | %H:%M') else: date = '' return { 'source': response.url.split("/")[2], 'url': response.url, 'title': extract_with_css('.article.details>h1::text'), 'sapo': extract_with_css('.Detail_Summary p::text'), 'body': body, 'cates': response.css( '.eds_breadCrumbs span[itemprop="itemListElement"] a>span::text' ).getall(), 'tags': [], 'publish': date }
def parse_detail(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() body = bodyCleaner(response.css('.bodyArt p').getall()) metaDate = response.css('.dateArt::text').get() try: date = datetime.datetime.strptime(metaDate, '%d/%m/%Y %H:%M') except ValueError: date = datetime.datetime.strptime(metaDate, '%H:%M, %d/%m/%Y') return { 'source': response.url.split("/")[2], 'url': response.url, 'title': extract_with_css('.postTit::text'), 'sapo': extract_with_css('.descArt::text'), 'body': body, 'cates': [extract_with_css('.cate_breadcrumb > .listCate a.current::text')], 'tags': [], 'publish': date }
def parse_detail(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() body = response.css('.post-content').getall() body = bodyCleaner(body) publish = response.css('meta[name="pubdate"]').re(r'content="(.*)"') return { 'source': response.url.split("/")[2], 'url': response.url, 'title': extract_with_css('.title-detail::text'), 'sapo': response.css('meta[property="og:description"]').re( r'content="(.*)">')[0], 'body': body, 'cates': [], 'tags': response.css('.tags a::text').getall(), 'publish': datetime.datetime.strptime(publish[0], "%Y-%m-%dT%H:%M:%S%z") }
def parse_detail(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() metaDescription = response.css( 'meta[name="description"]').re(r'content="(.*)">') if len(metaDescription) > 0: sapo = metaDescription[0] else: sapo = '' metaTags = response.css('meta[name="keywords"]').re(r'content="(.*)"') if len(metaTags) > 0: tags = [x.strip() for x in metaTags[0].split(',')] else: tags = '' body = bodyCleaner(response.css('#abody.cms-body.detail').getall()) metaDate = response.css('.details__meta .meta time::text').re( r'([0-9]{,2}:[0-9]{,2} - [0-9]{,2}\/[0-9]{,2}\/[0-9]{4})') if len(metaDate) > 0: date = datetime.datetime.strptime(metaDate[0], '%H:%M - %d/%m/%Y') else: date = '' return { 'source': response.url.split("/")[2], 'url': response.url, 'title': extract_with_css('.details__headline::text'), 'sapo': sapo, 'body': body, 'cates': response.css('.breadcrumbs span a span::text').getall(), 'tags': tags, 'publish': date }