def getContentAndSaveData(self, response, title, description): content = response.css( "#wrapper > div.container.plr-0.content-container > div.item-wrapper > div > div > div:nth-child(1) > div > div.item-content ::text" ).getall() content = ''.join(content) content = re.sub('\n+', '', content) description = self.cleanText(description) write_log(crawl_newspaper, self.category, title + '\n' + description + content)
def getContentAndSaveData(self, response, category, title, description): article = response.css('section.sidebar_1').css( 'article.content_detail.fck_detail.width_common.block_ads_connect') content = '' # may improve by concat the list for cont in article.css('p'): cont = cont.css(".Normal ::text").getall() content += ''.join(cont).replace('\n', '') + '\n' author_name = article.css('p').css('strong ::text').getall() author_name = ''.join(author_name) content = content.replace(author_name, '') content = re.sub('\n+', '\n', content).strip() # write_log(self.name, category, content) # write all data write_log(crawl_newspaper, category, title.rstrip() + '\n' + description.rstrip() + content)
def parse(self, response): category = str(response.url).replace('https://www.doisongphapluat.com/', '').replace('/', '').replace('.htm', '') list_news = response.css('ul.module-vertical-list') for new in list_news.css('li'): # ignore advertisements if new.css('label._mB::text').get() == 'QC': pass else: # get titles title = new.css('h4.title ::text').getall() title = ''.join(title) write_log(self.name, category, self.cleanText(title)) # get summary des = new.css('p.desc ::text').getall() des = ''.join(des) write_log(self.name, category, self.cleanText(des))
def getContentAndSaveData(self, response, category, title, description=''): body = response.css("div.l-content") raw_content = body.css("div#abody.cms-body.detail ::text").getall() # no infomation text trash = body.css("div#abody.cms-body.detail").css( ".video ::text").getall() trash += body.css("div#abody.cms-body.detail").css( "script ::text").getall() trash += body.css("div#abody.cms-body.detail").css( "table ::text").getall() clean_text = [] for text in raw_content: if text in trash: pass else: clean_text.append(text) content = ''.join(clean_text).replace('\r', '') content = re.sub('\n+', '\n', content).strip() write_log(crawl_newspaper, self.category, title + '\n' + description + content)
def saveData(self, response, title, description): content = response.css("div.left-sidebar.row > div.articleCon > div > div.wrapper-main-content > article > div.article-content ::text").getall() content = ''.join(content).replace('\r', '') content = re.sub('\n+', '\n', content).strip() write_log(crawl_newspaper, self.category, title + '\n' + description + content)
def getContentAndSaveData(self, response, title, description): content = response.css("div#article-body.cms-body").css( "p ::text").getall() content = ''.join(content) write_log(crawl_newspaper, self.category, title + '\n' + description + content)
def getContentAndSaveData(self, response, title, description=''): content = ''.join(response.css("#divNewsContent").css("p ::text").getall()) content = ''.join(content).replace('\r', '') content = re.sub('\n+', '\n', content).strip() write_log(crawl_newspaper, self.category, title + '\n' + description + content)