コード例 #1
0
 def getContentAndSaveData(self, response, title, description):
     content = response.css(
         "#wrapper > div.container.plr-0.content-container > div.item-wrapper > div > div > div:nth-child(1) > div > div.item-content ::text"
     ).getall()
     content = ''.join(content)
     content = re.sub('\n+', '', content)
     description = self.cleanText(description)
     write_log(crawl_newspaper, self.category,
               title + '\n' + description + content)
コード例 #2
0
 def getContentAndSaveData(self, response, category, title, description):
     article = response.css('section.sidebar_1').css(
         'article.content_detail.fck_detail.width_common.block_ads_connect')
     content = ''
     # may improve by concat the list
     for cont in article.css('p'):
         cont = cont.css(".Normal ::text").getall()
         content += ''.join(cont).replace('\n', '') + '\n'
     author_name = article.css('p').css('strong ::text').getall()
     author_name = ''.join(author_name)
     content = content.replace(author_name, '')
     content = re.sub('\n+', '\n', content).strip()
     # write_log(self.name, category, content)
     # write all data
     write_log(crawl_newspaper, category,
               title.rstrip() + '\n' + description.rstrip() + content)
コード例 #3
0
    def parse(self, response):
        category = str(response.url).replace('https://www.doisongphapluat.com/', '').replace('/', '').replace('.htm', '')
        list_news = response.css('ul.module-vertical-list')
        for new in list_news.css('li'):
            # ignore advertisements
            if new.css('label._mB::text').get() == 'QC':
                pass
            else:
                # get titles
                title = new.css('h4.title ::text').getall()
                title = ''.join(title)
                write_log(self.name, category, self.cleanText(title))

                # get summary
                des = new.css('p.desc ::text').getall()
                des = ''.join(des)
                write_log(self.name, category, self.cleanText(des))
コード例 #4
0
    def getContentAndSaveData(self, response, category, title, description=''):
        body = response.css("div.l-content")
        raw_content = body.css("div#abody.cms-body.detail ::text").getall()
        # no infomation text
        trash = body.css("div#abody.cms-body.detail").css(
            ".video ::text").getall()
        trash += body.css("div#abody.cms-body.detail").css(
            "script ::text").getall()
        trash += body.css("div#abody.cms-body.detail").css(
            "table ::text").getall()
        clean_text = []
        for text in raw_content:
            if text in trash:
                pass
            else:
                clean_text.append(text)

        content = ''.join(clean_text).replace('\r', '')
        content = re.sub('\n+', '\n', content).strip()
        write_log(crawl_newspaper, self.category,
                  title + '\n' + description + content)
コード例 #5
0
 def saveData(self, response, title, description):
     content = response.css("div.left-sidebar.row > div.articleCon > div > div.wrapper-main-content > article > div.article-content ::text").getall()
     content = ''.join(content).replace('\r', '')
     content = re.sub('\n+', '\n', content).strip()
     write_log(crawl_newspaper, self.category, title + '\n' + description + content)
コード例 #6
0
 def getContentAndSaveData(self, response, title, description):
     content = response.css("div#article-body.cms-body").css(
         "p ::text").getall()
     content = ''.join(content)
     write_log(crawl_newspaper, self.category,
               title + '\n' + description + content)
コード例 #7
0
 def getContentAndSaveData(self, response, title, description=''):
     content = ''.join(response.css("#divNewsContent").css("p ::text").getall())
     content = ''.join(content).replace('\r', '')
     content = re.sub('\n+', '\n', content).strip()
     write_log(crawl_newspaper, self.category, title + '\n' + description + content)