Beispiel #1
0
 def parse_news(self, response):
     log.msg("Start to parse news " + response.url, level=log.INFO)
     item = SpiderNewsAllItem()
     day = title = _type = keywords = url = article = ''
     url = response.url
     day = response.meta['day']
     title = response.meta['title']
     _type = response.meta['_type']
     response = response.body
     soup = BeautifulSoup(response)
     try:
         items_keywords = soup.find_all(class_='hotword')
         for i in range(0, len(items_keywords)):
             keywords += items_keywords[i].text.strip() + ' '
     except:
         log.msg("News " + title + " dont has keywords!", level=log.INFO)
     try:
         article = soup.find(id='newscontent').text.strip()
     except:
         log.msg("News " + title + " dont has article!", level=log.INFO)
     item['title'] = title
     item['day'] = day
     item['_type'] = _type
     item['url'] = url
     item['keywords'] = keywords
     item['article'] = article
     item['site'] = u'证券日报网'
     return item
Beispiel #2
0
    def parse_news(self, response):
        log.msg("Start to parse news " + response.url, level=log.INFO)
        item = SpiderNewsAllItem()
        day = title = _type = keywords = url = article = ''
        #直接解析meta
        url = response.url
        day = response.meta['day']
        title = response.meta['title']
        _type = response.meta['_type']
        response = response.body
        soup = BeautifulSoup(response)

        #找到文章
        try:
            # article = soup.find(class_='postTitle').text.strip()
            article = soup.find(id='news_body').text.strip()
        except:
            log.msg("News " + title + " dont has article!", level=log.INFO)
        item['title'] = title
        item['day'] = day
        item['_type'] = _type
        item['url'] = url
        item['keywords'] = keywords
        item['article'] = article
        item['site'] = u'博客园'
        return item
Beispiel #3
0
    def parse_news(self, response):
        log.msg("Start to parse news " + response.url, level=log.INFO)
        item = SpiderNewsAllItem()
        day = title = _type = keywords = url = article = ''
        url = response.url
        day = response.meta['day']
        title = response.meta['title']
        _type = response.meta['_type']
        response = response.body
        soup = BeautifulSoup(response)
#        try:
#            items_keywords = soup.find(class_='ar_keywords').find_all('a')
#            for i in range(0, len(items_keywords)):
#                keywords += items_keywords[i].text.strip() + ' '
#        except:
#            log.msg("News " + title + " dont has keywords!", level=log.INFO)
        

        
        try:
        ##################################
        # 分情况获取储存新闻内容的标签
                # "码云推荐",获取项目简介(通常是 README.md 文档内容)
            if re.search("translate",url):
                article = soup.find_all("div",class_ = "translate-content")
                markdown = "".join(str(article))
                markdown = Tomd(markdown).markdown
                article = [tag.text.strip() for tag in article]
                article = ''.join(article)
            else:
                if re.match("https://gitee.com",url):
                    article = soup.find("div",class_="file_content markdown-body")# CSS选择器:#git-readme > div > div.file_content.markdown-body
                # "码云周刊"
                elif re.match("https://blog.gitee.com",url):
                    article = soup.find("div",class_="entry-content")
                elif re.search("translate",url):
                    article = soup.find_all("div",class_ = "translate-content")
                # 其他常见页面
                elif soup.find("div",class_= ["content","box-aw main"]):
                    article = soup.find("div",class_= ["content","box-aw main"])
                else:
                    article = soup.find("section",class_= ["wrap cke_editable cke_editable_themed cke_contents_ltr cke_show_borders clearfix"])
                    
                if article and not article.find("div",class_="ad-wrap")==None:
                    article.find("div",class_="ad-wrap").extract()
                
                markdown = markdown = Tomd(str(article)).markdown
                article = article.text.strip() #提取标签文本
        except:
            log.msg("News " + title + " dont has article!", level=log.INFO)
        item['title'] = title
        item['day'] = day
        item['_type'] = _type
        item['url'] = url
        item['keywords'] = keywords
        item['article'] = article
        item['site'] = '开源中国'
        item['markdown'] = markdown
        return item
Beispiel #4
0
 def parse_news(self, response):
     log.msg("Start to parse news " + response.url, level=log.INFO)
     item = SpiderNewsAllItem()
     day = title = _type = keywords = url = article = ''
     url = response.url
     day = response.meta['day']
     title = response.meta['title']
     response = response.body
     soup = BeautifulSoup(response)
     try:
         article = soup.find(id='ozoom').text.strip()
     except:
         log.msg("News " + title + " dont has article!", level=log.INFO)
     item['title'] = title
     item['day'] = day
     item['_type'] = _type
     item['url'] = url
     item['keywords'] = keywords
     item['article'] = article
     item['site'] = u'证券日报'
     return item
Beispiel #5
0
    def parse_news(self, response):
        log.msg("Start to parse news " + response.url, level=log.INFO)
        item = SpiderNewsAllItem()
        day = title = _type = keywords = url = article = ''
        url = response.url
        day = response.meta['day']
        title = response.meta['title']
        _type = response.meta['_type']
        response = response.body
        soup = BeautifulSoup(response)
        #        try:
        #            items_keywords = soup.find(class_='ar_keywords').find_all('a')
        #            for i in range(0, len(items_keywords)):
        #                keywords += items_keywords[i].text.strip() + ' '
        #        except:
        #            log.msg("News " + title + " dont has keywords!", level=log.INFO)

        try:
            content_paragraph = soup.find("div", class_="text_info")
            article = []
            for tag in content_paragraph.find(
                    "div", class_="clear").previous_siblings:
                article.insert(0, tag)

            markdown = Tomd(''.join(
                str(article))).markdown.decode("unicode-escape")
            article = BeautifulSoup(''.join([str(tag) for tag in article
                                             ])).get_text().strip()
        except:
            log.msg("News " + title + " dont has article!", level=log.INFO)
        item['title'] = title
        item['day'] = day
        item['_type'] = _type
        item['url'] = url
        item['keywords'] = keywords
        item['article'] = article
        item['site'] = 'InfoQ'
        item['markdown'] = markdown
        return item