Example #1
0
class PeopleNewsCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        self.site = 'people.com.cn'  # 搜索站点

        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger
        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = self.baiduCrawler.session
        self.channel = channel

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())
        urls = self.baiduCrawler.search(self.channel.url, keywordList,
                                        startTimeIntSecond, endTimeIntSecond)
        articleList = list()
        count = 0
        for url in urls:
            article = self.crawlArticle(url)
            #self.logger.debug(article)

            #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
            if article is not None and article not in articleList:
                #count = count +1
                #self.logger.debug(u'文章数量%d',count)
                articleList.append(article)
        return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        pass

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420'
        html = self.session.download(url,
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)

        soup = BeautifulSoup(html['html'], 'html.parser')  # 'html.parser' 解析器
        try:
            meta = soup.find('meta').attrs['content']
        except:
            self.logger.warn(u'找不到meta里的content')
            return
        # self.logger.error('%s',meta)

        if "GB2312" in meta:
            encoding1 = 'GB2312'
        elif "UTF-8" in meta:
            encoding1 = 'UTF-8'
        elif "utf-8" in meta:
            encoding1 = 'utf-8'
        else:
            encoding1 = 'gbk'

        html = self.session.download(url,
                                     encoding=encoding1,
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        # 不同网页编码格式让其重新下载一遍
        soup = BeautifulSoup(html['html'], 'html.parser')  # 'html.parser' 解析器
        main = soup.find('body')

        if html:
            article_url = html['url']
            # self.logger.debug(article_url)
            if article_url.find(self.channel.url) < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None

            # if '.html' not in article_url:
            #     self.logger.error(u'非文章类型网址:%s ',article_url)
            #     return

            try:
                article_url = re.findall(
                    r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            except:
                self.logger.error(u'网址后缀不符合:%s ', article_url)
                return

            self.logger.debug('[peoplenews]' + article_url)
            articleid = article_url
            articleid = articleid[articleid.find('cn/') +
                                  3:-5]  # 由于数据库字段长度有限,所以截取部分作为ID
            self.logger.warn(u'地址 %s 以url地址部分字段作为tid', article_url)

            if 'bbs1' not in article_url:

                main1 = soup.find(
                    'div', attrs={'class': "i_cont"}
                )  #http://health.people.com.cn/n1/2017/1011/c14739-29579836.html
                main2 = soup.find(
                    'div', attrs={'class': "text_c"}
                )  #http://rencai.people.com.cn/n/2014/0721/c244800-25311391.html

                if (main1 is None) and (main2 is None):
                    self.logger.debug(u'走main')
                    try:
                        Ttitle = main.find('h1').text.strip()  # 标题
                    except:
                        self.logger.error(u'Ttitle存在走了main部分却不满足其他格式的的url::%s',
                                          article_url)
                        return

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle_crawl = soup.find('div',
                                                 attrs={'class':
                                                        "box01"})  #对应一种格式
                        if Ttitle_crawl is None:
                            self.logger.error(
                                u'Ttitle_crawl存在走了main部分却不满足其他格式的的url::%s',
                                article_url)
                            return
                        try:
                            Tpublishtime = Ttitle_crawl.find('div',
                                                             attrs={
                                                                 'class': "fl"
                                                             }).text.strip()
                        except:
                            self.logger.error(u'main中发布时间不匹配')
                            return

                        if Tpublishtime is None:
                            self.logger.error(
                                u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                article_url)
                            return
                        else:
                            # self.logger.error(Tpublishtime)
                            Tpublishtime = re.findall(
                                r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                                Tpublishtime)[0]

                            # Tpublishtime = Tpublishtime[:18]
                            if len(Tpublishtime[4]) > 1:
                                Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                               Tpublishtime[3] + Tpublishtime[4]
                            else:
                                Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                               Tpublishtime[3] + ':00'

                            # Tpublishtime = Tpublishtime.replace(u'年', '-').replace(u'月', '-').replace(u'日', '')

                            Tauthor = Ttitle_crawl.find(
                                'a', attrs={'target': "_blank"})
                            if Tauthor is not None:
                                Tauthor = Tauthor.text.strip()
                            else:
                                Tauthor = 'None'
                            Tcontent = soup.find('div',
                                                 attrs={'class': "box_con"})
                            if Tcontent is not None:
                                Tcontent = Tcontent.text.strip()
                                Tcontent = re.sub(r'\n|\t', '', Tcontent)
                            else:
                                self.logger.error(
                                    u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                    article_url)
                                return

                            Treply = None  #这种格式下没有这些统计可以获取
                            meta_info = None

                            article = Article(articleid,
                                              self.channel.channel_id,
                                              Ttitle,
                                              Tcontent,
                                              Tpublishtime,
                                              article_url,
                                              None,
                                              Tauthor,
                                              meta_info=meta_info)
                            article.statistics.reply_count = Treply
                            #self.logger.info(article)
                            return article

                elif (main1 is not None):
                    self.logger.debug(u'走main1')
                    Ttitle = main1.find('h2')  # 标题

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle = Ttitle.text.strip()

                    try:
                        Tpublishtime = main1.find('div',
                                                  attrs={
                                                      'class': "artOri"
                                                  }).text.strip()
                    except:
                        self.logger.error(u'main1中发布时间不匹配')
                        return

                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Tpublishtime = re.findall(
                            r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                            Tpublishtime)[0]
                        # self.logger.error(Tpublishtime)
                        # Tpublishtime = Tpublishtime[:18]
                        if len(Tpublishtime[4]) > 1:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + Tpublishtime[4]
                        else:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + ':00'

                        Tauthor = main1.find('div', attrs={
                            'class': "artOri"
                        }).find('a', attrs={'target': "_blank"})
                        # self.logger.debug(u"作者:%s",Tauthor)
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'

                        Tcontent = main1.find('div', attrs={'class': "artDet"})
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                article_url)
                            return

                        Treply = None  # 这种格式下没有这些统计可以获取
                        meta_info = None

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply
                        self.logger.info(article)
                        return article

                elif (main2 is not None):
                    self.logger.debug(u'走main2')
                    Ttitle = main2.find('h2', attrs={'class': "one"})  # 标题

                    if Ttitle is None:
                        self.logger.error(
                            u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Ttitle = Ttitle.text.strip()
                    try:
                        Tpublishtime = main2.find('span',
                                                  attrs={
                                                      'id': "p_publishtime"
                                                  }).text.strip()
                    except:
                        self.logger.error(u'main2中发布时间不匹配')
                        return

                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                            article_url)
                        return
                    else:
                        Tpublishtime = re.findall(
                            r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                            Tpublishtime)[0]
                        # self.logger.error(Tpublishtime)
                        # Tpublishtime = Tpublishtime[:18]
                        if len(Tpublishtime[4]) > 1:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + Tpublishtime[4]
                        else:
                            Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \
                                           Tpublishtime[3] + ':00'

                        Tauthor = main2.find('span', attrs={
                            'id': "p_origin"
                        }).find('a', attrs={'target': "_blank"})
                        # self.logger.debug(u"作者:%s",Tauthor)
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'

                        Tcontent = main2.find('div',
                                              attrs={'class': "show_text"})
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                article_url)
                            return

                        Treply = None  # 这种格式下没有这些统计可以获取
                        meta_info = None

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply
                        self.logger.info(article)
                        return article

                else:
                    self.logger.warn(u'存在另外一种html格式 %s', article_url)

            # elif 'bbs1' in article_url: #bbs1的格式
            #     self.logger.debug(u'走bbs1')
            #     if main is not None:
            #         main_crawl = main.find('div',attrs={'class':"navBar"})
            #         Ttitle = main_crawl.find('h2').text
            #
            #         if Ttitle is None:
            #             self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s', article_url)
            #             return
            #         else:
            #             statice_crawl = soup.find('p', attrs={'class': "replayInfo"})
            #             Tpublishtime = statice_crawl.find('span',attrs={'class':"float_l mT10"}).text.strip()
            #             Tpublishtime = Tpublishtime[-19:]
            #             if Tpublishtime is None:
            #                 self.logger.error(u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url)
            #                 return
            #             else:
            #                 Tpublishtime = Tpublishtime.replace(u'年', '-').replace(u'月', '-').replace(u'日', '')
            #
            #                 Tauthor = None
            #
            #                 Tcontent_crawl = soup.find('article')
            #                 Tcontent_crawl1 = Tcontent_crawl.find('div').attrs['content_path']
            #                 Tcontent_html = self.session.download(Tcontent_crawl1,encoding='utf-8', data=None, isJson=False, timeout=10, retry=3)
            #                 soup1 = BeautifulSoup(Tcontent_html, 'html.parser')
            #                 Tcontent = soup1.text.strip()
            #                 if Tcontent is not None:
            #                     Tcontent = re.sub(r'\n|\t', '', Tcontent)
            #                 else:
            #                     self.logger.error(u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url)
            #                     return
            #
            #                 Tread = statice_crawl.find('span',attrs={'class':"readNum"}).text.strip()
            #                 Treply = statice_crawl.find('span', attrs={'class': "replayNum"}).text.strip()
            #                 Tlike = statice_crawl.find('span',attrs={'class':"float_l supportBtn"}).attrs['overscore']
            #
            #                 meta_info = None ##这里保存class= replyInfo
            #
            #                 article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime,
            #                                   article_url, None, Tauthor, meta_info=meta_info)
            #                 article.statistics.reply_count = Treply
            #                 article.statistics.read_count = Tread
            #                 article.statistics.like_count = Tlike
            #
            #                 # self.logger.info(article)
            #                 return article

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.lastCommentId = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''

        cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420'
        html = self.session.download(article.url,
                                     encoding='gbk',
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        article_url = article.url

        soup = BeautifulSoup(html['html'], 'html.parser')
        try:
            sid = soup.find('meta', attrs={
                'name': "contentid"
            }).attrs['content']
        except:
            return (list(), False)
        sid = re.sub(r'\D', '', sid)
        bbs = 'http://bbs1.people.com.cn/postLink.do?nid=' + sid
        # bbs = soup.find('div', attrs={'class': "message"})
        # if bbs:
        # bbs = bbs.find('a')
        # if bbs:
        # bbs = bbs.attrs['href']
        # else:
        # bbs = 'http://bbs1.people.com.cn/postLink.do?nid='
        # print bbs
        # else:
        # return None

        commentList = list()
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        html1 = self.session.download(bbs,
                                      encoding='gbk',
                                      data=None,
                                      isJson=False,
                                      timeout=10,
                                      retry=3)
        soup1 = BeautifulSoup(html1, 'html.parser')
        id = soup1.find('meta', attrs={'name': "contentid"})
        if id:
            id = id.attrs['content']
            id = re.sub(r'\D', '', id)
            re_url = 'http://bbs1.people.com.cn/api/postApi.do'
            page = 1
            while page < 30:
                data1 = {
                    'action': 'postDetailByParentId',
                    'replayPostId': id,
                    'pageNo': page
                }
                html2 = self.session.download(re_url,
                                              encoding='utf-8',
                                              data=data1,
                                              isJson=False,
                                              timeout=10,
                                              retry=3)
                html2 = re.sub(r'\\\\\\', '', html2)
                html2 = re.sub(r'"\[\\"', '[', html2)
                html2 = re.sub(r'\\"\]"', ']', html2)
                html2 = re.sub(r'\\",\\"', ',', html2)
                html2 = json.loads(html2)
                totalCount = html2['totalCount']
                if totalCount == 0:
                    break
                replayPosts = html2['replayPosts']
                if replayPosts:
                    for i in replayPosts:
                        cid = i['id']
                        user_id = i['userId']
                        user_name = i['userNick']
                        user_ip = i['userIP']
                        # ip_address = get_ip_address(str(user_ip))
                        # ip_address = ''
                        user_head = ''
                        publish_datetime = time.strftime(
                            '%Y-%m-%d %H:%M:%S',
                            time.localtime(float(i['createTime']) / 1000))
                        reply_userid = i['parentId']
                        like_count = i['vote_yes']
                        unlike_count = i['vote_no']
                        read_count = i['readCount']
                        reply_count = i['replyCount']
                        source_url = article_url
                        content = i['contentText']
                        heat = 0
                        location_coutry = 'CN'
                        # print cid, user_id, user_name, user_ip, ip_address, user_head, publish_datetime, reply_userid
                        # print like_count,unlike_count,read_count,reply_count,source_url
                        commentList.append(
                            Comment(
                                article.tid,
                                self.channel.channel_id,
                                cid,
                                add_datetime,
                                publish_datetime,
                                user_ip,
                                location_coutry,
                                None,
                                None,  ###这里的ip_address还未实现
                                user_id,
                                user_name,
                                content,
                                reply_userid,
                                None,
                                like_count,
                                reply_count,
                                dislike_count=None))
                    pageCount = html2['pageCount']  # 评论总页数
                    if pageCount == page:
                        break
                    page = page + 1  # 评论页数+1
                else:
                    break
        return (commentList, False)
Example #2
0
class XinHuaNewsCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        :param channel:媒体
        :param logger:日志
        :return:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = self.baiduCrawler.session
        self.channel = channel

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        :param keywordList:关键字数组
        :param endTime:搜索时间范围结束
        :return:返回文章列表
        '''
        startTime = endTime - datetime.timedelta(days=1)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())
        urls = self.baiduCrawler.search(self.channel.url, keywordList,
                                        startTimeIntSecond, endTimeIntSecond)
        articleList = list()
        for url in urls:
            self.logger.debug(url)
            article = self.crawlArticle(url)
            if article is not None and article not in articleList:
                #同一文章可能会在搜索中出现多次结果,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                articleList.append(article)
        return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        :param article:文章
        :return:无需返回参数,统计信息写入article实例
        '''
        pass

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        :param url:
        :return:返回一个Article实例
        '''
        cookies = None
        html = self.session.download(url=url,
                                     encoding='utf-8',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            article_id = re.findall(r'c_\d+', article_url)[0]
            article_id = article_id[2:]
            soup = BeautifulSoup(html['html'], "html.parser")
            main1 = soup.find('div', attrs={'class': "widthMain main"})
            main2 = soup.find('div', attrs={'class': "main pagewidth"})
            main3 = soup.find('body', attrs={'class': "streamline-page"})
            main4 = soup.find('div', attrs={'class': "h-title"})
            main5 = soup.find('div', attrs={'id': "article"})
            main6 = soup.find('div', attrs={'id': "Title"})
            main7 = soup.find('div', attrs={'class': "article"})

            if main1 is not None:
                self.logger.debug("main1")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain1(main1)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main2 is not None:
                self.logger.debug("main2")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain2(main2)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main3 is not None:
                self.logger.debug("main3")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain3(main3)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main3 is None and main4 is not None:
                self.logger.debug("main4")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain4(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main5 is not None:
                self.logger.debug("main5")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain5(main5)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main6 is not None:
                self.logger.debug("main6")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain6(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if main7 is not None:
                self.logger.debug("main7")
                Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain7(soup)
                if (Ttitle != None and Ttime != None and Tcontent != None):
                    article = Article(tid=article_id,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      author_name=Tauthor,
                                      publish_datetime=Ttime,
                                      content=Tcontent,
                                      url=article_url)
                    return article

            if (main1 is None and main2 is None and main3 is None
                    and main4 is None and main5 is None and main6 is None
                    and main7 is None):
                self.logger.error(u"存在另外一种html格式::%s", url)
                return

    def refreshSearch(self):
        '''
        重置搜索
        :return:
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        :return:
        '''
        pass

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        :param article:文章
        :return:(commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        return ([], False)

    def crawlMain1(self, main1):
        # 获取标题
        Ttitle = main1.find('h1')
        if Ttitle is None:
            self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        else:
            Ttitle = Ttitle.text.strip()

        # 获取发布时间
        Tinfo = main1.find('div', attrs={'class': "info"})
        if Tinfo is not None:
            Ttime = Tinfo.find('span', attrs={'class': "h-time"})
            Ttime = Ttime.text.strip()
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None

        if len(Ttime) == 16:
            Ttime = Ttime + ':00'

        # 获取发布作者
        Tauthor = Tinfo.find('em', attrs={'id': "source"})
        if Tauthor is not None:
            Tauthor = Tauthor.text.strip()
        else:
            Tauthor = None

        #获取发布内容
        Tcontent = main1.find('div', attrs={'id': "content"})

        if Tcontent is not None:
            Tcontent = Tcontent.text.strip()
            Tcontent = re.sub(r'\n\t', '', Tcontent)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        return Ttitle, Ttime, Tauthor, Tcontent

    def crawlMain2(self, main2):
        # 获取标题
        error = ""
        Ttitle = main2.find('h1')
        if Ttitle is None:
            self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        else:
            Ttitle = Ttitle.text.strip()

        # 获取发布时间
        Tinfo = main2.find('div', attrs={'class': "info"})
        if Tinfo is not None:
            Ttime = Tinfo.find('span', attrs={'id': "pubtime"})
            Ttime = Ttime.text.strip()
            if (Ttime == ""):
                Ttime = Tinfo.text.strip()
                Ttime = re.findall(u'\d{4}年\d{2}月\d{2}日.\d{2}:\d{2}:\d{2}',
                                   Ttime)[0]
            timeArray = time.strptime(Ttime, u"%Y年%m月%d日 %H:%M:%S")
            Ttime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None

        if len(Ttime) == 16:
            Ttime = Ttime + ':00'

        # 获取发布作者
        Tauthor = Tinfo.find('em', attrs={'id': "source"})
        if Tauthor is not None:
            Tauthor = Tauthor.text.strip()
        else:
            Tauthor = None

        # 获取发布内容
        Tcontent = main2.find('div', attrs={'id': "content"})
        if Tcontent is not None:
            Tcontent = Tcontent.text.strip()
            Tcontent = re.sub(r'\n\t', '', Tcontent)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        return Ttitle, Ttime, Tauthor, Tcontent

    def crawlMain3(self, main3):
        # 获取标题
        Ttitle = main3.find('div', attrs={'class': "h-title"})
        if Ttitle is None:
            self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        else:
            Ttitle = Ttitle.text.strip()

        # 获取发布时间
        Tinfo = main3.find('div', attrs={'class': "h-info"})
        if Tinfo is not None:
            Ttime = Tinfo.find('span', attrs={'class': "h-time"})
            Ttime = Ttime.text.strip()
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None

        if len(Ttime) == 16:
            Ttime = Ttime + ':00'

        # 获取发布作者
        Tauthor = Tinfo.find_all('span')[1]
        if Tauthor is not None:
            Tauthor = Tauthor.text.strip()
        else:
            Tauthor = None

        # 获取发布内容
        Tcontent = main3.find('div', attrs={'class': "h-title"})
        if Tcontent is not None:
            Tcontent = Tcontent.text.strip()
            Tcontent = re.sub(r'\n\t', '', Tcontent)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        return Ttitle, Ttime, Tauthor, Tcontent

    def crawlMain4(self, main4):
        # 获取标题
        Ttitle = main4.find('div', attrs={'class': "h-title"})
        if Ttitle is None:
            self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        else:
            Ttitle = Ttitle.text.strip()

        # 获取发布时间
        Tinfo = main4.find('div', attrs={'class': "h-info"})
        if Tinfo is not None:
            Ttime = Tinfo.find('span', attrs={'class': "h-time"})
            Ttime = Ttime.text.strip()
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None

        if len(Ttime) == 16:
            Ttime = Ttime + ':00'

        # 获取发布作者
        Tauthor = Tinfo.find_all('span')[1]
        if Tauthor is not None:
            Tauthor = Tauthor.text.strip()
        else:
            Tauthor = None

        # 获取发布内容
        Tcontent = main4.find('div', attrs={'id': "p-detail"})
        if Tcontent is not None:
            Tcontent = Tcontent.text.strip()
            Tcontent = re.sub(r'\n\t', '', Tcontent)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        return Ttitle, Ttime, Tauthor, Tcontent

    def crawlMain5(self, main5):
        # 获取标题
        Ttitle = main5.find('h1')
        if Ttitle is None:
            self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        else:
            Ttitle = Ttitle.text.strip()

        # 获取发布时间
        Tinfo = main5.find('div', attrs={'class': "source"})
        Ttime = Tinfo.find('span', attrs={'class': "time"})
        if Ttime is not None:
            Ttime = Ttime.text.strip()
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None

        if len(Ttime) == 16:
            Ttime = Ttime + ':00'

        # 获取发布作者
        Tauthor = Tinfo.find('em', attrs={'id': "source"})
        if Tauthor is not None:
            Tauthor = Tauthor.text.strip()
        else:
            Tauthor = None

        # 获取发布内容
        Tcontent = main5.find('div', attrs={'class': "article"})
        if Tcontent is not None:
            Tcontent = Tcontent.text.strip()
            Tcontent = re.sub(r'\n\t', '', Tcontent)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        return Ttitle, Ttime, Tauthor, Tcontent

    def crawlMain6(self, main6):
        # 获取标题
        Ttitle_div = main6.find('div', attrs={'id': "Title"})
        if Ttitle_div is None:
            self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        else:
            Ttitle = Ttitle_div.text.strip()

        # 获取发布时间
        title_parents = Ttitle_div.find_parents()
        Tinfo = title_parents[2].find_all('td')[1]
        Ttime = Tinfo.text.strip()
        try:
            Ttime = re.findall(u'\d{4}年\d{2}月\d{2}日.\d{2}:\d{2}:\d{2}',
                               Ttime)[0]
        except:
            self.logger.error(traceback.format_exc())
            Ttime = ""
        if Ttime == "":
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        else:
            timeArray = time.strptime(Ttime, u"%Y年%m月%d日 %H:%M:%S")
            Ttime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

        Tauthor = Tinfo.find('font')
        if Tauthor is not None:
            Tauthor = Tauthor.text.strip()
        else:
            Tauthor = None

        # 获取发布内容
        Tcontent = main6.find('div', attrs={'id': "Content"})
        if Tcontent is not None:
            Tcontent = Tcontent.text.strip()
            Tcontent = re.sub(r'\n\t', '', Tcontent)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        return Ttitle, Ttime, Tauthor, Tcontent

    def crawlMain7(self, main7):
        # 获取标题
        error = ""
        Ttitle = main7.find('h1')
        if Ttitle is None:
            self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        else:
            Ttitle = Ttitle.text.strip()

        # 获取发布时间
        Ttime = main7.find('span', attrs={'class': "time"})
        if Ttime is not None:
            Ttime = Ttime.text.strip()
            timeArray = time.strptime(Ttime, u"%Y年%m月%d日 %H:%M:%S")
            Ttime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None

        if len(Ttime) == 16:
            Ttime = Ttime + ':00'

        # 获取发布作者
        Tauthor = main7.find('em', attrs={'id': "source"})
        if Tauthor is not None:
            Tauthor = Tauthor.text.strip()
        else:
            Tauthor = None

        # 获取发布内容
        Tcontent = main7.find('div', attrs={'class': "article"})
        if Tcontent is not None:
            Tcontent = Tcontent.text.strip()
            Tcontent = re.sub(r'\n\t', '', Tcontent)
        else:
            self.logger.error(u'[XinhuaNews]' +
                              u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
            return None, None, None, None
        return Ttitle, Ttime, Tauthor, Tcontent
Example #3
0
class TencentNewsCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = self.baiduCrawler.session
        self.channel = channel
        self.lastCommentId = None

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())

        # 由于之前news.qq.com这个前缀的网站太少新闻,新添加另外两个
        channel_prefix = ['news', 'new', 'gd']
        channel_urls = self.channel.url.split('|')
        channel_dict = dict(zip(channel_prefix, channel_urls))
        websites = dict()
        for prefix, channel_url in channel_dict.items():
            websites[prefix] = self.baiduCrawler.search(
                channel_url, keywordList, startTimeIntSecond, endTimeIntSecond)

        articleList = self.crawlArticle(channel_prefix, websites)

        return articleList

    def crawlArticle(self, channel_prefix, websites):
        '''
        :param channel_prefix: 网址前缀数组
        :param websites: 对应网址前缀的文章url字典
        :return:
        '''

        articleList = list()
        for prefix, url_list in websites.items():
            if prefix is channel_prefix[0]:
                for i in range(len(url_list)):
                    article = self.crawlNewsArticle(url_list[i])
                    if article is not None and article not in articleList:
                        # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                        articleList.append(article)

            if prefix is channel_prefix[1]:
                for i in range(len(url_list)):
                    article = self.crawlNewArticle(url_list[i])
                    if article is not None and article not in articleList:
                        # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                        articleList.append(article)

            if prefix is channel_prefix[2]:
                for i in range(len(url_list)):
                    article = self.crawlNewsArticle(url_list[i])
                    if article is not None and article not in articleList:
                        # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                        articleList.append(article)

        return articleList

    def crawlNewsArticle(self, url):
        '''
        爬取url前缀为news.qq.com和gd.qq.com的文章
        :param url:
        :return:
        '''
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        if html:
            article_url = html['url']
            if article_url.find('news.qq.com') < 0 and article_url.find(
                    'gd.qq.com') < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[TencentNews]' + article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')
            main = soup.find('div', attrs={'id': "Main-Article-QQ"})
            main1 = soup.find('div', attrs={'id': "Main-P-QQ"})
            if main is not None:
                Ttitle = main.find('h1').text.strip()  #标题
                Ttime = main.find('span', attrs={'class':
                                                 "article-time"})  #发布时间
                Ttime1 = main.find('span', attrs={'class': "a_time"})
                Ttime2 = main.find('span', attrs={'class': "pubTime"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                elif Ttime1 is not None:
                    Ttime1 = Ttime1.text.strip()
                    Ttime = Ttime1
                elif Ttime2 is not None:
                    Ttime2 = Ttime2.text.strip()
                    Ttime = Ttime2
                else:
                    Ttime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                Tauthor = main.find('span', attrs={'class': "a_source"})
                Tauthor1 = main.find('span', attrs={'class': "color-a-1"})
                if Tauthor is not None:
                    #Tauthor = Tauthor.find('a').text.strip()
                    Tauthor = Tauthor.text.strip()
                elif Tauthor1 is not None:
                    #Tauthor1 = Tauthor1.find('a').text.strip()
                    Tauthor1 = Tauthor1.text.strip()
                    Tauthor = Tauthor1
                else:
                    Tauthor = None
                Tcontent = main.find('div',
                                     attrs={'id': "Cnt-Main-Article-QQ"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    Tcontent = None
                articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0]
                try:
                    commentid = re.findall(r'cmt_id = (\d+);', html['html'])[0]
                    meta_info = '{"commentid":"%s"}' % commentid
                except:
                    commentid = None
                    meta_info = None
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Ttime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                if commentid is not None:
                    try:
                        re_url = 'http://coral.qq.com/article/' + commentid + '/commentnum'
                        html1 = json.loads(
                            self.session.download(re_url,
                                                  encoding='utf-8',
                                                  data=None,
                                                  timeout=10,
                                                  retry=3))
                        Treply = int(html1['data']['commentnum'])
                    except Exception:
                        traceInfo = traceback.format_exc()
                        self.logger.error(
                            'Faile to parse comment for %s (cid=%s):%s',
                            articleid, commentid, traceInfo)
                        Treply = None
                    article.statistics.reply_count = Treply
                return article
            elif main1 is not None:
                Ttitle = soup.find('meta', attrs={
                    'name': "Description"
                }).attrs['content']  # 标题
                Ttime = re.findall(
                    r"pubtime\D+(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})\',",
                    html['html'])
                if Ttime is not None:
                    Ttime = Ttime[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[
                        2] + ' ' + Ttime[3]
                else:
                    Ttime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'
                Tauthor = re.findall(r'para = {\s+name: \"(.*)\",',
                                     html['html'])
                if Tauthor is not None:
                    Tauthor = Tauthor[0]
                else:
                    Tauthor = None
                con_url = re.sub(r'\.htm\?.*', '.hdBigPic.js', article_url)
                con_html = self.session.download(con_url,
                                                 encoding='gbk',
                                                 data=None,
                                                 timeout=10,
                                                 retry=3)
                con_list = re.findall(r'<p>(.*?)</p>', con_html)
                if con_list is not None:
                    TT = []
                    for i in con_list:
                        if i.strip() not in TT:
                            TT.append(i)
                    Tcontent = ''.join(TT)
                else:
                    Tcontent = None
                articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0]
                try:
                    commentid = re.findall(r'aid\D+(\d+)\",', html['html'])[0]
                    meta_info = '{"commentid":"%s"}' % commentid
                except:
                    commentid = None
                    meta_info = None
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Ttime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                try:
                    if commentid is not None:
                        re_url = 'http://coral.qq.com/article/batchcommentnum'
                        data1 = {'targetid': articleid}
                        html1 = json.loads(
                            self.session.download(re_url,
                                                  encoding='utf-8',
                                                  data=data1,
                                                  timeout=10,
                                                  retry=3))
                        Treply = int(html1['data'][0]['commentnum'])
                    else:
                        Treply = None
                except:
                    Treply = None
                article.statistics.reply_count = Treply
                return article
        return None

    def crawlNewArticle(self, url):
        '''
        爬取url前缀为new.qq.com的文章
        :param url:
        :return:
        '''
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)
        if html:
            article_url = html['url']
            if article_url.find('new.qq.com/omn') < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return

            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[TencentNew]' + article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')

            script_tags = soup.head.find_all('script')
            data = dict()
            for tag in script_tags:
                text = re.search(r'window.DATA = (.*)', tag.text, re.S)
                if text:
                    data = json.loads(text.group(1))

            tid = data['article_id']
            title = data['title']
            author_name = data['media']
            author_id = data['media_id']
            publish_datetime = data['pubtime']
            comment_id = data['comment_id']

            main = soup.find('div', attrs={'class': 'qq_conent clearfix'})
            t_content = ''
            if main is not None:
                contents = main.find_all('p', {'class': 'one-p'})
                for content in contents:
                    if content.string is None:
                        continue
                    t_content += str(content.get_text().strip())

            get_comment_count_url = 'https://coral.qq.com/article/%s/commentnum?callback=_article%scommentnum' % (
                comment_id, comment_id)
            comment_data = self.session.download(get_comment_count_url)
            comment_data = re.search(
                r'_article%scommentnum\((.*)\)' % comment_id, comment_data)

            comment_dict = eval(comment_data.group(1))
            reply_count = comment_dict['data']['commentnum']
            meta_info = '{"commentid":"%s"}' % comment_id

            article = Article(tid=tid,
                              channel_id=self.channel.channel_id,
                              title=title,
                              content=t_content,
                              publish_datetime=publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name,
                              meta_info=meta_info)
            article.statistics.reply_count = reply_count
            return article
        return None

    def crawlStatistics(self, article):
        meta_info = article.meta_info
        if meta_info is None:
            return
        jo = json.loads(meta_info)
        if "commentid" not in jo:
            return
        commentid = jo["commentid"]
        re_url = 'http://coral.qq.com/article/batchcommentnum'
        data1 = {'targetid': commentid}
        html1 = json.loads(
            self.session.download(re_url,
                                  encoding='utf-8',
                                  data=data1,
                                  timeout=10,
                                  retry=3))
        article.statistics.reply_count = int(html1['data'][0]['commentnum'])

    def refreshSearch(self):
        pass

    def refreshCommentCrawler(self):
        self.lastCommentId = None

    def crawlComment(self, article):
        # 获取文章评论
        meta_info = article.meta_info
        if meta_info is None:
            return (list(), False)
        jo = json.loads(meta_info)
        if "commentid" not in jo:
            return (list(), False)
        commentid = jo["commentid"]
        cookies = 'pac_uid=0_58ec8106620c1; gj_mpvid=80515918; ad_play_index=97; dsp_cookiemapping0=1492586667155; pgv_info=ssid=s9259450720; ts_last=news.qq.com/a/20170415/002007.htm; ts_refer=www.baidu.com/link; pgv_pvid=1281052383; ts_uid=1143064466; ptag=www_baidu_com|'
        re_url = 'http://coral.qq.com/article/' + commentid + '/comment'
        commentList = list()
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        data1 = {
            'commentid':
            self.lastCommentId if self.lastCommentId is not None else '0',
            'reqnum': '50'
        }
        html = self.session.download(re_url,
                                     encoding='utf-8',
                                     cookies=cookies,
                                     data=data1,
                                     timeout=10,
                                     retry=3)
        jo = json.loads(html)
        if jo['errCode'] != 0:
            return ([], False)
        if jo['data']['retnum'] == 0:
            return ([], False)
        self.lastCommentId = jo['data']['last']
        for i in jo['data']['commentid']:
            cid = i['id']
            user_id = i['userinfo']['userid']
            user_name = i['userinfo']['nick']
            user_ip = ''
            location = i['userinfo']['region'].replace(u'市', '').replace(
                u'自治', '').replace(u'新区', '').replace(u'区',
                                                      '').replace(u'洲', '')
            location_list = location.split(':')

            location_country = location_list[0]
            if len(location_list) > 1:
                location_region = location_list[1]
            else:
                location_region = ''
            if len(location_list) > 2:
                location_city = location_list[2]
            else:
                location_city = ''
            #user_head = i['userinfo']['head']

            publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(i['time']))
            reply_userid = str(i['replyuserid'])
            like_count = i['up']
            reply_count = i['rep']
            content = i['content']
            # print cid, user_id, user_name, user_ip, ip_address, user_head, publish_datetime, reply_userid
            # print like_count,unlike_count,read_count,reply_count,source_url
            commentList.append(
                Comment(article.tid, self.channel.channel_id, cid,
                        add_datetime, publish_datetime, user_ip,
                        location_country, location_region, location_city,
                        user_id, user_name, content, reply_userid, None,
                        like_count, reply_count, None))
        return (commentList, jo['data']['hasnext'])
Example #4
0
class IFengNewsCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''

        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = self.baiduCrawler.session
        self.channel = channel

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())
        urls = self.baiduCrawler.search(self.channel.url, keywordList,
                                        startTimeIntSecond, endTimeIntSecond)

        articleList = list()
        for url in urls:
            article = self.crawlArticle(url)
            # self.crawlStatistics(article)
            if article is not None and article not in articleList:
                # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                articleList.append(article)
        return articleList

    def crawlStatistics(self, article):  #下次直接获得要统计的变量而不用爬整个网页
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        meta_info = article.meta_info  #凤凰网的meta_info我保存的是doc_url值,因为有几种格式,要么是subxxxx_0 要么是文章url
        # print len(meta_info)
        data1 = {
            'callback': 'newCommentListCallBack',
            'doc_url': meta_info,
            'job': '1',
            'callback': 'newCommentListCallBack'
        }
        re_url = 'http://comment.ifeng.com/get.php'
        html1 = self.session.download(re_url,
                                      encoding='gbk',
                                      data=data1,
                                      timeout=10,
                                      retry=3,
                                      addr=False,
                                      isJson=True)

        article.statistics.reply_count = html1['count']  #如果还需要其他统计数可以继续添加

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        html = self.session.download(url,
                                     encoding='utf-8',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True)

        if html:
            article_url = html['url']
            if article_url.find(self.channel.url) < 0:
                self.logger.warn('Unrelated url found:%s', url)
                return None
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.debug('[iFengnews]' + article_url)
            soup = BeautifulSoup(html['html'], 'lxml')  # 'html.parser' 解析器
            main = soup.find('div', attrs={'class': "main"})
            main1 = soup.find('div', attrs={'class': "yc_main"})

            if main is not None:
                self.logger.debug(u'走第一种格式')
                Ttitle = main.find('h1').text.strip()  # 标题
                if Ttitle is None:
                    self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                      url)
                    return
                else:
                    Tpublishtime = main.find('span', attrs={
                        'class': "ss01"
                    }).text.strip()
                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url)
                        return
                    else:
                        Tpublishtime = Tpublishtime.replace(u'年', '-').replace(
                            u'月', '-').replace(u'日', '')

                        Tauthor = main.find('a', attrs={'target': "_blank"})
                        if Tauthor is not None:
                            Tauthor = Tauthor.text.strip()
                        else:
                            Tauthor = 'None'
                        Tcontent = main.find('div',
                                             attrs={'id': "main_content"})
                        # print Tcontent
                        # Tcontent = Tcontent.find('p')
                        if Tcontent is not None:
                            Tcontent = Tcontent.text.strip()
                            Tcontent = re.sub(r'\n|\t', '', Tcontent)
                        else:
                            self.logger.error(
                                u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                url)
                            return

                        doc_url = re.findall(r'"commentUrl":"(.*)",',
                                             html['html'])
                        if doc_url:
                            doc_url = doc_url[0]
                        else:
                            doc_url = url

                        data1 = {
                            'callback': 'newCommentListCallBack',
                            # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml',
                            'doc_url': doc_url,
                            'job': '1',
                            'callback': 'newCommentListCallBack'
                        }
                        re_url = 'http://comment.ifeng.com/get.php'
                        html1 = self.session.download(re_url,
                                                      encoding='gbk',
                                                      data=data1,
                                                      timeout=10,
                                                      retry=3,
                                                      addr=False,
                                                      isJson=True)
                        # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] )

                        Treply = html1['count']
                        if len(html1['comments']) is not 0:
                            articleid = html1['comments'][0]['article_id']
                        else:
                            articleid = article_url
                            articleid = articleid[articleid.find('a/') + 2:
                                                  -6]  # 由于数据库字段长度有限,所以截取部分作为ID
                            self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid',
                                             article_url)

                        meta_info = doc_url

                        article = Article(articleid,
                                          self.channel.channel_id,
                                          Ttitle,
                                          Tcontent,
                                          Tpublishtime,
                                          article_url,
                                          None,
                                          Tauthor,
                                          meta_info=meta_info)
                        article.statistics.reply_count = Treply

                        self.logger.info(article)
                        return article

            ##对第二种格式的爬取
            if main1 is not None:
                self.logger.debug(u'走第二种格式')
                Ttitle = main1.find('h1').text.strip()
                if Ttitle is None:
                    self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ',
                                      url)
                    return
                else:
                    Tpublishtime = main1.find('span').text.strip()
                    if Tpublishtime is None:
                        self.logger.error(
                            u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url)
                        return

                # return Tpublishtime
                Tauthor = main1.find('a', attrs={'target': "_blank"})
                if Tauthor is not None:
                    Tauthor = Tauthor.text.strip()
                else:
                    Tauthor = 'None'
                Tcontent = main1.find('div', attrs={'class': "yc_con_txt"})

                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.warn(
                        u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url)
                    return

                doc_url = re.findall(r'"commentUrl":"(.*)",', html['html'])
                if doc_url:
                    doc_url = doc_url[0]
                else:
                    doc_url = url

                data1 = {
                    'callback': 'newCommentListCallBack',
                    # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml',
                    'doc_url': doc_url,
                    'job': '1',
                    'callback': 'newCommentListCallBack'
                }
                re_url = 'http://comment.ifeng.com/get.php'
                html1 = self.session.download(re_url,
                                              encoding='gbk',
                                              data=data1,
                                              timeout=10,
                                              retry=3,
                                              addr=False,
                                              isJson=True)
                # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] )
                try:
                    Treply = html1['count']
                except:
                    Treply = None

                if len(html1['comments']) is not 0:
                    articleid = html1['comments'][0]['article_id']
                else:
                    articleid = url.strip()
                    articleid = articleid[articleid.find('a/') +
                                          2:-6]  # 由于数据库字段长度有限,所以截取部分作为ID
                    self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid',
                                     article_url)

                meta_info = doc_url
                article = Article(articleid,
                                  self.channel.channel_id,
                                  Ttitle,
                                  Tcontent,
                                  Tpublishtime,
                                  article_url,
                                  None,
                                  Tauthor,
                                  meta_info=meta_info)
                article.statistics.reply_count = Treply
                self.logger.info(article)
                return article

            if (main is None) and (main1 is None):
                self.logger.warn(u"存在另外一种html格式::%s", url)
                return

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.lastCommentId = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        #self.logger.debug('Article:%s', article)

        html = self.session.download(article.url,
                                     encoding='utf-8',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     isJson=False)
        # meta_info = article.meta_info
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        # add_datetime = time.mktime(time.strptime('','%Y-%m-%d'))
        commentList = list()
        page = 1
        while page < 30:
            doc_url = re.findall(r'"commentUrl":"(.*)",', html['html'])
            if doc_url:
                doc_url = doc_url[0]
            else:
                doc_url = article.url

            data1 = {
                'callback': 'newCommentListCallBack',
                'orderby': '',
                'docUrl': doc_url,
                'job': '1',
                'p': page,
                'callback': 'newCommentListCallBack'
            }
            re_url = 'http://comment.ifeng.com/get.php'
            html1 = self.session.download(re_url,
                                          encoding='gbk',
                                          data=data1,
                                          timeout=10,
                                          retry=3,
                                          isJson=True)
            totalcount = html1['count']  # 评论总数
            if totalcount == 0:
                break
            comments = html1['comments']
            if comments:
                for comment in comments:
                    cid = comment['comment_id']
                    user_id = comment['user_id']
                    user_name = comment['uname']
                    user_ip = comment['client_ip']
                    # ip_address = get_ip_address(self, str(user_ip))  # 并没有获取到值
                    # if ip_address is '':
                    try:
                        ip_address = comment['ip_from']
                    except:
                        ip_address = None
                    # ip_address = comment['ip_from']
                    user_head = comment['user_url']
                    publish_datetime = time.strftime(
                        '%Y-%m-%d %H:%M:%S',
                        time.localtime(float(comment['create_time'])))
                    reply_userid = comment['parent']  # 评论的回复
                    if reply_userid:
                        reply_userid = comment['parent'][0]['user_id']
                    else:
                        reply_userid = ''
                    like_count = comment['uptimes']
                    unlike_count = None
                    read_count = None
                    reply_count = None
                    source_url = article.url
                    content = comment['comment_contents']
                    heat = 0
                    location_coutry = 'CN'

                    if ip_address is None:
                        commentList.append(
                            Comment(
                                article.tid,
                                self.channel.channel_id,
                                cid,
                                add_datetime,
                                publish_datetime,
                                user_ip,
                                None,
                                None,
                                None,  ###这里的ip_address还未实现
                                user_id,
                                user_name,
                                content,
                                reply_userid,
                                None,
                                like_count,
                                reply_count,
                                dislike_count=None))
                    else:
                        try:
                            location_region = ip_address[:ip_address.
                                                         find(u'省') + 1]
                            location_city = ip_address[ip_address.find(u'省') +
                                                       1:]
                        except:
                            location_region = None
                            location_city = None
                        commentList.append(
                            Comment(article.tid,
                                    self.channel.channel_id,
                                    cid,
                                    add_datetime,
                                    publish_datetime,
                                    user_ip,
                                    location_coutry,
                                    location_region,
                                    location_city,
                                    user_id,
                                    user_name,
                                    content,
                                    reply_userid,
                                    None,
                                    like_count,
                                    reply_count,
                                    dislike_count=None))

                page = page + 1
                totalpage = math.ceil(totalcount / 20.0)  # 计算评论总页数,向上取整
                if totalpage < page:
                    break
            else:
                break
        return (commentList, False
                )  #测试的时候 article[0][222].content  可以取出第222条的评论内容
Example #5
0
class SinaNewsCrawler(object):
    '''
    classdocs
    '''

    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = self.baiduCrawler.session
        self.channel = channel
    
    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(days=self.channel.search_ranges)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())
        urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond)
        articleList = list()
        for url in urls:
            article = self.crawlArticle(url)
            if article is not None and article not in articleList:
                #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                articleList.append(article)
        return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        cookies = 'U_TRS1=000000fa.9fe376b4.58573ebc.bde2f2c3; UOR=,vip.stock.finance.sina.com.cn,; vjuids=3923fcfb8.15914cd122a.0.e347599b65a6; SINAGLOBAL=183.63.92.250_1482112700.861930; SUB=_2AkMvC7H0f8NhqwJRmP4WzWzrb4xwzgnEieLBAH7sJRMyHRl-yD83qlNetRBAqqE4nv4pjjxQaUfLZo_Os-Bxsw..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFZzJ6nbHTRfVEqOXp-S.5z; SGUID=1482721389362_efec0e8d; vjlast=1488765553.1489054965.10; bdshare_firstime=1492414283526; _ct_uid=58f46f61.537a7929; lxlrtst=1492423120_o; rotatecount=2; Apache=59.42.29.149_1492670298.869113; ULV=1492670299361:18:6:6:59.42.29.149_1492670298.869113:1492670298484; afpCT=1; CNZZDATA1252916811=1442218969-1492654141-http%253A%252F%252Fnews.sina.com.cn%252F%7C1492664941; UM_distinctid=15b8a154522e79-0a3f79bddc9d05-4e45042e-100200-15b8a154523a49; CNZZDATA5399792=cnzz_eid%3D349789736-1492650802-http%253A%252F%252Fnews.sina.com.cn%252F%26ntime%3D1492667002; U_TRS2=00000095.1c285e96.58f85761.e07aa962; lxlrttp=1492423120'
        html = self.session.download(article.url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies)
        re_url = 'http://comment5.news.sina.com.cn/page/info'
        channel = re.findall(r"channel: '(.*)',", html['html'])[0]
        newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
        data = {
            'format': 'js',
            'channel': channel,
            'newsid': newsid,
            'group': '',
            'compress': '1',
            'ie': 'gbk',
            'oe': 'gbk',
            'page': '1',
            'page_size': '20'
        }
        try:
            html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
            html1 = re.sub(r'(.*=)\{', '{', html1)
            html1 = json.loads(html1)
            article.statistics.reply_count = html1['result']['count']['show']
        except:
            self.logger.error('[SinaStatistics]url:' + article.url + ', tid:' + article.tid + '%s' + traceback.format_exc())
            return
    
    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = 'U_TRS1=000000fa.9fe376b4.58573ebc.bde2f2c3; UOR=,vip.stock.finance.sina.com.cn,; vjuids=3923fcfb8.15914cd122a.0.e347599b65a6; SINAGLOBAL=183.63.92.250_1482112700.861930; SUB=_2AkMvC7H0f8NhqwJRmP4WzWzrb4xwzgnEieLBAH7sJRMyHRl-yD83qlNetRBAqqE4nv4pjjxQaUfLZo_Os-Bxsw..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFZzJ6nbHTRfVEqOXp-S.5z; SGUID=1482721389362_efec0e8d; vjlast=1488765553.1489054965.10; bdshare_firstime=1492414283526; _ct_uid=58f46f61.537a7929; lxlrtst=1492423120_o; rotatecount=2; Apache=59.42.29.149_1492670298.869113; ULV=1492670299361:18:6:6:59.42.29.149_1492670298.869113:1492670298484; afpCT=1; CNZZDATA1252916811=1442218969-1492654141-http%253A%252F%252Fnews.sina.com.cn%252F%7C1492664941; UM_distinctid=15b8a154522e79-0a3f79bddc9d05-4e45042e-100200-15b8a154523a49; CNZZDATA5399792=cnzz_eid%3D349789736-1492650802-http%253A%252F%252Fnews.sina.com.cn%252F%26ntime%3D1492667002; U_TRS2=00000095.1c285e96.58f85761.e07aa962; lxlrttp=1492423120'
        html = self.session.download(url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.info('[SinaNews]'+article_url)
            #获取发布时间
            date = re.findall(r'/(\d{4}-\d{2}-\d{2})/', article_url)
            if len(date) == 0:
                return None
            # if date[0] < '2015-07-01':
            #     html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True)

            soup = BeautifulSoup(html['html'], 'lxml')
            main = soup.find('div', attrs={'class': "wrap-inner"})
            main1 = soup.find('div', attrs={'class': "Main clearfix"})
            main2 = soup.find('div', attrs ={'class': "main-content w1240"})

            #第一种网页格式
            if main is not None:
                self.logger.debug('走第一种格式')
                #获取标题
                Ttitle = main.find('h1', attrs={'id': "artibodyTitle"})
                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取发布时间
                Ttime = main.find('span', attrs={'class': 'time-source'})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}).*', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取作者信息
                Tauthor = soup.find('span', attrs={'class': "time-source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main.find('div', attrs={'id': "artibody"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                #获取评论数
                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article

            #第二种网页格式
            elif main1 is not None:
                self.logger.debug('走第二种格式')
                #获取标题
                Ttitle = main1.find('h1', attrs={'id': "artibodyTitle"})
                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取时间
                Ttime = main1.find('span', attrs={'id': "pub_date"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取发布者
                Tauthor = main1.find('span', attrs={'id': "media_name"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a').text.strip()
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main1.find('div', attrs={'id': "artibody"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id,title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article

            #第三种网页格式
            elif main2 is not None:
                self.logger.debug(u'第三种格式')
                #获取标题
                Ttitle = main2.find('div', attrs={'class': "second-title"})

                if Ttitle is None:
                    self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                # 获取时间
                Ttime = main2.find('span', attrs={'class': "date"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D ' '(\d{2}:\d{2}).*', Ttime)[0]
                    Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3]
                else:
                    self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                # 获取发布者
                Tauthor = main2.find('a', attrs={'class': "source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.text.strip()
                else:
                    Tauthor = None

                # 获取内容
                Tcontent = main2.find('div', attrs={'id': "article"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                else:
                    self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                # 获取评论数
                try:
                    channel = re.findall(r"channel: '(.*)',", html['html'])[0]
                    newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
                    data = {
                        'format': 'js',
                        'channel': channel,
                        'newsid': newsid,
                        'group': '',
                        'compress': '1',
                        'ie': 'gbk',
                        'oe': 'gbk',
                        'page': '1',
                        'page_size': '20'
                    }
                    re_url = 'http://comment5.news.sina.com.cn/page/info'
                    html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False)
                    html1 = re.sub(r'(.*=)\{', '{', html1)
                    html1 = json.loads(html1)
                    totalcount = html1['result']['count']['show']
                    Treply = totalcount
                except:
                    Treply = None

                # 获取文章的id
                articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0]

                article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor)
                article.statistics.reply_count = Treply
                return article

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass
    
    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        pass
        
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        html = self.session.download(article.url, encoding='utf-8', data=False, timeout=10, retry=3, addr=True)
        channel = re.findall(r"channel: '(.*)',", html['html'])[0]
        newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        commentList = list()
        page = 1
        self.logger.info(article.url)
        try:
            while page < 30:
                data ={
                    'channel': channel,
                    'newsid': newsid,
                    'group': '',
                    'compress': '1',
                    'ie': 'gbk',
                    'oe': 'gbk',
                    'page': page,
                    'page_size': '20'
                }
                re_url = 'http://comment5.news.sina.com.cn/page/info'
                html1 = self.session.download(url=re_url, encoding='utf-8', data=data, timeout=10, retry=3, addr=True)
                html1 = html1["html"]
                html1 = re.sub(r'(.*=)\{', '{', html1)
                html1 = json.loads(html1)
                totalcount = html1['result']['count']['show']
                if totalcount == 0:
                    break
                cmntlist = html1["result"]["cmntlist"]
                for i in cmntlist:
                    cid = i["mid"]
                    user_id = i["uid"]
                    user_name = i["nick"]
                    user_ip = i["ip"]
                    publish_datetime = i["time"]
                    like_count = i["agree"]
                    content = i["content"]
                    commentList.append(Comment(article.tid, self.channel.channel_id, cid,add_datetime, publish_datetime, user_ip, None, None, None,user_id, user_name,content,None, None, like_count, None, None))

                totalpage = math.ceil(totalcount / 20.0)

                if totalpage < page:
                    break
                page = page + 1
        except:
            self.logger.error(self.logger.error('Fail to parse comment:%s'+traceback.format_exc()))
        finally:
            return (commentList, False)
Example #6
0
class ZhihuCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = SessionCrawler()
        self.channel = channel
        self.nextCommentUrl = None

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())
        urls = self.baiduCrawler.search(self.channel.url, keywordList,
                                        startTimeIntSecond, endTimeIntSecond)
        articleList = list()
        for baiduUrl in urls:
            url = self.__fetchRealUrlFromBaiduUrl(baiduUrl)
            article = self.crawlArticle(url)
            if article is not None and article not in articleList:
                #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                articleList.append(article)
        return articleList

    def __fetchRealUrlFromBaiduUrl(self, baiduUrl):
        '''
        '''
        response = self.session.session.get(baiduUrl, allow_redirects=False)
        if response.status_code == 302:
            return response.headers['Location']

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        #判断url格式,因为从百度查询所得不一定是question,目前只爬question
        if url.find('question') < 0:
            self.logger.warn('Question supported only:%s', url)
            return None
        article_id = re.findall(r'question/(\d+)', url)[0]
        self.session.randomSleep()
        response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS)
        soup = BeautifulSoup(response)
        main = soup.find('div', attrs={'id': "data"}).attrs['data-state']
        articleJson = json.loads(main)
        questionJson = articleJson['entities']['questions'][article_id]
        title = questionJson['title']
        contentSoup = BeautifulSoup(questionJson['editableDetail'])
        content = contentSoup.text
        author_id = questionJson['author']['id']
        author_name = questionJson['author']['name']
        createTimeInFloat = questionJson['created']
        publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S',
                                         time.localtime(createTimeInFloat))
        reply_count = questionJson['commentCount']
        read_count = questionJson['visitCount']
        collect_count = questionJson['followerCount']
        article = Article(article_id, self.channel.channel_id, title, content,
                          publish_datetime, url, author_id, author_name)
        article.statistics.reply_count = reply_count
        article.statistics.read_count = read_count
        article.statistics.collect_count = collect_count
        return article

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        articleCopy = self.crawlArticle(article.url)
        article.statistics.reply_count = articleCopy.statistics.reply_count
        article.statistics.read_count = articleCopy.statistics.read_count
        article.statistics.collect_count = articleCopy.statistics.collect_count

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.nextCommentUrl = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        if self.nextCommentUrl is None:
            curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0)
            curl = curl.replace('#', '%')
        else:
            curl = self.nextCommentUrl
        self.session.randomSleep()
        result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS)
        jo = json.loads(result)
        paging = jo['paging']
        hasnext = not paging['is_end']
        self.nextCommentUrl = paging['next']
        dataList = jo['data']
        add_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        commentList = list()
        for data in dataList:
            #self.logger.debug('[Zhihu]Comment data keys:%s', data.keys())
            #self.logger.debug('[ZHIHU]Comment url for %s:%s', article.title, data['url'])
            publish_datetime = time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time']))
            comment = Comment(article.tid,
                              article.channel_id,
                              data['id'],
                              add_datetime,
                              publish_datetime,
                              ip_address=None,
                              location_country=None,
                              location_region=None,
                              location_city=None,
                              author_id=data['author']['id'],
                              author_name=data['author']['name'],
                              content=data['content'],
                              reply_author_id=None,
                              read_count=None,
                              like_count=data['voteup_count'],
                              reply_count=data['comment_count'],
                              dislike_count=None)
            commentList.append(comment)
        return (commentList, hasnext)
Example #7
0
class NeteaseNewsCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = self.baiduCrawler.session
        self.channel = channel

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())
        urls = self.baiduCrawler.search(self.channel.url, keywordList,
                                        startTimeIntSecond, endTimeIntSecond)
        articleList = list()
        for url in urls:
            article = self.crawlArticle(url)
            if article is not None and article not in articleList:
                #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                articleList.append(article)
        return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056'
        try:
            self.logger.info("[crawlStatistics]" + article.tid)
            if len(article.tid) != 16:
                articleid = article.tid[3:len(article.tid) - 2]
            else:
                articleid = article.tid
            re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid
            html1 = self.session.download(url=re_url,
                                          encoding='utf-8',
                                          data=None,
                                          isJson=True,
                                          timeout=10,
                                          retry=3)
            article.statistics.reply_count = html1["tcount"]
        except:
            self.logger.error('[SinaStatistics]url:' + article.url + ', tid:' +
                              article.tid + ', %s' + traceback.format_exc())
            return

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056'
        html = self.session.download(url,
                                     encoding='gbk',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        if html:
            article_url = html['url']
            article_url = re.findall(
                r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0]
            self.logger.info(article_url)
            soup = BeautifulSoup(html['html'], 'html.parser')
            main = soup.find('div', attrs={'class': "post_content_main"})
            main1 = soup.find('div', attrs={'class': "ep-content-main"})

            #第一种网页格式
            if main is not None:

                #获取标题
                Ttitle = main.find('h1')
                if Ttitle is None:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #获取发布时间
                Ttime = main.find('div', attrs={'class': "post_time_source"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                       Ttime)[0]
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取发布作者
                Tauthor = main.find('div', attrs={'class': "post_time_source"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None

                #获取发布内容
                Tcontent = main.find('div', attrs={'class': "post_text"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                    dr = re.compile(r'<[^>]+>', re.S)
                    Tcontent = dr.sub('', Tcontent)
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                    # 获取评论数
                articleid = ""
                try:
                    articleid = re.findall(r'"docId" : "(.*)",',
                                           html['html'])[0]
                    re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid
                    html1 = self.session.download(url=re_url,
                                                  encoding='utf-8',
                                                  data=None,
                                                  isJson=True,
                                                  timeout=10,
                                                  retry=3)
                    Treply = html1["tcount"]
                except:
                    Treply = None
                    self.logger.error('[NeteaseComment]url:' + article_url +
                                      ', tid:' + articleid + ', %s' +
                                      traceback.format_exc())
                finally:
                    article = Article(tid=articleid,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      content=Tcontent,
                                      publish_datetime=Ttime,
                                      url=article_url,
                                      author_name=Tauthor)
                    article.statistics.reply_count = Treply
                    return article

            #第二种网页格式
            elif main1 is not None:

                #标题
                Ttitle = main1.find('h1')
                if Ttitle is None:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                else:
                    Ttitle = Ttitle.text.strip()

                #发布的时间
                Ttime = main1.find('div',
                                   attrs={'class': "ep-time-source cDGray"})
                Ttime1 = main1.find('div', attrs={'class': "ep-info cDGray"})
                if Ttime is not None:
                    Ttime = Ttime.text.strip()
                    Ttime = re.findall(
                        r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0]
                elif Ttime1 is not None:
                    Ttime = Ttime1.text.strip()
                    Ttime = re.findall(
                        r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0]
                else:
                    self.logger.error('[NeteaseNews]' +
                                      '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取')
                    return
                if len(Ttime) == 16:
                    Ttime = Ttime + ':00'

                #获取作者信息
                Tauthor = main1.find('div',
                                     attrs={'class': "ep-time-soure cDGray"})
                Tauthor1 = main1.find('div',
                                      attrs={'class': "ep-source cDGray"})
                if Tauthor is not None:
                    Tauthor = Tauthor.find('a')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                    else:
                        Tauthor = None
                elif Tauthor1 is not None:
                    Tauthor = Tauthor1.find('span')
                    if Tauthor is not None:
                        Tauthor = Tauthor.text.strip()
                        print Tauthor
                        Tauthor = re.findall(r'来源:(.*)"', Tauthor)[0]
                    else:
                        Tauthor = None
                else:
                    Tauthor = None

                #获取内容
                Tcontent = main1.find('div', attrs={'id': "endText"})
                if Tcontent is not None:
                    Tcontent = Tcontent.text.strip()
                    Tcontent = re.sub(r'\n|\t', '', Tcontent)
                    dr = re.compile(r'<[^>]+>', re.S)
                    Tcontent = dr.sub('', Tcontent)
                else:
                    self.logger.error('[SinaNews]' +
                                      '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取')
                    return

                #获取评论数
                try:
                    articleid = re.findall(r'"docId" : "(.*)",',
                                           html['html'])[0]
                    re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid
                    html1 = json.load(
                        self.session.download(re_url,
                                              encoding='utf-8',
                                              data=None,
                                              isJson=True,
                                              timeout=10,
                                              retry=3))
                    Treply = html1['tcount']
                except:
                    Treply = None
                    self.logger.error('[NeteaseComment]url:' + article_url +
                                      ', tid:' + articleid + ', %s' +
                                      traceback.format_exc())
                finally:
                    article = Article(tid=articleid,
                                      channel_id=self.channel.channel_id,
                                      title=Ttitle,
                                      content=Tcontent,
                                      publish_datetime=Ttime,
                                      url=article_url,
                                      author_name=Tauthor)
                    self.logger.debug("[crawlArticle]" + article.tid)
                    article.statistics.reply_count = Treply
                    return article

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        pass

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056'
        if len(article.tid) != 16:
            articleid = article.tid[3:len(article.tid) - 2]
        else:
            articleid = article.tid
        re_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid + '/comments/newList'
        commentList = list()
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        page = 0
        while page < 750:
            data1 = {
                'offset': page,
                'limit': 30,
                'showLevelThreshold': 72,
                'headLimit': 1,
                'tailLimit': 2,
                'ibc': 'newspc'
            }
            try:
                html1 = self.session.download(url=re_url,
                                              encoding='utf-8',
                                              cookies=cookies,
                                              data=data1,
                                              timeout=10,
                                              retry=3)
                html1 = json.loads(html1)
                totalcount = html1["newListSize"]

                if totalcount == 0:
                    break
                for i in html1['comments'].itervalues():
                    cid = i['commentId']
                    user_id = i['user']['userId']
                    if user_id == 0:
                        user_name = ''
                    else:
                        user_name = i['user']['nickname']
                    user_ip = ''
                    location = i['user']['location'].replace(
                        u'市', ':').replace(u'自治',
                                           ':').replace(u'新区', ':').replace(
                                               u'区', ':').replace(u'洲', ':')
                    location_list = location.split(':')

                    location_country = location_list[0]
                    if len(location_list) > 1:
                        location_region = location_list[1]
                    else:
                        location_region = ''
                    if len(location_list) > 2:
                        location_city = location_list[2]
                    else:
                        location_city = ''
                    publish_datetime = i['createTime']
                    like_count = i['vote']
                    unlike_count = i['against']
                    content = i['content']
                    dr = re.compile(r'<[^>]+>', re.S)
                    content = dr.sub('', i['content'])
                    commentList.append(
                        Comment(articleid, self.channel.channel_id, cid,
                                add_datetime, publish_datetime, user_ip,
                                location_country, location_region,
                                location_city, user_id, user_name, content,
                                None, None, like_count, None, unlike_count))
                # print page, totalcount
                if page > int(totalcount):
                    break
                page = page + 30
            except:
                self.logger.error('[NeteaseComment]url:' + article.url +
                                  ', tid:' + article.tid + ', %s' +
                                  traceback.format_exc())
                return
            finally:
                return (commentList, False)