Exemple #1
0
class WechatCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.session_public = SessionCrawler(logger=self.logger)
        self.session_public_article = SessionCrawler(logger=self.logger)
        self.channel = channel
        self.entityId = 'SYSU'

        self.ip_list = None
        self.proxies = None
        self.monitor_title = '微信爬虫监控'
        self.email = SendEmail()
        self.db = InsertDB()

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,爬取一天内的文章
        @param keywordList: 关键字数组
        @:param endTime: 搜索结束时间
        '''
        run_msg = '微信爬虫开始运行'
        self.db.Insert(self.channel.channel_id, self.entityId, run_msg)
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeStr = startTime.strftime('%Y-%m-%d')
        endTimeStr = endTime.strftime('%Y-%m-%d')

        # startTime = endTime - datetime.timedelta(days=1)
        # startTimeStr = startTime.strftime('%Y-%m-%d')
        # endTimeStr=startTime.strftime('%Y-%m-%d')

        self.logger.debug('startTime:%s', startTimeStr)
        self.logger.debug('endTime:%s', endTimeStr)

        # 随机选取一个代理
        # proxy_crawler = ProxyCrawler()
        # proxies = proxy_crawler.get_random_proxy()

        # publicList = self.getPublic()
        # articleList = self.searchPublic(publicList)
        articleList = list()
        urlList = list()

        for keyword in keywordList:
            # 忽略第一次,第一次不带时间范围
            pageUrl = (SOUGO_WEIXIN_URL_INIT % (urllib.quote(
                keyword.encode('utf-8')), int(time.time() * 1000))).replace(
                    '#', '%')
            self.logger.debug('pageUrl:%s', pageUrl)
            self.session.randomSleep()
            lastPageUrl = pageUrl
            # 爬取微信一天时间的内容
            pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote(
                keyword.encode('utf-8')), startTimeStr, endTimeStr)
            self.logger.debug('pageUrl:%s', pageUrl)
            # 得到ip队列
            self.ip_list = getIp()
            ip = self.ip_list.dequeue()
            self.proxies = {"http": "http://" + ip}

            while True:
                # proxies = {"http": "http://" + ip}
                headers = HEADERS_SOGOU.copy()
                headers['Referer'] = lastPageUrl
                try:
                    response = self.session.get(pageUrl,
                                                allow_redirects=False,
                                                headers=headers,
                                                proxies=self.proxies)
                    soup = BeautifulSoup(response, 'lxml')
                    main = soup.find('ul', {'class': "news-list"})
                    while True:
                        if main is None:
                            # self.logger.error('Fail to parse: ip被封,更新ip')
                            content = 'ip被封,更新ip'
                            # self.email.send(self.monitor_title, content)
                            self.db.Insert(self.channel.channel_id,
                                           self.entityId, content)
                            temp = self.ip_list.dequeue()
                            if self.ip_list.isempty():
                                self.ip_list = getIp()
                            self.proxies = {"http": "http://" + temp}
                            # while True:
                            #     try:
                            response = self.session.get(pageUrl,
                                                        allow_redirects=False,
                                                        headers=headers,
                                                        proxies=self.proxies)
                            soup = BeautifulSoup(response, 'lxml')
                            main = soup.find('ul', {'class': "news-list"})
                            #     break
                            # except:
                            #     ip_unuseful_content = '此ip是不合格的ip,更新ip'
                            #     # self.email.send(self.monitor_title, ip_unuseful_content)
                            #     self.db.Insert(self.channel.channel_id,self.entityId,ip_unuseful_content)
                            #     tmp = self.ip_list.dequeue()
                            #     if self.ip_list.isempty():
                            #         self.ip_list = getIp()
                            #     self.proxies = {"http": "http://" + tmp}
                        else:
                            break

                    li_list = main.findAll('li')
                    # li_list有可能为空,但还可以翻页
                    for li in li_list:
                        a_list = li.findAll('a')
                        try:
                            publish_datetime = li.select_one('.s-p').get('t')
                            publish_datetime = time.strftime(
                                '%Y-%m-%d %H:%M:%S',
                                time.localtime(int(publish_datetime)))
                        except Exception as e:
                            self.logger.debug(
                                'Publish_datetime crawl failed, use now time')
                            publish_datetime = datetime.datetime.now(
                            ).strftime('%Y-%m-%d %H:%M:%S')
                        print publish_datetime

                        for a in a_list:
                            if a['uigs'].startswith('article_title'):
                                # self.logger.debug('Article title:%s',a.text)
                                urlList.append((a['href'], pageUrl, a.text,
                                                publish_datetime))
                                break
                    pageBarList = soup.findAll('div',
                                               {'id': 'pagebar_container'})
                    if len(pageBarList) == 0:
                        # 没有翻页,直接退出
                        break
                    pageBar = pageBarList[0]
                    aList = pageBar.findAll('a')
                    foundNextPage = False
                    for a in aList:
                        if a['uigs'] == 'page_next':
                            foundNextPage = True
                            lastPageUrl = pageUrl
                            pageUrl = SOGOU_URL + a['href']
                            self.logger.debug('Found next page:%s', a.text)
                            break
                    if foundNextPage is False:
                        break
                except:
                    ip_unuseful_content = '此ip是不合格的ip,更新ip'
                    # self.email.send(self.monitor_title,ip_unuseful_content)
                    self.db.Insert(self.channel.channel_id, self.entityId,
                                   ip_unuseful_content)
                    tmp = self.ip_list.dequeue()
                    if self.ip_list.isempty():
                        self.ip_list = getIp()
                    self.proxies = {"http": "http://" + tmp}
        for item in urlList:
            article = self.crawlArticle(item[0],
                                        referer=item[1],
                                        title=item[2],
                                        publish_datetime=item[3],
                                        flag=0)

            if article is not None:
                if article not in articleList:
                    # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                    articleList.append(article)
        if articleList is None:
            monitor_content = '微信没有数据,或者微信爬虫挂了'
            self.email.send(self.monitor_title, monitor_content)
            self.db.Insert(self.channel.channel_id, self.entityId,
                           monitor_content)
        end_msg = '微信爬虫结束'
        self.db.Insert(self.channel.channel_id, self.entityId, end_msg)
        return articleList

    def searchPublic(self, keywordList):
        '''
        根据关键字数组,开始时间和结束时间范围搜索公众号
        :param keywordList:
        :param endTime:
        :return:
        '''
        articleList = list()

        for keyword in keywordList:
            self.logger.debug(keyword)
            pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword[0])
            self.logger.info('pageUrl:%s', pageUrl)
            self.session.randomSleep()
            response = self.session.get(pageUrl,
                                        textRspOnly=False,
                                        headers=HEADERS_SOGOU)
            soup = BeautifulSoup(response.text, 'lxml')
            main = soup.find('ul', {'class': "news-list2"})

            if main is None:
                self.logger.error('Fail to parse:%s', response.text)
            try:
                li_list = main.findAll('li')
            except Exception, e:
                print e
                continue

            for li in li_list:
                a_title = li.find('p', {'class': "tit"})
                if a_title is not None:
                    title = str(a_title.text.strip())
                    if title == keyword[0]:
                        self.logger.debug(title)
                        a_href = a_title.find('a')['href']
                        sub_articleList = self.crawlWetchartpublic(a_href)
                        for article in sub_articleList:
                            articleList.append(article)
        return articleList
Exemple #2
0
class WeiboCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        self.entityId = 'SYSU'
        # self.user_name = "*****@*****.**"
        # self.session = SessionCrawler(sleepRange=[3,8])
        # self.pass_word = "810214bee810214"
        # self.user_name = "15088137907"
        # self.pass_word = "4p2yhynrb7"
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]
        self.logger.info('username: %s' % self.user_name)

        self.email = SendEmail()
        self.db = InsertDB()
        self.monitor_title = '微博爬虫监控'
        self.proxies = ''

        self.session = SessionCrawler(sleepRange=[3, 8])

        # user_name_password3 = '15767199023:j980216'
        # user_name_password1 = '13427287354:4ova7zixzj'
        # user_name_password2 = '13532011721:1emr41761u'
        # user_name_password3 = '13640792755:1eek9uuym4'
        # user_name_password4 = '13697726577:7hviv4old0'####
        # user_name_password5 = '13794342903:6imuw2cdya'

        # 197的微博账号
        # user_name_password1 = '17825769929:4ms7e2v3zx'
        # user_name_password2 = '18211493432:7fagvqyi9p'
        # user_name_password3 = '17827278983:0nenzag325'
        # user_name_password4 = '13922771190:5aqa10wvwf'
        # user_name_password5 = '15999916968:2i45j5b49y'

        # 15119820746 - ---0htkvsq5h6
        # 15986585396 - ---5gsmhx3e8k
        # 13430915912 - ---8s1nif2d50
        # 15012471375 - ---3qwlffw8vv
        # 17880567972 - ---6jrlzr2fqe
        # 17876156948 - ---5g5w4i43f3
        # 15915132451 - ---2rl2v9hy9t
        # 13543985544 - ---8x0pqi3as7
        # 13717382951 - ---5p2d39l19r
        # 13640695490 - ---6nxc4vou4o
    def change_cookie(self):
        '''
        随机获取一个cookie
        :return:
        '''
        # usename_list = [
        #             '18814095644','13432881156','*****@*****.**','15018377821','*****@*****.**',
        #             '15767199023','13427287354','13532011721','13640792755','13794342903',
        #             '17825769929','18211493432','17827278983','13922771190','15999916968',
        #             '15119820746','15986585396','13430915912','15012471375','17880567972',
        #             '17876156948','15915132451','13543985544','13717382951','13640695490',
        #             '15711707673','13680181412','13414759320','17820956139','18476072534',
        #             '17806699214','13418852766','17827181603','15919354070','15088137907'
        #                ]
        usename_list = [
            '18814095644',
            '13432881156',
            '*****@*****.**',
            '15018377821',
            '*****@*****.**',
        ]
        usename = random.choice(usename_list)

        return usename

    def get_username_password(self):
        '''
        随机获取用户和密码
        :return:
        '''
        user_name_password1 = '18814095644:ljda.18814095644'
        user_name_password2 = '13432881156:liang452035397'
        user_name_password3 = '[email protected]:810214bee810214'
        user_name_password4 = '15018377821:zzm15331411'
        user_name_password5 = '15767199023:j980216'
        user_name_password6 = '[email protected]:uwinvip'

        user_list = [
            user_name_password1, user_name_password2, user_name_password3,
            user_name_password4, user_name_password5, user_name_password6
        ]

        user_choice = random.choice(user_list)
        user_name_password = user_choice.split(':')
        return user_name_password

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        run_msg = '微博爬虫开始运行'
        self.db.Insert(self.channel.channel_id, self.entityId, run_msg)
        startTime = endTime - datetime.timedelta(hours=2)
        # startTime=datetime.datetime(2017,11,20,23)
        page = 1
        articleList = list()
        hasnext = True
        while hasnext:
            data = self.__searchByPage(keywordList, startTime, endTime, page)

            (articleListInPage, hasnext) = self.__parseSearchPage(data)

            articleList.extend(articleListInPage)
            page += 1
        if articleList is None:
            article_msg = '微博没有爬取到数据'
            self.email.send(self.monitor_title, article_msg)
            self.db.Insert(self.channel.channel_id, self.entityId, article_msg)
        end_msg = '微博爬虫结束'
        self.db.Insert(self.channel.channel_id, self.entityId, end_msg)
        return articleList

    def __searchByPage(self, keywordList, startTime, endTime, page):
        query = urllib.quote(' '.join(keywordList).encode('utf-8'))
        params = {
            'typeall':
            '1',
            'suball':
            '1',  # 包含全部
            'timescope':
            'custom:%s:%s' % (startTime.strftime("%Y-%m-%d"),
                              (endTime.strftime("%Y-%m-%d"))),  # 时间
            # 微博搜索的时间范围格式不同 不能写%Y-%m-%d-%H
            'Refer':
            'SWeibo_box',
            'page':
            page
        }
        user_agent = random.choice(AGENTS)
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            # 'Host': 's.weibo.com',
            # 'Referer': 'http://s.weibo.com/',
            'User-Agent': user_agent
        }
        index_url = 'http://s.weibo.com/weibo/' + query  # 搜索主页+

        usename_cookie = self.change_cookie()
        self.logger.debug('Use Cookie %s' % usename_cookie)
        try:
            cookies = self.__load_cookies_from_lwp(usename_cookie)
            html = self.session.get(index_url,
                                    params=params,
                                    headers=headers,
                                    cookies=cookies)  # 加载本地cookies
            lindex = html.find(
                '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"'
            )
            rindex = html[lindex:].find('</script>')
            rindex = lindex + rindex - 1

            lindex = lindex + len(
                '<script>STK && STK.pageletM && STK.pageletM.view(')
            jo = json.loads(html[lindex:rindex])
            data = jo['html']  # 实时微博页
            self.logger.debug('Get data')
            return data
        except Exception as e:
            self.logger.debug('ERROR %s' % e)
            loginFlag = self.__login()
            self.logger.debug('Use username: %s' % self.user_name)
            if loginFlag is False:
                self.logger.error('Fail to logon')
                login_msg = '微博登录失败'
                self.email.send(self.monitor_title, login_msg)
                self.db.Insert(self.channel.channel_id, self.entityId,
                               login_msg)
                return

            cookies = self.__load_cookies_from_lwp(self.user_name)
            self.logger.debug('Get a new Cookie: %s' % cookies)
            html = self.session.get(index_url,
                                    params=params,
                                    headers=headers,
                                    cookies=cookies)  # 加载本地cookies
            lindex = html.find(
                '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"'
            )
            rindex = html[lindex:].find('</script>')
            rindex = lindex + rindex - 1
            lindex = lindex + len(
                '<script>STK && STK.pageletM && STK.pageletM.view(')
            jo = json.loads(html[lindex:rindex])
            data = jo['html']  # 实时微博页
            return data

            # self.logger.warning('Crawler failed: %s' % e)
            # msg = '没有获取到json数据,说明微博爬虫挂了'
            # self.email.send(self.monitor_title,msg)
            # self.db.Insert(self.channel.channel_id,self.entityId,msg)

    def __parseSearchPage(self, data):
        '''
        @return: (articleList,hasnext)
        '''
        articleList = list()
        hasnext = False
        soup = BeautifulSoup(data, "lxml")
        self.logger.info(soup)
        # check if no result
        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return (articleList, hasnext)

        # find page bar to check if more

        pageDivList = soup.findAll('div', {'class': 'W_pages'})
        if len(pageDivList) > 0:
            pageDiv = pageDivList[0]
            if len(pageDiv.findAll('a',
                                   {'class': 'page next S_txt1 S_line1'})) > 0:
                hasnext = True
        if hasnext is False:
            self.logger.info('The last page')

        root_1 = soup.findAll('div', {"action-type": "feed_list_item"})
        # self.logger.debug(root_1)
        for r in root_1:
            root_2 = r.find('div', {'class': "content clearfix"})
            mid = r.attrs['mid']
            article_url = root_2.find('div', {
                'class': "feed_from W_textb"
            }).findNext('a').attrs['href']
            self.logger.debug('1  %s', article_url)
            if not article_url.startswith('http:'):
                article_url = 'http:' + article_url
            # self.logger.debug(article_url)
            root_content = root_2.find('p', {'class': "comment_txt"})

            long_content = root_content.find('a', {'action-type': "fl_unfold"})
            try:
                link_content = root_content.find('a').attrs['href']
                link_content = '  原文链接: ' + link_content
            except:
                link_content = ''
            if long_content:
                content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[
                    'action-data']
                self.session.randomSleep()
                response = self.session.get(content_url, textRspOnly=False)
                try:
                    content_html = response.json()['data']['html']
                    content = BeautifulSoup(
                        content_html, 'html.parser').text.strip().replace(
                            "'", "''").replace("%", "\%").replace(":", "\:")
                except Exception, e:
                    self.logger.debug('Exception: %s' % e)
                    continue
            else:
                content = root_content.text.strip().replace("'", "''").replace(
                    "%", "\%").replace(":", "\:")
                content = content + link_content
                # self.logger.error(content)

            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '
            author_id = r.attrs['tbinfo']
            author_id = re.findall(r'ouid=(\d+)', author_id)[0]
            author_name = root_2.find('a').attrs['nick-name']
            publish_datetime = root_2.find('a', {
                'class': "W_textb"
            }).attrs['date']

            try:
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(float(publish_datetime) / 1000))
            except:
                continue
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name)

            # fetch statistics
            root_3 = r.find('div', {'class': "feed_action clearfix"})
            soup_li_list = root_3.findAll('li')
            self.__parseStatistics(article, soup_li_list)
            # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \
            #     publish_datetime, reply_count, read_count, like_count, collect_count, forward_count
            if article not in articleList:
                articleList.append(article)
        return (articleList, hasnext)
Exemple #3
0
class WeiboCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.channel = channel
        # self.user_name = "*****@*****.**"
        # self.session = SessionCrawler(sleepRange=[3,8])
        # self.pass_word = "810214bee810214"
        # self.user_name = "15018377821"
        # self.pass_word = "zzm15331411"
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]
        self.logger.info('username: %s' % self.user_name)

        self.session = SessionCrawler(sleepRange=[3, 8])

    def get_username_password(self):
        '''
        随机获取用户和密码
        :return:
        '''
        user_name_password1 = '18814095644:ljda.18814095644'
        user_name_password2 = '13432881156:liang452035397'
        user_name_password3 = '[email protected]:810214bee810214'
        user_name_password4 = '15018377821:zzm15331411'
        user_name_password5 = '[email protected]:uwinvip'
        user_name_password6 = '15767199023:j980216'

        user_list = [
            user_name_password1, user_name_password2, user_name_password3,
            user_name_password4, user_name_password5, user_name_password6
        ]

        user_choice = random.choice(user_list)
        user_name_password = user_choice.split(':')
        return user_name_password

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''

        # time_now = time.time()
        # date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        # com_time_low = time.mktime(time.strptime(date + ' 07:00:00', '%Y-%m-%d %H:%M:%S'))
        # com_time_hig = time.mktime(time.strptime(date + ' 07:59:58', '%Y-%m-%d %H:%M:%S'))
        # if time_now > com_time_low and time_now < com_time_hig:
        # loginFlag = self.__login()
        # if loginFlag is False:
        #     self.logger.error('Fail to logon')
        #     return

        startTime = endTime - datetime.timedelta(hours=2)
        # startTime=datetime.datetime(2017,11,20,23)
        page = 1
        articleList = list()
        hasnext = True
        while hasnext:
            data = self.__searchByPage(keywordList, startTime, endTime, page)
            (articleListInPage, hasnext) = self.__parseSearchPage(data)

            articleList.extend(articleListInPage)
            page += 1
        return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        # return
        try:
            (data, check) = self.__fetchSingleArticle(article)

            if check == '0':
                soup = BeautifulSoup(data, 'lxml')
                ulList = soup.findAll(
                    'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'})
                li_list = ulList[0].findAll('li')
                self.__parseStatistics(article, li_list)
            elif check == '1':
                self.logger.warning(u'要访问的网页404了:%s', article.url)
                return
            else:
                self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url)
                return
        except:
            self.logger.error('Fail to fetch statistics for:%s, %s',
                              article.url, traceback.format_exc())
            return

    def __save_cookies_lwp(self, cookiejar):
        """
        保存cookies到本地
        """
        filename = 'sinaweibocookies'
        lwp_cookiejar = cookielib.LWPCookieJar()
        for c in cookiejar:
            args = dict(vars(c).items())
            args['rest'] = args['_rest']
            del args['_rest']
            c = cookielib.Cookie(**args)
            lwp_cookiejar.set_cookie(c)
        lwp_cookiejar.save(filename, ignore_discard=True)

    def __load_cookies_from_lwp(self):
        """
        读取本地cookies
        """
        filename = 'sinaweibocookies'
        lwp_cookiejar = cookielib.LWPCookieJar()
        lwp_cookiejar.load(filename, ignore_discard=True)
        # self.logger.debug(lwp_cookiejar)
        return lwp_cookiejar

    def __parseStatistics(self, article, soup_li_list):
        # 新版
        collect_count = soup_li_list[0].find('span').text
        collect_count = re.findall(r'\d+', collect_count)
        if len(collect_count) > 0:
            collect_count = int(collect_count[0])
        else:
            collect_count = 0
        forward_count = soup_li_list[1].find('span').text

        forward_count = re.findall(r'\d+', forward_count)
        if len(forward_count) > 0:
            forward_count = int(forward_count[0])
        else:
            forward_count = 0

        reply_count = soup_li_list[2].find('span').text
        reply_count = re.findall(r'\d+', reply_count)
        if len(reply_count) > 0:
            reply_count = int(reply_count[0])
        else:
            reply_count = 0

        like_count = soup_li_list[3].find('span').text
        like_count = re.findall(r'\d+', like_count)
        if len(like_count) > 0:
            like_count = int(like_count[0])
        else:
            like_count = 0
        article.statistics.reply_count = reply_count
        article.statistics.like_count = like_count
        article.statistics.collect_count = collect_count
        article.statistics.forward_count = forward_count

    def __parseSearchPage(self, data):
        '''
        @return: (articleList,hasnext)
        '''
        articleList = list()
        hasnext = False
        soup = BeautifulSoup(data, "lxml")
        # self.logger.info(soup)
        #check if no result
        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return (articleList, hasnext)

        #find page bar to check if more

        pageDivList = soup.findAll('div', {'class': 'W_pages'})
        if len(pageDivList) > 0:
            pageDiv = pageDivList[0]
            if len(pageDiv.findAll('a',
                                   {'class': 'page next S_txt1 S_line1'})) > 0:
                hasnext = True
        if hasnext is False:
            self.logger.info('The last page')

        root_1 = soup.findAll('div', {"action-type": "feed_list_item"})
        # self.logger.debug(root_1)
        for r in root_1:
            root_2 = r.find('div', {'class': "content clearfix"})
            mid = r.attrs['mid']
            article_url = root_2.find('div', {
                'class': "feed_from W_textb"
            }).findNext('a').attrs['href']
            self.logger.debug('1  %s', article_url)
            if not article_url.startswith('http:'):
                article_url = 'http:' + article_url
            # self.logger.debug(article_url)
            root_content = root_2.find('p', {'class': "comment_txt"})

            long_content = root_content.find('a', {'action-type': "fl_unfold"})
            try:
                link_content = root_content.find('a').attrs['href']
                link_content = '  原文链接: ' + link_content
            except:
                link_content = ''
            if long_content:
                content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[
                    'action-data']
                self.session.randomSleep()
                response = self.session.get(content_url, textRspOnly=False)
                try:
                    content_html = response.json()['data']['html']
                    content = BeautifulSoup(
                        content_html, 'html.parser').text.strip().replace(
                            "'", "''").replace("%", "\%").replace(":", "\:")
                except Exception, e:
                    self.logger.debug('Exception: %s' % e)
                    continue
            else:
                content = root_content.text.strip().replace("'", "''").replace(
                    "%", "\%").replace(":", "\:")
                content = content + link_content
                # self.logger.error(content)

            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '
            author_id = r.attrs['tbinfo']
            author_id = re.findall(r'ouid=(\d+)', author_id)[0]
            author_name = root_2.find('a').attrs['nick-name']
            publish_datetime = root_2.find('a', {
                'class': "W_textb"
            }).attrs['date']

            try:
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(float(publish_datetime) / 1000))
            except:
                continue
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name)

            #fetch statistics
            root_3 = r.find('div', {'class': "feed_action clearfix"})
            soup_li_list = root_3.findAll('li')
            self.__parseStatistics(article, soup_li_list)
            # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \
            #     publish_datetime, reply_count, read_count, like_count, collect_count, forward_count
            if article not in articleList:
                articleList.append(article)
        return (articleList, hasnext)
Exemple #4
0
class WeiboCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel=None, logger=None):
        '''
        Constructor
        '''
        # if logger is None:
        #         #     self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        #         # else:
        #         #     self.logger = logger
        self.logger = logging.getLogger()

        self.channel = channel
        self.user_name_password = self.get_username_password()
        self.user_name = self.user_name_password[0]
        self.pass_word = self.user_name_password[1]
        # self.logger.info('username: %s' % self.user_name)
        print 'username: %s' % self.user_name

        self.session = SessionCrawler(sleepRange=[3, 8])

    def get_username_password(self):
        '''
        随机获取用户和密码
        :return:
        '''
        # 超算1的微博账号
        # user_name_password1 = '18814095644:ljda.18814095644'
        # user_name_password2 = '13432881156:liang452035397'
        # user_name_password3 = '[email protected]:810214bee810214'
        # user_name_password4 = '15018377821:zzm15331411'
        # user_name_password5 = '[email protected]:uwinvip'

        # 超算2的微博账号
        # user_name_password1 = '13432881156:liang452035397'
        # user_name_password2 = '[email protected]:uwinvip'
        # user_name_password3 = '15767199023:j980216'
        # user_name_password1 = '13427287354:4ova7zixzj'
        # user_name_password2 = '13532011721:1emr41761u'
        # user_name_password3 = '13640792755:1eek9uuym4'
        # user_name_password4 = '13697726577:7hviv4old0'
        # user_name_password5 = '13794342903:6imuw2cdya'

        # 197的微博账号
        user_name_password1 = '17825769929:4ms7e2v3zx'
        user_name_password2 = '18211493432:7fagvqyi9p'
        user_name_password3 = '17827278983:0nenzag325'
        user_name_password4 = '13922771190:5aqa10wvwf'
        user_name_password5 = '15999916968:2i45j5b49y'

        user_list = [
            user_name_password1, user_name_password2, user_name_password3,
            user_name_password4, user_name_password5
        ]

        # user_list = [user_name_password1, user_name_password2, user_name_password3]

        user_choice = random.choice(user_list)
        user_name_password = user_choice.split(':')
        return user_name_password

    # 随机选取User-Agent
    def get_random_agent(self):
        user_agent = [
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;  Trident/5.0)",
        ]
        random_header = random.choice(user_agent)
        return random_header

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        self.__login()
        # startTime = endTime - datetime.timedelta(hours=2)
        # # startTime=datetime.datetime(2017,11,20,23)
        # page = 1
        # articleList = list()
        # hasnext = True
        # while hasnext:
        #     data = self.__searchByPage(keywordList, startTime,endTime, page)
        #     (articleListInPage,hasnext) = self.__parseSearchPage(data)
        #
        #     articleList.extend(articleListInPage)
        #     page+=1
        # return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        # return
        try:
            (data, check) = self.__fetchSingleArticle(article)

            if check == '0':
                soup = BeautifulSoup(data, 'lxml')
                ulList = soup.findAll(
                    'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'})
                li_list = ulList[0].findAll('li')
                self.__parseStatistics(article, li_list)
            elif check == '1':
                self.logger.warning(u'要访问的网页404了:%s', article.url)
                return
            else:
                self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url)
                return
        except:
            self.logger.error('Fail to fetch statistics for:%s, %s',
                              article.url, traceback.format_exc())
            return

    def __save_cookies_lwp(self, cookiejar):
        """
        保存cookies到本地
        """
        filename = 'sinaweibocookies'
        lwp_cookiejar = cookielib.LWPCookieJar()
        for c in cookiejar:
            args = dict(vars(c).items())
            args['rest'] = args['_rest']
            del args['_rest']
            c = cookielib.Cookie(**args)
            lwp_cookiejar.set_cookie(c)
        lwp_cookiejar.save(filename, ignore_discard=True)

    def __load_cookies_from_lwp(self):
        """
        读取本地cookies
        """
        filename = 'sinaweibocookies'
        lwp_cookiejar = cookielib.LWPCookieJar()
        lwp_cookiejar.load(filename, ignore_discard=True)
        return lwp_cookiejar

    def __parseStatistics(self, article, soup_li_list):
        # 新版
        collect_count = soup_li_list[0].find('span').text
        collect_count = re.findall(r'\d+', collect_count)
        if len(collect_count) > 0:
            collect_count = int(collect_count[0])
        else:
            collect_count = 0
        forward_count = soup_li_list[1].find('span').text

        forward_count = re.findall(r'\d+', forward_count)
        if len(forward_count) > 0:
            forward_count = int(forward_count[0])
        else:
            forward_count = 0

        reply_count = soup_li_list[2].find('span').text
        reply_count = re.findall(r'\d+', reply_count)
        if len(reply_count) > 0:
            reply_count = int(reply_count[0])
        else:
            reply_count = 0

        like_count = soup_li_list[3].find('span').text
        like_count = re.findall(r'\d+', like_count)
        if len(like_count) > 0:
            like_count = int(like_count[0])
        else:
            like_count = 0
        article.statistics.reply_count = reply_count
        article.statistics.like_count = like_count
        article.statistics.collect_count = collect_count
        article.statistics.forward_count = forward_count

    def __parseSearchPage(self, data):
        '''
        @return: (articleList,hasnext)
        '''
        articleList = list()
        hasnext = False
        soup = BeautifulSoup(data, "lxml")
        # self.logger.info(soup)
        #check if no result
        noResultDivList = soup.findAll('div', {'class': 'pl_noresult'})
        if len(noResultDivList) > 0:
            hasnext = False
            self.logger.info('No result')
            return (articleList, hasnext)

        #find page bar to check if more

        pageDivList = soup.findAll('div', {'class': 'W_pages'})
        if len(pageDivList) > 0:
            pageDiv = pageDivList[0]
            if len(pageDiv.findAll('a',
                                   {'class': 'page next S_txt1 S_line1'})) > 0:
                hasnext = True
        if hasnext is False:
            self.logger.info('The last page')

        root_1 = soup.findAll('div', {"action-type": "feed_list_item"})
        # self.logger.debug(root_1)
        for r in root_1:
            root_2 = r.find('div', {'class': "content clearfix"})
            mid = r.attrs['mid']
            article_url = root_2.find('div', {
                'class': "feed_from W_textb"
            }).findNext('a').attrs['href']
            self.logger.debug('1  %s', article_url)
            if not article_url.startswith('http:'):
                article_url = 'http:' + article_url
            # self.logger.debug(article_url)
            root_content = root_2.find('p', {'class': "comment_txt"})

            long_content = root_content.find('a', {'action-type': "fl_unfold"})
            try:
                link_content = root_content.find('a').attrs['href']
                link_content = '  原文链接: ' + link_content
            except:
                link_content = ''
            if long_content:
                content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[
                    'action-data']
                self.session.randomSleep()
                response = self.session.get(content_url, textRspOnly=False)
                try:
                    content_html = response.json()['data']['html']
                    content = BeautifulSoup(
                        content_html, 'html.parser').text.strip().replace(
                            "'", "''").replace("%", "\%").replace(":", "\:")
                except Exception, e:
                    self.logger.debug('Exception: %s' % e)
                    continue
            else:
                content = root_content.text.strip().replace("'", "''").replace(
                    "%", "\%").replace(":", "\:")
                content = content + link_content
                # self.logger.error(content)

            title = content[:30].replace("'", "''").replace("%", "\%").replace(
                ":", "\:") + ' '
            author_id = r.attrs['tbinfo']
            author_id = re.findall(r'ouid=(\d+)', author_id)[0]
            author_name = root_2.find('a').attrs['nick-name']
            publish_datetime = root_2.find('a', {
                'class': "W_textb"
            }).attrs['date']

            try:
                publish_datetime = time.strftime(
                    '%Y-%m-%d %H:%M:%S',
                    time.localtime(float(publish_datetime) / 1000))
            except:
                continue
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime,
                              url=article_url,
                              author_id=author_id,
                              author_name=author_name)

            #fetch statistics
            root_3 = r.find('div', {'class': "feed_action clearfix"})
            soup_li_list = root_3.findAll('li')
            self.__parseStatistics(article, soup_li_list)

            if article not in articleList:
                articleList.append(article)
        return (articleList, hasnext)
Exemple #5
0
class WeiXinCrawlerByTopic(object):
    def __init__(self, sleepRange, logger):
        '''
        Constructor
        '''
        self.logger = logger
        self.session = SessionCrawler(None, sleepRange, self.logger)

    def __querySogou(self, sougoUrl):
        '''
        Given the official account id, we expect to uniquely find one and only one item
        Return the url to the official account
        '''
        self.logger.debug('Query sougo %s', sougoUrl)
        content = self.session.get(sougoUrl, SOUGO_HEADER)
        soup = BeautifulSoup(content)
        ul = soup.findAll('ul',{'class':'news-list'})[0]
        liList = ul.findAll('li')
        for li in liList:
            aList = li.findAll('a')
            articleUrl = None
            for a in aList:
                if a['uigs'].startswith('article_title'):
                    articleUrl = a['href']
                    break
            if articleUrl is not None:
                header = WX_HEADER.copy()
                header['Referer'] = sougoUrl
                self.session.randomSleep()
                content = self.session.get(articleUrl, header)
                article = self.parseArticle(content)
                article.contentUrl = articleUrl
                queryComment(self.session, articleUrl, article)

    def parseArticle(self, content):
        soup = BeautifulSoup(content)
        article = Article()
        #content
        div = soup.findAll('div',{'class':'rich_media_content'})
        if len(div)==0:
            #it may be due to that this post has been banned
            self.logger.warn('No content')
            return None
            #raise Exception('No content for %s'% article.title)
        article.content = div[0].text

        #title in <title> ... </title>
        title = soup.findNext('title')
        
        article.title = title.text
        article.wid = hash(article.title)
        
        #find meta list
        divMeta = soup.findAll('div',{'class':'rich_media_meta_list'})[0]
        
        #post date
        emPostdate = divMeta.findAll('em',{'id':'post-date'})[0]
        article.publishDateTime = time.mktime(datetime.datetime.strptime(emPostdate.text, '%Y-%m-%d').timetuple())
        
        #author
        emAuthorList = divMeta.findAll('em',{'class':'rich_media_meta rich_media_meta_text'})
        for em in emAuthorList:
            if 'id' not in em.attrs:
                article.author = em.text
                break
        
        #profile
        divProfile = divMeta.findAll('div',{'class':'profile_inner'})[0]
        ##nickname
        strong = divProfile.findAll('strong',{'class':'profile_nickname'})[0]
        article.userName = strong.text
        ##userid
        article.userId = strong.findNext('span').text
        
        return article
Exemple #6
0
class WeiXinCralwer(object):
    '''
    classdocs
    '''


    def __init__(self, sleepRange, logger):
        '''
        Constructor
        '''
        self.logger = logger
        self.session = SessionCrawler(None, sleepRange, self.logger)

    def __querySogou(self, sougoUrl):
        '''
        Given the official account id, we expect to uniquely find one and only one item
        Return the url to the official account
        '''
        self.logger.debug('Query sougo %s', sougoUrl)
        content = self.session.get(sougoUrl, SOUGO_HEADER)
        soup = BeautifulSoup(content)
        item = soup.findAll('a',{'uigs':'account_name_0'})[0]
        return item['href']

    def __queryArticleList(self, sougoUrl, officialAccountUrl):
        self.logger.debug('Query ariticle list for %s', officialAccountUrl)
        header = WX_HEADER.copy()
        header['Referer'] = sougoUrl
        self.session.randomSleep()
        content = self.session.get(officialAccountUrl, header)
        lindex = content.find(WX_ARTICLE_LIST_PREFIX)+len(WX_ARTICLE_LIST_PREFIX)
        rindex = content.find(WX_ARTICLE_LIST_SUFFIX)
        rindex = lindex + content[lindex:rindex].rfind(';')
        js = content[lindex:rindex]
        jo = json.loads(js)
        aList = jo['list']
        articleList = list()
        for item in aList:
            app_msg_ext_info = item['app_msg_ext_info']
            comm_msg_info = item['comm_msg_info']            
            article = self.__fetchArticle(app_msg_ext_info)
            article.publishDateTime = comm_msg_info['datetime']
            articleList.append(article)
            if 'multi_app_msg_item_list' in item:
                for embedItem in item['multi_app_msg_item_list']:
                    article = self.__fetchArticle(embedItem)
                    article.publishDateTime = comm_msg_info['datetime']
                    articleList.append(article)
        return articleList
    
    def __fetchArticle(self, item):
        article = Article()
        article.title = item['title']
        article.wid = str(hash(article.title))
        article.author = item['author']
        article.contentUrl = item['content_url']
        article.digest = item['digest']
        article.fileid = item['fileid']
        article.sourceUrl = item['source_url']
        #print article.title,":",article.contentUrl,'\n'
        return article
    
    def __queryComment(self, articleList, referer):
        mainPageHeader = WX_HEADER.copy()
        mainPageHeader['Referer'] = referer
        for article in articleList:
            self.logger.debug('Query comment for %s', article.title)
            #find the signature
            lindex= article.contentUrl.find('signature=')
            rindex = article.contentUrl[lindex:].find('&')
            if rindex>0:
                signature = article.contentUrl[lindex:rindex]
            else:
                signature = article.contentUrl[lindex:]
            #find the timestamp
            lindex= article.contentUrl.find('timestamp=')+len('timestamp=')
            timestamp = int(article.contentUrl[lindex:lindex+10])
            self.session.randomSleep()
            #query main page
            mainUrl = 'http://mp.weixin.qq.com'+article.contentUrl.replace('&amp;','&')
            self.session.randomSleep()
            content = self.session.get(mainUrl, mainPageHeader)
            soup = BeautifulSoup(content)
            div = soup.findAll('div',{'class':'rich_media_content'})
            if len(div)==0:
                #it may be due to that this post has been banned
                self.logger.warn('No content for %s', article.title)
                continue
                #raise Exception('No content for %s'% article.title)
            article.content = div[0].text
            #query comment page
            currentTime = int(time.time())
            url = WX_COMMENT_URL % (timestamp, signature)
            #print url
            header = WX_COMMENT_HEADER.copy()
            header['Referer'] = mainUrl
            self.session.randomSleep()
            content = self.session.get(url, header)
            jo = json.loads(content)
            #print jo.keys()
            article.readCount = jo['read_num']
            article.likeCount = jo['like_num']
            commentList = jo['comment']
            for item in commentList:
                comment = Comment()
                comment.commenterNickName = item['nick_name']
                comment.likeCount = item['like_num']
                comment.content = item['content']
                comment.contentId = item['content_id']
                comment.createTime = item['create_time']
                for replyItem in item['reply']['reply_list']:
                    reply = Reply()
                    reply.content = replyItem['content']
                    reply.createTime = replyItem['create_time']
                    reply.uin = replyItem['uin']
                    reply.toUin = replyItem['to_uin']
                    reply.replyId = replyItem['reply_id']
                    comment.replyList.append(reply)
                article.commentList.append(comment)

    def crawl(self, officialAccountId):
        
        sougoUrl = SOUGO_QUERY_URL % (1, quote(officialAccountId))
        officialAccountUrl = self.__querySogou(sougoUrl)
        articleList = self.__queryArticleList(sougoUrl, officialAccountUrl)
        self.__queryComment(articleList, officialAccountUrl)
        return articleList
        #self.__writeCsv(officialAccountId+'.csv', articleList)

    def writeDb(self, dbConf, officialAccountId, articleList):
        dbProxy = MySqlProxy(host=dbConf['dbHost'], 
                             port=3306, user=dbConf['dbUser'], 
                             passwd=dbConf['dbPasswd'], db=dbConf['dbName'])
        weixinSql = 'INSERT INTO T_WEIXIN (pid, wid, author, title, digest, content, publish_datetime, read_count, like_count) values '
        commentSql = 'INSERT INTO T_WEIXIN_COMMENT(pid, cid, wid, content, publisher_name, publish_datetime,like_count) values '
        replySql = 'INSERT INTO T_WEIXIN_REPLY (rid, cid, content, publish_datetime, uin, touin) values '
        weixinValueList = list()
        commentValueList = list()
        replyValueList = list()
        widSet = set()
        for article in articleList:
            weixinValueList.append('("%s","%s","%s","%s","%s","%s","%s",%d,%d)'%(
                                    officialAccountId,
                                    str(article.wid),
                                    article.author.replace('"','\\"'),
                                    article.title.replace('"','\\"'),
                                    article.digest.replace('"','\\"'),
                                    article.content.replace('"','\\"'),
                                    time.strftime(ISOTIMEFORMAT, time.localtime(article.publishDateTime)),
                                    article.readCount,
                                    article.likeCount
                                        ))
            widSet.add(article.fileid)
            for comment in article.commentList:
                commentValueList.append('("%s","%s","%s","%s","%s","%s",%d)'%(
                                            officialAccountId,
                                            str(comment.contentId),
                                            str(article.wid),
                                            comment.content.replace('"','\\"'),
                                            comment.commenterNickName.replace('"','\\"'),
                                            time.strftime(ISOTIMEFORMAT, time.localtime(comment.createTime)),
                                            comment.likeCount
                                        ))
                for reply in comment.replyList:
                    replyValueList.append('("%s","%s","%s","%s","%s","%s")'%(
                                            str(reply.replyId),
                                            str(comment.contentId),
                                            reply.content.replace('"','\\"'),
                                            time.strftime(ISOTIMEFORMAT, time.localtime(reply.createTime)),
                                            reply.uin,
                                            reply.toUin
                                        ))

        #clear the db firstly
        sql = 'delete from T_WEIXIN where wid in (%s) and pid="%s"' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId)
        dbProxy.execute(sql)
        sql = 'delete from T_WEIXIN_REPLY where cid in (select cid from T_WEIXIN_COMMENT where wid in (%s) and pid="%s")' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId)
        dbProxy.execute(sql)
        sql = 'delete from T_WEIXIN_COMMENT where wid in (%s) and pid="%s"' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId)
        dbProxy.execute(sql)

        #insert to db
        if len(weixinValueList)>0:
            self.logger.info('Insert %d records to weixin', len(weixinValueList))
            dbProxy.execute(weixinSql +','.join(weixinValueList))
        if len(commentValueList)>0:
            self.logger.info('Insert %d records to comment', len(commentValueList))
            dbProxy.execute(commentSql +','.join(commentValueList))
        if len(replyValueList)>0:
            self.logger.info('Insert %d records to reply', len(replyValueList))
            dbProxy.execute(replySql +','.join(replyValueList))
        
    def __writeCsv(self, fileName, articleList):
        #f = codecs.open(fileName,"w","utf-8")
        csvfile = file(fileName,'w')
        csvfile.write(codecs.BOM_UTF8)
        writer = csv.writer(csvfile)
        header = ['Title', 'Digest', 'Author', 'readCount', 'likeCount', 'publishDateTime', 'Comment-NickName', 'Comment-Content', 'Comment-likeCount', 'Comment-CreateTime']
        writer.writerow(header)
        for article in articleList:
            writer.writerow(
                            (
                              article.title.encode('utf8'),
                              article.digest.encode('utf8'),
                              article.author.encode('utf8'),
                              article.readCount,
                              article.likeCount,
                              time.strftime(ISOTIMEFORMAT, time.localtime(article.publishDateTime)),
                              '',
                              '',
                              '',
                              ''
                             )
                            )
            for comment in article.commentList:
                writer.writerow(
                                (
                                    '',
                                    '',
                                    '',
                                    '',
                                    '',
                                    '',
                                    comment.commenterNickName.encode('utf8'),
                                    comment.content.encode('utf8'),
                                    comment.likeCount,
                                    time.strftime(ISOTIMEFORMAT, time.localtime(comment.createTime))
                                 )
                                )
        csvfile.close()
Exemple #7
0
class WetchartpublicCrawler(object):
    '''
    calssdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        构造函数
        :param channel:
        :param logger:
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.channel = channel

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        :param keywordList: 关键字数组
        :param endTime: 搜索时间范围结束
        :return:
        '''
        for keyword in keywordList:
            pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword)
            self.logger.debug('pageUrl:%s', pageUrl)
            response = self.session.get(pageUrl,
                                        textRspOnly=False,
                                        headers=HEADERS_SOGOU)
            soup = BeautifulSoup(response.text)
            main = soup.find('ul', {'class': "news-list2"})
            if main is None:
                self.logger.error('Fail to parse:%s', response.text)
            li_list = main.findAll('li')

            for li in li_list:
                a_title = li.find('p', {'class': "tit"})
                if a_title is not None:
                    title = str(a_title.text.strip())
                    if title == keyword:
                        a_href = a_title.find('a')['href']
                        self.logger.debug(a_href)
                        self.crawlWetchartpublic(a_href)

    def crawlWetchartpublic(self, url):
        response = self.session.get(url,
                                    textRspOnly=False,
                                    headers=HEADERS_SOGOU_PUBLIC)
        soup = BeautifulSoup(response.text)
        script_list = soup.findAll('script')
        for li in script_list:
            li_str = str(li.text)
            sub_str1 = "msgList = "
            sub_str2 = '}]};'
            if li_str.find(sub_str1) != -1:
                index1 = li_str.find(sub_str1)
                index2 = li_str.find(sub_str2)
                main = str(li.text)[index1 + len(sub_str1):index2 + 3]
                articleJson = json.loads(main)
                articlelist = articleJson['list']
                for item in articlelist:
                    mes_info = item['app_msg_ext_info']
                    url = 'https://mp.weixin.qq.com' + mes_info['content_url']
                    url = url.replace('amp;', '')
                    self.crawlArticle(url)
                    multi_item_list = mes_info['multi_app_msg_item_list']
                    for multi_item in multi_item_list:
                        multi_url = 'https://mp.weixin.qq.com' + multi_item[
                            'content_url']
                        multi_url = multi_url.replace('amp;', '')
                        self.crawlArticle(multi_url)

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        :return:返回一个article实例
        '''
        self.session.randomSleep()
        response = self.session.get(url, textRspOnly=False)

    def crawlStatistics(self, article):
        '''
        爬去统计信息
        :param article:
        :return: 无需返回参数,统计信息写入article实例
        '''
        pass

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        pass

    def crawlComment(self, article):
        '''
        根据文章,爬去文章的评论,返回评论列表
        :param article:
        :return:(commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        pass
Exemple #8
0
class ZhihuCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.baiduCrawler = BaiduCrawler(self.logger)
        self.session = SessionCrawler()
        self.channel = channel
        self.nextCommentUrl = None

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeIntSecond = time.mktime(startTime.timetuple())
        endTimeIntSecond = time.mktime(endTime.timetuple())
        urls = self.baiduCrawler.search(self.channel.url, keywordList,
                                        startTimeIntSecond, endTimeIntSecond)
        articleList = list()
        for baiduUrl in urls:
            url = self.__fetchRealUrlFromBaiduUrl(baiduUrl)
            article = self.crawlArticle(url)
            if article is not None and article not in articleList:
                #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                articleList.append(article)
        return articleList

    def __fetchRealUrlFromBaiduUrl(self, baiduUrl):
        '''
        '''
        response = self.session.session.get(baiduUrl, allow_redirects=False)
        if response.status_code == 302:
            return response.headers['Location']

    def crawlArticle(self, url):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        #判断url格式,因为从百度查询所得不一定是question,目前只爬question
        if url.find('question') < 0:
            self.logger.warn('Question supported only:%s', url)
            return None
        article_id = re.findall(r'question/(\d+)', url)[0]
        self.session.randomSleep()
        response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS)
        soup = BeautifulSoup(response)
        main = soup.find('div', attrs={'id': "data"}).attrs['data-state']
        articleJson = json.loads(main)
        questionJson = articleJson['entities']['questions'][article_id]
        title = questionJson['title']
        contentSoup = BeautifulSoup(questionJson['editableDetail'])
        content = contentSoup.text
        author_id = questionJson['author']['id']
        author_name = questionJson['author']['name']
        createTimeInFloat = questionJson['created']
        publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S',
                                         time.localtime(createTimeInFloat))
        reply_count = questionJson['commentCount']
        read_count = questionJson['visitCount']
        collect_count = questionJson['followerCount']
        article = Article(article_id, self.channel.channel_id, title, content,
                          publish_datetime, url, author_id, author_name)
        article.statistics.reply_count = reply_count
        article.statistics.read_count = read_count
        article.statistics.collect_count = collect_count
        return article

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        articleCopy = self.crawlArticle(article.url)
        article.statistics.reply_count = articleCopy.statistics.reply_count
        article.statistics.read_count = articleCopy.statistics.read_count
        article.statistics.collect_count = articleCopy.statistics.collect_count

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        self.nextCommentUrl = None

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        if self.nextCommentUrl is None:
            curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0)
            curl = curl.replace('#', '%')
        else:
            curl = self.nextCommentUrl
        self.session.randomSleep()
        result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS)
        jo = json.loads(result)
        paging = jo['paging']
        hasnext = not paging['is_end']
        self.nextCommentUrl = paging['next']
        dataList = jo['data']
        add_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        commentList = list()
        for data in dataList:
            #self.logger.debug('[Zhihu]Comment data keys:%s', data.keys())
            #self.logger.debug('[ZHIHU]Comment url for %s:%s', article.title, data['url'])
            publish_datetime = time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time']))
            comment = Comment(article.tid,
                              article.channel_id,
                              data['id'],
                              add_datetime,
                              publish_datetime,
                              ip_address=None,
                              location_country=None,
                              location_region=None,
                              location_city=None,
                              author_id=data['author']['id'],
                              author_name=data['author']['name'],
                              content=data['content'],
                              reply_author_id=None,
                              read_count=None,
                              like_count=data['voteup_count'],
                              reply_count=data['comment_count'],
                              dislike_count=None)
            commentList.append(comment)
        return (commentList, hasnext)
Exemple #9
0
class WechatCrawler(object):
    '''
    classdocs
    '''
    def __init__(self, channel, logger=None):
        '''
        Constructor
        '''
        if logger is None:
            self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT)
        else:
            self.logger = logger

        self.session = SessionCrawler(logger=self.logger)
        self.session_public = SessionCrawler(logger=self.logger)
        self.session_public_article = SessionCrawler(logger=self.logger)
        self.channel = channel
        self.entityId = 'SYSU'

    def searchArticle(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索文章
        @param keywordList: 关键字数组
        @param endTime: 搜索时间范围结束
        '''
        startTime = endTime - datetime.timedelta(
            days=self.channel.search_ranges)
        startTimeStr = startTime.strftime('%Y-%m-%d')
        endTimeStr = endTime.strftime('%Y-%m-%d')
        self.logger.debug('startTime:%s', startTimeStr)
        self.logger.debug('endTime:%s', endTimeStr)
        urlList = list()
        publicList = self.getPublic()
        articleList = self.searchPublic(publicList, endTime)
        for keyword in keywordList:
            # 忽略第一次,第一次不带时间范围
            pageUrl = (SOUGO_WEIXIN_URL_INIT % (urllib.quote(
                keyword.encode('utf-8')), int(time.time() * 1000))).replace(
                    '#', '%')
            self.logger.debug('pageUrl:%s', pageUrl)
            self.session.randomSleep()
            response = self.session.get(pageUrl,
                                        textRspOnly=False,
                                        headers=HEADERS_SOGOU)
            lastPageUrl = pageUrl
            pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote(
                keyword.encode('utf-8')), startTimeStr, endTimeStr)
            pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote(
                keyword.encode('utf-8')))
            self.logger.debug('pageUrl:%s', pageUrl)
            while True:
                # 用上一次url作为这次的referer
                headers = HEADERS_SOGOU.copy()
                headers['Referer'] = lastPageUrl
                self.session.randomSleep()
                response = self.session.get(pageUrl,
                                            textRspOnly=False,
                                            headers=headers)
                soup = BeautifulSoup(response.text, 'lxml')
                main = soup.find('ul', {'class': "news-list"})
                if main is None:
                    self.logger.error('Fail to parse:%s', response.text)
                    return []
                li_list = main.findAll('li')
                #li_list有可能为空,但还可以翻页
                for li in li_list:
                    a_list = li.findAll('a')
                    for a in a_list:
                        if a['uigs'].startswith('article_title'):
                            #self.logger.debug('Article title:%s',a.text)
                            urlList.append((a['href'], pageUrl, a.text))
                            break
                pageBarList = soup.findAll('div', {'id': 'pagebar_container'})
                if len(pageBarList) == 0:
                    #没有翻页,直接退出
                    break
                pageBar = pageBarList[0]
                aList = pageBar.findAll('a')
                foundNextPage = False
                for a in aList:
                    if a['uigs'] == 'page_next':
                        foundNextPage = True
                        lastPageUrl = pageUrl
                        pageUrl = SOGOU_URL + a['href']
                        self.logger.debug('Found next page:%s', a.text)
                        break
                if foundNextPage is False:
                    break
        for item in urlList:
            article = self.crawlArticle(item[0],
                                        referer=item[1],
                                        title=item[2],
                                        flag=0)
            if article is not None:
                if article not in articleList:
                    #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重
                    articleList.append(article)
        return articleList

    def searchPublic(self, keywordList, endTime):
        '''
        根据关键字数组,开始时间和结束时间范围搜索公众号
        :param keywordList:
        :param endTime:
        :return:
        '''
        articleList = list()
        for keyword in keywordList:
            self.logger.debug(keyword)
            pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword[0])
            self.logger.info('pageUrl:%s', pageUrl)
            self.session.randomSleep()
            response = self.session.get(pageUrl,
                                        textRspOnly=False,
                                        headers=HEADERS_SOGOU)
            soup = BeautifulSoup(response.text, 'lxml')
            main = soup.find('ul', {'class': "news-list2"})

            if main is None:
                self.logger.error('Fail to parse:%s', response.text)
            li_list = main.findAll('li')

            for li in li_list:
                a_title = li.find('p', {'class': "tit"})
                if a_title is not None:
                    title = str(a_title.text.strip())
                    if title == keyword[0]:
                        self.logger.debug(title)
                        a_href = a_title.find('a')['href']
                        sub_articleList = self.crawlWetchartpublic(a_href)
                        for article in sub_articleList:
                            articleList.append(article)
        return articleList

    def crawlWetchartpublic(self, url):
        '''
        按公众号爬取文章
        :param url:
        :return:
        '''
        self.logger.debug(url)
        self.session_public.randomSleep()
        response = self.session_public.get(url,
                                           textRspOnly=False,
                                           headers=HEADERS_ARTICLE)
        soup = BeautifulSoup(response.text, 'lxml')
        self.logger.debug(soup)
        script_list = soup.findAll('script')
        # if len(script_list) == 0:
        # script_list = self.isCrawlerPublic(url)

        articleList = list()
        for li in script_list:
            li_str = str(li.text)
            sub_str1 = "msgList = "
            sub_str2 = '}]};'
            if li_str.find(sub_str1) != -1:
                index1 = li_str.find(sub_str1)
                index2 = li_str.find(sub_str2)
                main = str(li.text)[index1 + len(sub_str1):index2 + 3]
                articleJson = json.loads(main)
                articlelistJson = articleJson['list']
                for item in articlelistJson:
                    mes_info = item['app_msg_ext_info']
                    url = 'https://mp.weixin.qq.com' + mes_info['content_url']
                    url = url.replace('amp;', '')
                    self.logger.debug('article_url:' + url)
                    article = self.crawlArticle(url, flag=1)
                    articleList.append(article)
                    multi_item_list = mes_info['multi_app_msg_item_list']
                    for multi_item in multi_item_list:
                        multi_url = 'https://mp.weixin.qq.com' + multi_item[
                            'content_url']
                        multi_url = multi_url.replace('amp;', '')
                        self.logger.debug('article_url:' + multi_url)
                        article = self.crawlArticle(multi_url, flag=1)
                        if article is not None:
                            articleList.append(article)
        return articleList

    def crawlStatistics(self, article):
        '''
        爬取统计信息
        @return: 无需返回参数,统计信息写入article实例
        '''
        pass

    def crawlArticle(self, url, **kwargs):
        '''
        根据url爬取文章内容和统计信息
        @return: 返回一个Article实例
        '''
        #TBD, 转发情况目前先不考虑
        searchTitle = ''
        referer = None
        if (kwargs['flag'] == 0):
            if 'referer' not in kwargs:
                return None
            if 'title' in kwargs:
                searchTitle = kwargs['title']
            else:
                searchTitle = ''
        try:
            self.session_public_article.randomSleep()
            if (kwargs['flag'] == 0):
                referer = kwargs['referer']
                headers = HEADERS_ARTICLE.copy()
                headers['Referer'] = referer
                response = self.session_public_article.get(
                    url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE)
            else:
                response = self.session_public_article.get(
                    url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE)
            mid = re.findall(r'var mid = .*"(\d+)";',
                             response.text)[0] + '-' + re.findall(
                                 r'var idx = .*"(\d+)";', response.text)[0]
            soup = BeautifulSoup(response.text, 'lxml')
            main = soup.find('div', {'id': "img-content"})
            title = main.find('h2').text.strip()
            content = main.find('div', {'id': "js_content"}).text.strip()
            profile = main.find('div', {'class': "profile_inner"})
            author_id = profile.find('span').text.strip()
            author_name = profile.find('strong').text.strip()
            try:
                publish_datetime = main.find('em', {
                    'id': "post-date"
                }).text.strip() + ' 00:00:00'
                publish_datetime = datetime.datetime.strptime(
                    publish_datetime, '%Y-%m-%d %H:%M:%S')
            except:
                self.logger.warn(
                    'Fail to parse publish_datetime, use current time as time')
                publish_datetime = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
            article = Article(mid,
                              self.channel.channel_id,
                              title,
                              content,
                              publish_datetime=publish_datetime,
                              url=url,
                              author_id=author_id,
                              author_name=author_name,
                              meta_info='{refer="%s"}' % referer)
            self.logger.debug('Successfully parse article:%s', title)

            return article
        except:
            self.logger.error('Fail to get article for %s: %s due to %s', url,
                              searchTitle, traceback.format_exc())
            return None

    def refreshSearch(self):
        '''
        重置搜索
        '''
        pass

    def refreshCommentCrawler(self):
        '''
        重置评论爬取
        '''
        pass

    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        return (list(), False)

    def getPublic(self):
        self.conf = self.__readDbConf()
        publictablename = Constants.TABLE_SA_WETCHARTPUBLIC + Constants.TABLE_NAME_DELIMITER + self.entityId
        sql = '''
                SELECT public_name FROM %s
              ''' % (publictablename)
        dbProxy = MySqlProxy(self.conf[CONF_FILE_DBCONF][CONF_FILE_DBHOST],
                             3306,
                             self.conf[CONF_FILE_DBCONF][CONF_FILE_DBUSER],
                             self.conf[CONF_FILE_DBCONF][CONF_FILE_DBPASS],
                             self.conf[CONF_FILE_DBCONF][CONF_FILE_DBNAME])
        # dbProxy = MySqlProxy('localhost', 3306, 'root', 'zzm15331411', 'sentiment_re')
        # dbProxy = MySqlProxy('112.124.47.197', 3306, 'test', 'test', 'sa2')
        dbProxy.execute(sql)
        resultList = dbProxy.fetchall()
        # resultList = [(u'今日中大',),]
        return resultList

    def __readDbConf(self):
        fileName = os.path.join('conf', CONF_FILE_NAME)
        c = Configuration(fileName)
        return c.readConfig()

    def getProxyPageList(self, url):
        url_list = []
        for i in range(1, 100):
            url_new = url + str(i)
            url_list.append(url_new)

        return url_list

    def getProxyIpList(self, url):
        response = self.session.get(url,
                                    textRspOnly=False,
                                    headers=PROXY_IP_URL_HEADER,
                                    timeout=2)
        host_list = etree.HTML(response.text).xpath(
            '//table[contains(@id,"ip_list")]/tr/td[2]/text()')
        port_list = etree.HTML(response.text).xpath(
            '//table[contains(@id,"ip_list")]/tr/td[3]/text()')
        ip_list = list()
        for i in range(0, len(host_list)):
            ip = host_list[i] + r':' + port_list[i]
            ip_list.append(ip)
        return ip_list

    def verifyIp(self, ip):
        proxy = {'http': 'http://' + ip}
        proxy_handler = urllib2.ProxyHandler(proxy)
        opener = urllib2.build_opener(proxy_handler)
        urllib2.install_opener(opener)

        test_url = 'http://www.baidu.com'
        req = urllib2.Request(url=test_url, headers=PROXY_IP_URL_HEADER)
        try:
            res = urllib2.urlopen(req)
            content = res.read()
            if content:
                self.logger.debug(r'https://' + ip + ' is OK')
            else:
                self.logger.debug(r'https://' + ip + ' is BAD')
                ip = ""
            return ip
        except urllib2.URLError as e:
            self.logger.debug(r'https://' + ip + ' ' + str(e.reason))
            return ""
        except:
            self.logger.debug(r'https://' + ip + ' Other Error')
            return ""

    def getVaildIp(self):
        page_url_list = self.getProxyPageList(PROXY_IP_URL)
        ip_list = list()
        for page_url in page_url_list:
            page_ip_list = self.getProxyIpList(page_url)
            for ip in page_ip_list:
                ip = self.verifyIp(ip)
                if ip != "":
                    ip_list.append(ip)
        return ip_list

    def isCrawlerPublic(self, url):
        data_script_list = list()
        page_url_list = self.getProxyPageList(PROXY_IP_URL)
        flag = 0
        for page_url in page_url_list:
            page_ip_list = self.getProxyIpList(page_url)
            flag = 0
            for ip in page_ip_list:
                flag = 0
                ip = self.verifyIp(ip)
                if ip != "":
                    # self.session_public.randomSleep()
                    proxy = {'http': r'http://' + ip}
                    try:
                        response = self.session_public.get(
                            url,
                            textRspOnly=True,
                            headers=HEADERS_ARTICLE,
                            proxies=proxy)
                        soup = BeautifulSoup(response.text, 'lxml')
                        self.logger.debug(response.text)
                        script_list = soup.findAll('script')
                        if len(script_list) != 0:
                            flag = 0
                            for li in script_list:
                                li_str = str(li.text)
                                sub_str1 = "msgList = "
                                if li_str.find(sub_str1) != -1:
                                    data_script_list.append(li)
                                    flag = 1
                                    break
                            if flag == 1:
                                break
                    except:
                        self.logger.debug(
                            'The ip can not be used to crawler public')
            if flag == 1:
                break
        if (flag == 0):
            self.logger.debug('The ip can not be used to crawler public')

        return data_script_list