コード例 #1
0
ファイル: test_threadPool.py プロジェクト: LYQCOOL/neteasy
 def __init__(self, workqueue):
     threading.Thread.__init__(self)
     self.workQueue = workqueue
     self.setDaemon(True)
     self.start()
     self.mongodb = MongoDB()
     self.checkMongoDB = TempMongoDB()
コード例 #2
0
ファイル: test_threadPool.py プロジェクト: LYQCOOL/neteasy
 def __init__(self, workQueue, saveQueue, timeout=30):
     threading.Thread.__init__(self)
     self.timeout = timeout
     self.setDaemon(True)
     self.workQueue = workQueue
     self.saveQueue = saveQueue
     self.mongodb = MongoDB()
     self.start()
コード例 #3
0
ファイル: ReNews.py プロジェクト: LYQCOOL/Souhu
 def __init__(self):
     self.comment = NewsComment()
     self.mongo = MongoDB()
     self.conn = MySQLdb.connect('localhost',
                                 'root',
                                 '1995',
                                 'newsurl',
                                 charset='utf8',
                                 use_unicode=True)
     self.cursor = self.conn.cursor()
     self.message = NewsMessage()
コード例 #4
0
ファイル: gen_zong.py プロジェクト: LYQCOOL/xinhua
 def __init__(self):
     self.comment = NewsComment()
     self.mongo = MongoDB()
     self.huan = huanCun()
コード例 #5
0
class NewsComment(object):
    def __init__(self):
        self.mongo = MongoDB()

    def run(self, news_url, page):
        bu = re.split(r'c_|.htm', news_url)[1]
        comment_url = 'http://comment.home.news.cn/a/newsCommAll.do?&newsId=1-%s&pid=%d' % (
            bu, page)
        json_object = dict()
        comment_dict = dict()
        flag = 1
        while 1:
            try:
                json_object = json.loads(
                    requests.get(comment_url, timeout=30).content.replace(
                        'var commentJsonVarStr___=', '')[:-1])
                break
            except Exception as e:
                flag += 1
                print "获取评论错误:", e

            if flag > 5:
                return
        for item in json_object['contentAll']:
            # 评论文章url
            news_url = news_url

            # 评论内容
            ping_lun_nei_rong = item["content"]
            comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

            # 评论时间
            ping_lun_shi_jian = item["commentTime"]
            comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

            # 回复数量
            hui_fu_shu = None
            comment_dict['hui_fu_shu'] = hui_fu_shu

            # 点赞数量
            dian_zan_shu = item["upAmount"]
            comment_dict['dian_zan_shu'] = dian_zan_shu

            # 评论id
            ping_lun_id = item["userId"]
            comment_dict['ping_lun_id'] = ping_lun_id

            # 用户昵称
            yong_hu_ming = item["nickName"]
            comment_dict['yong_hu_ming'] = yong_hu_ming

            # 性别
            xing_bie = None
            comment_dict['xing_bie'] = xing_bie

            # 用户等级
            yong_hu_deng_ji = None
            comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji

            # 用户省份
            yong_hu_sheng_fen = item["ipInfo"]
            comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen

            # 抓取时间
            do_time = time.time()
            comment_dict['do_time'] = do_time

            # 抓取网站
            zhan_dian = u'新华网'
            comment_dict['zhan_dian'] = zhan_dian

            # 主键
            comment_dict['_id'] = str(ping_lun_id) + news_url

            #print json.dumps(comment_dict, ensure_ascii=False, indent=4)
            self.mongo.put_comment(comment_dict)
コード例 #6
0
ファイル: NewsMessage.py プロジェクト: LYQCOOL/Souhu
class NewsMessage(object):
    def __init__(self):
        self.comment = NewsComment()
        self.mongo = MongoDB()
        # self.update = update_uel()
        # self.conn = MySQLdb.connect('localhost', 'root', '1995', 'newsurl', charset='utf8', use_unicode=True)
        # self.cursor = self.conn.cursor()

    def getNewsMessage(self):
        count = 0
        for news_url in NewsUrl.Run():
            req = urllib2.Request(news_url)
            try:
                urllib2.urlopen(req)
            except urllib2.URLError, e:
                if hasattr(e, 'code'):
                    print 'Error code: ', e.code
                elif hasattr(e, 'reason'):
                    print 'Reason: ', e.reason
                continue

            re_ = 'http://sports.sohu.com/\d*?/[n]\d*?.shtml'
            if (re.match(re_, news_url)):
                print news_url
                html = ''
                flag = 1
                while 1:
                    try:
                        html = requests.get(news_url, timeout=30)
                        html.encoding = 'gb2312'
                        break
                    except Exception as e:
                        flag += 1
                        print e
                    if flag > 10:
                        return

                soup = BeautifulSoup(html.text, 'html.parser')
                re_ = '.*[n](\d*?).shtml'
                _id = re.match(re_, news_url).group(1)
                title = soup.find_all('title')[0].text
                if (title == "404,您访问的页面已经不存在!"):
                    continue
                """这一段代码是用来获取阅读数和评论数的"""
                comment_number = self.getCommentNumber(news_url, _id)
                if comment_number:
                    yue_du_shu = comment_number[0]
                    ping_lun_shu_liang = comment_number[1]
                else:
                    yue_du_shu = 0
                    ping_lun_shu_liang = 0
                # select_sql = """
                #               select ping_lun_shu_liang from news where url = %s"""
                # if (self.cursor.execute(select_sql, news_url)):
                #     data = self.cursor.fetchone()
                #     # print  data[0]
                #     if (data[0] == ping_lun_shu_liang):
                #         continue
                # else:
                message_dict = dict()
                ping_dic = dict()
                # 发布时间

                # shijian1 = tiongoe.strftime('%Y-%m-%d', time.localtime(time.time() - 2 * 24 * 60 * 60))
                shijian = time.strftime('%Y-%m-%d',
                                        time.localtime(time.time()))
                fa_bu_shi_jian = soup.find_all(id='pubtime_baidu')[0].text
                if (not re.search(shijian, fa_bu_shi_jian)):
                    continue

                message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian
                # print fa_bu_shi_jian

                # 文章网址
                wen_zhang_wang_zhi = news_url
                message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi

                # 文章标题
                wen_zhang_biao = soup.title.string.encode('utf-8')
                # print wen_zhang_biao
                wen_ = '(.*?)\-.*?'
                wen_zhang_biao_ti = re.search(wen_, wen_zhang_biao).group(1)
                # print  wen_zhang_biao_ti
                message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti

                # 评论数量
                ping_lun_shu_liang = ping_lun_shu_liang
                message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang

                # 文章来源

                wen_zhang_lai_yuan = soup.find_all(
                    id="media_span")[0].text.encode('utf-8')

                message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan

                # 文章正文
                li = []
                for i in soup.select("div#contentText"):
                    for wen_zhang_zheng_wen in i.select('p'):
                        li.append(wen_zhang_zheng_wen.text.encode('utf-8'))
                message_dict['wen_zhang_zheng_wen'] = ",".join(li)

                # 抓取时间
                do_time = time.time()
                message_dict['do_time'] = do_time

                # 抓取网站
                zhan_dian = u'搜狐网'
                message_dict['zhan_dian'] = zhan_dian

                # 图片链接n
                tu_pian_lian_jie = None
                message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie

                # 文章栏目
                wen_zhang_lan_mu = u'搜狐体育' + soup.select(
                    "div#mypos")[0].text.encode('utf-8')

                try:
                    message_dict[
                        'wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace(
                            '>', '->')
                except Exception as e:
                    print e
                    message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu

                # 文章作者
                wen_zhang_zuo_zhe = soup.find_all(
                    id="author")[0].text.encode('utf-8')
                message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe

                # 关键词
                guan_jian_ci = None
                message_dict['guan_jian_ci'] = guan_jian_ci

                # 相关标签
                xiang_guan_biao_qian = None
                message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian

                # 阅读数量
                yue_du_shu = yue_du_shu
                message_dict['yue_du_shu'] = yue_du_shu

                # 主键
                message_dict['_id'] = _id + '|_|' + news_url

                count += 1
                # print count
                ping_dic['url'] = news_url
                ping_dic['_id'] = _id
                ping_dic['ping_lun_shu_liang'] = ping_lun_shu_liang
                # self.update.process_item(ping_dic)

                # print json.dumps(message_dict, ensure_ascii=False, indent=4)
                self.mongo.put_content(message_dict)

                flag1 = 0
                if ping_lun_shu_liang > 0:
                    all_page = int(math.ceil(ping_lun_shu_liang / 10.0))
                    for page in xrange(1, all_page + 1):
                        try:
                            self.comment.run(news_url, _id, page)
                        except Exception as e:
                            print e
                            self.comment.run(news_url, _id, page)
                            continue
            else:
                print news_url
                html1 = ''
                flag = 1
                while 1:
                    try:
                        html1 = requests.get(news_url, timeout=30)
                        html1.encoding = 'utf-8'
                        break
                    except Exception as e:
                        flag += 1
                        print e
                    if flag > 10:
                        return
                tree = etree.HTML(html1.text)
                soup = BeautifulSoup(html1.text, 'html.parser')
                # print soup.text
                re_ = "http://www.sohu.com/a/(\d*?)\_"
                title = soup.find_all('title')[0].text
                if (title == "404,您访问的页面已经不存在!"):
                    continue
                # print soup.select("#mp-comment")
                if (soup.select("#mp-comment") != []):
                    _id = soup.select("#mp-comment")[0]['sid'].encode("utf-8")
                    # print _id
                    if (int(_id) == 0):
                        _id = 'mp_' + re.search(re_, news_url).group(1)
                else:
                    continue
                """这一段代码是用来获取阅读数和评论数的"""
                comment_number = self.getCommentNumber(news_url, _id)
                if comment_number:
                    yue_du_shu = comment_number[0]
                    ping_lun_shu_liang = comment_number[1]
                else:
                    yue_du_shu = 0
                    ping_lun_shu_liang = 0
                # select_sql = """  select ping_lun_shu_liang from news where url = %s"""
                # if(self.cursor.execute(select_sql, news_url)):
                #     data = self.cursor.fetchone()
                #     #print  data[0]
                #     if (data[0] == ping_lun_shu_liang):
                #         continue
                # else:

                message_dict = dict()
                ping_dic = dict()
                # 发布时间
                shijian = time.strftime('%Y-%m-%d',
                                        time.localtime(time.time()))
                try:
                    fa_bu_shi_jian = soup.select('span#news-time')[0].text
                except:
                    fa_bu_shi_jian = soup.select('span.time')[0].text
                if (not re.search(shijian, fa_bu_shi_jian)):
                    continue

                message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian
                # 文章网址
                wen_zhang_wang_zhi = news_url
                message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi

                # 文章标题
                wen_zhang_biao = soup.title.string.encode('utf-8')
                wen_ = '(.*?)\_.*?'
                try:
                    wen_zhang_biao_ti = re.search(wen_,
                                                  wen_zhang_biao).group(1)
                    message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti
                except:
                    message_dict['wen_zhang_biao_ti'] = "无"

                # 评论数量
                ping_lun_shu_liang = ping_lun_shu_liang
                message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang

                # 文章来源
                try:
                    wen_zhang_lai_yuan = soup.select(
                        "#user-info h4 a")[0].text.encode('utf-8')
                    message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan
                except:
                    message_dict['wen_zhang_lai_yuan'] = "空"

                # 文章正文
                li = []
                for i in soup.select("article.article"):
                    for wen_zhang_zheng_wen in i.select('p'):
                        li.append(wen_zhang_zheng_wen.text.encode('utf-8'))
                message_dict['wen_zhang_zheng_wen'] = ','.join(li)
                # 抓取时间
                do_time = time.time()
                message_dict['do_time'] = do_time

                # 抓取网站
                zhan_dian = u'搜狐网'
                message_dict['zhan_dian'] = zhan_dian

                # 图片链接
                if (not soup.select('.article img')):
                    tu_pian_lian_jie = None
                    message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie
                else:
                    tu_pian = soup.select('.article img')
                    tu = []
                    for tu_pian_lian_jie in tu_pian:
                        if (not re.search('http', tu_pian_lian_jie['src'])):
                            tu.append("http:" + tu_pian_lian_jie['src'])

                        else:
                            tu.append(tu_pian_lian_jie['src'])
                    message_dict['tu_pian_lian_jie'] = " ".join(tu)

                # 文章栏目
                try:
                    wen_zhang_lan_mu = soup.select(
                        ".location.area")[0].text.encode('utf-8')
                except:
                    wen_zhang_lan_mu = ""
                try:
                    message_dict[
                        'wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace(
                            '>', '->')
                except Exception as e:
                    print e
                    message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu

                # 文章作者
                wen_zhang_zuo_zhe = None
                message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe

                # 关键词
                guan_jian_ci = None
                message_dict['guan_jian_ci'] = guan_jian_ci

                # 相关标签
                xiang_guan_biao_qian = None
                message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian

                # 阅读数量
                yue_du_shu = yue_du_shu
                message_dict['yue_du_shu'] = yue_du_shu

                # 主键
                message_dict['_id'] = _id + '|_|' + news_url

                count += 1

                # print count
                ping_dic['url'] = news_url
                ping_dic['_id'] = _id
                ping_dic['ping_lun_shu_liang'] = ping_lun_shu_liang
                # self.update.process_item(ping_dic)

                # print json.dumps(message_dict, ensure_ascii=False, indent=4)

                self.mongo.put_content(message_dict)
                flag2 = 0
                if ping_lun_shu_liang > 0:
                    all_page = int(math.ceil(ping_lun_shu_liang / 10.0))
                    for page in xrange(1, all_page + 1):
                        try:
                            self.comment.run(news_url, _id, page)
                        except Exception as e:
                            print e
                            self.comment.run(news_url, _id, page)
                            continue
コード例 #7
0
ファイル: NewsMessage.py プロジェクト: LYQCOOL/xinhua
 def __init__(self):
     self.comment = NewsComment()
     self.mongo = MongoDB()
     self.huan = huanCun()
     '''self.genzong = genZong()'''
コード例 #8
0
ファイル: NewsMessage.py プロジェクト: LYQCOOL/xinhua
class NewsMessage(object):
    def __init__(self):
        self.comment = NewsComment()
        self.mongo = MongoDB()
        self.huan = huanCun()
        '''self.genzong = genZong()'''

    def getNewsMessage(self):
        '''self.genzong.run()'''
        for news_url in NewsUrl.Run():
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0",
                "Host": re.split(r"/", news_url)[2],
            }
            html = ''
            flag = 1
            while 1:
                try:
                    html = requests.get(news_url, headers=headers,
                                        timeout=30).content
                    break
                except Exception as e:
                    flag += 1
                    print "RREQUESTERROR", e
                    print "URL:" + news_url
                if flag > 10:
                    return
            tree = etree.HTML(html)
            '''这一段代码是用来获取阅读数和评论数的'''
            comment_number = self.getCommentNumber(news_url)
            if comment_number:
                yue_du_shu = comment_number[0]
                ping_lun_shu_liang = comment_number[1]
            else:
                yue_du_shu = 0
                ping_lun_shu_liang = 0

            message_dict = dict()
            # 文章网址
            wen_zhang_wang_zhi = news_url
            message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi

            # 文章标题.//*[@class="h-title"]/text()
            wen_zhang_biao_ti = pathOneNode(
                tree,
                ".//*[@class='h-title']/text()|.//*[@class='btt']/h1/text()|.//*[@class='tit']/h1/text()|.//*[@class='sm01']/text()|.//*[@id='title']/text()"
            )
            message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti

            # 发布时间
            fa_bu_shi_jian = pathOneNode(
                tree,
                ".//*[@class='h-time']/text()|.//*[@class='time']/text()|.//*[@class='gnxx']/div[2]/text()|.//*[@class='tm']/text()|.//*[@class='sm02']/text()|.//*[@id='pubtime']/text()"
            )
            message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian

            # 评论数量 '''
            ping_lun_shu_liang = ping_lun_shu_liang
            message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang

            # 文章来源.//*[@id="source"]/text()
            try:
                wen_zhang_lai_yuan = tree.xpath(
                    ".//*[@class='ly']/a/text()|.//*[@class='gnxx']/div[1]/text()|.//*[@class='sus']/a/text()|.//*[@class='sm02']/a/text()|.//*[@id='source']/text()"
                )[-1].replace(u'来源:', '').replace('\r\n', '').replace(' ', '')
            except:
                wen_zhang_lai_yuan = pathAllNode(tree,
                                                 ".//*[@id='source']//text()")
            message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan

            # 文章正文.//*[@id='xhw']//p
            try:
                wen_zhang_zheng_wen = tree.xpath(
                    ".//*[@id='p-detail']//p/text()|.//*[@class='content']//p/text()|.//*[@id='content']//p/text()|.//*[@id='content']//p/text()"
                )
            except:
                wen_zhang_zheng_wen = pathAllNode(tree, ".//*[@id='xhw']")
            zheng_wen = ''
            for i in wen_zhang_zheng_wen:
                zheng_wen = zheng_wen + i.replace(u' ', '').replace(
                    '\r\n', '').replace(' ', '')
            message_dict['wen_zhang_zheng_wen'] = zheng_wen

            # 抓取时间
            do_time = time.time()
            message_dict['do_time'] = do_time

            # 抓取网站
            zhan_dian = u'新华网'
            message_dict['zhan_dian'] = zhan_dian

            # 图片链接
            photo_URL_qian = re.findall(
                r'http://[a-z|A-Z]+.xinhuanet.com/[a-z|A-Z]+/\d+-\d+/\d+/|http://ent.news.cn/\d+-\d+/\d+/|http://www.sc.xinhuanet.com/[a-z|A-Z]+/\d+-\d+/\d+/',
                news_url)[0]
            tu_pian_lian = ''
            try:
                tu_pian_lian_jie = tree.xpath(
                    ".//*[@align='center']/img/@src|.//*[@align='center']/span/img/@src"
                )
                if tu_pian_lian_jie:
                    for i in tu_pian_lian_jie:
                        photo_URL = photo_URL_qian + i
                        tu_pian_lian = tu_pian_lian + ' ' + photo_URL
                else:
                    pass
            except:
                print "photo Error:" + news_url
            message_dict['tu_pian_lian_jie'] = tu_pian_lian
            # 文章栏目
            if (re.split('/', news_url)[3] == 'politics'
                    or re.split('/', news_url)[3] == 'politics'):
                wen_zhang_lan_mu = re.split('/', news_url)[3]
            elif (re.split('/', news_url)[3] == 'c'):
                wen_zhang_lan_mu = 'sport'
            elif (re.split('/', news_url)[3] == 'content'):
                wen_zhang_lan_mu = 'bendi'
            else:
                wen_zhang_lan_mu = 'yule'
            message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu

            # 文章作者

            try:
                try:
                    con = tree.xpath(".//*[@class='tiyi1']/../text()")[-1]
                    wen_zhang_zuo_zhe = ''
                    for i in con:
                        wen_zhang_zuo_zhe += i
                except:
                    wen_zhang_zuo_zhe = pathAllNode(
                        tree,
                        ".//*[@class='p-jc']|.//*[@class='bjn']|.//*[@class='bj']|.//*[@class='editor']"
                    )
                message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe.replace(
                    u'【纠错】',
                    '').replace(u'责任编辑', '').replace(u'体育—', '').replace(
                        '\r\n', '').replace(u':', '').replace(':', '').replace(
                            '[', '').replace(']', '')
            except:
                message_dict['wen_zhang_zuo_zhe'] = None
            # 关键词
            try:
                guan_jian_ci = tree.xpath(
                    './/*[@name="keywords"]/@content')[0].replace('\r\n', '')
            except:
                guan_jian_ci = None
            message_dict['guan_jian_ci'] = guan_jian_ci

            # 相关标签

            xiang_guan_biao_qian = None
            message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian

            # 阅读数量
            yue_du_shu = yue_du_shu
            message_dict['yue_du_shu'] = yue_du_shu

            # 主键
            message_dict['_id'] = news_url
            if (message_dict['fa_bu_shi_jian']) == None:
                try:
                    with open("ERROR.text", "a") as file:
                        file.write(news_url + "\n")
                finally:
                    pass
            else:
                #print json.dumps(message_dict, ensure_ascii=False, indent=4)
                self.mongo.put_content(message_dict)

            if ping_lun_shu_liang > 0:
                all_page = ping_lun_shu_liang / 20
                for page in xrange(1, all_page + 2):
                    self.comment.run(news_url, page)
            '''#追踪
            dict_zhui = {}
            dict_zhui['url'] = news_url
            dict_zhui['num'] = ping_lun_shu_liang
            dict_zhui['_id'] = news_url
            self.huan.put_zhuizong(dict_zhui)'''

    def getCommentNumber(self, news_url):
        jison_object = dict()
        bu = re.split(r'c_|.htm', news_url)[1]
        comment_url = 'http://comment.home.news.cn/a/newsCommAll.do?newsId=1-' + bu
        flag = 1
        while 1:
            try:
                json_object = json.loads(
                    requests.get(comment_url, timeout=30).content.replace(
                        'var commentJsonVarStr___=', '')[:-1])
                break
            except Exception as e:
                flag += 1
                print e
            if flag > 5:
                return
        # 阅读数
        yue_du_shu = None
        # 评论数
        ping_lun_shu_liang = json_object['totalRows']
        return yue_du_shu, ping_lun_shu_liang
コード例 #9
0
 def __init__(self):
     self.comment = NewsComment()
     self.mongo = MongoDB()
     self.f = open('test.txt', "r+")
     self.i = 0
     self.url_list =[]
コード例 #10
0
ファイル: test_threadPool.py プロジェクト: LYQCOOL/neteasy
class MyFetchThreadSecond(threading.Thread):
    def __init__(self, workQueue, saveQueue, timeout=30):
        threading.Thread.__init__(self)
        self.timeout = timeout
        self.setDaemon(True)
        self.workQueue = workQueue
        self.saveQueue = saveQueue
        self.mongodb = MongoDB()
        self.start()

    def working_one(self, item, label):
        # 返回所需要的内容,fetchFirst->json
        if label == 'shehui' or label == 'guonei':
            label = 'news.163.com'
        else:
            label = '%s.163.com' % label
        header = {
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'Host': label,
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/59.0.3071.115 Safari/537.36'
        }
        flag = 1
        html = ''
        while 1:
            try:
                html = requests.get(url=item, headers=header, timeout=30). \
                    content.decode('gbk').encode('utf-8')
                break
            except Exception as e:
                print e
                flag += 1
            if flag > 10:
                break
        wenzhang_tree = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
        return wenzhang_tree

    def Xpath_for_content(self, tree, path):
        try:
            content = tree.xpath(path)
            return content
        except Exception as e:
            print "ERROR: Locate in the FetchSecond's Xpath_for_content method, exception: e" % e
            return None

    def run(self):
        while not self.workQueue.empty():
            try:
                # print "%s start working" % self.name
                # condition : json's label should be 其他, json's time should be today
                item = self.workQueue.get()
                if item['label'] != u'其它':
                    continue
                current_time = time.strftime("%m/%d/%Y", time.localtime(time.time()))
                result = re.search(current_time, item['time'])
                if result == None:
                    continue
                # get new's html tree
                wenzhang_tree = self.working_one(item['docurl'], item['channelname'])
                message_dict = dict()
                # 文章来源
                wen_zhang_lai_yuan = self.Xpath_for_content(wenzhang_tree, '//*[@id="ne_article_source"]/text()')
                message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan[0]
                # 文章正文
                wen_zhang_zheng_wen = self.Xpath_for_content(wenzhang_tree, '//*[@id="endText"]//p/text()')
                b = '\n'
                for temp in wen_zhang_zheng_wen:
                    b += temp
                message_dict['wen_zhang_zheng_wen'] = b
                # 文章栏目
                wen_zhang_lan_mu = self.Xpath_for_content(wenzhang_tree,
                                                          '//*[@id="ne_wrap"]/body//div/div[@class="clearfix'
                                                          '"]/div[@class="post_crumb"]//a/text()')
                c = '\n'
                for temp2 in wen_zhang_lan_mu:
                    c += temp2
                    c += ' '
                message_dict['wen_zhang_lan_mu'] = c
                # 评论网址
                ping_lun_wang_zhi = item['commenturl']
                message_dict['ping_lun_wang_zhi'] = ping_lun_wang_zhi
                # 文章网址
                wen_zhang_wang_zhi = item['docurl']
                message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi
                # 文章标题
                wen_zhang_biao_ti = item['title']
                message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti
                # 发布时间
                fa_bu_shi_jian = item['time']
                message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian
                # 参与人数
                ping_lun_shu_liang = item['tienum']
                message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang
                # 抓取时间
                do_time = time.time()
                message_dict['do_time'] = do_time
                # 抓取网站
                zhan_dian = u'网易新闻'
                message_dict['zhan_dian'] = zhan_dian
                # 图片链接
                tu_pian_lian_jie = None
                message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie
                # 文章作者
                wen_zhang_zuo_zhe = None
                message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe
                # 关键词
                try:
                    guan_jian_ci = item['keywords'][0]['keyname']
                    message_dict['guan_jian_ci'] = guan_jian_ci
                except Exception as e:
                    message_dict['guan_jian_ci'] = None
                    print "ERROR: Locate in the FetchSecond's run method for guan_jian_ci, exception: %s" % e
                # 相关标签
                xiang_guan_biao_qian = None
                message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian
                # 阅读数量
                yue_du_shu = ping_lun_shu_liang
                message_dict['yue_du_shu'] = yue_du_shu
                # 主键
                message_dict['_id'] = wen_zhang_wang_zhi
                # save message_dict
                self.mongodb.put_content(message_dict)
                # some info
                url_info = re.match('https?://(.*?).163.com/\d*?/\d*?/\d*?/(.*?).html', wen_zhang_wang_zhi)
                all_thing = (url_info, wen_zhang_wang_zhi)
                self.saveQueue.put(all_thing)
            except Exception as e:
                print "ERROR: Locate in the FetchSecond's run method 'while not Queue empty', exception: %s" % e
                continue
コード例 #11
0
ファイル: test_threadPool.py プロジェクト: LYQCOOL/neteasy
 def __init__(self, savequeue):
     self.saveQueue = savequeue
     self.check_mongodb = TempMongoDB()
     self.update_mongodb = MongoDB()
コード例 #12
0
ファイル: test_threadPool.py プロジェクト: LYQCOOL/neteasy
class CheckUpdate(object):
    def __init__(self, savequeue):
        self.saveQueue = savequeue
        self.check_mongodb = TempMongoDB()
        self.update_mongodb = MongoDB()

    def run(self):
        old_data = self.check_mongodb.get()
        count_for_news = 0
        count_for_comment = 0
        try:
            for every_data in old_data:
                ping_lun_shu = every_data['ping_lun_shu']
                content_url = every_data['_id']
                # get comment's num and then compare,if the num has been changed,then get the new data
                info = re.match('http://(.*?).163.com/\d*?/\d*?/\d*?/(.*?).html', content_url)
                default_url = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/' \
                              'comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2' % \
                              (info.group(1), info.group(2))
                current_shuliang = self.working(info=info, comment_url=default_url)['newListSize']
                num = current_shuliang - ping_lun_shu
                if num > 0:
                    put_data = (info, content_url)
                    condition_one = {'_id': content_url}
                    condition_two = {'do_time': time.time()}
                    self.saveQueue.put(put_data)
                    self.update_mongodb.update_content(condition_one, condition_two)
                    count_for_news += 1
                    count_for_comment += num
                else:
                    remove_condition = {
                        "_id": content_url
                    }
                    self.check_mongodb.delete(remove_condition)
        except Exception as e:
            print "ERROR: Locate in the CheckUpdate, exception: %s" % e
        finally:
            print "UPDATE: There are %d news has been updated, and there has %d comments been updated" % \
                  (count_for_news, count_for_comment)

    def working(self, info, comment_url):
        host = 'comment.%s.163.com' % (info.group(1))
        referer = comment_url
        header = {
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'Host': host,
            'Referer': referer,
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
                          ' Chrome/59.0.3071.115 Safari/537.36'
        }
        flag = 1
        while 1:
            try:
                json_data = json.loads(requests.get(url=comment_url, headers=header, timeout=30).content)
                break
            except Exception as e:
                print "ERROR: Failed to get the comment's json, exception: %s" % e
                flag += 1
            if flag > 5:
                return
        return json_data
コード例 #13
0
ファイル: test_threadPool.py プロジェクト: LYQCOOL/neteasy
class MyCommentThread(threading.Thread):
    def __init__(self, workqueue):
        threading.Thread.__init__(self)
        self.workQueue = workqueue
        self.setDaemon(True)
        self.start()
        self.mongodb = MongoDB()
        self.checkMongoDB = TempMongoDB()

    def run(self):
        while not self.workQueue.empty():
            try:
                # print "%s start working" % self.name
                info, wenzhang_Url = self.workQueue.get()
                default_url = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s/' \
                              'comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2' % \
                              (info.group(1), info.group(2))
                pages = self.working(wenzhang_Url, default_url, info)
                if pages > 0:
                    comment_urls = list()
                    for i in range(1, pages + 1):
                        offset = i * 30
                        temp = 'http://comment.%s.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/%s' \
                               '/comments/newList?offset=%d&limit=30&showLevelThreshold=72&headLimit=1&' \
                               'tailLimit=2' % (info.group(1), info.group(2), offset)
                        comment_urls.append(temp)
                    for item in comment_urls:
                        drop = self.working(wenzhang_Url, item, info)
            except Exception as e:
                print "ERROR: Locate in the CommentThread's run method 'while not Queue empty', exception: %s" % e
                continue

    def working(self, content_url, the_comment_url, info):
        host = 'comment.%s.163.com' % (info.group(1))
        referer = the_comment_url
        header = {
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'Host': host,
            'Referer': referer,
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
                          ' Chrome/59.0.3071.115 Safari/537.36'
        }
        # 默认获取第一页的json数据
        flag = 1
        while 1:
            try:
                json_data = json.loads(requests.get(url=the_comment_url, headers=header, timeout=30).content)
                break
            except Exception as e:
                print "ERROR: Failed to get the comment's json, exception: %s" % e
                flag += 1
            if flag > 5:
                return
        pages = 0
        try:
            for comment_id in json_data['comments']:
                comment_dict = dict()
                # 评论内容
                ping_lun_nei_rong = json_data['comments'][str(comment_id)]['content']
                comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

                # 评论时间
                ping_lun_shi_jian = json_data['comments'][str(comment_id)]['createTime']
                comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

                # 回复数量
                hui_fu_shu = None
                comment_dict['hui_fu_shu'] = hui_fu_shu

                # 点赞数量
                dian_zan_shu = json_data['comments'][str(comment_id)]['vote']
                comment_dict['dian_zan_shu'] = dian_zan_shu

                # 评论ID
                ping_lun_id = comment_id
                comment_dict['ping_lun_id'] = ping_lun_id

                # 用户昵称
                try:
                    yong_hu_ming = json_data['comments'][str(comment_id)]['user']['nickname']
                    comment_dict['yong_hu_ming'] = yong_hu_ming
                except Exception as e:
                    comment_dict['yong_hu_ming'] = None

                # 性别
                comment_dict['xing_bie'] = None

                # 用户等级
                comment_dict['yong_hu_deng_ji'] = None

                # 用户省份
                comment_dict['yong_hu_sheng_fen'] = json_data['comments'][str(comment_id)]['user']['location']

                # 抓取时间
                do_time = time.time()
                comment_dict['do_time'] = do_time

                # 抓取网站
                zhan_dian = u'网易新闻'
                comment_dict['zhan_dian'] = zhan_dian

                # 主键
                comment_dict['_id'] = ping_lun_id + content_url

                # 获取评论数
                ping_lun_shu = json_data['newListSize']
                pages = ping_lun_shu / 30

                self.mongodb.put_comment(comment_dict)
                # put the data into the database
                check_dict = dict()
                check_dict['_id'] = content_url
                check_dict['do_time'] = do_time
                check_dict['ping_lun_shu'] = ping_lun_shu
                self.checkMongoDB.put(check_dict)
            return pages
        except Exception as e:
            print "ERROR: Locate in the CommentThread's working method for parsing json data, exception: %s," \
                  "and json data is %s" % (e, json_data)
コード例 #14
0
ファイル: test_threadPool.py プロジェクト: LYQCOOL/neteasy
        updateUrl_Queue = Queue.Queue()
        # First: check old news' url for update
        Updater = CheckUpdate(savequeue=updateUrl_Queue)
        Updater.run()
        CommentPool = ThreadPoolOfCommentThread(num_of_threads=10, workqueue=updateUrl_Queue)
        CommentPool.wait_for_complete()
        print "UPDATE: Complete checking for the old news!"
        # Second: Get all news' url for json
        for url in Urls:
            StartUrlQueue.put(url)
        fetch_1 = MyFetchThreadFirst(workQueue=StartUrlQueue, saveQueue=Json_Url_Queue)
        fetch_1.join()
        # Third: Parse json data and get news' html and parse to generate the message_dict then save it
        print "FETCH: There are %d newest news has been fetched today" % Json_Url_Queue.qsize()
        FetchSecondPool = ThreadPoolOfFetchSecond(num_of_threads=6, workqueue=Json_Url_Queue, savequeue=urlInfo_Queue)
        FetchSecondPool.wait_for_complete()
        # Forth: Request the comment_url and save comment's message_dict
        CommentPool = ThreadPoolOfCommentThread(num_of_threads=10, workqueue=urlInfo_Queue)
        CommentPool.wait_for_complete()
        start_uptime, end_uptime = getCondition()
        time_condition = {"do_time": {"$gte": start_uptime, "$lt": end_uptime},
                          "zhan_dian": "网易新闻"}
        temp_mongo = MongoDB()
        num_of_comments = temp_mongo.get_comment_data(time_condition)
        print "FETCH: There are %d comments has been fetched today" % num_of_comments
        print "TIME: Total spent %d seconds" % (time.time() - start_moment)
        print "SLEEP: Mission complete, start to sleeping.... "
        sleep_seconds = DeltaSeconds()
        time.sleep(sleep_seconds)

コード例 #15
0
ファイル: NewsComment.py プロジェクト: countingstars112/ifeng
class NewsComment(object):
    def __init__(self):
        self.mongo = MongoDB()

    def run(self, news_url, page):
        comment_url = 'http://comment.ifeng.com/get.php?docUrl=%s&format=js&job=1&p=%d&pageSize=20' % (
            news_url, page)
        json_object = dict()
        comment_dict = dict()
        flag = 1
        while 1:
            try:
                json_object = json.loads(
                    requests.get(comment_url, timeout=30).content.replace(
                        'var commentJsonVarStr___=', '')[:-1])
                break
            except Exception as e:
                flag += 1
                print "获取评论错误:", e

            if flag > 3:
                return
        for item in json_object['comments']:
            # 评论文章url
            news_url = news_url

            # 评论内容
            ping_lun_nei_rong = item["comment_contents"]
            comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

            # 评论时间
            ping_lun_shi_jian = item["create_time"]
            comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

            # 回复数量
            hui_fu_shu = None
            comment_dict['hui_fu_shu'] = hui_fu_shu

            # 点赞数量
            dian_zan_shu = None
            comment_dict['dian_zan_shu'] = dian_zan_shu

            # 评论id
            ping_lun_id = item["comment_id"]
            comment_dict['ping_lun_id'] = ping_lun_id

            # 用户昵称
            yong_hu_ming = item["uname"]
            comment_dict['yong_hu_ming'] = yong_hu_ming

            # 性别
            xing_bie = None
            comment_dict['xing_bie'] = xing_bie

            # 用户等级
            yong_hu_deng_ji = None
            comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji

            # 用户省份
            yong_hu_sheng_fen = item["ip_from"]
            comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen

            # 抓取时间
            do_time = time.time()
            comment_dict['do_time'] = do_time

            # 抓取网站
            zhan_dian = u'凤凰网'
            comment_dict['zhan_dian'] = zhan_dian

            # 主键
            comment_dict['_id'] = ping_lun_id + news_url

            # print json.dumps(comment_dict, ensure_ascii=False, indent=4)
            self.mongo.put_comment(comment_dict)
        pass
コード例 #16
0
class NewsComment(object):
    def __init__(self):
        self.mongo = MongoDB()

    def run(self, news_url, _id, page):
        # comment_url = 'http://apiv2.sohu.com/api/topic/load?page_size=10' \
        #                '&topic_source_id=%s&page_no=1&hot_size=5&topic_url=%s&source_id=%s' % (_id,news_url,_id)
        if news_url.endswith('shtml'):
            pass
        else:
            tow_ids = news_url.split('/')[-1].split('_')
            media_id = tow_ids[1]
            source_id = tow_ids[0]
            comment_url = 'http://apiv2.sohu.com/api/topic/load?callback=jQuery1124008187733188312629_1539945526218&page_size=10' \
                          '&topic_source_id=%s&page_no=1&media_id=%s&source_id=mp_%s' % (_id, media_id, source_id)
            # print comment_url
            json_object = dict()
            flag = 1
            while 1:
                try:
                    # json_object = json.loads(requests.get(comment_url, timeout=30).content)
                    comments = requests.get(comment_url, timeout=30).content
                    json_object = json.loads(
                        re.match('.*218\((.*?)\);', comments).group(1))
                    break
                except Exception as e:
                    flag += 1
                    print "获取评论错误:", e

                if flag > 5:
                    return
                count = 0
            if (json_object[u'jsonObject'].has_key(u'topic_id') == False):
                print "暂时无法获取topic_id"

            else:
                item = json_object[u'jsonObject'][u'topic_id']

                # comment_URL = 'http://apiv2.sohu.com/api/comment/list?page_size=10&topic_id=%s&page_no=%d&source_id=%s' % (item, page,_id)
                comment_URL = 'http://apiv2.sohu.com/api/topic/load?callback=jQuery1124008187733188312629_1539945526218&page_size=10' \
                              '&topic_id=%s&page_no=%s&media_id=%s&source_id=mp_%s' % (item, page, media_id, source_id)
                Json_object = dict()
                comment_dict = dict()
                flag = 1
                while 1:
                    try:
                        # json_object = json.loads(requests.get(comment_url, timeout=30).content)
                        comments = requests.get(comment_URL,
                                                timeout=30).content
                        Json_object = json.loads(
                            re.match('.*218\((.*?)\);', comments).group(1))
                        break
                    except Exception as e:
                        flag += 1
                        print "获取评论错误:", e

                    if flag > 5:
                        return

                count = 0
                for item in Json_object[u'jsonObject'][u'comments']:

                    # 评论文章url
                    news_url = news_url

                    # 评论内容
                    ping_lun_nei_rong = item["content"]
                    comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

                    # 评论时间
                    ping_lun_shi_jian = item["create_time"]
                    comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

                    # 回复数量
                    hui_fu_shu = item["reply_count"]
                    comment_dict['hui_fu_shu'] = hui_fu_shu

                    # 点赞数量
                    dian_zan_shu = item["support_count"]
                    comment_dict['dian_zan_shu'] = dian_zan_shu

                    # 评论id
                    ping_lun_id = item["comment_id"]
                    comment_dict['ping_lun_id'] = ping_lun_id

                    # 用户昵称
                    if (item[u'passport'].has_key(u'nickname') == False):
                        yong_hu_ming = None
                    else:
                        yong_hu_ming = item[u'passport']["nickname"]
                    comment_dict['yong_hu_ming'] = yong_hu_ming
                    # 性别
                    xing_bie = None
                    comment_dict['xing_bie'] = xing_bie

                    # 用户等级
                    yong_hu_deng_ji = None
                    comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji

                    # 用户省份
                    yong_hu_sheng_fen = item["ip_location"]
                    comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen

                    # 抓取时间
                    do_time = time.time()
                    comment_dict['do_time'] = do_time

                    # 抓取网站
                    zhan_dian = u'搜狐网'
                    comment_dict['zhan_dian'] = zhan_dian

                    # 主键
                    comment_dict['_id'] = str(ping_lun_id) + '|_|' + news_url
                    #
                    count += 1
                    # print json.dumps(comment_dict, ensure_ascii=False, indent=4)
                    self.mongo.put_comment(comment_dict)
コード例 #17
0
 def __init__(self):
     self.mongo = MongoDB()
コード例 #18
0
class NewsMessage(object):
    def __init__(self):
        self.comment = NewsComment()
        self.mongo = MongoDB()
        self.f = open('test.txt', "r+")
        self.i = 0
        self.url_list =[]

    def getNewsTotleUrl(self):
        for news_url in NewsUrl.Run():
            self.getNewsMessage(news_url)

        if self.f.read():
            with open('test.txt', 'r+') as f:
                a = json.load(f)
            url_list_before = a['url']
            for url_before in url_list_before:
                url_json = json.loads(url_before)['wen_zhang_wang_zhi']
                pin_lun_shu = self.getPinglun(url_json)
                if pin_lun_shu != json.loads(url_before)['ping_lun_shu_liang']:
                    self.getNewsMessage(url_json)
                else:
                    url_list_before.remove(url_before)
        self.file_close(url_list_before)


    def getNewsMessage(self, news_url):
        self.i += 1
        print self.i
        print news_url
        html = ''
        flag = 1
        while 1:
            try:
                html = requests.get(news_url, timeout=30).content
                break
            except Exception as e:
                flag += 1
                print e
            if flag > 10:
                return
        tree = etree.HTML(html)

        """这一段代码是用来获取阅读数和评论数的"""
        comment_number = self.getCommentNumber(news_url)
        if comment_number:
            yue_du_shu = comment_number[0]
            ping_lun_shu_liang = comment_number[1]
        else:
            yue_du_shu = 0
            ping_lun_shu_liang = 0

        message_dict = dict()
        message_url = dict()

        # 文章网址
        wen_zhang_wang_zhi = news_url
        message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi
        message_url['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi

        # 文章标题
        wen_zhang_biao_ti = pathOneNode(tree, '//title/text()')
        if  wen_zhang_biao_ti != None:
            wen_zhang_biao_ti = wen_zhang_biao_ti.replace('_', '').replace(u"娱乐频道", "").replace(u"凤凰网","").replace(u'凤凰体育','').replace(u'凤凰财经','')
        message_dict['wen_zhang_biao_ti'] =wen_zhang_biao_ti

        # 发布时间
        fa_bu_shi_jian = pathOneNode(tree, '//span[@itemprop="datePublished"]/text()')
        if fa_bu_shi_jian == None:
            fa_bu_shi_jian = pathOneNode(tree, '//*[@id="titL"]/p/span/text()')
        message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian

        # 评论数量
        ping_lun_shu_liang = ping_lun_shu_liang
        message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang
        message_url['ping_lun_shu_liang'] = ping_lun_shu_liang

        # 文章来源
        wen_zhang_lai_yuan = pathOneNode(tree, '//span[@itemprop="publisher"]/span/a/text()')
        if wen_zhang_lai_yuan == None:
            wen_zhang_lai_yuan = pathOneNode(tree, '//*[@id="artical_sth"]/p/span[3]/span/text()')
        if wen_zhang_lai_yuan == None:
            wen_zhang_lai_yuan = u'凤凰网'
        message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan

        # 文章正文
        wen_zhang_zheng_wen = pathAllNode(tree, '//div[@id="main_content"]')
        if wen_zhang_zheng_wen == None:
            try:
                re_ = "G_listdata=..\n{1,}.*({title:\'[\S\s]+?])"
                re__ = "{title:\'([\S\s]+?)\',"
                text_first = re.findall(re_, html)
                text_conten = re.findall(re__, text_first[0])
                wen_zhang_zheng_wen = "".join(text_conten)
            except Exception as e:
                try:
                    wen_zhang_zheng_wen = pathAllNode(tree, '//*[@id="slidedesc2"]')
                except Exception as e:
                    wen_zhang_zheng_wen = None
        message_dict['wen_zhang_zheng_wen'] = wen_zhang_zheng_wen

        # 抓取时间
        do_time = time.time()
        message_dict['do_time'] = do_time

        # 抓取网站
        zhan_dian = u'凤凰网'
        message_dict['zhan_dian'] = zhan_dian

        # 图片链接
        tu_pian_lian_jie = pathGetImg(tree, '//*[@id="main_content"]//img[@alt]/@src')
        if tu_pian_lian_jie:
            message_dict['tu_pian_lian_jie'] = " ".join(tu_pian_lian_jie)
        else:
            message_dict['tu_pian_lian_jie'] = None


        # 文章栏目
        wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="theCurrent cDGray js_crumb"]')
        if wen_zhang_lan_mu == None:
            wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="speNav js_crumb"]')
        if wen_zhang_lan_mu == None:
            wen_zhang_lan_mu = pathAllNode(tree, '//div[@class="cmtNav js_crumb"]')
        try:
            message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu.replace('>', '->')
        except Exception as e:
            message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu

        # 文章作者
        wen_zhang_zuo_zhe = None
        message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe

        # 关键词
        guan_jian_ci = None
        message_dict['guan_jian_ci'] = guan_jian_ci

        # 相关标签
        xiang_guan_biao_qian = None
        message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian

        # 阅读数量
        yue_du_shu = yue_du_shu
        message_dict['yue_du_shu'] = yue_du_shu

        # 主键
        message_dict['_id'] = news_url

        # #时间
        # d1 = datetime.datetime.now().date()
        # message_url['time'] = d1


        # print json.dumps(message_dict, ensure_ascii=False, indent=4)
        if wen_zhang_zheng_wen != None and wen_zhang_biao_ti != None:
            self.mongo.put_content(message_dict)
            self.url_list.append(json.dumps(message_url, sort_keys=True, indent=4))
            print message_dict
            if ping_lun_shu_liang > 0:
                all_page = ping_lun_shu_liang / 20
                for page in xrange(1, all_page + 1):
                    self.comment.run(news_url, page)


    def getPinglun(self, news_url):
        """这一段代码是用来获取和评论数的"""
        comment_number = self.getCommentNumber(news_url)
        if comment_number:
            ping_lun_shu_liang = comment_number[1]
        else:
            ping_lun_shu_liang = 0
        return ping_lun_shu_liang



    def file_close(self, url_list_before):
        url_dirc=dict()
        end_url= self.url_list+url_list_before
        url_dirc['url'] = end_url
        self.f.truncate(0)
        self.f.seek(0, 0)
        self.f.write(json.dumps(url_dirc))
        self.f.close()


    def getCommentNumber(self, news_url):
        json_object = dict()
        comment_url = 'http://comment.ifeng.com/get.php?doc_url=%s&format=js&job=1' % news_url
        flag = 1
        while 1:
            try:
                json_object = json.loads(requests.get(comment_url, timeout=30).content.replace('var commentJsonVarStr___=', '')[:-1])
                break
            except Exception as e:
                flag += 1
                print e
            if flag > 5:
                return
        # 阅读数
        yue_du_shu = json_object['join_count']
        # 评论数
        ping_lun_shu_liang = json_object['count']
        return yue_du_shu, ping_lun_shu_liang