Exemple #1
0
 def tryparse(self, str):
     #转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     #转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     result = urllib.parse.urlparse(url)
     params = urllib.parse.parse_qs(result.query, True)
     category = ""  #类型
     #区分栏目
     if url.find('api.3g.ifeng.com/get_pic_list?channel=news') > -1:
         category = "图片"
         categorytag = self.categroytag["%s" % category]
     elif url.find('api.iclient.ifeng.com/ClientNews') > -1:
         category = params['id'][0]
         if category == "SYLB10,SYDT10,SYRECOMMEND" or category == "SYLB10,SYDT10":
             category = "头条"
             categorytag = self.categroytag["%s" % category]
         elif category == "RECOMVIDEO":
             category = "视频"
             categorytag = self.categroytag["%s" % category]
         elif category == "YAOWEN223":
             category = "要闻"
             categorytag = self.categroytag["%s" % category]
         elif category == "VIDEOSHORT":
             category = "小视频"
             categorytag = self.categroytag["%s" % category]
         else:
             SingleLogger().log.debug("有不正确的url1")
             return
     else:
         SingleLogger().log.debug("有不正确的url2")
         return
     crawltime = strjson['time']
     #获取data
     data = strjson['data']
     data = json.loads(data)
     lable = ""
     #如果是“图片”栏目,取item方式不一样
     if category == "图片":
         item = data['body']['item']
         for y, x in enumerate(item):
             self.Analysis_fenghuang(x, category, crawltime, y, categorytag,
                                     lable)
     else:
         for y1, curobj1 in enumerate(data):
             item = curobj1['item']
             lable = curobj1['type']
             if lable == "top":
                 lable = "置顶"
             else:
                 lable = ""
             for y2, curobj2 in enumerate(item):
                 self.Analysis_fenghuang(curobj2, category, crawltime, y2,
                                         categorytag, lable)
Exemple #2
0
    def tryparse(self, str):
        # 转换编码格式
        strjson = str.decode("UTF-8", "ignore")
        # 转json对象
        strjson = json.loads(strjson)
        url = strjson['url']
        result = urllib.parse.urlparse(url)
        params = urllib.parse.parse_qs(result.query, True)
        crawltime = strjson['time']

        # 区分栏目
        category = ""  # 类型
        try:
            category = params['channel'][0]
        except:
            SingleLogger().log.debug("=====category======>%s" % params)
        if category == "news_jingyao":
            category = "要闻"
            categorytag = self.categroytag["%s" % category]
        elif category == "news_toutiao":
            category = "推荐"
            categorytag = self.categroytag["%s" % category]
        elif category == "news_video":
            category = "视频"
            categorytag = self.categroytag["%s" % category]
        elif category == "news_pic":
            category = "图片"
            categorytag = self.categroytag["%s" % category]
        else:
            SingleLogger().log.debug("=====有不正确的栏目======>%s" % category)
            SingleLogger().log.debug("有不正确的栏目")
            return

        # 获取data
        data = strjson['data']
        data = json.loads(data)
        # feed
        if data['data']['feed'] and data['data']['feed'] != '':
            feed = data['data']['feed']
            for y1, curobj1 in enumerate(feed):
                self.Analysis_sina(curobj1, category, crawltime, y1,
                                   categorytag)
        # ad
        try:
            if data['data']['ad']['feed'] and data['data']['ad']['feed'] != '':
                ad = data['data']['ad']['feed']
                for y2, curobj2 in enumerate(ad):
                    self.Analysis_sina(curobj2, category, crawltime, y2,
                                       categorytag)
        except:
            None
Exemple #3
0
 def getHtmlImages(self, url):
     html = Http.get(url)
     soup = BeautifulSoup(html, "html.parser")  # 文档对象
     imgStr = ""
     for k in soup.find_all('img'):  # 获取img
         try:
             imgStr += k['data-src'] + ","
         except:
             SingleLogger().log.debug("没有找到标签")
         try:
             imgStr += k['src'] + ","
         except:
             SingleLogger().log.debug("没有找到src标签")
     return imgStr
Exemple #4
0
 def tryparse(self, str):
     #转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     #转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     result = urllib.parse.urlparse(url)
     params = urllib.parse.parse_qs(result.query, True)
     category = ""  #类型
     try:
         category = params['chlid'][0]
         if category == "news_news_top":
             category = "要闻"
             categorytag = self.categroytag["%s" % category]
         elif category == "news_news_lianghui":
             category = "两会"
             categorytag = self.categroytag["%s" % category]
         elif category == "news_video_top":
             category = "视频"
             categorytag = self.categroytag["%s" % category]
         elif category == "news_video_main":
             category = "图片"
             categorytag = self.categroytag["%s" % category]
         else:
             SingleLogger().log.debug("不在4种类型之内")
             return
     except:
         SingleLogger().log.debug("无类型")
         return
     crawltime = strjson['time']
     #获取data
     data = strjson['data']
     try:
         data = json.loads(data)
     except:
         None
     #防止报错
     try:
         #如果是“两会”,取newslist方式不一样
         if category == "两会":
             list = data['idlist'][0]['newslist']
         else:
             list = data['newslist']
     except:
         return
     for y, x in enumerate(list):
         self.Analysis_ten(x, category, crawltime, y, categorytag)
Exemple #5
0
 def tryparse(self, str):
     # 转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     # 转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     result = urllib.parse.urlparse(url)
     params = urllib.parse.parse_qs(result.query, True)
     if url.find('news-list-for-best-channel') > -1:
         category = "推荐"
         categorytag = self.categroytag["%s" % category]
     elif url.find('news-list-for-hot-channel') > -1:
         category = "要闻"
         categorytag = self.categroytag["%s" % category]
     elif url.find('news-list-for-channel') > -1:
         channel_id = params['channel_id'][0]
         if channel_id == "21044074964":
             category = "美图"
             categorytag = self.categroytag["%s" % category]
         elif channel_id == "21044074724":
             category = "视频"
             categorytag = self.categroytag["%s" % category]
         elif channel_id == "21044074756":
             category = "图片"
             categorytag = self.categroytag["%s" % category]
         else:
             SingleLogger().log.debug(url)
             return
     else:
         SingleLogger().log.debug(url)
         return
     crawltime = int(strjson['time'])
     # 获取data
     data = strjson['data']
     data = json.loads(data)
     list = data['result']
     datalen = len(list)
     for y, x in enumerate(list):
         if category == "要闻" or category == "图片":
             if datalen == y + 1:
                 continue
         elif category == "视频" or category == "美图":
             if y == 0:
                 continue
         self.Analysis_ydzx(x, category, crawltime, y, categorytag)
Exemple #6
0
    def get(url,referer=None):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
            'Accept': 'text/html;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding': 'gzip',
            'Connection': 'close',
            'referer':referer
            }

        response = requests.get(url,headers=headers)
        if response.status_code == 200 :
            SingleLogger().log.debug("==========response.text===========>%s" % response.text)
            SingleLogger().log.debug("==========response.content===========>%s" % response.content)

            return response.content
        else:
            return ""
        pass
Exemple #7
0
 def tryparse(self,str):
     #转换编码格式
     strjson = str.decode("UTF-8","ignore")
     #转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     result = urllib.parse.urlparse(url)
     params = urllib.parse.parse_qs(result.query,True)
     channelId = params['channelId'][0]
     if url.find('v6/news.go') > -1:
         if channelId == "1":
             category ="要闻"
             categorytag = self.categroytag["%s" % category]
         elif channelId == "13557":
             category = "推荐"
             categorytag = self.categroytag["%s" % category]
         else:
             SingleLogger().log.debug(url)
             return
     elif url.find('v5/news.go') > -1:
         if channelId == "4313":
             category = "两会"
             categorytag = self.categroytag["%s" % category]
         else:
             SingleLogger().log.debug(url)
             return
     else:
         SingleLogger().log.debug(url)
         return
     crawltime = strjson['time']
     #获取data
     data = strjson['data']
     data = json.loads(data)
     list = data['recommendArticles']
     for y,x in enumerate(list):
         self.Analysis_shxw(x,category,crawltime,y,categorytag)
     if category == "要闻":
         list = data['trainArticles']['trainList']
         for y,x in enumerate(list):
             self.Analysis_shxw(x,category,crawltime,y,categorytag)
Exemple #8
0
 def tryparse(self, str):
     # 转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     # 转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     #暂时只抓取一个栏目,写死
     category = "推荐"
     categorytag = self.categroytag["%s" % category]
     crawltime = strjson['time']
     # 获取data
     data = strjson['data']
     try:
         data = json.loads(data)
         list = data['data']
         for y, x in enumerate(list):
             self.Analysis_bdxw(x, category, crawltime, y, categorytag)
     except:
         SingleLogger().log.debug("抓取数据不正常")
Exemple #9
0
 def tryparse(self, str):
     # 转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     # 转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     result = urllib.parse.urlparse(url)
     params = urllib.parse.parse_qs(result.query, True)
     category = ""  #类型
     try:
         category = params['category'][0]
         if category == "news_hot":
             category = "热点"
             categorytag = self.categroytag["%s" % category]
         elif category == "hotsoon_video":
             category = "小视频"
             categorytag = self.categroytag["%s" % category]
         elif category == "video":
             category = "视频"
             categorytag = self.categroytag["%s" % category]
         elif category == "组图":
             category = "图片"
             categorytag = self.categroytag["%s" % category]
         elif category == "image_wonderful":
             category = "美图"
             categorytag = self.categroytag["%s" % category]
     except:
         if url.find('wenda/v1/native/feedbrow') > -1:
             category = "问答"
             categorytag = self.categroytag["%s" % category]
         else:
             category = "推荐"
             categorytag = self.categroytag["%s" % category]
             SingleLogger().log.debug("无类型")
     if category != "两会" and category != "问答" and category != "热点" and category != "视频" and category != "小视频" and category != "推荐" and category != "图片" and category != "美图":
         return
     crawltime = strjson['time']
     #获取data
     data = strjson['data']
     data = json.loads(data)
     list = data['data']
     for y, x in enumerate(list):
         self.Analysis_sntt(x, category, crawltime, y, categorytag)
Exemple #10
0
 def tryparse(self, str):
     # 转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     # 转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     if url.find('api/feed_feedlist') > -1:
         category = "推荐"
         categorytag = self.categroytag["%s" % category]
     elif url.find('api/newchosenlist') > -1:
         category = "视频"
         categorytag = self.categroytag["%s" % category]
     elif url.find('api/newslist') > -1:
         category = "两会"
         categorytag = self.categroytag["%s" % category]
     elif url.find('api/medianewslist') > -1:
         category = "图片"
         categorytag = self.categroytag["%s" % category]
     else:
         SingleLogger().log.debug(url)
         return
     crawltime = strjson['time']
     # 获取data
     data = strjson['data']
     data = json.loads(data)
     list = data['data']
     if category == "推荐":
         data = list['top']
         for y, x in enumerate(data):
             self.Analysis_bdxw(x, category, crawltime, y,categorytag)
         data = list['news']
         for y, x in enumerate(data):
             self.Analysis_bdxw(x, category, crawltime, y,categorytag)
     else:
         data = list['news']
         datalen = len(data)
         for y, x in enumerate(data):
             # 若类型等于图片时,则最后一次循环时进行跳出
             if category == "图片":
                 if datalen == y + 1:
                     return
             self.Analysis_bdxw(x, category, crawltime, y,categorytag)
Exemple #11
0
 def tryparse(self, str):
     # 转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     # 转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     if url.find('/aweme/v1/feed/') > -1:
         category = "推荐"
         categorytag = self.categroytag["%s" % category]
     elif url.find('/aweme/v1/nearby/feed/') > -1:
         category = "附近"
         categorytag = self.categroytag["%s" % category]
     else:
         SingleLogger().log.debug(url)
         return
     crawltime = strjson['time']
     # 获取data
     data = strjson['data']
     data = json.loads(data)
     list = data['aweme_list']
     for y, x in enumerate(list):
         self.Analysis_bdxw(x, category, crawltime, y, categorytag)
Exemple #12
0
 def tryparse(self, str):
     #转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     #转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     if url.find('rest/n/feed/hot') > -1:
         category = "发现"
         categorytag = self.categroytag["%s" % category]
     else:
         SingleLogger().log.debug(url)
         return
     crawltime = strjson['time']
     #获取data
     data = strjson['data']
     try:
         data = json.loads(data)
     except:
         None
     list = data['feeds']
     for y, x in enumerate(list):
         self.Analysis_ks(x, category, crawltime, y, categorytag)
Exemple #13
0
    def Analysis_bdxw(self, data, category, crawltime, y, categorytag):
        seq = y + 1  # 排序
        title = ""  # 标题
        articleid = ""  # 文章标识
        restype = 1  # 类型 1 图文 2 图片 3 视频
        logo = ""  # 图片
        source = ""  # 来源
        abstract = ""  # 摘要
        tab = ""  # 标签
        gallary = ""  #详情图片,视频
        content = ""  # 内容
        audio = ''  #音频
        video = ''  #视频
        try:
            corner_type = data['tips']
            if corner_type == "":
                restype = 1
                content = self.getHtmlBodyInnerText(data['share_url'])
            elif corner_type == "视频":
                restype = 3
                content = self.getHtmlVideos(data['share_url'])
                video = content
            elif corner_type == "广告":
                return
        except:
            SingleLogger().log.debug("非视频/图片资讯")
        title = data['title']
        abstract = data['introduction']
        url = data['share_url']
        source = data['source_name']
        articleid = data['id']
        publish_time = data['publish_time']
        img_url = data['cover']
        for i in img_url:
            if i != "":
                logo += i + ","

        gallary = self.getHtmlImages(url)

        # 判断图末尾是否为,若是则进行删除
        gallarylen = len(gallary)
        if gallarylen > 0:
            gallarystr = gallary[gallarylen - 1]
            if gallarystr == ",":
                gallary = gallary[:-1]

        # 当等于视频时已得到视频地址 无需调用
        if restype != 3:
            video += self.getHtmlVideos(url)

        crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                     time.localtime(crawltime / 1000))
        publish_timestr = time.strftime(
            "%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000))
        SingleLogger().log.debug(title)
        # 判断列表封面图末尾是否为,若是则进行删除
        logolen = len(logo)
        if logolen > 0:
            logostr = logo[logolen - 1]
            if logostr == ",":
                logo = logo[:-1]
        sdata = {
            "title": title,
            "description": abstract,
            "content": content,
            "source": source,
            "pubtimestr": publish_timestr,
            "pubtime": publish_time,
            "crawltimestr": crawltimestr,
            "crawltime": crawltime,
            "status": 0,
            "shorturl": url,
            "logo": logo,
            "labels": tab,
            "keyword": "",
            "seq": seq,
            "identity": str(articleid),
            "appname": self.appname,
            "app_tag": self.apptag,
            "category_tag": categorytag,
            "category": category,
            "restype": restype,
            "gallary": gallary,  #里面的所有图片地址
            "video": video,
            "audio": audio
        }
        self.db(sdata, articleid, title)
Exemple #14
0
 def Analysis_wyxw(self, data, category, crawltime, y,categorytag):
     title = ""  # 标题
     articleid = ""  # 文章标识
     restype = 1  # 类型 1 图文 2 图片 3 视频
     logo = ""  # 图片
     source = ""  # 来源
     abstract = ""  # 摘要
     content = ""  # 内容
     gallary = ""
     tab = ""  # 标签
     video = ''  # 视频
     audio = ''  # 音频
     try:
         title = data['title']
     except:
         SingleLogger().log.debug("无title")
     publish_time = ""  # 发布时间
     publish_timestr = ""  # 发布时间戳
     if category == "视频":
         restype = 3
         abstract = data['description']
         logo = data['cover']
         source = data['topicName']
         articleid = data['vid']
     elif category == "图片":
         abstract = data['desc']
         publish_timestr = data['createdate']
         timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S")
         publish_time = int(time.mktime(timeArray))
         title = data['setname']
         img_list = data['pics']
         for i in img_list:
             if i != "":
                 logo += i + ","
         url = data['seturl']
         articleid = data['setid']
         restype = 2
     else:
         try:
             abstract = data['digest']
         except:
             SingleLogger().log.debug("无digest")
         try:
             logo = data['imgsrc']
         except:
             SingleLogger().log.debug("无imgsrc")
         try:
             source = data['source']
         except:
             SingleLogger().log.debug("无source")
         try:
             if category == "问吧":
                 articleid = data['docid']
             else:
                 articleid = data['id']
         except:
             SingleLogger().log.debug("无id")
         # 若唯一标识为空,则获取图片唯一标识,此资讯为图片资讯
         if articleid == "":
             articleid = data['photosetID']
             restype = 2
         try:
             TAG = data['TAG']
             if TAG == "视频":
                 restype = 3
         except:
             SingleLogger().log.debug("无TAG")
         try:
             img_list = data['imgnewextra']
             for z in img_list:
                 if z['imgsrc'] != "":
                     logo += "," + z['imgsrc']
         except:
             SingleLogger().log.debug("仅一张或没有图片")
         try:
             tab = data['interest']
             if tab == "S":
                 tab = "置顶"
         except:
             SingleLogger().log.debug("无interest")
         if category == "热点":
             try:
                 tab = data['recReason']
                 if tab == "大家都在看":
                     tab = "热"
             except:
                 SingleLogger().log.debug("无recReason")
     seq = y + 1  # 排序
     if publish_timestr == "":
         try:
             publish_timestr = data['ptime']
             timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S")
             publish_time = int(time.mktime(timeArray))
         except:
             try:
                 publish_time = data['recTime']
                 publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time))
             except:
                 SingleLogger().log.debug("无recTime、ptime")
     crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000))
     # 拼链接地址
     news_detail_url = 'https://c.m.163.com/nc/article/' + str(articleid) + '/full.html'
     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
     # 若restype=3为视频资讯 取视频地址
     if restype == 3:
         url = 'https://c.m.163.com/nc/video/detail/' + str(articleid) + '.html'
         if category == "视频":
             content = data['mp4_url']
         elif category == "热点":
             news_detail = requests.get(url, headers=headers).json()
             content = news_detail['mp4_url']
         else:
             content = data['videoinfo']['mp4_url']
         video=content
     elif restype == 2:
         if category == "图片":
             strarr = url.split('/')
             first = strarr[4][-4:]
             second = articleid
         else:
             strarr = articleid.split('|')
             first = strarr[0][-4:]
             second = strarr[1]
         news_detail_url = 'https://c.m.163.com/photo/api/set/' + str(first) + '/' + str(second) + '.json'
         news_detail = requests.get(news_detail_url, headers=headers).json()
         url = news_detail['url']
         tdata = news_detail['photos']
         for t in tdata:
             if t['imgurl'] != "":
                 gallary += t['imgurl'] + ","
             content += t['note'] + "<br/>"
     elif restype == 1:
         if category == "问吧":
             news_detail_url = 'https://wenba.m.163.com/wenda/mob/answer/detail.do?uuid=' + str(articleid)
             news_detail = requests.get(news_detail_url, headers=headers).json()['data']
             content = news_detail['answer']['content']
             # 读取问答图片
             image_list = news_detail['answer']['images']
             for i in image_list:
                 if i['src'] != "":
                     gallary += i['src'] + ","
             url = "https://c.m.163.com/news/ans/" + articleid + ".html"
             # 跟帖接口https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/@replyid字段值/app/comments/newList
         else:
             try:
                 news_detail = requests.get(news_detail_url, headers=headers)
                 news_detail = news_detail.json()[str(articleid)]
             except:
                 time.sleep(20)
                 news_detail = requests.get(news_detail_url, headers=headers).json()[str(articleid)]
             # 读取资讯图片
             image_list = news_detail['img']
             for i in image_list:
                 if i['src'] != "":
                     gallary += i['src'] + ","
             # 读取资讯视频
             try:
                 video_list = news_detail['video']
                 for v in video_list:
                     if v['url_mp4'] != "":
                         video += v['url_mp4'] + ","
             except:
                 SingleLogger().log.debug("无视频")
             content = news_detail['body']
             # 读取
             try:
                 spinfo_list = news_detail['spinfo']
                 for s in spinfo_list:
                     if s['spcontent'] != "":
                         content += s['spcontent']
             except:
                 None
             url = news_detail['shareLink']
     sdata = {
         "title": title,
         "description": abstract,
         "content": content,
         "source": source,
         "pubtimestr": publish_timestr,
         "pubtime": publish_time,
         "crawltimestr": crawltimestr,
         "crawltime": crawltime,
         "status": 0,
         "shorturl": url,
         "logo": logo,
         "labels": tab,
         "keyword": "",
         "seq": seq,
         "identity": str(articleid),
         "appname": self.appname,
         "app_tag": self.apptag,
         "category_tag":categorytag,
         "category": category,
         "restype": restype,
         "gallary": gallary,
         "video": video,
         "audio": audio
     }
     self.db(sdata, articleid, title)
Exemple #15
0
 def Analysis_dftt(self, data, category, crawltime, y, categorytag):
     seq = y + 1  # 排序
     title = ""  # 标题
     articleid = ""  # 文章标识
     restype = 1  # 类型 1 图文 2 图片 3 视频
     logo = ""  # 图片
     source = ""  # 来源
     abstract = ""  # 摘要
     tab = ""  # 标签
     gallary = ""
     content = ""  # 内容
     video = ''  # 视频
     audio = '' #音频
     articleid=data['rowkey']
     title=data['topic']
     try:
         hotnews=data['hotnews']
         if hotnews=="1":
             tab='热门'
     except:
         SingleLogger().log.debug('非热门')
     try:
         issptopic=data['issptopic']
         if issptopic=="1":
             if tab=='':
                 tab='专题'
             else:
                 tab+=',专题'
     except:
         SingleLogger().log.debug('非专题')
     try:
         imglist=data['miniimg']
         for i in imglist:
             if i['src'] != "":
                 logo += i['src'] + ","
     except:
         SingleLogger().log.debug('无封面图')
     try:
         url=data['shareurl']
     except:
         url = data['url']
     source=data['source']
     # 判断列表封面图末尾是否为,若是则进行删除
     logolen = len(logo)
     if logolen > 0:
         logostr = logo[logolen - 1]
         if logostr == ",":
             logo = logo[:-1]
     publish_time=data['ctrtime']
     crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000))
     publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000))
     # 滚动视频
     additional02=""
     try:
         additional02 = data['additional02']
     except:
         SingleLogger().log.debug('非滚动视频')
     if len(additional02)>0:
         for i in additional02:
             imglist = i['imgjs']
         for i in imglist:
             if i['src'] != "":
                 logo += i['src'] + ","
         source = i['source']
         videos = i['videojs']
         for v in videos:
             if v['src'] != "":
                 video += v['src'] + ","
                 content += v['src'] + ","
         articleid = i['8413148441964056151']
         sdata = {
             "title": title,
             "description": abstract,
             "content": content,
             "source": source,
             "pubtimestr": publish_timestr,
             "pubtime": publish_time,
             "crawltimestr": crawltimestr,
             "crawltime": crawltime,
             "status": 0,
             "shorturl": url,
             "logo": logo,
             "labels": tab,
             "keyword": "",
             "seq": seq,
             "identity": str(articleid),
             "appname": self.appname,
             "app_tag": self.apptag,
             "category_tag": categorytag,
             "category": category,
             "restype": restype,
             "gallary": gallary,
             "video": video,
             "audio": audio
         }
         self.db(sdata, articleid, title)
     else :
         isvideo = data['videonews']
         if isvideo == "1":
             restype = 3
             content = data['video_link']
             video = data['video_link']
         #普通资讯
         if restype == 1:
             if tab.find('专题')>-1:
                 content=url
             else:
                 dataurl=url+""
                 try:
                     gallary = self.getHtmlImages(url)
                 except:
                     SingleLogger().log.debug("没有gallary")
                 try:
                     content = self.getHtmlBodyInnerText(url)
                 except:
                     SingleLogger().log.debug("没有图文详情")
                 try:
                     videos = self.getHtmlVideos(url)
                     if videos != '':
                         video += videos
                 except:
                     SingleLogger().log.debug("详情没有video")
         sdata = {
             "title": title,
             "description": abstract,
             "content": content,
             "source": source,
             "pubtimestr": publish_timestr,
             "pubtime": publish_time,
             "crawltimestr": crawltimestr,
             "crawltime": crawltime,
             "status": 0,
             "shorturl": url,
             "logo": logo,
             "labels": tab,
             "keyword": "",
             "seq": seq,
             "identity": str(articleid),
             "appname": self.appname,
             "app_tag": self.apptag,
             "category_tag":categorytag,
             "category": category,
             "restype": restype,
             "gallary": gallary,
             "video": video,
             "audio": audio
         }
         self.db(sdata, articleid, title)
Exemple #16
0
 def tryparse(self, str):
     # 转换编码格式
     strjson = str.decode("UTF-8", "ignore")
     wdid = ""
     # 转json对象
     strjson = json.loads(strjson)
     url = strjson['url']
     result = urllib.parse.urlparse(url)
     params = urllib.parse.parse_qs(result.query, True)
     if url.find('recommend/getSubDocPic') > -1:
         try:
             category = params['from'][0]
             if category == "toutiao":
                 category = "头条"
                 categorytag = self.categroytag["%s" % category]
         except:
             category = "热点"
             categorytag = self.categroytag["%s" % category]
     elif url.find('recommend/getChanListNews') > -1:
         category = "视频"
         categorytag = self.categroytag["%s" % category]
     elif url.find('recommend/getComRecNews') > -1:
         wdidstr = url.split('?')[0].split('/')
         wdid = wdidstr[5]
         category = "问吧"
         categorytag = self.categroytag["%s" % category]
     elif url.find('recommend/useraction') > -1:
         if url.find('recommend/useraction?info=') > -1:
             SingleLogger().log.debug(url)
             return
         category = "两会"
         categorytag = self.categroytag["%s" % category]
     elif url.find('photo/api') > -1:
         # photo/api/set 为图片详情
         if url.find('photo/api/set') > -1:
             SingleLogger().log.debug(url)
             return
         category = "图片"
         categorytag = self.categroytag["%s" % category]
     else:
         return
     crawltime = strjson['time']
     # 获取data
     data = strjson['data']
     try:
         data = json.loads(data)
     except:
         SingleLogger().log.debug("无效抓取")
         return
     if category == "热点":
         list = data['推荐']
     elif category == "头条":
         list = data['T1348647909107']
     elif category == "视频":
         list = data['视频']
     elif category == "图片":
         list = data
     elif category == "问吧":
         list = data[wdid]
     for y, x in enumerate(list):
         self.Analysis_wyxw(x, category, crawltime, y,categorytag)
Exemple #17
0
    def Analysis_sina(self, data, category, crawltime, y, categorytag):
        video = ''  # 视频
        audio = ''  # 音频
        title = ""  # 标题
        abstract = ""  # 摘要
        articleid = ""  # 文章标识
        tab = ""  # 标签
        source = ""  # 来源
        logo = ""  # 列表图片
        url = ""  # 文章短地址
        actionType = ""  # 文章展示类型(2-普通 14-头条 3-视频 1-广告 2-图片)
        layoutStyle = ""  # 布局样式
        publish_time = ""  # 发布时间 时间戳
        publish_timestr = ""  # 发布时间 标准时间str
        crawltimestr = ""  # 抓包时间
        restype = 1  # 类型 1 图文 2 图片 3 视频
        keywords = ""  # 关键字
        content = ""  # 内容
        gallary = ""  # 图片资讯图片地址
        # 布局样式 20 频道样式
        layoutStyle = data['layoutStyle']
        # 文章展示类型(2-普通 14-头条 3-视频 1-广告 2-图片)
        actionType = data['actionType']
        #若样式为20 无文章类型则为频道 且忽略此数据
        if layoutStyle == 20 and actionType != '':
            return
        # 若样式为36 类型为23 则为要闻滚动 且忽略此数据
        if layoutStyle == 36 and actionType != 23:
            return
        # 标题
        try:
            if data['intro'] and data['intro'] != "":
                title = data['intro']
            if data['longTitle'] and data['longTitle'] != "":
                title = data['longTitle']
            elif data['title'] and data['title'] != "":
                title = data['title']
            else:
                SingleLogger().log.debug('无标题1')
        except:
            SingleLogger().log.debug('无标题2')

        # 发布时间
        try:
            publish_time = data['pubDate']
            if publish_time and publish_time != "":
                publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                                time.localtime(publish_time))
        except:
            SingleLogger().log.debug("无发布时间")

        # 抓包时间
        try:
            crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime(crawltime /
                                                        1000))  # 抓包时间
        except:
            SingleLogger().log.debug("获取抓包时间失败")

        # 来源
        try:
            source = data['source']
        except:
            SingleLogger().log.debug("没有来源")

        # 短地址
        try:
            url = data['link']
        except:
            SingleLogger().log.debug("无短地址")

        # 文章标识
        try:
            articleid = data['newsId']
        except:
            SingleLogger().log.debug("没有文章标识")

        # 文章摘要
        try:
            abstract = data['intro']
        except:
            SingleLogger().log.debug('无摘要')

        # 列表图logo
        try:
            images = data['pics']['list']
            for imgobj in images:
                logo += imgobj['pic'] + ","
        except:
            logo = data['pic']

        # 标签
        try:
            tab = data['showTag']
        except:
            SingleLogger().log.debug('无标签')

        # 分类处理
        # 视频
        if actionType == 3:
            SingleLogger().log.debug('视频')
            restype = 3
            # 防止报错
            try:
                videoInfo = data['videoInfo']  # 视频信息
                logo = videoInfo['pic']
                content = videoInfo['url']
                video = content
            except:
                SingleLogger().log.debug("获取视频详情失败")

        # 图片
        elif actionType == 6:
            SingleLogger().log.debug('图片')
            restype = 2
            try:
                logo = data['pic']
                images = data['pics']['list']
                for imgobj in images:
                    gallary += imgobj['pic'] + ","
                    content += imgobj['alt'] + "<br>"
            except:
                SingleLogger().log.debug('获取图片详情失败')

        # 广告
        elif actionType == 1 and layoutStyle == 3:
            SingleLogger().log.debug('广告')
            content = url

        # 明日头条
        elif actionType == 14:
            SingleLogger().log.debug('明日头条')
            mrttList = data['mrttList']
            title = mrttList[0]['alt']
            logo = mrttList[0]['kpic']
            articleid = mrttList[0]['newsId']

        # 普通新闻
        else:
            SingleLogger().log.debug('普通新闻')
            if tab.find('专题') > -1:
                content = url
            else:
                # 防止报错
                if url != '':
                    try:
                        gallary = self.getHtmlImages(url)
                    except:
                        SingleLogger().log.debug("没有gallary")
                    try:
                        content = self.getHtmlBodyInnerText(url)
                    except:
                        SingleLogger().log.debug("没有图文详情")
                    try:
                        videos = self.getHtmlVideos(url)
                        if videos != '':
                            video += videos
                    except:
                        SingleLogger().log.debug("详情没有video")

        sdata = {
            "title": title,
            "description": abstract,
            "content": content,  #
            "source": source,
            "pubtimestr": publish_timestr,
            "pubtime": publish_time,
            "crawltimestr": crawltimestr,  # 抓包时间
            "crawltime": crawltime,
            "status": 0,
            "shorturl": url,
            "logo": logo,
            "labels": tab,
            "keyword": "",
            "seq": y + 1,  # 排序
            "identity": str(articleid),
            "appname": self.appname,
            "app_tag": self.apptag,
            "category_tag": categorytag,
            "category": category,  # 栏目
            "restype": restype,  #
            "gallary": gallary,  #里面的所有图片地址
            "video": video,
            "audio": audio
        }
        SingleLogger().log.debug("=====sina======>%s" % sdata)
        self.db(sdata, articleid, title)
Exemple #18
0
 def add_ydzx_db(self, data, category, crawltime, y, categorytag):
     seq = y + 1  # 排序
     title = ""  # 标题
     articleid = ""  # 文章标识
     restype = 1  # 类型 1 图文 2 图片 3 视频
     logo = ""  # 图片
     source = ""  # 来源
     abstract = ""  # 摘要
     tab = ""  # 标签
     gallary = ""
     IsArtID = False  # 是否为广告资讯
     content = ""  # 内容
     publish_timestr = ""
     publish_time = ""
     url = ""  # 跳转地址
     video = ''  # 视频
     audio = ''  # 音频
     title = data['title']
     source = data['source']
     try:
         abstract = data['summary']
     except:
         SingleLogger().log.debug("无summary")
     try:
         articleid = data['docid']
     except:
         SingleLogger().log.debug("广告资讯")
         articleid = data['aid']
         if title == "":
             title = abstract
     try:
         image_list = data['image_urls']
         for i in image_list:
             if i != "":
                 logo += i + ","
     except:
         SingleLogger().log.debug("无图片")
     try:
         card_label = data['card_label']['text']
         tab = card_label
     except:
         SingleLogger().log.debug("无标签")
     try:
         url = data['url']
     except:
         SingleLogger().log.debug("无url")
     try:
         publish_timestr = data['date']
         timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S")
         publish_time = int(time.mktime(timeArray))
     except:
         SingleLogger().log.debug("无时间")
     try:
         content_type = data['content_type']
         if content_type == "video":
             restype = 3
             content = data['video_url']
             video = content
         elif content_type == "slides":
             restype = 2
             gallery_items = data['gallery_items']
             for g in gallery_items:
                 if g['img'] != "":
                     gallary += g['img'] + ","
                 if g['desc'] != "":
                     content += g['desc'] + "<br/>"
         elif content_type == "picture":
             logo = data['image']
     except:
         ctype = data['ctype']
         if ctype == "advertisement":
             IsArtID = True
             tab = data['tag']
             SingleLogger().log.debug("广告")
     crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                  time.localtime(crawltime / 1000))
     # 拼链接地址
     news_detail_url = 'https://a1.go2yd.com/Website/contents/content?docid=' + str(
         articleid)
     headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
     }
     # 若不为广告资讯或者视频资讯,取资讯详细信息
     if IsArtID == False:
         if restype == 1:
             news_detail = requests.get(news_detail_url).json()['documents']
             if category == "美图":
                 news_title = news_detail[0]['title']
                 if news_title != "":
                     title = news_title
                     abstract = news_detail[0]['summary']
             content = news_detail[0]['content']
     # 判断列表封面图末尾是否为,若是则进行删除
     logolen = len(logo)
     if logolen > 0:
         logostr = logo[logolen - 1]
         if logostr == ",":
             logo = logo[:-1]
     sdata = {
         "title": title,
         "description": abstract,
         "content": content,
         "source": source,
         "pubtimestr": publish_timestr,
         "pubtime": publish_time,
         "crawltimestr": crawltimestr,
         "crawltime": crawltime,
         "status": 0,
         "shorturl": url,
         "logo": logo,
         "labels": tab,
         "keyword": "",
         "seq": seq,
         "identity": str(articleid),
         "appname": self.appname,
         "app_tag": self.apptag,
         "category_tag": categorytag,
         "category": category,
         "restype": restype,
         "gallary": gallary,
         "video": video,
         "audio": audio
     }
     self.db(sdata, articleid, title)
Exemple #19
0
 def Analysis_shxw(self,data,category,crawltime,y,categorytag):
     video = ''  # 视频
     audio = ''  # 音频
     seq = y + 1#排序
     title = ""#标题
     articleid = ""#文章标识
     restype = 1#类型 1 图文 2 图片 3 视频
     logo = ""#图片
     source = ""#来源
     abstract = ""#摘要
     tab = ""#标签
     gallary = ""
     IsArtID = False#是否为广告资讯
     content = ""#内容
     publish_timestr = ""
     publish_time = ""
     url = ""#跳转地址
     try:
         articleid = data['newsId']
     except:
         SingleLogger().log.log.debug("广告")
     try:
         title = data['title']
     except:
         SingleLogger().log.log.debug("无标题")
     try:
         source = data['media']
     except:
         SingleLogger().log.debug("无来源")
     try:
         abstract = data['description']
     except:
         SingleLogger().log.debug("无描述")
     try:
         tab = data['recomReasons']
     except:
         SingleLogger().log.debug("无标签")
     try:
         img_list = data['pics']
         for i in img_list:
             if i != "":
                 logo+=i + ","
     except:
         SingleLogger().log.debug("无图片")
     templateType=data['templateType']
     if templateType==14:
         IsArtID=True
         tab="广告"
         articleid=data['data']['adid']
         title=data['data']['resource']['text']
         logo=data['data']['resource1']['file']
         source=data['data']['resource2']['text']
     elif templateType==37:
         restype=3
     try:
         publish_time = data['time']
         publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000))
     except:
         SingleLogger().log.debug("无时间")
     crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000))
     news_detail_url = 'https://zcache.k.sohu.com/api/news/cdn/v5/article.go/' + str(articleid) + '/0/0/0/3/1/18/40/5/1/1/1522743246021.json'
     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
     if IsArtID==False:
         if restype == 3:
             vid = data['vid']
             if vid!=0:
                 news_detail_url = 'https://s1.api.tv.itc.cn/v4/video/info/' + str(vid) + '.json?site=2&api_key=695fe827ffeb7d74260a813025970bd5'
                 news_detail = requests.get(news_detail_url,headers=headers).json()
                 content = news_detail['data']['download_url']
                 url = news_detail['data']['url_html5']
                 video=content
         else:
             news_detail = requests.get(news_detail_url,headers=headers).json()
             content = news_detail['content']
             gallary_list = news_detail['photos']
             for g in gallary_list:
                 if g['pic'] != "":
                     gallary+=g['pic'] + ","
             tvInfos = news_detail['tvInfos']
             for t in tvInfos:
                 if t['tvUrl'] != "":
                     #如果不为视频地址,则通过视频ID调用接口,返回视频地址
                     if t['tvUrl']=="urlNor&prod=news&prod=h5,":
                         vid = t['vid']
                         news_detail_url = 'https://s1.api.tv.itc.cn/v4/video/info/' + str(vid) + '.json?site=2&api_key=695fe827ffeb7d74260a813025970bd5'
                         news_detail = requests.get(news_detail_url,headers=headers).json()
                         video+=  news_detail['data']['download_url']+ ","
                     else:
                         video+=t['tvUrl'] + ","
     sdata = {
         "title": title,
         "description": abstract,
         "content": content,
         "source": source,
         "pubtimestr": publish_timestr,
         "pubtime": publish_time,
         "crawltimestr": crawltimestr,
         "crawltime": crawltime,
         "status": 0,
         "shorturl": url,
         "logo": logo,
         "labels": tab,
         "keyword": "",
         "seq": seq,
         "identity":str(articleid),
         "appname": self.appname,
         "app_tag": self.apptag,
         "category_tag": categorytag,
         "category": category,
         "restype": restype,
         "gallary": gallary,
         "video": video,
         "audio": audio
     }
     self.db(sdata,articleid,title)
Exemple #20
0
    def Analysis_bdxw(self, data, category, crawltime, y, categorytag):

        seq = y + 1  # 排序
        title = ""  # 标题
        articleid = ""  # 文章标识
        restype = 1  # 类型 1 图文 2 图片 3 视频
        logo = ""  # 图片
        source = ""  # 来源
        abstract = ""  # 摘要
        tab = ""  # 标签
        gallary = ""  #文章中的图片
        content = ""  # 内容
        video = ''  # 视频
        audio = ''  # 音频

        if data['type'] == 'RECOMMENDED_MESSAGE':
            articleid = data['id']
            #属于资讯
            try:
                #视频资讯
                videofind = data['item']['video']
                isvideo = 1
            except:
                isvideo = 0
            if isvideo == 1:
                #视频资讯
                restype = 3
                #封面图
                logo = data['item']['video']['image']['picUrl']
                # # 短地址
                # if data['item']['linkInfo']:
                #     url = data['item']['linkInfo']['originalLinkUrl']
                #     video = self.getVideo(url)
                #     if video and video != '':
                #         gallary += video +","
            elif len(data['item']['pictures']) > 0:
                #不为空就是图文信息
                restype = 2
                #循环取封面图
                for picUrl in data['item']['pictures']:
                    logo += picUrl['picUrl'] + ","
                    gallary += picUrl['picUrl'] + ","
            else:
                #纯文本
                restype = 1
            #标题
            title = data['item']['topic']['content']
            source = data['item']['topic']['content']
            #内容
            content = data['item']['content']
            if restype == 3:
                video = content
            #时间
            publish_timestr = data['item']['createdAt'][:-5].replace("T", " ")
            timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S")
            publish_time = int(time.mktime(timeArray))  # string转成时间戳
            crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime(crawltime / 1000))
            #短地址
            if data['item']['linkInfo']:
                url = data['item']['linkInfo']['originalLinkUrl']
            # 判断列表封面图末尾是否为,若是则进行删除
            if len(logo) > 0:
                logostr = logo[len(logo) - 1]
                if logostr == ",":
                    logo = logo[:-1]
            # 判断gallary末尾是否为,若是则进行删除
            if len(gallary) > 0:
                logostr = gallary[len(gallary) - 1]
                if logostr == ",":
                    gallary = gallary[:-1]
        else:
            return

        SingleLogger().log.debug(title)
        sdata = {
            "title": title,
            "description": abstract,
            "content": content,
            "source": source,
            "pubtimestr": publish_timestr,
            "pubtime": publish_time,
            "crawltimestr": crawltimestr,
            "crawltime": crawltime,
            "status": 0,
            "shorturl": url,
            "logo": logo,
            "labels": tab,
            "keyword": "",
            "seq": seq,
            "identity": str(articleid),
            "appname": self.appname,
            "app_tag": self.apptag,
            "category_tag": categorytag,
            "category": category,
            "restype": restype,
            "gallary": gallary,  #里面的所有图片地址
            "video": video,
            "audio": audio
        }
        self.db(sdata, articleid, title)
Exemple #21
0
 def Analysis_bdxw(self, data, category, crawltime, y, categorytag):
     seq = y + 1  # 排序
     title = ""  # 标题
     articleid = ""  # 文章标识
     restype = 1  # 类型 1 图文 2 图片 3 视频
     logo = ""  # 图片
     source = ""  # 来源
     abstract = ""  # 摘要
     tab = ""  # 标签
     gallary = ""
     content = ""  # 内容
     video = ''  # 视频
     audio = '' #音频
     try:
         ctag = data['ctag']["name"]
         if ctag == "专题":
             return
         elif ctag == "置顶":
             tab = ctag
     except:
         SingleLogger().log.debug("无标签")
     title = data['title']
     abstract = data['abs']
     url = data['url']
     source = data['site']
     articleid = data['nid']
     publish_time = data['sourcets']
     img_url = data['imageurls']
     for i in img_url:
         if i['url'] != "":
             logo += i['url'] + ","
     try:
         corner_type = data['corner_type']
         if corner_type == "video":
             restype = 3
             content = data['video']['url']
             video=content
         elif corner_type == "image":
             restype = 2
     except:
         SingleLogger().log.debug("非视频/图片资讯")
     if restype != 3:
         content_data = data['content']
         for c in content_data:
             if c['type'] == "image":
                 gallary += c['data']['original']['url'] + ","
             elif c['type'] == "text":
                 content += c['data'] + "<br/>"
     crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000))
     publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000))
     SingleLogger().log.debug(title)
     # 判断列表封面图末尾是否为,若是则进行删除
     logolen = len(logo)
     if logolen > 0:
         logostr = logo[logolen - 1]
         if logostr == ",":
             logo = logo[:-1]
     sdata = {
         "title": title,
         "description": abstract,
         "content": content,
         "source": source,
         "pubtimestr": publish_timestr,
         "pubtime": publish_time,
         "crawltimestr": crawltimestr,
         "crawltime": crawltime,
         "status": 0,
         "shorturl": url,
         "logo": logo,
         "labels": tab,
         "keyword": "",
         "seq": seq,
         "identity": str(articleid),
         "appname": self.appname,
         "app_tag": self.apptag,
         "category_tag":categorytag,
         "category": category,
         "restype": restype,
         "gallary": gallary,
         "video": video,
         "audio": audio
     }
     self.db(sdata, articleid, title)
Exemple #22
0
    def Analysis_ten(self, data, category, crawltime, y, categorytag):
        video = ''  # 视频
        audio = ''  # 音频
        #标题
        title = ""
        try:
            title = data['title']  #标题
        except:
            SingleLogger().log.debug('无标题')
        #摘要
        abstract = ""
        try:
            abstract = data['abstract']  #摘要
        except:
            SingleLogger().log.debug("无摘要")
        #文章标识
        articleid = ""
        try:
            articleid = data['id']  #文章标识
            #如果文章标识为空,则跳出此循环
            if articleid == "":
                return
        except:
            SingleLogger().log.debug("无文章标识")
        menulable = ""
        #当前栏目标签
        try:
            menulable = data['uinname']
        except:
            SingleLogger().log.debug("无栏目标签")
        #图片
        logo = ""
        #来源
        source = ""
        #资讯地址
        url = ""
        try:
            source = data['source']
        except:
            SingleLogger().log.debug("无来源")
        try:
            url = data['url']  #分享地址
            if not (url) or url == "":
                url = data['short_url']  # 分享地址
                if not (url) or url == "":
                    url = data['surl']  # 分享地址
        except:
            SingleLogger().log.debug("无资讯地址")
        #发布时间 时间戳
        publish_time = ""
        #发布时间 标准时间
        publish_timestr = ""
        try:
            publish_time = data['timestamp']
            if publish_time and publish_time != "":
                publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                                time.localtime(publish_time))
        except:
            SingleLogger().log.debug("无发布时间")

        # 抓包时间
        crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                     time.localtime(crawltime / 1000))  # 抓包时间
        #类型 1 图文 2 图片 3 视频
        restype = 1
        #排序
        seq = y + 1
        #关键字
        keywords = ""
        #标签
        tab = ""
        try:
            zdTab = data['labelList'][0]['word']
            if tab == "":
                tab = zdTab
            else:
                tab += "," + zdTab
        except:
            SingleLogger().log.debug("无标签")
        try:
            zdTab = data['up_labelList'][0]['word']
            if tab != '':
                tab += ',' + zdTab
            else:
                tab = zdTab
        except:
            SingleLogger().log.debug("无特殊标签")
        #文章展示类型(528-热点精选 88-问答)
        articletype = ""
        try:
            articletype = data['articletype']
        except:
            SingleLogger().log.debug("无articletype")
            return

        #内容
        content = ""

        #图片资讯图片地址
        gallary = ""

        #列表图片展示类型(1-没图 0-1张小图 3-1张大图 2-3张小图)
        picShowType = ""
        try:
            picShowType = data['picShowType']
        except:
            SingleLogger().log.debug("无picShowType")

        #logo 图片列表(除了视频板块)
        if picShowType == 1:  #无图
            #列表没有图
            logo = ""
        else:
            #在thumbnails_qqnews数组里面取值
            try:
                image_list = data['thumbnails_qqnews']
                if not (image_list):
                    image_list = data['thumbnails_qqnews_photo']
                    if not (image_list):
                        image_list = data['thumbnails']
                logo = ""
                for i in image_list:
                    if i != "":
                        logo += i + ","
            except:
                SingleLogger().log.debug("没有列表图,可能没有图或是视频")

        if articletype == "528" or articletype == "525":  #528,525-热点精选
            if tab == "":
                tab = "热点精选"
            else:
                tab += ",热点精选"
            try:
                #热点新闻,取里面第一个列表的
                childList = data['newsModule']['newslist'][0]
                title = childList['title']
                source = childList['source']
                abstract = childList['abstract']  # 摘要
                logo = childList['thumbnails_qqnews'][0]
                if not (logo):
                    logo = childList['thumbnails_qqnews_photo'][0]
                    if not (logo):
                        logo = childList['thumbnails'][0]
                try:
                    url = data['url']  # 分享地址
                    if not (url) or url == "":
                        url = data['short_url']  # 分享地址
                        if not (url) or url == "":
                            url = data['surl']  # 分享地址
                except:
                    SingleLogger().log.debug("无资讯地址")
                content = url
            except:
                SingleLogger().log.debug("该条热点消息无内容")

        elif articletype == "4" or articletype == "101":  #视频新闻 4,101
            restype = 3  #视频
            try:
                videoData = data["video_channel"]["video"]
                logo = videoData["img"]
                content = videoData["playurl"]
                video = content
            except:
                SingleLogger().log.debug('无视频')

        elif articletype == "533":  #直播
            restype = 3  # 视频
            if tab == "":
                tab = "直播"
            else:
                tab += ",直播"
            liveVideo = data["newsModule"]["newslist"][0]
            title = liveVideo['title']
            source = liveVideo['source']
            abstract = liveVideo['abstract']
            logo = liveVideo['thumbnails_qqnews'][0]
            if not (logo):
                logo = liveVideo['thumbnails_qqnews_photo'][0]
                if not (logo):
                    logo = liveVideo['thumbnails'][0]
            try:
                publish_time = liveVideo['timestamp']  # 发布时间 时间戳
                if publish_time and publish_time != "":
                    publish_timestr = time.strftime(
                        "%Y-%m-%d %H:%M:%S",
                        time.localtime(publish_time))  # 发布时间 标准时间
            except:
                SingleLogger().log.debug("无发布时间")
            try:
                url = liveVideo['url']  # 分享地址
                if not (url) or url == "":
                    url = liveVideo['short_url']  # 分享地址
                    if not (url) or url == "":
                        url = liveVideo['surl']  # 分享地址
                try:
                    video_channel = liveVideo['video_channel']['video'][
                        'playurl']
                    content = video_channel
                except:
                    content = url
                video = content
            except:
                SingleLogger().log.debug("无分享地址")

        elif articletype == "526":  #标签列表,不是新闻return
            return
            # 普通新闻 图片新闻
        elif articletype == "0" or articletype == "12" or articletype == "1":
            news_detail_url = 'http://r.inews.qq.com/getSimpleNews/1.3.1_qqnews_5.5.90/' + str(
                menulable) + '/' + str(articleid)
            news_detail = rq.get(news_detail_url).json()
            if articletype == "0" or articletype == "12":
                content = news_detail['content']['text']
                attribute = news_detail['attribute']
                for a in attribute:
                    try:
                        # 判断gallary是否存在此链接地址,存在则跳出此次循环,不存在则进行拼接
                        if gallary.find(attribute[a]["url"]) > -1:
                            continue
                        else:
                            gallary += attribute[a]["url"] + ","
                    except:
                        try:
                            video += attribute[a]["playurl"] + ","
                        except:
                            try:
                                audio += attribute[a]["murl"] + ","
                            except:
                                SingleLogger().log.debug(json.dumps(attribute))
            elif articletype == "1":
                restype = 2  #图片新闻
                attribute = news_detail['attribute']
                for a in attribute:
                    try:
                        # 判断gallary是否存在此链接地址,存在则跳出此次循环,不存在则进行拼接
                        if gallary.find(attribute[a]["url"]) > -1:
                            continue
                        else:
                            gallary += attribute[a]["url"] + ","
                            content += attribute[a]['desc'] + "<br/>"
                    except:
                        SingleLogger().log.debug(json.dumps(attribute))
            #专题新闻
        elif articletype == "100":
            content = url
        sdata = {
            "title": title,
            "description": abstract,
            "content": content,
            "source": source,
            "pubtimestr": publish_timestr,
            "pubtime": publish_time,
            "crawltimestr": crawltimestr,  #抓包时间
            "crawltime": crawltime,
            "status": 0,
            "shorturl": url,
            "logo": logo,
            "labels": tab,
            "keyword": "",
            "seq": seq,
            "identity": str(articleid),
            "appname": self.appname,
            "app_tag": self.apptag,
            "category_tag": categorytag,
            "category": category,  #栏目
            "restype": restype,  #类型
            "gallary": gallary,
            "video": video,
            "audio": audio
        }
        self.db(sdata, articleid, title)
Exemple #23
0
    def Analysis_bdxw(self, data, category, crawltime, y, categorytag):
        seq = y + 1  # 排序
        title = ""  # 标题
        articleid = ""  # 文章标识
        restype = 1  # 类型 1 图文 2 图片 3 视频
        logo = ""  # 图片
        source = ""  # 来源
        abstract = ""  # 摘要
        tab = ""  # 标签
        gallary = ""  #详情图片,视频
        content = ""  # 内容
        audio = ''  #音频
        url = data['url']
        #头条栏目暂时看到的都是图文新闻
        restype = 1
        content = self.getHtmlBodyInnerText(url)

        title = data['topic']

        source = data['source']
        articleid = data['rowkey']

        publish_time = data['date']
        img_url = data['miniimg']
        for i in img_url:
            if i['src'] != "":
                logo += i['src'] + ","

        gallary = self.getHtmlImages(url)
        video = self.getHtmlVideos(url)

        crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                     time.localtime(crawltime / 1000))
        publish_timestr = time.strftime(
            "%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000))
        SingleLogger().log.debug(title)
        # 判断列表封面图末尾是否为,若是则进行删除
        logolen = len(logo)
        if logolen > 0:
            logostr = logo[logolen - 1]
            if logostr == ",":
                logo = logo[:-1]
        sdata = {
            "title": title,
            "description": abstract,
            "content": content,
            "source": source,
            "pubtimestr": publish_timestr,
            "pubtime": publish_time,
            "crawltimestr": crawltimestr,
            "crawltime": crawltime,
            "status": 0,
            "shorturl": url,
            "logo": logo,
            "labels": tab,
            "keyword": "",
            "seq": seq,
            "identity": str(articleid),
            "appname": self.appname,
            "app_tag": self.apptag,
            "category_tag": categorytag,
            "category": category,
            "restype": restype,
            "gallary": gallary,  #里面的所有图片地址
            "video": video,
            "audio": audio
        }
        self.db(sdata, articleid, title)
Exemple #24
0
 def Analysis_sntt(self, x, category, crawltime, y, categorytag):
     data = x['content']
     video = ''  # 视频
     audio = ''  # 音频
     data = json.loads(data)
     title = ""  #标题
     articleid = ""  #文章标识
     restype = 1  #类型 1 图文 2 图片 3 视频
     logo = ""  #图片
     source = ""  #来源
     abstract = ""  #摘要
     try:
         abstract = data['abstract']  #摘要
     except:
         SingleLogger().log.debug("无摘要")
     seq = y + 1  #排序
     keywords = ""  #关键字
     gallary = ""  #图片资讯图片地址
     IsV = False  #是否为大V资讯
     IsArtID = False  #是否无唯一标识,并为广告
     url = ""  #资讯地址
     content = ""  #内容
     publish_time = ""  #发布时间
     publish_timestr = ""  #发布时间 标准时间
     hot = 0  #是否为热门
     tab = ""  #标签
     try:
         tab = data['label']
     except:
         tab = ""
         SingleLogger().log.debug("非置顶文章")
     try:
         url = data['display_url']  #分享地址
     except:
         try:
             url = data['share_url']  #分享地址
         except:
             try:
                 url = data['url']  #分享地址
             except:
                 SingleLogger().log.debug("无资讯地址")
     try:
         publish_time = data['publish_time']
     except:
         SingleLogger().log.debug("无发布时间")
     #如果是美图栏目
     if category == "美图":
         try:
             title = data['content']
         except:
             SingleLogger().log.debug("无内容仅只有图片")
         articleid = data['group_id']  #文章标识
         logo = data['large_image']['url']
         publish_time = data['create_time']
     #如果是小视频栏目
     elif category == "小视频":
         xdata = data['raw_data']
         title = xdata['title']
         articleid = xdata['group_id']
         image_list = xdata['large_image_list']
         for w in image_list:
             if w['url'] != "":
                 logo += w['url'] + ","
         url = xdata['share']['share_url']
         publish_time = xdata['create_time']
         restype = 3
     #如果是问答栏目
     elif category == "问答":
         seq = y
         try:
             qdata = data['question']
             qdata = json.loads(qdata)  #转json
             title = qdata['title']
             articleid = qdata['qid']
             publish_time = qdata['create_time']
             image_list = qdata['content']['large_image_list']
             for w in image_list:
                 if w['url'] != "":
                     logo += w['url'] + ","
         except:
             return
     else:
         try:
             title = data['title']
         except:
             SingleLogger().log.debug("无标题")
         try:
             articleid = data['group_id']  #文章标识
         except:
             try:
                 articleid = data['thread_id']  #文章标识
                 IsV = True
                 try:
                     #取文章图片列表
                     large_image_list = data['large_image_list']
                     for i, j in enumerate(large_image_list):
                         if j['url'] != "":
                             #取前3个图为列表图
                             if (i < 3):
                                 logo += j['url'] + ","
                             gallary += j['url'] + ","
                 except:
                     SingleLogger().log.debug("大V资讯无图片")
             except:
                 #若此资讯是广告,并无唯一标识,则生成一个唯一标识
                 if tab == "广告":
                     articleid = uuid.uuid1()  #文章标识
                     IsArtID = True
                 SingleLogger().log.debug("无唯一标识:")
         #如果has_video=true 则为视频新闻
         if data['has_video'] == True:
             restype = 3
         else:
             try:
                 keywords = data['keywords']  #关键字
             except:
                 SingleLogger().log.debug("无关键字")
         try:
             logo = data['middle_image']['url']
         except:
             SingleLogger().log.debug("无封面图")
     #如果文章标识为空,则跳出此循环
     if articleid == "":
         return
     try:
         source = data['source']
     except:
         SingleLogger().log.debug("无来源")
     #是否为图片新闻
     try:
         gallary_flag = data['gallary_flag']
         if gallary_flag == 1:
             restype = 2
     except:
         SingleLogger().log.debug("非图片新闻")
     try:
         hot = data['hot']  #0 非热门 1 热门
     except:
         SingleLogger().log.debug("非热门")
     if hot == 1:
         if tab == "":
             tab = "热"
         else:
             tab = tab + ",热"
     #若restype=1为普通资讯 则可能存在多张图片
     if restype == 1:
         try:
             image_list = data['image_list']
             logo = ""
             for i in image_list:
                 if i['url'] != "":
                     logo += i['url'] + ","
         except:
             SingleLogger().log.debug("只一张或没有图片")
     if publish_time != "":
         publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime(publish_time))
     crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                  time.localtime(crawltime / 1000))
     #不为广告并有唯一标识 则取资讯详情信息
     if IsArtID == False:
         if category == "问答":
             news_detail_url = 'http://is.snssdk.com/wenda/v2/question/brow/?device_id=48679316565'
             postData = {'qid': str(articleid), 'count': 30}
             #返回文章json
             news_detail = requests.post(news_detail_url,
                                         data=postData).json()
             qdata = news_detail['question']
             content = qdata['content']['text']
             url = qdata['share_data']['share_url']  #分享地址
             adata = news_detail['data']
             gallary = ""
             for a in adata:
                 content += "<br/>" + a['answer']['content_abstract']['text']
                 image_list = a['answer']['content_abstract'][
                     'large_image_list']
                 for w in image_list:
                     if w['url'] != "":
                         gallary += w['url'] + ","
             articleid = uuid.uuid1()  #文章标识
         else:
             #拼链接地址
             news_detail_url = 'http://a3.bytecdn.cn/article/content/15/2/' + str(
                 articleid) + '/' + str(articleid) + '/1/'
             if IsV:
                 news_detail_url = 'http://lf.snssdk.com/ugc/thread/detail/v2/content/?thread_id=' + str(
                     articleid)
                 #返回文章json
                 news_detail = requests.get(news_detail_url).json()
             else:
                 #返回文章json
                 news_detail = requests.get(news_detail_url).json()['data']
             try:
                 content = news_detail['content']
                 #若restype=2为图片新闻 则取图片地址
                 if restype == 2:
                     gallery = news_detail['gallery']
                     for z in gallery:
                         gallary += z['sub_image']['url'] + ","
                 elif restype == 3:
                     video = content
             except:
                 SingleLogger().log.debug("内容暂无/图片取值错误")
     sdata = {
         "title": title,
         "description": abstract,
         "content": content,
         "source": source,
         "pubtimestr": publish_timestr,
         "pubtime": publish_time,
         "crawltimestr": crawltimestr,
         "crawltime": crawltime,
         "status": 0,
         "shorturl": url,
         "logo": logo,
         "labels": tab,
         "keyword": keywords,
         "seq": seq,
         "identity": str(articleid),
         "appname": self.appname,
         "app_tag": self.apptag,
         "category_tag": categorytag,
         "category": category,
         "restype": restype,
         "gallary": gallary,
         "video": video,
         "audio": audio
     }
     self.db(sdata, articleid, title)
Exemple #25
0
    def Analysis_fenghuang(self, data, category, crawltime, y, categorytag,
                           lable):
        title = ""  #标题
        abstract = ""  #摘要
        articleid = ""  #文章标识
        tab = lable  #标签
        source = ""  #来源
        logo = ""  #列表图片
        url = ""  #文章短地址
        articletype = ""  #文章展示类型(phvideo-视频 doc-图文)
        publish_time = ""  #发布时间 时间戳
        publish_timestr = ""  #发布时间 标准时间str
        crawltimestr = ""  #抓包时间
        restype = 1  #类型 1 图文 2 图片 3 视频
        seq = y + 1  #排序
        keywords = ""  #关键字
        content = ""  #内容
        gallary = ""  #图片资讯图片地址
        detailJk = ""  #详情接口
        style = ""  # 是否为热点
        video = ''  # 视频
        audio = ''  # 音频
        #如果是推荐关注列表,直接return
        try:
            articletype = data['type']  #文章展示类型
            if articletype == 'marquee2':
                return
        except:
            SingleLogger().log.debug("无articletype")
            return

        # 标题
        try:
            title = data['title']
        except:
            SingleLogger().log.debug('无标题')

        # 标签
        try:
            style = data['style']['recomReason']['reasonName']
            if tab == "":
                tab = style
            else:
                tab += "," + style
        except:
            SingleLogger().log.debug('无标签资讯')
        try:
            style = data['style']['attribute']
            if tab == "":
                tab = style
            else:
                tab += "," + style
        except:
            SingleLogger().log.debug('无标签资讯')

        # 发布时间
        try:
            publish_timestr = data['updateTime']  #发布时间 标准时间str
            timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S")
            publish_time = int(time.mktime(timeArray))  #string转成时间戳
        except:
            SingleLogger().log.debug("无发布时间")
            #无发布时间采用当前时间
            publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime(crawltime / 1000))
            publish_time = crawltime  # string转成时间戳

        # 抓包时间
        try:
            crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime(crawltime /
                                                        1000))  # 抓包时间
        except:
            SingleLogger().log.debug("获取抓包时间失败")

        # 来源
        try:
            source = data['source']
        except:
            SingleLogger().log.debug("没有来源或者是视频、广告")

        #没有link字段就return
        try:
            linkInfo = data['link']
        except:
            return

        # 短地址
        try:
            url = linkInfo['weburl']
        except:
            SingleLogger().log.debug("无短地址")

        # 文章标识
        try:
            articleid = data['documentId']
        except:
            SingleLogger().log.debug("没有文章标识或者是广告")

        #列表图logo
        #style.images里面没有就取thumbnail
        try:
            images = data['style']['images']
            if images and len(images) > 0:
                for imgobj in images:
                    logo += imgobj + ","
            else:
                logo = data['thumbnail']
                if logo == "":
                    SingleLogger().log.debug('无列表图')
        except:
            try:
                logo = data['thumbnail']
            except:
                SingleLogger().log.debug('无列表图')

        #分类处理
        # 视频,或 小视频栏目
        if articletype == 'phvideo' or articletype == "videoshortimg":
            restype = 3
            try:
                source = data['phvideo']['channelName']  #来源
            except:
                SingleLogger().log.debug("视频无来源")
            # 防止报错
            try:
                guid = data['id']  #视频接口参数
                articleid = guid  # 文章标识
                #没有MP4字段就调用详情接口
                try:
                    content = linkInfo['mp4']
                except:
                    detailJk = "http://api.3g.ifeng.com/api_phoenixtv_details?guid=" + guid  #视频详情接口
                    postData = {}
                    res = self.httpPost(detailJk, postData)
                    content = res['singleVideoInfo'][0]['videoURLMid']
                video = content
            except:
                SingleLogger().log.debug("获取视频详情失败")

        #图片栏目、图片新闻
        elif articletype == "photo" or articletype == "slide":
            restype = 2
            #防止报错
            try:
                detailJk = linkInfo['url']  # 详情接口地址
                postData = {}
                res = self.httpPost(detailJk, postData)
                #图片类型的如果有slide字段就取,没有就按照普通新闻的接口来
                try:
                    slides = res['body']['slides']
                    if len(slides) > 0:
                        for sldobj in slides:
                            curDesc = sldobj['description']
                            curImg = sldobj['image']
                            if curDesc != "":
                                content += curDesc + "<br/>"
                            if curImg != "":
                                gallary += curImg + ","
                except:
                    try:
                        content = res['body']['text']
                    except:
                        SingleLogger().log.debug("无text")
                    try:
                        gallaryList = res['body']['img']
                        if len(gallaryList) > 0:
                            for gaobj in gallaryList:
                                gallary += gaobj['url'] + ","
                    except:
                        SingleLogger().log.debug("详情没有图片")
                    try:
                        videos = res['body']['videos']
                        for vidobj in videos:
                            video += vidobj['video']['Normal']['src'] + ","
                    except:
                        SingleLogger().log.debug("详情没有视频")
            except:
                SingleLogger().log.debug("获取图片详情失败")

        #广告
        elif articletype == "advert":
            try:
                articleid = data['pid']  #文章标识
            except:
                SingleLogger().log.debug("该广告没有文章标识")
            content = url

        #普通新闻
        elif articletype == "doc":
            #防止报错
            try:
                detailJk = linkInfo['url']  #详情接口地址
                postData = {}
                res = self.httpPost(detailJk, postData)
                try:
                    content = res['body']['text']
                except:
                    SingleLogger().log.debug("无text")
                try:
                    gallaryList = res['body']['img']
                    if len(gallaryList) > 0:
                        for gaobj in gallaryList:
                            gallary += gaobj['url'] + ","
                except:
                    SingleLogger().log.debug("详情没有图片")
                try:
                    videos = res['body']['videos']
                    for vidobj in videos:
                        video += vidobj['video']['Normal']['src'] + ","
                except:
                    SingleLogger().log.debug("详情没有视频")
            except:
                SingleLogger().log.debug("获取图文详情失败")

        # 置顶新闻
        elif articletype == "topic2":
            content = url

        sdata = {
            "title": title,
            "description": abstract,
            "content": content,  #
            "source": source,
            "pubtimestr": publish_timestr,
            "pubtime": publish_time,
            "crawltimestr": crawltimestr,  #抓包时间
            "crawltime": crawltime,
            "status": 0,
            "shorturl": url,
            "logo": logo,
            "labels": tab,
            "keyword": "",
            "seq": seq,
            "identity": str(articleid),
            "appname": self.appname,
            "app_tag": self.apptag,
            "category_tag": categorytag,
            "category": category,  #栏目
            "restype": restype,  #类型
            "gallary": gallary,  #里面的所有图片地址
            "video": video,
            "audio": audio
        }
        self.db(sdata, articleid, title)