Beispiel #1
0
def getExtraPageInfo(infoNum):
#     infoNum : the news count you wanna get,is in general a number which is times of 10
    # http://api.roll.news.sina.com.cn/zt_list?channel=video&cat_3==1||=2&tag=1&show_ext=1&show_all=1&show_cat=1&format=json&show_num=80&page=1&callback=newsloadercallback&_=1406814194374
#     url=r'http: //api.roll.news.sina.com.cn/zt_list?channel=video&cat_3==1||=2&tag=1&show_ext=1&show_all=1&show_cat=1&format=json&show_num='+str(infoNum)+r'&page=1&callback=newsloadercallback&_='    
#     timeStamp=(long)(time.time()-60)*1000
#     print timeStamp
#     url+=str(timeStamp)
    url=r'http://api.roll.news.sina.com.cn/zt_list?channel=video&cat_3==1||=2&tag=1&show_ext=1&show_all=1&show_cat=1&format=json&show_num='+str(infoNum)+r'&page=1&callback=newsloadercallback&_=1406814194375'
    print url
    content=getHtml(url)    
    vInfoList=[]
    if content:
        info=json.loads(r1('.*?(\{.*\})',content),encoding='utf-8')
    #     print info
        if info.has_key('result'):
            tResult=info['result']  
            if tResult.has_key('data'):
                infoList=tResult['data']
                for info in infoList:
                    vInfo={}
                    vInfo['vid']=r1(r'-(\d+)-',info['ext1'])
                    vInfo['title']= info['title'] 
                    vInfo['url']=info['url']                    
                    vInfo['thumb']=info['img']
                    vInfo['summary']=info['ext5']
                    vInfo['keywords']=info['keywords']
                    vInfo['newsid']=info['id']                
                    vInfo['vtype']=info['ext4']
                    vInfo['loadtime']=timeformat.getTimeStamp((long)(info['createtime']))
                    vInfo['duration']=''
                    vInfo['web']=ctable
                    try:
                        subContent=getHtml(vInfo['url'])
                        subSoup=BeautifulSoup(subContent, from_encoding='utf-8')
    #                     tblock=subSoup.find('p',{'class':"channel"})
    #                     vInfo['vtype']= tblock.find('a').string
                        fblock=subSoup.find('p',{'class':"from"})
                        vInfo['source']= fblock.find_all('span')[1].string.replace(u'来源:','')
#                         block1=subSoup.find('div',{'class':"relatedVido favVideo"})
#                         reList=block1.find_all('li')
#                         strList=''
#                         for i in range(len(reList)-1):
#                             strList+=reList[i].get('video-id')+','
#                         strList+=reList[len(reList)-1].get('video-id')
#                         vInfo['related']=strList
                        vInfo['related']='' # related news is no needed
                        vInfoList.append(vInfo)
                        print vInfo['loadtime'],vInfo['url']
                    except:
                        print 'Error: ',vInfo['url']
#                         logging.error('Error: '+vInfo['url'])
    return vInfoList
Beispiel #2
0
def getHiddenPageInfo():
#     infoNum : the news count you wanna get,is in general a number which is times of 10

    # url=r'http://interest.mix.sina.com.cn/api/cate/video?page_num=20&page=1&callback=newsloadercallback&_=1406282282089'
#     url=r'http://interest.mix.sina.com.cn/api/cate/video?page_num=20&page=1&callback=newsloadercallback&_='
#     timeStamp=(long)(time.time()-60)*1000
#     url+=str(timeStamp)
    url=r'http://interest.mix.sina.com.cn/api/cate/video?page_num=20&page=1&callback=newsloadercallback&_=1406282282089'
#     print url
    content=getHtml(url)
#     print content
   
    vInfoList=[]
    if content:
        info=json.loads(r1('.*?(\{.*\})',content),encoding='utf-8')
        if info.has_key('result'):
            tResult=info['result']  
            if tResult.has_key('data'):
                infoList=tResult['data']
                for info in infoList:
                    vInfo={}
                    vInfo['vid']=info['vid']
                    vInfo['title']= info['title']
                    vInfo['url']=info['url']
                    vInfo['thumb']=info['thumb']
                    vInfo['summary']=info['lsummary']
                    vInfo['keywords']=info['keywords']
                    vInfo['newsid']=r1(r'.*?:(.*?):',info['commentid'])
                    vInfo['loadtime']=timeformat.getTimeStamp((long)(info['ctime']))
                    vInfo['duration']=''
                    vInfo['web']=ctable
                    try:
                        subContent=getHtml(vInfo['url'])
                        subSoup=BeautifulSoup(subContent, from_encoding='utf-8')
                        tblock=subSoup.find('p',{'class':"channel"})
                        vInfo['vtype']= tblock.find('a').string
                        fblock=subSoup.find('p',{'class':"from"})
                        vInfo['source']= fblock.find_all('span')[1].string.replace(u'来源:','')
#                         block1=subSoup.find('div',{'class':"relatedVido favVideo"})
#                         reList=block1.find_all('li')
#                         strList=''
#                         for i in range(len(reList)-1):
#                             strList+=reList[i].get('video-id')+','
#                         strList+=reList[len(reList)-1].get('video-id')
#                         vInfo['related']=strList
                        vInfo['related']='' # related news is no needed
                        vInfoList.append(vInfo)
                        print vInfo['loadtime'],vInfo['url']
                    except:
                        print 'Error: ',vInfo['url']
#                         logging.error('Error: '+vInfo['url'])
    return vInfoList
Beispiel #3
0
def trackUser(web,vid,userid,userip,mode):
    # web,vid,vtype,mvid,mtype,userid,userip,requesttime,click
    try:
        if mode in click_mod.values() and mode!=click_mod['auto']:
            tablemerge.increaseClick(web, vid)
        rows=tablemerge.getRecordsByWebVid(mergetable, web, vid)
        if rows !=-1 and len(rows)>0:
            title,vtype,mvid,mtype=rows[0]
            requesttime=timeformat.getTimeStamp()
            data=(web,vid,title,vtype,mvid,mtype,userid,userip,requesttime,mode)
            tablerequest.InsertItem(requesttable, data)
    except:
        logging.error('trackUser database visit error')
Beispiel #4
0
def getPageInfo(page):
#     page is a num    
# http://v.qq.com/c/todayHot.js
#     url=r'http://v.qq.com/c/media.js'
    url=categoryList[page]
#     tDir=r'e:\tmp'
#     fileName=r'china.html'
#     filePath=os.path.join(tDir,fileName)   
     
    content=getHtmlwithQQCookie(url)
#     print content 
#     if content:    
#         fileKit.writeFileBinary(filePath, content)
#     content=fileKit.readFileBinary(filePath)
    
    vInfoList=[] 
    if content:
        news = json.loads(r1('.*?(\{.*\})',content),encoding='utf-8')        
        videoList=[]
        if news.has_key('data'):
            videoList=news['data']
        for item in videoList:
            vInfo={}     
            if item.has_key('id'): # has two format json file
                vInfo['url']=item['url']
                vInfo['vid']=item['vid']
                vInfo['newsid']=item['id']
                vInfo['title']=item['title']
                vInfo['thumb']=item['image']
                vInfo['loadtime']=item['dateline']
                try:
                    vInfo['duration']='{:02d}:{:02d}:{:02d}'.format(int(item['hour']),int(item['minute']),int(item['second']))
                except:
                    vInfo['duration']=''
                vInfo['web']=ctable
                vInfo['vtype']=item['tag'][0] if len(item['tag'])>0 else item['column']
                vInfo['summary']=item['video_comment']
                vInfo['source']='qq'   
                vInfo['keywords']=vInfo['title']
                vInfo['related']=''   
            else: # different format for sports
                vInfo['url']=item['url']
                vInfo['vid']=item['vid']
                vInfo['newsid']=item['aid']
                vInfo['title']=item['title']
                vInfo['thumb']=item['img']
                # just set as an hour before
                vInfo['loadtime']=timeformat.getTimeStamp(time.time()-3600)
                try:
                    vInfo['duration']='{:02d}:{:02d}:{:02d}'.format(int(item['hour']),int(item['min']),int(item['second']))
                except:
                    vInfo['duration']=''
                vInfo['web']=ctable
                vInfo['vtype']=item['tag'][0] if len(item['tag'])>0 else categoryNameList[page]
                vInfo['summary']=item['title']
                vInfo['source']='qq'   
                vInfo['keywords']=vInfo['title']
                vInfo['related']=''  
                
            print vInfo['loadtime'],vInfo['url']
            vInfoList.append(vInfo)                                
    return vInfoList