def getExtraPageInfo(infoNum): # infoNum : the news count you wanna get,is in general a number which is times of 10 # http://api.roll.news.sina.com.cn/zt_list?channel=video&cat_3==1||=2&tag=1&show_ext=1&show_all=1&show_cat=1&format=json&show_num=80&page=1&callback=newsloadercallback&_=1406814194374 # url=r'http: //api.roll.news.sina.com.cn/zt_list?channel=video&cat_3==1||=2&tag=1&show_ext=1&show_all=1&show_cat=1&format=json&show_num='+str(infoNum)+r'&page=1&callback=newsloadercallback&_=' # timeStamp=(long)(time.time()-60)*1000 # print timeStamp # url+=str(timeStamp) url=r'http://api.roll.news.sina.com.cn/zt_list?channel=video&cat_3==1||=2&tag=1&show_ext=1&show_all=1&show_cat=1&format=json&show_num='+str(infoNum)+r'&page=1&callback=newsloadercallback&_=1406814194375' print url content=getHtml(url) vInfoList=[] if content: info=json.loads(r1('.*?(\{.*\})',content),encoding='utf-8') # print info if info.has_key('result'): tResult=info['result'] if tResult.has_key('data'): infoList=tResult['data'] for info in infoList: vInfo={} vInfo['vid']=r1(r'-(\d+)-',info['ext1']) vInfo['title']= info['title'] vInfo['url']=info['url'] vInfo['thumb']=info['img'] vInfo['summary']=info['ext5'] vInfo['keywords']=info['keywords'] vInfo['newsid']=info['id'] vInfo['vtype']=info['ext4'] vInfo['loadtime']=timeformat.getTimeStamp((long)(info['createtime'])) vInfo['duration']='' vInfo['web']=ctable try: subContent=getHtml(vInfo['url']) subSoup=BeautifulSoup(subContent, from_encoding='utf-8') # tblock=subSoup.find('p',{'class':"channel"}) # vInfo['vtype']= tblock.find('a').string fblock=subSoup.find('p',{'class':"from"}) vInfo['source']= fblock.find_all('span')[1].string.replace(u'来源:','') # block1=subSoup.find('div',{'class':"relatedVido favVideo"}) # reList=block1.find_all('li') # strList='' # for i in range(len(reList)-1): # strList+=reList[i].get('video-id')+',' # strList+=reList[len(reList)-1].get('video-id') # vInfo['related']=strList vInfo['related']='' # related news is no needed vInfoList.append(vInfo) print vInfo['loadtime'],vInfo['url'] except: print 'Error: ',vInfo['url'] # logging.error('Error: '+vInfo['url']) return vInfoList
def getHiddenPageInfo(): # infoNum : the news count you wanna get,is in general a number which is times of 10 # url=r'http://interest.mix.sina.com.cn/api/cate/video?page_num=20&page=1&callback=newsloadercallback&_=1406282282089' # url=r'http://interest.mix.sina.com.cn/api/cate/video?page_num=20&page=1&callback=newsloadercallback&_=' # timeStamp=(long)(time.time()-60)*1000 # url+=str(timeStamp) url=r'http://interest.mix.sina.com.cn/api/cate/video?page_num=20&page=1&callback=newsloadercallback&_=1406282282089' # print url content=getHtml(url) # print content vInfoList=[] if content: info=json.loads(r1('.*?(\{.*\})',content),encoding='utf-8') if info.has_key('result'): tResult=info['result'] if tResult.has_key('data'): infoList=tResult['data'] for info in infoList: vInfo={} vInfo['vid']=info['vid'] vInfo['title']= info['title'] vInfo['url']=info['url'] vInfo['thumb']=info['thumb'] vInfo['summary']=info['lsummary'] vInfo['keywords']=info['keywords'] vInfo['newsid']=r1(r'.*?:(.*?):',info['commentid']) vInfo['loadtime']=timeformat.getTimeStamp((long)(info['ctime'])) vInfo['duration']='' vInfo['web']=ctable try: subContent=getHtml(vInfo['url']) subSoup=BeautifulSoup(subContent, from_encoding='utf-8') tblock=subSoup.find('p',{'class':"channel"}) vInfo['vtype']= tblock.find('a').string fblock=subSoup.find('p',{'class':"from"}) vInfo['source']= fblock.find_all('span')[1].string.replace(u'来源:','') # block1=subSoup.find('div',{'class':"relatedVido favVideo"}) # reList=block1.find_all('li') # strList='' # for i in range(len(reList)-1): # strList+=reList[i].get('video-id')+',' # strList+=reList[len(reList)-1].get('video-id') # vInfo['related']=strList vInfo['related']='' # related news is no needed vInfoList.append(vInfo) print vInfo['loadtime'],vInfo['url'] except: print 'Error: ',vInfo['url'] # logging.error('Error: '+vInfo['url']) return vInfoList
def trackUser(web,vid,userid,userip,mode): # web,vid,vtype,mvid,mtype,userid,userip,requesttime,click try: if mode in click_mod.values() and mode!=click_mod['auto']: tablemerge.increaseClick(web, vid) rows=tablemerge.getRecordsByWebVid(mergetable, web, vid) if rows !=-1 and len(rows)>0: title,vtype,mvid,mtype=rows[0] requesttime=timeformat.getTimeStamp() data=(web,vid,title,vtype,mvid,mtype,userid,userip,requesttime,mode) tablerequest.InsertItem(requesttable, data) except: logging.error('trackUser database visit error')
def getPageInfo(page): # page is a num # http://v.qq.com/c/todayHot.js # url=r'http://v.qq.com/c/media.js' url=categoryList[page] # tDir=r'e:\tmp' # fileName=r'china.html' # filePath=os.path.join(tDir,fileName) content=getHtmlwithQQCookie(url) # print content # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) vInfoList=[] if content: news = json.loads(r1('.*?(\{.*\})',content),encoding='utf-8') videoList=[] if news.has_key('data'): videoList=news['data'] for item in videoList: vInfo={} if item.has_key('id'): # has two format json file vInfo['url']=item['url'] vInfo['vid']=item['vid'] vInfo['newsid']=item['id'] vInfo['title']=item['title'] vInfo['thumb']=item['image'] vInfo['loadtime']=item['dateline'] try: vInfo['duration']='{:02d}:{:02d}:{:02d}'.format(int(item['hour']),int(item['minute']),int(item['second'])) except: vInfo['duration']='' vInfo['web']=ctable vInfo['vtype']=item['tag'][0] if len(item['tag'])>0 else item['column'] vInfo['summary']=item['video_comment'] vInfo['source']='qq' vInfo['keywords']=vInfo['title'] vInfo['related']='' else: # different format for sports vInfo['url']=item['url'] vInfo['vid']=item['vid'] vInfo['newsid']=item['aid'] vInfo['title']=item['title'] vInfo['thumb']=item['img'] # just set as an hour before vInfo['loadtime']=timeformat.getTimeStamp(time.time()-3600) try: vInfo['duration']='{:02d}:{:02d}:{:02d}'.format(int(item['hour']),int(item['min']),int(item['second'])) except: vInfo['duration']='' vInfo['web']=ctable vInfo['vtype']=item['tag'][0] if len(item['tag'])>0 else categoryNameList[page] vInfo['summary']=item['title'] vInfo['source']='qq' vInfo['keywords']=vInfo['title'] vInfo['related']='' print vInfo['loadtime'],vInfo['url'] vInfoList.append(vInfo) return vInfoList