def getFirstPageInfo(): url=r'http://video.sina.com.cn/news/' # tDir=r'e:\tmp' # fileName=r'sina.html' # filePath=os.path.join(tDir,fileName) content=getHtml(url) # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) vInfoList=[] if content: soup = BeautifulSoup(content, from_encoding='gbk') videoList=soup.find_all('div',{'suda-uatrack-key':"news_video"}) for item in videoList: vInfo={} vInfo['vid']=item.find('div',{'class':"news-item-count"}).get('data-vid-count') vInfo['title']=item.get('data-title') vInfo['url']=item.get('data-url') vInfo['thumb']=item.find('img').get('src') vInfo['summary']=item.find('p',{'class':"desc"}).string vInfo['keywords']=item.get('data-key') vInfo['newsid']=item.get('data-newsid') vInfo['duration']='' vInfo['web']=ctable # hm=r1('(\d{2}:\d{2})',item.find('div',{'class':"news-item-time"}).string) # ymd=r1(r'.*?/(\d{4}-\d{2}-\d{2}).*?',vInfo['url']) # vInfo['loadtime']=timeformat.getTimeStamp((long)(time.mktime(time.strptime(ymd+' '+hm, '%Y-%m-%d %H:%M')))) try: subContent=getHtml(vInfo['url']) subSoup=BeautifulSoup(subContent, from_encoding='utf-8') tblock=subSoup.find('p',{'class':"channel"}) vInfo['vtype']= tblock.find('a').string fblock=subSoup.find('p',{'class':"from"}) vInfo['source']= fblock.find_all('span')[1].string.replace(u'来源:','') # block1=subSoup.find('div',{'class':"relatedVido favVideo"}) # reList=block1.find_all('li') # strList='' # for i in range(len(reList)-1): # strList+=reList[i].get('video-id')+',' # strList+=reList[len(reList)-1].get('video-id') # vInfo['related']=strList vInfo['related']='' # related news is no needed block2=subSoup.find('p',{'class':"from"}) timeStr=block2.find('em').string vInfo['loadtime']= timeformat.extractTimeStamp(timeStr) vInfoList.append(vInfo) print vInfo['loadtime'],vInfo['url'] except: print 'Error: ',vInfo['url'] # logging.error('Error: '+vInfo['url']) return vInfoList
def getPageInfo(page): # page is a num # http://so.tv.sohu.com/list_p1122_p20_p3_p40_p5_p6_p73_p8_p90_p101_p110.html url=r'http://so.tv.sohu.com/list_p1122_p20_p3_p40_p5_p6_p73_p8_p90_p10'+str(page)+r'_p110.html' # tDir=r'e:\tmp' # fileName=r'sohu.html' # filePath=os.path.join(tDir,fileName) content=getHtml(url) # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) vInfoList=[] if content: soup = BeautifulSoup(content, from_encoding='utf-8') soup_content=soup.find('ul', {'class':"st-list short cfix"}) videoList=soup_content.find_all('li') for item in videoList: vInfo={} st_pic_a=item.find('div',{'class':"st-pic"}).find('a') vInfo['vid']=st_pic_a.get('_s_v') vInfo['url']=st_pic_a.get('href') vInfo['title']=str(item.find('strong').find('a').string) vInfo['newsid']=r1(r'/n(\d+)\.',vInfo['url']) vInfo['thumb']=st_pic_a.find('img').get('src') dustr=str(st_pic_a.find('span',{'class':"maskTx"}).string) m = re.search(r'(\d{1,2}).*?(\d{1,2})', dustr) if m: minute=m.group(1) second=m.group(2) vInfo['duration']='{:02d}:{:02d}'.format(int(minute),int(second)) else: vInfo['duration']='' vInfo['web']=ctable try: subContent=getHtml(vInfo['url']) subSoup = BeautifulSoup(subContent,from_encoding='gbk') vInfo['keywords']=subSoup.find('meta',{'name':"keywords"}).get('content') # print vInfo['keywords'] info_con=subSoup.find('div',{'class':"info info-con"}) sum_p=str(info_con.find('p',{'class':"intro"})) vInfo['summary']=r1(r'<p class="intro">(.*?)<a class',sum_p).replace('简介:','') timeStr='' vInfo['vtype']='' vInfo['source']='' block1=info_con.find('ul',{'class':"u cfix"}) if block1: timeStr=str(block1.find('li').string) tblock=block1.find_all('li',{'class':"h"}) vInfo['source']= tblock[0].string.replace(u'来源:','').strip() vInfo['vtype']= str(tblock[2].find('a').string) else: block1=subSoup.find('div',{'class':"vInfo clear"}) if block1: timeStr=str(block1.find('div',{'class':"wdA l"}).string) vInfo['loadtime']= timeformat.extractTimeStamp(timeStr) # relUrl=r'http://pl.hd.sohu.com/videolist?playlistid=6969620&pagesize=999&order=1&callback=sohuHD.play.showPlayListBox&vid=1884339' # playlistId=r1(r'var playlistId="(\d+)"',subContent) # relUrl=r'http://pl.hd.sohu.com/videolist?playlistid='+playlistId+r'&pagesize=999&order=1&callback=sohuHD.play.showPlayListBox&vid='+vInfo['vid'] # vInfo['related']= getRelatedVideo(relUrl) vInfo['related']='' vInfoList.append(vInfo) print vInfo['loadtime'],vInfo['url'] except: print 'Error: ',vInfo['url'] # logging.error('Error: '+vInfo['url']) return vInfoList