def getVideoByVid(vid): url = 'http://vv.video.qq.com/geturl?otype=xml&platform=1&vid=%s&format=2' % vid content=getHtml(url) videoUrl=None if content: videoUrl=r1(r'<url>(.*?)</url>',content) return videoUrl
def getHtmlInfo(): url = r"http://www.huxiu.com" wap_url = "http://m.huxiu.com" content = getHtml(url) # print content newsList = [] if content: soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8") itemList = soup.find_all("div", {"class": "mod-b mod-art "}) itemList += soup.find_all("div", {"class": "mod-b mod-art mod-b-push"}) for item in itemList: nInfo = {} head = item.find("", {"class": "mob-ctt"}) if not head: continue title = head.find("h3") if not title: continue title = title.find("a") nInfo["url"] = url + title.get("href") nInfo["title"] = title.getText() nInfo["newsid"] = getMd5(nInfo["url"]) nInfo["summary"] = item.find("div", {"class": "mob-sub"}).getText() nInfo["description"] = nInfo["summary"] nInfo["thumb"] = item.find("img", {"class": "lazy"}).get("data-original") nInfo["keywords"] = "" timeStr = head.find("span", {"class": "time"}).getText() timeSec = time.time() min_num = r1(u"(\d{1,2})分钟前", timeStr) if min_num: timeSec -= 60 * long(min_num) else: hour_num = r1(u"(\d{1,2})小时前", timeStr) if hour_num: timeSec -= 3600 * long(hour_num) else: day_num = r1(u"(\d{1,2})天前", timeStr) timeSec = timeSec - long(day_num) * 24 * 3600 if day_num else timeSec nInfo["ctime"] = timeSec author_div = item.find("div", {"class": "mob-author"}) nInfo["author"] = "" if author_div: author_span = author_div.find("span", {"class": "author-name "}) nInfo["author"] = author_span.getText() if author_span else "" nInfo["source"] = ctable newsList.append(nInfo) return newsList
def getVideoByUrl(url): # tDir=r'e:\tmp' # fileName=r'v1.html' # filePath=os.path.join(tDir,fileName) # url is like : http://v.qq.com/news/?tag=hot&vid=a00153364t6 vid=r1(r'.*?vid=(.*)',url) videoUrl=getVideoByVid(vid) return videoUrl
def getVideoByUrl(url): # tDir=r'e:\tmp' # fileName=r'v1.html' # filePath=os.path.join(tDir,fileName) content = getHtml(url) # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) videoUrl = None if content: # video videoUrl = r1(r"<video.*?src='(.*?)'", content) if not videoUrl: sourceWeb = r1(r'src="(.*?)" data-vid', content) dataVid = r1(r'data-vid="(.*?)"', content) if "ku6" in sourceWeb and dataVid: videoUrl = getKu6VideoByVid(dataVid) return videoUrl
def getVideoByUrl(url): # tDir=r'e:\tmp' # fileName=r'v1.html' # filePath=os.path.join(tDir,fileName) content=getHtml(url) # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) videoUrl=None if content: videoUrl=r1(r'<param.*?videoUrl=(.*?)"',content) return videoUrl
def getVideoByUrl(url): if re.match(r'http://share.vrs.sohu.com', url): vid = r1('id=(\d+)', url) else: html = getHtml(url)#.decode('gbk') vid = r1(r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?', html) assert vid if re.match(r'http://tv.sohu.com/', url): info = json.loads(getHtml('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid)) for qtyp in ["oriVid","superVid","highVid" ,"norVid","relativeId"]: hqvid = info['data'][qtyp] if hqvid != 0 and hqvid != vid : info = json.loads(getHtml('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % hqvid)) break host = info['allot'] tvid = info['tvid'] urls = [] data = info['data'] assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']): clipURL = urlparse(clip).path urls.append(real_url(host,hqvid,tvid,new,clipURL,ck)) # assert data['clipsURL'][0].endswith('.mp4') else: info = json.loads(getHtml('http://my.tv.sohu.com/play/videonew.do?vid=%s&referer=http://my.tv.sohu.com' % vid)) host = info['allot'] tvid = info['tvid'] urls = [] data = info['data'] assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']): clipURL = urlparse(clip).path urls.append(real_url(host,vid,tvid,new,clipURL,ck)) return urls
def getVideoByUrl(url): # tDir=r'e:\tmp' # fileName=r'v1.html' # filePath=os.path.join(tDir,fileName) # url is like:http://v.ifeng.com/news/world/201408/015041f2-2979-9982-9fb1-950a9390ac64.shtml # vInfo_url_prefix=r'http://v.ifeng.com/video_info_new/4/48/01de5902-0b5a-00f1-5154-47d50dda0448.xml' vInfo_url_prefix=r'http://v.ifeng.com/video_info_new/'#4/48/01de5902-0b5a-00f1-5154-47d50dda0448.xml' d1=r1(r'.*/(.*?)\.',url) # print d1,d1[len(d1)-2],d1[len(d1)-2:len(d1)] vInfo_url=vInfo_url_prefix+d1[len(d1)-2]+r'/'+d1[len(d1)-2:len(d1)]+r'/'+d1+r'.xml' # print vInfo_url content=getHtml(vInfo_url) # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) videoUrl=None if content: root = ET.fromstring(content) videoUrl=root[0].attrib.get('VideoPlayUrl') return videoUrl
def getVInfoUrl(url): if url: vInfo_url_prefix=r'http://v.ifeng.com/video_info_new/' d1=r1(r'.*/(.*?)\.',url) vInfo_url=vInfo_url_prefix+d1[len(d1)-2]+r'/'+d1[len(d1)-2:len(d1)]+r'/'+d1+r'.xml' return vInfo_url
def get(self, call): # print call web=str(self.get_argument('web')) vid= str(self.get_argument('vid')) userid=str(self.get_argument('userid', 'anonymous')) userip=str(self.request.remote_ip) mode=str(self.get_argument('mode', videoinfo.click_mod['auto'])) ############# Deal video address parsing and user tracking ################# videoinfo.trackUser(web,vid,userid, userip, mode) ############# Deal video address parsing and user tracking ################# urls=None html_url=None try: if web=='china': url=china.getUrlByVid(vid) html_url=url videoUrl=None if url: http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(url) if response: content=response.body videoUrl=r1(r"<video.*?src='(.*?)'",content) if not videoUrl: sourceWeb=r1(r'src="(.*?)" data-vid',content) dataVid=r1(r'data-vid="(.*?)"',content) if 'ku6' in sourceWeb and dataVid: url=r'http://v.ku6.com/fetchVideo4Player/'+dataVid+r'.html' resp2=yield http_client.fetch(url) if resp2: content=resp2.body videoUrl=china.getKu6VideoUrlByContent(content) urls=videoUrl elif web=='ifeng': url=ifeng.getUrlByVid(vid) html_url=url url=ifeng.getVInfoUrl(url) videoUrl=None if url: http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(url) if response: content=response.body videoUrl=ifeng.getVideoUrlByContent(content) urls=videoUrl elif web=='kankan': url=kankan.getUrlByVid(vid) html_url=url videoUrl=None if url: http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(url) if response: content=response.body videoUrl=kankan.getVideoDirectByContent(content) if not videoUrl: part1=r1(r'(/\d{4}-\d{2}-\d{2}/\w*?)\.',url) xml_url=r'http://www.kankanews.com/vxml%s.xml'%part1 resp2=yield http_client.fetch(xml_url) if resp2: content=resp2.body videoUrl=kankan.getVideoInfoByContent(content) urls=videoUrl elif web=='qq': url = 'http://vv.video.qq.com/geturl?otype=xml&platform=1&vid=%s&format=2' % vid videoUrl=None http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(url) if response: content=response.body if content: videoUrl=r1(r'<url>(.*?)</url>',content) urls=videoUrl elif web=='sina': urls=sina.getVideoByVid(vid) elif web=='sohu': url=sohu.getUrlByVid(vid) videoUrl=[] if re.match(r'http://tv.sohu.com/', url): json_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(json_url) if response: content=response.body try: info=json.loads(content) for qtyp in ["oriVid","superVid","highVid" ,"norVid","relativeId"]: hqvid = info['data'][qtyp] if hqvid != 0 and hqvid != vid : resp2=yield http_client.fetch('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % hqvid) info = json.loads(resp2.body) break videoUrl=sohu.getRealUrlByInfo(info, hqvid) except: pass else: json_url='http://my.tv.sohu.com/play/videonew.do?vid=%s&referer=http://my.tv.sohu.com' % vid http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(json_url) if response: content=response.body try: info=json.loads(content) videoUrl=sohu.getRealUrlByInfo(info, vid) except: pass urls=videoUrl elif web=='v1': url=v1.getUrlByVid(vid) html_url=url videoUrl=None if url: http_client = tornado.httpclient.AsyncHTTPClient() response = yield http_client.fetch(url) if response: content=response.body if content: videoUrl=r1(r'<param.*?videoUrl=(.*?)"',content) urls=videoUrl except: logging.info('video parse error:%s'%html_url) ############# Deal video address parsing and user tracking ################# records = self.getRecords(urls) ############# Deal video address parsing and user tracking ################# #get thte user's ip addr self.set_header('Content-Type', 'application/xml') #print self.render_string('template.xml',source=source) print self.request.remote_ip self.render2('video.xml',records=records)