def getVideoByVid(vid): url = 'http://vv.video.qq.com/geturl?otype=xml&platform=1&vid=%s&format=2' % vid content=getHtml(url) videoUrl=None if content: videoUrl=r1(r'<url>(.*?)</url>',content) return videoUrl
def getVideoByUrl(url): # tDir=r'e:\tmp' # fileName=r'v1.html' # filePath=os.path.join(tDir,fileName) content=getHtml(url) # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) videoUrl=None if content: videoUrl=r1(r'<param.*?videoUrl=(.*?)"',content) return videoUrl
def getVideoByUrl(url): if re.match(r'http://share.vrs.sohu.com', url): vid = r1('id=(\d+)', url) else: html = getHtml(url)#.decode('gbk') vid = r1(r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?', html) assert vid if re.match(r'http://tv.sohu.com/', url): info = json.loads(getHtml('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid)) for qtyp in ["oriVid","superVid","highVid" ,"norVid","relativeId"]: hqvid = info['data'][qtyp] if hqvid != 0 and hqvid != vid : info = json.loads(getHtml('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % hqvid)) break host = info['allot'] tvid = info['tvid'] urls = [] data = info['data'] assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']): clipURL = urlparse(clip).path urls.append(real_url(host,hqvid,tvid,new,clipURL,ck)) # assert data['clipsURL'][0].endswith('.mp4') else: info = json.loads(getHtml('http://my.tv.sohu.com/play/videonew.do?vid=%s&referer=http://my.tv.sohu.com' % vid)) host = info['allot'] tvid = info['tvid'] urls = [] data = info['data'] assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']): clipURL = urlparse(clip).path urls.append(real_url(host,vid,tvid,new,clipURL,ck)) return urls
def getHtmlInfo(): url = r"http://www.huxiu.com" wap_url = "http://m.huxiu.com" content = getHtml(url) # print content newsList = [] if content: soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8") itemList = soup.find_all("div", {"class": "mod-b mod-art "}) itemList += soup.find_all("div", {"class": "mod-b mod-art mod-b-push"}) for item in itemList: nInfo = {} head = item.find("", {"class": "mob-ctt"}) if not head: continue title = head.find("h3") if not title: continue title = title.find("a") nInfo["url"] = url + title.get("href") nInfo["title"] = title.getText() nInfo["newsid"] = getMd5(nInfo["url"]) nInfo["summary"] = item.find("div", {"class": "mob-sub"}).getText() nInfo["description"] = nInfo["summary"] nInfo["thumb"] = item.find("img", {"class": "lazy"}).get("data-original") nInfo["keywords"] = "" timeStr = head.find("span", {"class": "time"}).getText() timeSec = time.time() min_num = r1(u"(\d{1,2})分钟前", timeStr) if min_num: timeSec -= 60 * long(min_num) else: hour_num = r1(u"(\d{1,2})小时前", timeStr) if hour_num: timeSec -= 3600 * long(hour_num) else: day_num = r1(u"(\d{1,2})天前", timeStr) timeSec = timeSec - long(day_num) * 24 * 3600 if day_num else timeSec nInfo["ctime"] = timeSec author_div = item.find("div", {"class": "mob-author"}) nInfo["author"] = "" if author_div: author_span = author_div.find("span", {"class": "author-name "}) nInfo["author"] = author_span.getText() if author_span else "" nInfo["source"] = ctable newsList.append(nInfo) return newsList
def getVideoByUrl(url): # tDir=r'e:\tmp' # fileName=r'v1.html' # filePath=os.path.join(tDir,fileName) content = getHtml(url) # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) videoUrl = None if content: # video videoUrl = r1(r"<video.*?src='(.*?)'", content) if not videoUrl: sourceWeb = r1(r'src="(.*?)" data-vid', content) dataVid = r1(r'data-vid="(.*?)"', content) if "ku6" in sourceWeb and dataVid: videoUrl = getKu6VideoByVid(dataVid) return videoUrl
def getVideoByUrl(url): # tDir=r'e:\tmp' # fileName=r'v1.html' # filePath=os.path.join(tDir,fileName) # url is like:http://v.ifeng.com/news/world/201408/015041f2-2979-9982-9fb1-950a9390ac64.shtml # vInfo_url_prefix=r'http://v.ifeng.com/video_info_new/4/48/01de5902-0b5a-00f1-5154-47d50dda0448.xml' vInfo_url_prefix=r'http://v.ifeng.com/video_info_new/'#4/48/01de5902-0b5a-00f1-5154-47d50dda0448.xml' d1=r1(r'.*/(.*?)\.',url) # print d1,d1[len(d1)-2],d1[len(d1)-2:len(d1)] vInfo_url=vInfo_url_prefix+d1[len(d1)-2]+r'/'+d1[len(d1)-2:len(d1)]+r'/'+d1+r'.xml' # print vInfo_url content=getHtml(vInfo_url) # if content: # fileKit.writeFileBinary(filePath, content) # content=fileKit.readFileBinary(filePath) videoUrl=None if content: root = ET.fromstring(content) videoUrl=root[0].attrib.get('VideoPlayUrl') return videoUrl
def real_url(host,vid,tvid,new,clipURL,ck): url = 'http://'+host+'/?prot=9&prod=flash&pt=1&file='+clipURL+'&new='+new +'&key='+ ck+'&vid='+str(vid)+'&uid='+str(int(time.time()*1000))+'&t='+str(random.random()) return json.loads(getHtml(url))['url']