Esempio n. 1
0
def getVideoByVid(vid):
    url = 'http://vv.video.qq.com/geturl?otype=xml&platform=1&vid=%s&format=2' % vid
    content=getHtml(url)
    videoUrl=None
    if content:
        videoUrl=r1(r'<url>(.*?)</url>',content)    
    return videoUrl
Esempio n. 2
0
def getHtmlInfo():
    url = r"http://www.huxiu.com"
    wap_url = "http://m.huxiu.com"
    content = getHtml(url)
    # print content
    newsList = []
    if content:
        soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8")
        itemList = soup.find_all("div", {"class": "mod-b mod-art "})
        itemList += soup.find_all("div", {"class": "mod-b mod-art mod-b-push"})
        for item in itemList:
            nInfo = {}
            head = item.find("", {"class": "mob-ctt"})
            if not head:
                continue
            title = head.find("h3")
            if not title:
                continue
            title = title.find("a")
            nInfo["url"] = url + title.get("href")
            nInfo["title"] = title.getText()
            nInfo["newsid"] = getMd5(nInfo["url"])
            nInfo["summary"] = item.find("div", {"class": "mob-sub"}).getText()
            nInfo["description"] = nInfo["summary"]
            nInfo["thumb"] = item.find("img", {"class": "lazy"}).get("data-original")
            nInfo["keywords"] = ""
            timeStr = head.find("span", {"class": "time"}).getText()
            timeSec = time.time()
            min_num = r1(u"(\d{1,2})分钟前", timeStr)
            if min_num:
                timeSec -= 60 * long(min_num)
            else:
                hour_num = r1(u"(\d{1,2})小时前", timeStr)
                if hour_num:
                    timeSec -= 3600 * long(hour_num)
                else:
                    day_num = r1(u"(\d{1,2})天前", timeStr)
                    timeSec = timeSec - long(day_num) * 24 * 3600 if day_num else timeSec
            nInfo["ctime"] = timeSec
            author_div = item.find("div", {"class": "mob-author"})
            nInfo["author"] = ""
            if author_div:
                author_span = author_div.find("span", {"class": "author-name "})
                nInfo["author"] = author_span.getText() if author_span else ""
            nInfo["source"] = ctable
            newsList.append(nInfo)
    return newsList
Esempio n. 3
0
def getVideoByUrl(url):
#     tDir=r'e:\tmp'
#     fileName=r'v1.html'
#     filePath=os.path.join(tDir,fileName)  
# url is like : http://v.qq.com/news/?tag=hot&vid=a00153364t6
    vid=r1(r'.*?vid=(.*)',url)
    videoUrl=getVideoByVid(vid)          
    return videoUrl
Esempio n. 4
0
def getVideoByUrl(url):
    #     tDir=r'e:\tmp'
    #     fileName=r'v1.html'
    #     filePath=os.path.join(tDir,fileName)

    content = getHtml(url)

    #     if content:
    #         fileKit.writeFileBinary(filePath, content)
    #     content=fileKit.readFileBinary(filePath)

    videoUrl = None
    if content:
        # video
        videoUrl = r1(r"<video.*?src='(.*?)'", content)
        if not videoUrl:
            sourceWeb = r1(r'src="(.*?)" data-vid', content)
            dataVid = r1(r'data-vid="(.*?)"', content)
            if "ku6" in sourceWeb and dataVid:
                videoUrl = getKu6VideoByVid(dataVid)
    return videoUrl
Esempio n. 5
0
def getVideoByUrl(url):
#     tDir=r'e:\tmp'
#     fileName=r'v1.html'
#     filePath=os.path.join(tDir,fileName)  
      
    content=getHtml(url)
    
#     if content:    
#         fileKit.writeFileBinary(filePath, content)
#     content=fileKit.readFileBinary(filePath)
    videoUrl=None
    if content:
        videoUrl=r1(r'<param.*?videoUrl=(.*?)"',content)
    return videoUrl
Esempio n. 6
0
def getVideoByUrl(url):
    if re.match(r'http://share.vrs.sohu.com', url):
        vid = r1('id=(\d+)', url)
    else:
        html = getHtml(url)#.decode('gbk')
        vid = r1(r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?', html)
    assert vid

    if re.match(r'http://tv.sohu.com/', url):
        info = json.loads(getHtml('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid))
        for qtyp in ["oriVid","superVid","highVid" ,"norVid","relativeId"]:
            hqvid = info['data'][qtyp]
            if hqvid != 0 and hqvid != vid :
                info = json.loads(getHtml('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % hqvid))
                break
        host = info['allot']
        tvid = info['tvid']
        urls = []
        data = info['data']
        assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
        for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']):
            clipURL = urlparse(clip).path
            urls.append(real_url(host,hqvid,tvid,new,clipURL,ck))
        # assert data['clipsURL'][0].endswith('.mp4')

    else:
        info = json.loads(getHtml('http://my.tv.sohu.com/play/videonew.do?vid=%s&referer=http://my.tv.sohu.com' % vid))
        host = info['allot']
        tvid = info['tvid']
        urls = []
        data = info['data']
        assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
        for new,clip,ck, in zip(data['su'], data['clipsURL'], data['ck']):
            clipURL = urlparse(clip).path
            urls.append(real_url(host,vid,tvid,new,clipURL,ck))
    return urls
Esempio n. 7
0
def getVideoByUrl(url):
#     tDir=r'e:\tmp'
#     fileName=r'v1.html'
#     filePath=os.path.join(tDir,fileName)  
#     url is like:http://v.ifeng.com/news/world/201408/015041f2-2979-9982-9fb1-950a9390ac64.shtml  
#     vInfo_url_prefix=r'http://v.ifeng.com/video_info_new/4/48/01de5902-0b5a-00f1-5154-47d50dda0448.xml'  
    vInfo_url_prefix=r'http://v.ifeng.com/video_info_new/'#4/48/01de5902-0b5a-00f1-5154-47d50dda0448.xml' 
    d1=r1(r'.*/(.*?)\.',url)
#     print d1,d1[len(d1)-2],d1[len(d1)-2:len(d1)]
    vInfo_url=vInfo_url_prefix+d1[len(d1)-2]+r'/'+d1[len(d1)-2:len(d1)]+r'/'+d1+r'.xml'
#     print vInfo_url
    content=getHtml(vInfo_url)
    
#     if content:    
#         fileKit.writeFileBinary(filePath, content)
#     content=fileKit.readFileBinary(filePath)

    videoUrl=None
    if content:
        root = ET.fromstring(content)
        videoUrl=root[0].attrib.get('VideoPlayUrl')
    return videoUrl
Esempio n. 8
0
def getVInfoUrl(url):
    if url:
        vInfo_url_prefix=r'http://v.ifeng.com/video_info_new/'
        d1=r1(r'.*/(.*?)\.',url)
        vInfo_url=vInfo_url_prefix+d1[len(d1)-2]+r'/'+d1[len(d1)-2:len(d1)]+r'/'+d1+r'.xml'
        return vInfo_url
Esempio n. 9
0
    def get(self, call):
#         print call
        web=str(self.get_argument('web'))        
        vid= str(self.get_argument('vid'))      
        userid=str(self.get_argument('userid', 'anonymous'))        
        userip=str(self.request.remote_ip)
        mode=str(self.get_argument('mode', videoinfo.click_mod['auto']))
        ############# Deal video address parsing and user tracking #################
        videoinfo.trackUser(web,vid,userid, userip, mode)
        ############# Deal video address parsing and user tracking #################
        urls=None
        html_url=None
        try:
            if web=='china':
                url=china.getUrlByVid(vid) 
                html_url=url
                videoUrl=None
                if url:
                    http_client = tornado.httpclient.AsyncHTTPClient()
                    response = yield http_client.fetch(url) 
                    if response: 
                        content=response.body
                        videoUrl=r1(r"<video.*?src='(.*?)'",content)
                        if not videoUrl:
                            sourceWeb=r1(r'src="(.*?)" data-vid',content)
                            dataVid=r1(r'data-vid="(.*?)"',content)
                            if 'ku6' in sourceWeb and dataVid:
                                url=r'http://v.ku6.com/fetchVideo4Player/'+dataVid+r'.html'
                                resp2=yield http_client.fetch(url)
                                if resp2:
                                    content=resp2.body
                                    videoUrl=china.getKu6VideoUrlByContent(content)
                urls=videoUrl
            elif web=='ifeng':
                url=ifeng.getUrlByVid(vid)
                html_url=url
                url=ifeng.getVInfoUrl(url)
                videoUrl=None
                if url:
                    http_client = tornado.httpclient.AsyncHTTPClient()
                    response = yield http_client.fetch(url) 
                    if response: 
                        content=response.body
                        videoUrl=ifeng.getVideoUrlByContent(content)
                urls=videoUrl
            elif web=='kankan':
                url=kankan.getUrlByVid(vid)
                html_url=url
                videoUrl=None
                if url:
                    http_client = tornado.httpclient.AsyncHTTPClient()
                    response = yield http_client.fetch(url) 
                    if response:
                        content=response.body
                        videoUrl=kankan.getVideoDirectByContent(content)
                        if not videoUrl:
                            part1=r1(r'(/\d{4}-\d{2}-\d{2}/\w*?)\.',url)
                            xml_url=r'http://www.kankanews.com/vxml%s.xml'%part1
                            resp2=yield http_client.fetch(xml_url)
                            if resp2:
                                content=resp2.body
                                videoUrl=kankan.getVideoInfoByContent(content)                   
                urls=videoUrl  
            elif web=='qq':
                url = 'http://vv.video.qq.com/geturl?otype=xml&platform=1&vid=%s&format=2' % vid  
                videoUrl=None
                http_client = tornado.httpclient.AsyncHTTPClient()
                response = yield http_client.fetch(url)
                if response:
                    content=response.body
                    if content:
                        videoUrl=r1(r'<url>(.*?)</url>',content)
                urls=videoUrl             
            elif web=='sina':
                urls=sina.getVideoByVid(vid)
            elif web=='sohu':
                url=sohu.getUrlByVid(vid)
                videoUrl=[] 
                if re.match(r'http://tv.sohu.com/', url):                    
                    json_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid                     
                    http_client = tornado.httpclient.AsyncHTTPClient()
                    response = yield http_client.fetch(json_url) 
                    if response:
                        content=response.body                          
                        try:
                            info=json.loads(content)
                            for qtyp in ["oriVid","superVid","highVid" ,"norVid","relativeId"]:
                                hqvid = info['data'][qtyp]
                                if hqvid != 0 and hqvid != vid :
                                    resp2=yield http_client.fetch('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % hqvid)
                                    info = json.loads(resp2.body)
                                    break
                            videoUrl=sohu.getRealUrlByInfo(info, hqvid)
                        except:
                            pass 
                else:
                    json_url='http://my.tv.sohu.com/play/videonew.do?vid=%s&referer=http://my.tv.sohu.com' % vid
                    http_client = tornado.httpclient.AsyncHTTPClient()
                    response = yield http_client.fetch(json_url) 
                    if response:
                        content=response.body 
                        try:
                            info=json.loads(content)
                            videoUrl=sohu.getRealUrlByInfo(info, vid)
                        except:
                            pass                        
                urls=videoUrl  
            elif web=='v1':
                url=v1.getUrlByVid(vid)  
                html_url=url            
                videoUrl=None
                if url:
                    http_client = tornado.httpclient.AsyncHTTPClient()
                    response = yield http_client.fetch(url) 
                    if response:
                        content=response.body
                        if content:
                            videoUrl=r1(r'<param.*?videoUrl=(.*?)"',content)
                urls=videoUrl
        except:
            logging.info('video parse error:%s'%html_url)  
                    
        ############# Deal video address parsing and user tracking #################        
        records = self.getRecords(urls)
        ############# Deal video address parsing and user tracking #################
        #get thte user's ip addr
        self.set_header('Content-Type', 'application/xml')
        #print self.render_string('template.xml',source=source)
        print self.request.remote_ip
        self.render2('video.xml',records=records)