def tudou_with_youku_info(url, vidfmt): http = HttpUtil() html = http.get(url) vcode = re.search(r'vcode\s*[:=]\s*\'([^\']+)\'', html) vcode = vcode.group(1) url = 'http://v.youku.com/v_show/id_{0}.html'.format(vcode) import flvcd return flvcd.FLVCD().info(url, vidfmt)
def __init__(self, axel, proxy=None, log=None): ThreadBase.__init__(self, log=log) self.__oldurls = [] self.__urltsks_q = Queue.Queue() self.__axel = axel self.__http = HttpUtil() self.__progress_bar = ProgressBar() if proxy: self.__http.set_proxy(proxy)
def info(self, url): if url.find('sohu.com') < 0: raise ValueError('not a sohu.com video url') import json import re html = HttpUtil().get(url) playlistid = re.findall(r'var playlistId="(?P<s>[^"]*?)";', html)[0] url = r'http://pl.hd.sohu.com/videolist?playlistid=%s'%playlistid data = json.loads(HttpUtil().get(url), encoding='gbk') title = data['albumName'] items = [video['pageUrl'] for video in data['videos']] return title, items
def tudou_download(url, vidfmt): http = HttpUtil() html = http.get(url) charset = http.parse_charset() html = html.decode(charset) iid = r1(r'"k":([^,]*),', html) if not iid: iid = r1(r'iid\s*[:=]\s*(\d+)', html) assert iid title = r1(r"kw\s*[:=]\s*['\"]([^']+)['\"]", html) assert title title = unescape_html(title) return tudou_download_by_iid(iid, title)
def info(self, url): if url.find('youku.com') < 0: raise ValueError('not a youku.com video url') html = HttpUtil().get(url) soup = BeautifulSoup(html) self.title = self.__title(html, soup) self.items = self.__items(html, soup) return self.title, self.items
def __info(self, url, vidfmt): parse_url = 'http://www.yytingting.com/bookstore/playAndDownload.action?' \ 'id=%s&pageNo=%d&pageSize=%d' id = _util.r1('bookId=(\d+)', url) http = HttpUtil() http.add_header('Referer', url) tmp = parse_url % (id, 1, 20) info = http.get(tmp) js = json.loads(info) data = js['data']['data'] pageNo = js['data']['pageNo'] pageSize = js['data']['pageSize'] total = js['data']['total'] urls1 = [] for i in range(total/pageSize): url = parse_url % (id, i+1, pageSize) html = http.get(url) js = json.loads(html) fmt = 'http://www.yytingting.com/resource/getPlayUrl.action?id=%d&type=6' urls1 = urls1 + [(data['resName'], fmt % data['resId']) for data in js['data']['data']] urls = [] for name, url in urls1: html = http.get(url) js = json.loads(html) urls.append((name, js['data']['url'])) return urls
def info(slef, url, merge=True, vidfmt=0): """ format_op = ["norVid", "highVid", "superVid", "oriVid"] """ assert vidfmt in (0, 1, 2, 3) http = HttpUtil() vid_page = http.get(url) vid = r1('vid="(\d+)"', vid_page) if not vid: vid = r1('vid:\s*\'(\d+)\'', vid_page) assert vid import json html = http.get('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid) data = json.loads(html.decode(http.parse_charset())) if vidfmt > 0: format_op = ["norVid", "highVid", "superVid", "oriVid"] vid = data['data'][format_op[vidfmt]] html = http.get('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid) data = json.loads(html.decode(http.parse_charset())) host = data['allot'] prot = data['prot'] urls = [] data = data['data'] title = data['tvName'] size = sum(data['clipsBytes']) assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su']) for file, new in zip(data['clipsURL'], data['su']): urls.append(real_url(host, prot, file, new)) assert data['clipsURL'][0].endswith('.mp4') return urls, title, 'mp4', 5, None
def __get_content_len(self, url): http = HttpUtil() if self.proxy: http.set_proxy(self.proxy) info = http.head(url) if 200 <= info.status < 300: if info.msg.dict.has_key('Content-Length'): return int(info.getheader('Content-Length')) try: resp = http.get_response(url) except urllib2.URLError as e: self.log.warn('%s \n %s', e.reason, url) return 0 if 200 <= resp.code < 300: # assert resp.has_header('Accept-Ranges') length = int(resp.headers.get('Content-Length')) resp.close() return length
def __init__(self): self.http = HttpUtil(charset="utf-8") self.http.header_refer_ = "http://v.ifeng.com/include/ifengLivePlayer_v1.40.4.swf" self.http.header_user_agent_ = r"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" self.http.add_header("x-flash-version", "11,5,502,146") self.http.add_header("Accept-Language", "zh-CN") self.http.add_header("Accept", "*/*") self.http.add_header("Proxy-Connection", "Keep-Alive") self.uuid = "" self.flv_location = "" self.schedule_json = None self.channels = {} self.down_handle = None
def info(self, url, vidfmt): parse_url = 'http://www.flvcd.com/parse.php?' parse_url += 'kw=' + quote(url) parse_url += '&flag=one' format = ['', 'high', 'super', 'real'] if vidfmt > 0: parse_url += '&format=%s' % format[vidfmt] parse_url += "&Go=1&go=1" # 20150723 http = HttpUtil() http.add_header('Referer', parse_url) print parse_url try: html = http.get(parse_url).decode('gb2312', 'ignore') from bs4 import BeautifulSoup soup = BeautifulSoup(html) m3u = soup.find('input', attrs={'name': 'inf'}).get('value') title = soup.find('input', attrs={'name': 'name'}).get('value') except Exception as e: # raise ValueError('not support') return [], '', None, 0, None urls = [u for u in m3u.split('|')] npf, headers = host_filter(url) return urls, title, None, npf, headers
def info(self, url, vidfmt): parse_url = 'http://www.flvcd.com/parse.php?' parse_url += 'kw='+ quote(url) parse_url += '&flag=one' format = ['', 'high', 'super', 'real'] if vidfmt > 0: parse_url += '&format=%s'%format[vidfmt] parse_url += "&Go=1&go=1" # 20150723 http = HttpUtil() http.add_header('Referer', parse_url) print parse_url try: html = http.get(parse_url).decode('gb2312', 'ignore') from bs4 import BeautifulSoup soup = BeautifulSoup(html) m3u = soup.find('input', attrs={'name': 'inf'}).get('value') title = soup.find('input', attrs={'name': 'name'}).get('value') except Exception as e: # raise ValueError('not support') return [], '', None, 0, None urls = [u for u in m3u.split('|')] npf, headers = host_filter(url) return urls, title, None, npf, headers
def w56_download_by_id(id, refer, vidfmt=0, merge=True): html = HttpUtil().get('http://vxml.56.com/json/%s/?src=site' % id) info = json.loads(html)['info'] title = info['Subject'] # assert title # hd = info['hd'] # assert hd in (0, 1, 2) # type = ['normal', 'clear', 'super'][hd] assert vidfmt in (0, 1, 2) type = ['normal', 'clear', 'super'][vidfmt] files = [x for x in info['rfiles'] if x['type'] == type] assert len(files) == 1 size = int(files[0]['filesize']) url = files[0]['url'] ext = r1(r'\.([^.]+)\?', url) assert ext in ('flv', 'mp4') return [url], title, str(ext), 1, None
class M3u8Stream(ThreadBase): def __init__(self, axel, proxy=None, log=None): ThreadBase.__init__(self, log=log) self.__oldurls = [] self.__urltsks_q = Queue.Queue() self.__axel = axel self.__http = HttpUtil() self.__progress_bar = ProgressBar() if proxy: self.__http.set_proxy(proxy) def recode(self, url, duration, vfmt, fp, npf, freq=10, detach=False): """ @param npf: download url stream by n parts per file @param vfmt: live video format """ self.m3u8url = url self.duration = duration self.vfmt = int(vfmt) # TODO: ugly conversion self.__ostream = fp self.__npf = npf self.__freq = freq if detach: self.start() else: self.run() def run(self): try: self.__loop() except: raise finally: while not self.__urltsks_q.empty(): self.__urltsks_q.get().cleanup() self.log.debug('[M3u8Stream] stop') def __loop(self): last_clip_at = 0 buff_stream_len = 0 targetduration = 2 start_at = time.time() stop_at = 0 if self.duration: stop_at = start_at + self.duration curr_tsk = None while not self.isSetStop(): start_at = time.time() self.__progress_bar.display() if self.duration and start_at >= stop_at: self.log.info("[DownloadLiveStream] time's up") return # get index page every 10s if last_clip_at + self.__freq < start_at: urls, targetduration = self.__get_curr_m3u8_file(self.m3u8url) for url in urls: if url not in self.__oldurls: memfile = BytesIO() memfile.read = memfile.getvalue urltask = UrlTask(url, out=memfile, npf=self.__npf, bar=self.__progress_bar, log=self.log) self.__oldurls.append(url) self.__axel.addTask(urltask) self.__urltsks_q.put(urltask) if len(self.__oldurls) > 100: self.__oldurls = self.__oldurls[-20:] last_clip_at = start_at # append to stream; handle error; get a new clip if curr_tsk: if curr_tsk.isArchived(): self.log.debug('[M3u8Stream] merge clip, %s', curr_tsk.url) self.__ostream.write(curr_tsk.out.read()) curr_tsk.out.close() curr_tsk.cleanup() curr_tsk = None buff_stream_len += targetduration elif curr_tsk.isError(): self.log.error('[M3u8Stream] error: %s', curr_tsk.url) curr_tsk.cleanup() raise elif not self.__urltsks_q.empty(): curr_tsk = self.__urltsks_q.get() if time.time() - start_at < 1: sleep(1) def __get_curr_m3u8_file(self, m3u8url, n=3): urls = [] sub_m3u8s = [] targetduration = 0 try: m3u8 = self.__http.get(m3u8url) for line in m3u8.splitlines(False): line = line.strip(' \n') if line == '': continue if line.startswith('#'): if line.lower().find('targetduration') > 0: targetduration = int(line.split(':')[1]) self.log.debug('[M3u8Stream] targetduration=%d', targetduration) else: if line.startswith('http'): urls.append(line) else: url = urllib.basejoin(M3u8Stream.host_filter(m3u8url), line) if line.endswith('.m3u8'): sub_m3u8s.append(url) else: urls.append(url) sm_len = len(sub_m3u8s) if sm_len > 0: fmt_index = self.vfmt if self.vfmt < sm_len else sm_len-1 self.log.debug('[M3u8Stream] use sub m3u8 url: %s', sub_m3u8s[fmt_index]) return self.__get_curr_m3u8_file(sub_m3u8s[fmt_index]) except urllib2.URLError as e: self.log.warn('[M3u8Stream] network not working: %s', e.message) except _socket_timeout: self.log.warn('[M3u8Stream] connection timeout') except: raise return urls, targetduration @staticmethod def host_filter(url): if url.find('ifeng.com') > 0: return re.match('(^http[s]?://[^/?]*/)', url).group(0) else: return re.match('(^http[s]?://.*/)', url).group(0) # if __name__ == "__main__": # main()
class Spider: def __init__(self): self.http = HttpUtil(charset="utf-8") self.http.header_refer_ = "http://v.ifeng.com/include/ifengLivePlayer_v1.40.4.swf" self.http.header_user_agent_ = r"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" self.http.add_header("x-flash-version", "11,5,502,146") self.http.add_header("Accept-Language", "zh-CN") self.http.add_header("Accept", "*/*") self.http.add_header("Proxy-Connection", "Keep-Alive") self.uuid = "" self.flv_location = "" self.schedule_json = None self.channels = {} self.down_handle = None def start_recode(self, channel_name, duration, output='./'): output = os.path.abspath(output) if not os.path.isdir(output): os.mkdir(output) outfile = os.path.join(output, util.get_time_string() + ".flv") LOG.info("[channel] %s", channel_name) uuid = self._get_uuid(channel_name) flv_location = self._get_flv_location(uuid) LOG.info("[location] %s", flv_location) LOG.info("[output] %s", outfile) LOG.info("[start.... ] %s", util.get_time_string()) self.download_handle = DownloadStreamHandler(open(outfile,"w"), duration) self.http.fetch(flv_location, self.download_handle) LOG.info("[stop..... ] %s", util.get_time_string()) def get_channel_info(self): data = self.http.get(r'http://v.ifeng.com/live/js/scheduleurls.js?37') tmp = util.reg_helper(data,r'g_scheduelUrl\s=\s(?P<f>.*)}')[0] + '}' tmp = tmp.replace("\'","\"").decode(encoding="utf-8") js = json.loads(s=tmp, encoding="utf-8") for uuid, channel in js.items(): name = channel['name'] self.channels[name] = {'uuid': uuid, 'url': channel['url']} self.schedule_json = tmp return self.channels, self.schedule_json def _get_uuid(self,channel_name): self.get_channel_info() url = self.channels[channel_name]['url'] data = self.http.get(url) html = data.decode(CHARSET) if html.find(r'uuid=') > 0: reg_str = r'uuid=(?P<f>[^|]*)' else: reg_str = r'http://biz.vsdn.tv380.com/playlive.php\?(?P<f>[^|]*)' self.uuid = util.reg_helper(html,reg_str)[0] LOG.info("[UUID] %s", self.uuid) return self.uuid def _get_param(self, uuid): time_string = str(int(time.time() + 300)) hash_string = "ifeng" + "7171537bdc0b95c6a23d9e21ea6615ebet720se2zjw" + time_string + uuid + "1" + "ifenuserid=" hash_result = hashlib.md5(hash_string).hexdigest() param = uuid + "&swax_rt=js&ifenai=ifeng&ifenfg=&ifents=" + time_string + "&ifenv=1&ifensg="\ + hash_result[5:15] + "&ifenuserid=" return param def _get_flv_location(self, uuid): param = self._get_param(uuid) url = r'http://ifenglive.soooner.com/?uuid=%s' % (param) data = self.http.get(url) html = data.decode(CHARSET) reg_str = r'playurl="(?P<f>[^"]*)"' self.flv_location = util.reg_helper(html,reg_str)[0] self.flv_location = url.replace("rtmp://", "http://") LOG.info("[flv] %s", self.flv_location) data = self.http.get(self.flv_location) html = data.decode(CHARSET) reg_str = r'playurl="(?P<f>[^"]*)"' self.flv_location = util.reg_helper(html, reg_str)[0] self.flv_location = self.flv_location.replace("rtmp://", "http://") return self.flv_location
def real_url(host, prot, file, new): url = 'http://%s/?prot=%s&file=%s&new=%s' % (host, prot, file, new) html = HttpUtil().get(url) start, _, host, key, _, _, _, _, _ = html.split('|') return '%s%s?key=%s' % (start[:-1], new, key)
class M3u8Stream(ThreadBase): def __init__(self, axel, proxy=None, log=None): ThreadBase.__init__(self, log=log) self.__oldurls = [] self.__urltsks_q = Queue.Queue() self.__axel = axel self.__http = HttpUtil() self.__progress_bar = ProgressBar() if proxy: self.__http.set_proxy(proxy) def recode(self, url, duration, vfmt, fp, npf, freq=10, detach=False): """ @param npf: download url stream by n parts per file @param vfmt: live video format """ self.m3u8url = url self.duration = duration self.vfmt = int(vfmt) # TODO: ugly conversion self.__ostream = fp self.__npf = npf self.__freq = freq if detach: self.start() else: self.run() def run(self): try: self.__loop() except: raise finally: while not self.__urltsks_q.empty(): self.__urltsks_q.get().cleanup() self.log.debug('[M3u8Stream] stop') def __loop(self): last_clip_at = 0 buff_stream_len = 0 targetduration = 2 start_at = time.time() stop_at = 0 if self.duration: stop_at = start_at + self.duration curr_tsk = None while not self.isSetStop(): start_at = time.time() self.__progress_bar.display() if self.duration and start_at >= stop_at: self.log.info("[DownloadLiveStream] time's up") return # get index page every 10s if last_clip_at + self.__freq < start_at: urls, targetduration = self.__get_curr_m3u8_file(self.m3u8url) for url in urls: if url not in self.__oldurls: memfile = BytesIO() memfile.read = memfile.getvalue urltask = UrlTask(url, out=memfile, npf=self.__npf, bar=self.__progress_bar, log=self.log) self.__oldurls.append(url) self.__axel.addTask(urltask) self.__urltsks_q.put(urltask) if len(self.__oldurls) > 100: self.__oldurls = self.__oldurls[-20:] last_clip_at = start_at # append to stream; handle error; get a new clip if curr_tsk: if curr_tsk.isArchived(): self.log.debug('[M3u8Stream] merge clip, %s', curr_tsk.url) self.__ostream.write(curr_tsk.out.read()) curr_tsk.out.close() curr_tsk.cleanup() curr_tsk = None buff_stream_len += targetduration elif curr_tsk.isError(): self.log.error('[M3u8Stream] error: %s', curr_tsk.url) curr_tsk.cleanup() raise elif not self.__urltsks_q.empty(): curr_tsk = self.__urltsks_q.get() if time.time() - start_at < 1: sleep(1) def __get_curr_m3u8_file(self, m3u8url, n=3): urls = [] sub_m3u8s = [] targetduration = 0 try: m3u8 = self.__http.get(m3u8url) for line in m3u8.splitlines(False): line = line.strip(' \n') if line == '': continue if line.startswith('#'): if line.lower().find('targetduration') > 0: targetduration = int(line.split(':')[1]) self.log.debug('[M3u8Stream] targetduration=%d', targetduration) else: if line.startswith('http'): urls.append(line) else: url = urllib.basejoin(M3u8Stream.host_filter(m3u8url), line) if line.endswith('.m3u8'): sub_m3u8s.append(url) else: urls.append(url) sm_len = len(sub_m3u8s) if sm_len > 0: fmt_index = self.vfmt if self.vfmt < sm_len else sm_len - 1 self.log.debug('[M3u8Stream] use sub m3u8 url: %s', sub_m3u8s[fmt_index]) return self.__get_curr_m3u8_file(sub_m3u8s[fmt_index]) except urllib2.URLError as e: self.log.warn('[M3u8Stream] network not working: %s', e.message) except _socket_timeout: self.log.warn('[M3u8Stream] connection timeout') except: raise return urls, targetduration @staticmethod def host_filter(url): if url.find('ifeng.com') > 0: return re.match('(^http[s]?://[^/?]*/)', url).group(0) else: return re.match('(^http[s]?://.*/)', url).group(0) # if __name__ == "__main__": # main()