def eastday_video_download(url): html = get_content(url, ) title = match1(html, r'var\s*redirect_topic\s*=\s*[\'|"](.*?)[\'|"];') if title is None: title = match1( html, r'<meta\s*name=[\'|"]description[\'|"]\s*content=[\'|"](.,*?)[\'|"]/>' ) source = match1(html, r'var\s*d_source\s*=\s*[\'|"](.*?)[\'|"];') if source is None: source = "crawl" thumbnail_url = match1(html, r'var\s*global_share_img\s*=\s*[\'|"](.*?)[\'|"];') video_url = match1(html, r'var\s*mp4\s*=\s*[\'|"](.*?)[\'|"];') if not re.search(r"http|https", video_url): video_url = "http:{}".format(video_url) if not re.search(r"http|https", thumbnail_url): thumbnail_url = "http:{}".format(thumbnail_url) data = { "type": 'video', "title": title, "source": source, "thumbnail_urls": [thumbnail_url], "image_urls": None, "video_url": [video_url], "ext": None, "size": None, } return data
def ku6_download(url): html = get_content(url) type = news_type(url) title = match1( html, r"\$\(['\"]#video-title['\"]\)\.text\(['\"]([\s\S\w\W]+?)['\"]\);") if title is None: title = match1(html, r"document\.title\s*=\s*['\"]([\s\S\w\W]+?)['\"];") title = title.strip() source = match1( html, r"\$\(['\"]#video-author['\"]\)\.text\(['\"](.*?)['\"]\);") img_url = match1( html, r'[\'|"]poster[\'|"]:\s*[\'|"](.*?)[\'|"],\s*[\'|"]controls[\'|"]:') video_url = match1( html, r'this\.src\(\{type:\s*[\'|"]video/mp4[\'|"], src: [\'|"](.*?)[\'|"]}\);' ) data = { "type": type, "title": title, "source": source, "thumbnail_urls": [img_url], "image_urls": None, "video_url": [video_url], "ext": None, "size": None, } return data
def ifeng_download(url, title=None, output_dir=output_dir, merge=True, info_only=False, **kwargs): # old pattern /uuid.shtml # now it could be #uuid id = match1( url, r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})') if id: return ifeng_download_by_id(id, None, output_dir=output_dir, merge=merge, info_only=info_only) html = get_content(url) uuid_pattern = r'"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"' id = match1( html, r'var vid="([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"' ) if id is None: video_pattern = r'"vid"\s*:\s*' + uuid_pattern id = match1(html, video_pattern) assert id, "can't find video app" return ifeng_download_by_id(id, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
def check_url(url, username, mongo): """ 检查url正确性,是否支持一键转换 :return: site in SITES """ video_host = match1(url, r'http[s]?://([^/]+)/') logging.debug(video_host) video_url = match1(url, r'http[s]?://[^/]+(.*)') logging.debug(video_url) if re.search(r'\?', video_url): if video_host == "www.ku6.com": pass else: url = url.replace(match1(video_url, r'(\?.*)'), '') if not (video_host and video_url): if mongo.exists(url): mongo.update(url, COD.FORMAT) raise AssertionError(r"格式错误") else: info = mongo.info(url, COD.FORMAT, username) mongo.insert(info) raise AssertionError(r"格式错误:{}".format(url)) assert video_host and video_url, r"格式错误:{}".format(url) if video_host.endswith(r'.com.cn') or video_host.endswith(r'.ac.cn'): video_host = video_host[:-3] domain = match1(video_host, r'(\.[^.]+\.[^.]+)$') or video_host k = match1(domain, r'([^.]+)') # acfun临时处理 if k == "acfun": url = url.replace("https", "http") # qq临时处理 if re.search(r"new\.qq\.com", url) or re.search(r"v\.qq\.com", url): k = "qq" logging.debug("site is {}".format(k)) if k not in SITES: if mongo.exists(url): mongo.update(url, COD.URLES) raise AssertionError(r'不支持的url, k={}'.format(k)) else: info = mongo.info(url, COD.URLES) mongo.insert(info) raise AssertionError(r'不支持的url, k={}'.format(k)) else: if mongo.exists(url): if mongo.block(url): mongo.update(url, COD.URLEX) raise AssertionError(r'此url重复') else: info = mongo.info(url, COD.BEGIN, username) mongo.insert(info) return k, url
def baomihua_download(url): html = get_content(url) type = news_type(url) title = match1(html, r"var\s*temptitle\s*=\s*'(.*?)';") source = match1(html, r"var\s*appName\s*=\s*\"(.*?)\";") img_url = match1(html, r"var\s*pic\s*=\s*\"(.*?)\";") _id = match1(html, r'flvid\s*=\s*(\d+)') if type == "video": return baomihua_download_by_id( _id, title, source, img_url, type, )
def baomihua_download_by_id(_id, title, source, img_url, type): html = get_content( 'http://play.baomihua.com/getvideourl.aspx?flvid={}&devicetype=' 'phone_app'.format(_id)) host = match1(html, r'host=([^&]*)') _type = match1(html, r'videofiletype=([^&]*)') vid = match1(html, r'&stream_name=([^&]*)') dir_str = match1(html, r'&dir=([^&]*)').strip() video_url = 'http://{}/{}/{}.{}'.format(host, dir_str, vid, _type) logging.debug("url is {}".format(video_url)) if title is None: title = match1(html, r'&title=([^&]*)') title = urllib.parse.unquote(title) if source is None: return None if img_url is None: img_url = match1(html, r'&video_img=([^&]*)') ext = _type size = int(match1(html, r'&videofilesize=([^&]*)')) size = float("{:.2f}".format(int(size) / 1024 / 1024)) data = { "type": type, "title": title, "source": source, "thumbnail_urls": [img_url], "image_urls": None, "video_url": [video_url], "ext": ext, "size": size, } return data
def ifeng_download_by_id(id, title=None, output_dir=output_dir, merge=True, info_only=False): assert match1( id, r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'), id url = 'http://vxml.ifengimg.com/video_info_new/{}/{}/{}.xml'.format( id[-2], id[-2:], id) xml = get_content(url) # 标题 title_real = match1(xml, r'Name="([^"]+)"') title_real = unescape(title_real) # 来源 source = match1(xml, r'ColumnName="([^"]+)"') source = unescape(source) # 缩略图 thumbnail_urls = match1(xml, 'SmallPosterUrl="([^"]+)"') # 视频下载链接 video_url = match1(xml, r'VideoPlayUrl="([^"]+)"') video_url = video_url.replace('http://wideo.ifeng.com/', 'http://ips.ifeng.com/wideo.ifeng.com/') type, ext, size = url_info(video_url) # print_info(site_info, title, ext, size) data = { "title": title_real, "source": source, "thumbnail_urls": thumbnail_urls, "video_url": video_url, } if not info_only: download_urls([video_url], title, ext, size, output_dir, merge=merge, headers=headers) return data
def bilibili_download(url): response = get_content(url) html = etree.HTML(response) if html.xpath('//title/text()')[0]: title = html.xpath('//title/text()')[0] elif html.xpath('//meta[@itemprop="name"]/@content')[0]: title = html.xpath('//meta[@itemprop="name"]/@content')[0] else: title = html.xpath('//meta[@property="og:title"]/@content')[0] title = match1(title, r'(.*?)_哔哩哔哩') if html.xpath('//meta[@itemprop="thumbnailUrl"]/@content'): thumbnail_url = html.xpath('//meta[@itemprop="thumbnailUrl"]/@content') elif html.xpath('//meta[@itemprop="image"]/@content'): thumbnail_url = html.xpath('//meta[@itemprop="image"]/@content') else: thumbnail_url = html.xpath('//meta[@property="og:image"]/@content') source = html.xpath('//meta[@itemprop="author"]/@content')[0] video_url = None type = news_type(url) data = { "type": type, "title": title, "source": source, "thumbnail_urls": thumbnail_url, "image_urls": None, "video_url": video_url, "ext": None, "size": None, } return data
def sohu_video_download(url): if re.match(r'http[s]?://share\.vrs\.sohu\.com', url): vid = match1(url, 'id=(\d+)') source = None else: html = get_content(url, charset="GBK") vid = match1(html, r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?;') if re.search(r"var\s*wm_username='******';", html): source = re.search(r"var\s*wm_username='******';", html).group(1) else: source = None assert vid, "视频vid获取失败,请检查url" if re.match(r'http[s]?://tv\.sohu\.com/', url): info = json.loads( get_content( 'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'.format(vid))) if info.get("data") and (info.get("data") is not None): for qtyp in [ 'oriVid', 'superVid', 'highVid', 'norVid', 'relativeId' ]: if 'data' in info: hqvid = info['data'][qtyp] else: hqvid = info[qtyp] if hqvid != 0 and hqvid != vid: info = json.loads( get_content( 'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'. format(hqvid))) if 'allot' not in info: continue break host = info['allot'] tvid = info['tvid'] urls = [] if not source: if "wm_data" in info: if 'wm_username' in info["wm_data"]: source = info["wm_data"]["wm_username"] else: source = "crawl" else: source = "crawl" data = info['data'] title = data['tvName'] thumbnail_url = data["coverImg"] size = sum(data['clipsBytes']) assert len(data['clipsURL']) == len(data['clipsBytes']) == len( data['su']) for fileName, key in zip(data['su'], data['ck']): urls.append(real_url(fileName, key, data['ch'])) else: info = json.loads( get_content( 'http://my.tv.sohu.com/play/videonew.do?vid={}&referer=' 'http://my.tv.sohu.com'.format(vid))) host = info['allot'] tvid = info['tvid'] urls = [] if not source: if "wm_data" in info: if 'wm_username' in info["wm_data"]: source = info["wm_data"]["wm_username"] else: source = "crawl" else: source = "crawl" data = info['data'] title = data['tvName'] thumbnail_url = data["coverImg"] size = sum(map(int, data['clipsBytes'])) assert len(data['clipsURL']) == len(data['clipsBytes']) == len( data['su']) for fileName, key in zip(data['su'], data['ck']): urls.append(real_url(fileName, key, data['ch'])) data = { "type": 'video', "title": title, "source": source, "thumbnail_urls": [thumbnail_url], "image_urls": None, "video_url": urls, "ext": None, "size": size, } return data else: return None