def eastday_video_download(url): html = get_content(url, ) title = match1(html, r'var\s*redirect_topic\s*=\s*[\'|"](.*?)[\'|"];') if title is None: title = match1( html, r'<meta\s*name=[\'|"]description[\'|"]\s*content=[\'|"](.,*?)[\'|"]/>' ) source = match1(html, r'var\s*d_source\s*=\s*[\'|"](.*?)[\'|"];') if source is None: source = "crawl" thumbnail_url = match1(html, r'var\s*global_share_img\s*=\s*[\'|"](.*?)[\'|"];') video_url = match1(html, r'var\s*mp4\s*=\s*[\'|"](.*?)[\'|"];') if not re.search(r"http|https", video_url): video_url = "http:{}".format(video_url) if not re.search(r"http|https", thumbnail_url): thumbnail_url = "http:{}".format(thumbnail_url) data = { "type": 'video', "title": title, "source": source, "thumbnail_urls": [thumbnail_url], "image_urls": None, "video_url": [video_url], "ext": None, "size": None, } return data
def baomihua_download_by_id(_id, title, source, img_url, type): html = get_content( 'http://play.baomihua.com/getvideourl.aspx?flvid={}&devicetype=' 'phone_app'.format(_id)) host = match1(html, r'host=([^&]*)') _type = match1(html, r'videofiletype=([^&]*)') vid = match1(html, r'&stream_name=([^&]*)') dir_str = match1(html, r'&dir=([^&]*)').strip() video_url = 'http://{}/{}/{}.{}'.format(host, dir_str, vid, _type) logging.debug("url is {}".format(video_url)) if title is None: title = match1(html, r'&title=([^&]*)') title = urllib.parse.unquote(title) if source is None: return None if img_url is None: img_url = match1(html, r'&video_img=([^&]*)') ext = _type size = int(match1(html, r'&videofilesize=([^&]*)')) size = float("{:.2f}".format(int(size) / 1024 / 1024)) data = { "type": type, "title": title, "source": source, "thumbnail_urls": [img_url], "image_urls": None, "video_url": [video_url], "ext": ext, "size": size, } return data
def miaopai_download(url): mobile_page = get_content(url, headers=fake_headers_mobile) try: title = re.search(r'([\'"])title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3) except: title = re.search(r'([\'"])status_title\1:\s*([\'"])(.+?)\2,', mobile_page).group(3) title = title.replace('\n', '_') source = re.search(r'([\'"])screen_name\1:\s*([\'"])(.+?)\2,', mobile_page).group(3) stream_url = re.search(r'([\'"])stream_url\1:\s*([\'"])(.+?)\2,', mobile_page).group(3) thumbnail_urls = re.search( r'[\'"]page_pic[\'"]:[\s\W\S\w]*[\'"]url[\'"]:\s*[\'"](.*?)[\'"],[\s\W\S\w]*},', mobile_page ).group(1) ext = 'mp4' type = news_type(url) data = { "type": type, "title": title, "source": source, "thumbnail_urls": [thumbnail_urls], "image_urls": None, "video_url": [stream_url], "ext": ext, "size": None, } return data
def ku6_download(url): html = get_content(url) type = news_type(url) title = match1( html, r"\$\(['\"]#video-title['\"]\)\.text\(['\"]([\s\S\w\W]+?)['\"]\);") if title is None: title = match1(html, r"document\.title\s*=\s*['\"]([\s\S\w\W]+?)['\"];") title = title.strip() source = match1( html, r"\$\(['\"]#video-author['\"]\)\.text\(['\"](.*?)['\"]\);") img_url = match1( html, r'[\'|"]poster[\'|"]:\s*[\'|"](.*?)[\'|"],\s*[\'|"]controls[\'|"]:') video_url = match1( html, r'this\.src\(\{type:\s*[\'|"]video/mp4[\'|"], src: [\'|"](.*?)[\'|"]}\);' ) data = { "type": type, "title": title, "source": source, "thumbnail_urls": [img_url], "image_urls": None, "video_url": [video_url], "ext": None, "size": None, } return data
def ifeng_download(url, title=None, output_dir=output_dir, merge=True, info_only=False, **kwargs): # old pattern /uuid.shtml # now it could be #uuid id = match1( url, r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})') if id: return ifeng_download_by_id(id, None, output_dir=output_dir, merge=merge, info_only=info_only) html = get_content(url) uuid_pattern = r'"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"' id = match1( html, r'var vid="([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"' ) if id is None: video_pattern = r'"vid"\s*:\s*' + uuid_pattern id = match1(html, video_pattern) assert id, "can't find video app" return ifeng_download_by_id(id, title=title, output_dir=output_dir, merge=merge, info_only=info_only)
def acfun_download(url): response = get_content(url) if re.search(r'data-title="(.*?)"', response, re.S).group(1): title = re.search(r'data-title="(.*?)"', response, re.S).group(1) elif re.search(r'<title>(.*?)\s-\sAcFun弹幕视频网.*</title>', response, re.S).group(1): title = re.search(r'<title>(.*?)\s-\sAcFun弹幕视频网.*</title>', response, re.S).group(1) else: title = re.search(r'data-proof="(.*?)"', response, re.S).group(1) thumbnail_url = re.search(r'"coverImage":"(.*?)"', response).group(1) if re.search(r'data-uname="(.*?)"', response, re.S).group(1): source = re.search(r'data-uname="(.*?)"', response, re.S).group(1) elif re.search(r'"username":"******"', response, re.S).group(1): source = re.search(r'"username":"******"', response, re.S).group(1) else: source = re.search(r'data-name="(.*?)"', response, re.S).group(1) video_url = None type = news_type(url) data = { "type": type, "title": title, "source": source, "thumbnail_urls": [thumbnail_url], "image_urls": None, "video_url": video_url, "ext": None, "size": None, } return data
def baomihua_download(url): html = get_content(url) type = news_type(url) title = match1(html, r"var\s*temptitle\s*=\s*'(.*?)';") source = match1(html, r"var\s*appName\s*=\s*\"(.*?)\";") img_url = match1(html, r"var\s*pic\s*=\s*\"(.*?)\";") _id = match1(html, r'flvid\s*=\s*(\d+)') if type == "video": return baomihua_download_by_id( _id, title, source, img_url, type, )
def wangyi_news_download(url): html = get_content(url, charset="GBK") doc = pq(html) # 标题 title = doc('div.post_content_main h1').text() assert title, "获取文章标题失败" # 来源 source = doc( 'div.post_content_main div.post_time_source a#ne_article_source').text( ) assert source, "获取文章来源失败" # 预处理正文内容 # content = doc('div.post_content_main div.post_body').html() content = doc('div.post_content_main div.post_body div.post_text').html() back = re.compile( r"<div\s*class=['|\"]ep-source\s*cDGray['|\"]>[\s\S\w\W]*?</div>") content = back.sub('', content, re.S) content = cleaner(str(content)) assert content, "获取文章内容失败" # 获取文章内图片 image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S) if not image_urls: image_urls = re.findall(r'data-original=[\'|"](.*?)[\'|"]', content, re.S) # 获取不到返回空列表 assert image_urls, "获取文章图片失败" image_urls_final = [] for url in image_urls: regex = re.compile(r'http:|https:') if regex.match(url): image_urls_final.append(url) else: image_url = 'http:' + url image_urls_final.append(image_url) # 缩略图 thumbnail_urls = [image_urls_final[0]] data = { "type": 'news', "title": title, "source": source, "content": content, "thumbnail_urls": thumbnail_urls, "image_urls": image_urls_final, } return data
def zaker_news_download(url): html = get_content(url, ) doc = pq(html) # 标题 title = doc('div#article div.article_header h1').text() assert title, "获取文章标题失败" # 来源 source = doc( 'div#article div.article_header div.article_tips a span.auther').text( ) assert source, "获取文章来源失败" # 预处理正文内容 content = doc('div#article div.article_content div#content').html() content = cleaner(str(content)) assert content, "获取文章内容失败" # 获取文章内图片 image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S) if not image_urls: image_urls = re.findall(r'data-original=[\'|"](.*?)[\'|"]', content, re.S) # 获取不到返回空列表 assert image_urls, "文章中缺少图片" image_urls_final = [] for url in image_urls: regex = re.compile(r'http:|https:') if regex.match(url): image_urls_final.append(url) else: image_url = 'http:' + url image_urls_final.append(image_url) # 缩略图 thumbnail_urls = [image_urls_final[0]] data = { "type": 'news', "title": title, "source": source, "content": content, "thumbnail_urls": thumbnail_urls, "image_urls": image_urls_final, } return data
def ifeng_download_by_id(id, title=None, output_dir=output_dir, merge=True, info_only=False): assert match1( id, r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'), id url = 'http://vxml.ifengimg.com/video_info_new/{}/{}/{}.xml'.format( id[-2], id[-2:], id) xml = get_content(url) # 标题 title_real = match1(xml, r'Name="([^"]+)"') title_real = unescape(title_real) # 来源 source = match1(xml, r'ColumnName="([^"]+)"') source = unescape(source) # 缩略图 thumbnail_urls = match1(xml, 'SmallPosterUrl="([^"]+)"') # 视频下载链接 video_url = match1(xml, r'VideoPlayUrl="([^"]+)"') video_url = video_url.replace('http://wideo.ifeng.com/', 'http://ips.ifeng.com/wideo.ifeng.com/') type, ext, size = url_info(video_url) # print_info(site_info, title, ext, size) data = { "title": title_real, "source": source, "thumbnail_urls": thumbnail_urls, "video_url": video_url, } if not info_only: download_urls([video_url], title, ext, size, output_dir, merge=merge, headers=headers) return data
def btime_news_download(url): html = get_content(url, ) doc = pq(html) # 标题 title = doc('div.article-container div.article h1#title').text() # 来源 source = doc('div.content-info span.col.cite').text() # 预处理正文内容 content = doc('div.content-text div#content-pure').children() content = cleaner(str(content)) assert content, "获取文章内容失败" # 获取文章内图片 image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S) # 获取不到返回空列表 assert image_urls, "文章中缺少图片" image_urls_final = [] for url in image_urls: regex = re.compile(r'http:|https:') if regex.match(url): image_urls_final.append(url) else: image_url = 'http:' + url image_urls_final.append(image_url) # 缩略图 thumbnail_urls = [image_urls_final[0]] data = { "type": 'news', "title": title, "source": source, "content": content, "thumbnail_urls": thumbnail_urls, "image_urls": image_urls_final, } return data
def bilibili_download(url): response = get_content(url) html = etree.HTML(response) if html.xpath('//title/text()')[0]: title = html.xpath('//title/text()')[0] elif html.xpath('//meta[@itemprop="name"]/@content')[0]: title = html.xpath('//meta[@itemprop="name"]/@content')[0] else: title = html.xpath('//meta[@property="og:title"]/@content')[0] title = match1(title, r'(.*?)_哔哩哔哩') if html.xpath('//meta[@itemprop="thumbnailUrl"]/@content'): thumbnail_url = html.xpath('//meta[@itemprop="thumbnailUrl"]/@content') elif html.xpath('//meta[@itemprop="image"]/@content'): thumbnail_url = html.xpath('//meta[@itemprop="image"]/@content') else: thumbnail_url = html.xpath('//meta[@property="og:image"]/@content') source = html.xpath('//meta[@itemprop="author"]/@content')[0] video_url = None type = news_type(url) data = { "type": type, "title": title, "source": source, "thumbnail_urls": thumbnail_url, "image_urls": None, "video_url": video_url, "ext": None, "size": None, } return data
def lieqi_news_download(url): i = 1 content_list = [] title = None source = None thumbnail_urls = None while True: if i == 1: detail_url = url else: detail_url = url.replace(".html", '-{}.html'.format(i)) try: html = get_content(detail_url, ) except Exception: raise Exception("获取文章内容超时") if re.search(r"很抱歉!您访问页面被外星人劫持了", html): break doc = pq(html) if i == 1: # 标题 title = doc('title').text() if not title: title = doc("div.contentLtopCnt.clearfix h1.title").text() # 来源 source = doc('div.contentLtopCnt.clearfix div.sourceShare div.source').children() # 缩略图 try: thumbnail_urls = re.search( r'var\s*detail_poster_src\s*=\s*[\'|"](.*?)[\'|"]', html ).group(1) if not re.match(r"http[s]?:", thumbnail_urls): thumbnail_urls = "http:" + thumbnail_urls thumbnail_urls = [thumbnail_urls] except AttributeError: pass try: source = re.search(r"</span>\s*<span>(.*?)</span>", str(source)).group(1) except AttributeError: raise AttributeError("获取来源失败") # 预处理正文内容 div = doc('div.contentLtopCnt.clearfix div.contentTextCnt').html() content_list.append(str(div)) i += 1 else: # 预处理正文内容 div = doc('div.contentLtopCnt.clearfix div.contentTextCnt').html() content_list.append(str(div)) i += 1 # 阈值 if i >= 30: break try: content = ''.join(content_list) content = cleaner(content) logging.debug('清洗完成') except: raise AssertionError("获取文章内容失败") # 获取文章内图片 image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S) # 获取不到返回空列表 assert image_urls, "文章中缺少图片" image_urls_final = [] for url in image_urls: regex = re.compile(r'http:|https:') if regex.match(url): image_urls_final.append(url) else: image_url = 'http:' + url image_urls_final.append(image_url) # 缩略图 if not thumbnail_urls: thumbnail_urls = [image_urls_final[0]] if (title and source): data = { "type": 'news', "title": title, "source": source, "content": content, "thumbnail_urls": thumbnail_urls, "image_urls": image_urls_final, } else: raise Exception("获取标题和来源失败") return data
def eastday_news_download(url): i = 1 content_list = [] title = None source = None while True: if i == 1: detail_url = url else: detail_url = url.replace(".html", '-{}.html'.format(i)) try: html = get_content(detail_url, ) except Exception: raise Exception("获取文章内容超时") if re.search(r'<div class="detail_room">', html): logging.debug('东方号内容,发布失败') raise Exception('东方号内容,发布失败') if re.search(r"404 很抱歉!您访问页面被外星人劫持了", html): break doc = pq(html) if i == 1: # 标题 title = doc( 'div.detail_left_cnt div.J-title_detail.title_detail h1 span' ).text() # 来源 source = doc( 'div.detail_left_cnt div.J-title_detail.title_detail div.share_cnt_p.clearfix div.fl' ).children() try: source = re.search(r"</i>\s*<i>(.*?)</i>", str(source)).group(1) except AttributeError: source = re.search(r"</i>\s*<a.*>(.*?)</a>", str(source), re.S).group(1) # 预处理正文内容 div = doc('div#J-contain_detail_cnt').html() content_list.append(str(div)) i += 1 else: # 预处理正文内容 div = doc('div#J-contain_detail_cnt').html() content_list.append(str(div)) i += 1 # 阈值 if i >= 30: break try: content = ''.join(content_list) content = cleaner(content) logging.debug('清洗完成') except: raise AssertionError("获取文章内容失败") # 获取文章内图片 image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S) # 获取不到返回空列表 assert image_urls, "文章中缺少图片" image_urls_final = [] for url in image_urls: regex = re.compile(r'http:|https:') if regex.match(url): image_urls_final.append(url) else: image_url = 'http:' + url image_urls_final.append(image_url) # 缩略图 thumbnail_urls = [image_urls_final[0]] if (title and source): data = { "type": 'news', "title": title, "source": source, "content": content, "thumbnail_urls": thumbnail_urls, "image_urls": image_urls_final, } else: raise Exception("获取标题和来源失败") return data
def qq_video_download(url): type = "video" video_url = None if re.search(r"new\.qq\.com/omv/video/", url): vid = re.search(r"^http[s]?://new\.qq\.com/omv/video/(.*?)$", url).group(1) detail_url = "{}{}".format( "https://pacaio.match.qq.com/vlike/detail?vid=", vid, ) response = get_content(detail_url) info = json.loads(response) title = info.get("data").get("title") source = info.get("data").get("source") if (source is None) or (source == ''): source = "腾讯视频" if info.get("data").get("imgs").get("228X128"): thumbnail_url = info.get("data").get("imgs").get("228X128") elif info.get("data").get("imgs").get("496X280"): thumbnail_url = info.get("data").get("imgs").get("496X280") else: thumbnail_url = info.get("data").get("img") elif re.search(r"v\.qq\.com/x/page/", url) or re.search( r"v\.qq\.com/x/cover", url): response = get_content(url) title = re.search(r"<title>(.*?)</title>", response).group(1) if (title is None) or (title == ""): title = re.search( r'<meta\s*itemprop=[\'|"]name[\'|"]\s*name=[\'|"]title[\'|"]\s*content=[\'|"](.*?)[\'|"]>', response).group(1) if (title is None) or (title == ""): title = re.search( r'<meta\s*name=[\'|"]twitter:title[\'|"]\s*property=[\'|"]og:title[\'|"]' r'\s*content=[\'|"](.*?)[\'|"]\s*/>', response).group(1) title = re.sub(r"_.*$", '', title) try: source = re.search( r'<span\s*class=[\'|"]user_name[\'|"]>(.*?)</span>', response).group(1) except AttributeError: source = re.search( r'<strong\s*class=[\'|"]player_title[\'|"]>(.*?)</strong>', response).group(1) if (source is None) or (source == ''): source = "腾讯视频" thumbnail_url = re.search( r'<meta\s*itemprop=[\'|"]image[\'|"]\s*content=[\'|"](.*?)[\'|"]>', response).group(1) if thumbnail_url is None: thumbnail_url = re.search( r'<meta\s*itemprop=[\'|"]thumbnailUrl[\'|"]\s*content=[\'|"](.*?)[\'|"]>', response).group(1) if not re.search(r"^http[s]?:(.*)?$", thumbnail_url).group(1): thumbnail_url = re.search( r'[\'|"]pic_640_360[\'|"]:[\'|"](.*?)[\'|"],', response).group(1) elif re.search(r"sports\.qq\.com", url): return {"message": "腾讯独家,暂不支持"} else: title = None source = None thumbnail_url = None video_url = None data = { "type": type, "title": title, "source": source, "thumbnail_urls": [thumbnail_url], "image_urls": None, "video_url": video_url, "ext": None, "size": None, } return data
def zhonghua_news_download(url): i = 1 content_list = [] title = None source = None thumbnail_urls = None while True: if i == 1: detail_url = url else: detail_url = url.replace(".html", '_{}.html'.format(i)) try: html = get_content(detail_url, ) except Exception: raise Exception("获取文章内容超时") doc = pq(html) if i == 1: # 标题 title = doc("div.pleft.mt10 div.article-header h1.title").text() # 来源 source = doc( 'div.pleft.mt10 div.article-header div.info div.left small#article-source' ).text() # 预处理正文内容 div = doc('div.pleft.mt10 div.viewbox div#main-content').html() content_list.append(str(div)) i += 1 else: # 预处理正文内容 div = doc('div.pleft.mt10 div.viewbox div#main-content').html() content_list.append(str(div)) i += 1 if not re.search(r"下一页</a>", html): break if i >= 30: break try: content = ''.join(content_list) content = cleaner(content) logging.debug('清洗完成') except: raise AssertionError("获取文章内容失败") # 获取文章内图片 image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S) # 获取不到返回空列表 assert image_urls, "文章中缺少图片" image_urls_final = [] for url in image_urls: regex = re.compile(r'http:|https:') if regex.match(url): image_urls_final.append(url) else: image_url = 'http://kan.china.com' + url image_urls_final.append(image_url) # 缩略图 if not thumbnail_urls: thumbnail_urls = [image_urls_final[0]] if (title and source): data = { "type": 'news', "title": title, "source": source, "content": content, "thumbnail_urls": thumbnail_urls, "image_urls": image_urls_final, } else: raise Exception("获取标题和来源失败") return data
def sohu_news_download(url, ): html = get_content(url, ) doc = pq(html) if "www.sohu.com/a/" in url: # 标题 title = doc('div.text div.text-title h1').text() if not title: title = doc( 'div.content.area div.article-box.l h3.article-title').text() if re.match(r"原创", title): title = title.replace("原创", '') # 来源 source = doc('div.column.left div.user-info h4 a').text() if not source: source = doc( 'div.right-author-info.clearfix div.l.clearfix a.name.l').text( ) # 预处理正文内容 content = doc('div.text article.article').html() if not content: content = doc('article.article-text').html() backsohu = re.compile(r"<span\s*class=['|\"]backword['|\"]>.*?</span>") editor_name = re.compile( r"<p\s*data-role=['|\"]editor-name['|\"]>.*</p>") content = backsohu.sub('', content) content = editor_name.sub('', content) if re.search(r"(搜狐.*?独家出品 未经许可严禁转载)", content): content = re.sub(r'(搜狐.*?独家出品 未经许可严禁转载)', '', content) content = cleaner(str(content)) assert content, "获取文章内容失败" # 获取文章内图片 image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S) # 获取不到返回空列表 assert image_urls, "文章中缺少图片" image_urls_final = [] for url in image_urls: regex = re.compile(r'http:|https:') if regex.match(url): image_urls_final.append(url) else: image_url = 'http:' + url image_urls_final.append(image_url) # 缩略图 thumbnail_urls = [image_urls_final[0]] elif "sh.focus.cn/zixun/" in url: # 标题 title = doc('div.main-content h1').text() if re.match(r"原创", title): title = title.replace("原创", '') # 来源 source = doc( 'div.main-content div.s-pic-info div.info-source span a').text() # 预处理正文内容 content = doc('div.main-content div.info-content').html() backsohu = re.compile(r"<span\s*class=['|\"]backword['|\"]>.*?</span>") editor_name = re.compile( r"<p\s*data-role=['|\"]editor-name['|\"]>.*</p>") content = backsohu.sub('', content) content = editor_name.sub('', content) if re.search(r"(搜狐.*?独家出品 未经许可严禁转载)", content): content = re.sub(r'(搜狐.*?独家出品 未经许可严禁转载)', '', content) content = cleaner(str(content)) assert content, "获取文章内容失败" # 获取文章内图片 image_urls = re.findall(r'src=[\'|"](.*?)[\'|"]', content, re.S) # 获取不到返回空列表 assert image_urls, "文章中缺少图片" image_urls_final = [] for url in image_urls: regex = re.compile(r'http:|https:') if regex.match(url): image_urls_final.append(url) else: image_url = 'http:' + url image_urls_final.append(image_url) # 缩略图 thumbnail_urls = [image_urls_final[0]] else: raise AssertionError("urls检测爬虫异常") data = { "type": 'news', "title": title, "source": source, "content": content, "thumbnail_urls": thumbnail_urls, "image_urls": image_urls_final, } return data
def sohu_video_download(url): if re.match(r'http[s]?://share\.vrs\.sohu\.com', url): vid = match1(url, 'id=(\d+)') source = None else: html = get_content(url, charset="GBK") vid = match1(html, r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?;') if re.search(r"var\s*wm_username='******';", html): source = re.search(r"var\s*wm_username='******';", html).group(1) else: source = None assert vid, "视频vid获取失败,请检查url" if re.match(r'http[s]?://tv\.sohu\.com/', url): info = json.loads( get_content( 'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'.format(vid))) if info.get("data") and (info.get("data") is not None): for qtyp in [ 'oriVid', 'superVid', 'highVid', 'norVid', 'relativeId' ]: if 'data' in info: hqvid = info['data'][qtyp] else: hqvid = info[qtyp] if hqvid != 0 and hqvid != vid: info = json.loads( get_content( 'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'. format(hqvid))) if 'allot' not in info: continue break host = info['allot'] tvid = info['tvid'] urls = [] if not source: if "wm_data" in info: if 'wm_username' in info["wm_data"]: source = info["wm_data"]["wm_username"] else: source = "crawl" else: source = "crawl" data = info['data'] title = data['tvName'] thumbnail_url = data["coverImg"] size = sum(data['clipsBytes']) assert len(data['clipsURL']) == len(data['clipsBytes']) == len( data['su']) for fileName, key in zip(data['su'], data['ck']): urls.append(real_url(fileName, key, data['ch'])) else: info = json.loads( get_content( 'http://my.tv.sohu.com/play/videonew.do?vid={}&referer=' 'http://my.tv.sohu.com'.format(vid))) host = info['allot'] tvid = info['tvid'] urls = [] if not source: if "wm_data" in info: if 'wm_username' in info["wm_data"]: source = info["wm_data"]["wm_username"] else: source = "crawl" else: source = "crawl" data = info['data'] title = data['tvName'] thumbnail_url = data["coverImg"] size = sum(map(int, data['clipsBytes'])) assert len(data['clipsURL']) == len(data['clipsBytes']) == len( data['su']) for fileName, key in zip(data['su'], data['ck']): urls.append(real_url(fileName, key, data['ch'])) data = { "type": 'video', "title": title, "source": source, "thumbnail_urls": [thumbnail_url], "image_urls": None, "video_url": urls, "ext": None, "size": size, } return data else: return None