class Sina_mobile(Crawler): NAME = 'sina_mobile' HEADERS = makeUTF8({ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', }) HEADERS_NO_UTF8 = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', } def normalize_url(self, link): return link def unique_id(self, link): for item in link.split('/'): if item.isdigit(): return item def expand_url(self, num): return "https://m.weibo.cn/detail/" + str(num) def run(self, content, link): #content = page.text soup = BeautifulSoup(content, "lxml") data = str(soup.select("body script")[0]).split( 'var $render_data = [')[1].split('][0]')[0] status = json.loads(data)["status"] html = json.loads(data)["status"]["text"] soup = BeautifulSoup(html, "lxml") a_list = soup.findAll('a') text = html_to_plain_text(html) HYPERLINK = [] for a in a_list: if 'm.weibo.cn/search?' in a.get('href'): HYPERLINK.append(a.get('href')) for url in HYPERLINK: text = text.replace('HYPERLINK', url, 1) return makeResponseSuccess({ "unique_id": "weibo:%s" % status["id"], 'uploadDate': status["created_at"], #Tue Feb 18 02:48:31 +0800 2020 'users': status["user"]["screen_name"], 'thumbnailURL': status["page_info"]["page_pic"]["url"], 'title': status["page_info"]["title"], #'stream_url_hd': status["page_info"]["media_info"]["stream_url_hd"], 'desc': text #超链接 }) async def run_async(self, content, link): return self.run(self=self, content=content, link=link)
class SinaMobile(Crawler): NAME = 'weibo-mobile' SHORT_PATTERN = r'' PATTERN = r'^(https:\/\/|http:\/\/)?m\.weibo\.(com|cn)\/detail\/(\d+)' HEADERS = makeUTF8({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', }) HEADERS_NO_UTF8 = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', } def normalize_url( self, link ) : ret = re.search(self.PATTERN, link) vid = ret.group(3) return f"https://m.weibo.cn/detail/{vid}" def unique_id(self, link): ret = re.search(self.PATTERN, link) vid = ret.group(3) return f'weibo-mobile:{vid}' def expand_url(self, num): return "https://m.weibo.cn/detail/" + str(num) def run(self, content, xpath, link, update_video_detail):#content = page.text soup = BeautifulSoup(content, "lxml") data = str(soup.select("body script")[0]).split('var $render_data = [')[1].split('][0]')[0] status = json.loads(data)["status"] html = json.loads(data)["status"]["text"] soup = BeautifulSoup(html, "lxml") a_list = soup.findAll('a') text = html_to_plain_text(html) HYPERLINK = [] for a in a_list: if 'm.weibo.cn/search?' in a.get('href'): HYPERLINK.append(a.get('href')) for url in HYPERLINK: text = text.replace('HYPERLINK', url, 1) return makeResponseSuccess({ "unique_id": self.unique_id(self=self, link=link), 'uploadDate': parse(status["created_at"]).astimezone(timezone.utc),#Tue Feb 18 02:48:31 +0800 2020 'thumbnailURL': status["page_info"]["page_pic"]["url"], 'title': status["page_info"]["title"], 'site': 'weibo-mobile', 'desc': text,#超链接 'utags': [] }) async def unique_id_async( self, link ) : return self.unique_id(self = self, link = link) async def run_async(self, content, xpath, link, update_video_detail): return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)
class Zcool(Crawler): NAME = 'zcool' PATTERN = r'^https:\/\/www\.zcool\.com\.cn\/work\/[0-9a-zA-Z=]*\.html' SHORT_PATTERN = r'' HEADERS = makeUTF8({ 'Referer': 'https://www.zcool.com.cn/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.zcool.com.cn/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } DESC_REGEX_OBJ = re.compile(r"share_description\s*=\s*\'(.*)\'\s*;", re.MULTILINE) COVER_REGEX_OBJ = re.compile( r'share_description_split,\s*title:\s*\".*\",\s*pic:\s*\"(.*)\"', re.MULTILINE) UID_REGEX_OBJ = re.compile( r"^https:\/\/www\.zcool\.com\.cn\/work\/([0-9a-zA-Z=]*)\.html", re.MULTILINE) def normalize_url(self, link): return link def expand_url(self, short): return short def unique_id(self, link): return 'zcool:%s' % self.UID_REGEX_OBJ.search(link).group(1) def run(self, content, xpath, link, update_video_detail): if not 'J_prismPlayer0' in content: return makeResponseFailed('NOT_ZCOOL_VIDEO') zcool_id = self.UID_REGEX_OBJ.search(link).group(1) title = xpath.xpath('//span[@class="fw-bold"]/text()')[0] desc = self.DESC_REGEX_OBJ.search(content).group(1) desc = desc.replace('<br>', '\n') upload_time = xpath.xpath('//p[@class="title-time"]/@title')[0].split( ':')[-1] upload_time = parse(upload_time) - timedelta(hours=8) cover = self.COVER_REGEX_OBJ.search(content).group(1) cover = cover.split('|')[0].strip().split('@')[0] return makeResponseSuccess({ 'thumbnailURL': cover, 'title': title, 'desc': desc, 'site': 'zcool', 'uploadDate': upload_time, "unique_id": "zcool:%s" % zcool_id, "utags": [] }) async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)
class Bilibili( Crawler ) : NAME = 'bilibili' PATTERN = r'^(https:\/\/|http:\/\/)?((www|m)\.)?(bilibili\.com\/video\/[aA][vV][\d]+|b23\.tv\/[aA][vV][\d]+)' SHORT_PATTERN = r'^[aA][Vv][\d]+$' HEADERS = makeUTF8( { 'Referer' : 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } ) HEADERS_NO_UTF8 = { 'Referer' : 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } def get_cookie(self) : return { 'SESSDATA' : Config.BILICOOKIE_SESSDATA, 'bili_jct' : Config.BILICOOKIE_bili_jct } def normalize_url( self, link ) : link = link.lower() return "https://www.bilibili.com/video/" + link[link.rfind("av"):] def expand_url( self, short ) : return "https://www.bilibili.com/video/" + short.lower() def unique_id( self, link ) : link = link.lower() return 'bilibili:%s' % link[link.rfind("av"):] def run( self, content, xpath, link, update_video_detail ) : raise NotImplementedError() async def unique_id_async( self, link ) : return self.unique_id(self = self, link = link) async def run_async(self, content, xpath, link, update_video_detail) : link = link.lower() vidid = link[link.rfind("av"):] if False : # use biliplus, try to get metadata from deleted video api_url = f"https://www.biliplus.com/api/view?id={vidid[2:]}" async with aiohttp.ClientSession() as session: async with session.get(api_url) as resp: if resp.status == 200 : apirespond = await resp.text() respond_json = loads(apirespond) if 'code' in respond_json and respond_json['code'] == -404 : raise Exception('Video not found in biliplus, it is gone forever ðŸ˜') thumbnailURL = respond_json['pic'] title = respond_json['title'] desc = respond_json['description'] uploadDate = parse(respond_json['created_at']) - timedelta(hours = 8) # convert from Beijing time to UTC utags = respond_json['tag'] return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title' : title, 'desc' : desc, 'site': 'bilibili', 'uploadDate' : uploadDate, "unique_id": "bilibili:%s" % vidid, "utags": utags }) try : thumbnailURL = xpath.xpath( '//meta[@itemprop="thumbnailUrl"]/@content' )[0] title = xpath.xpath( '//h1[@class="video-title"]/@title' )[0] desc = getInnerText(xpath.xpath( '//div[@class="info open"]/node()' )) uploadDate = parse(xpath.xpath( '//meta[@itemprop="uploadDate"]/@content' )[0]) - timedelta(hours = 8) # convert from Beijing time to UTC utags = xpath.xpath( '//meta[@itemprop="keywords"]/@content' )[0] utags = list(filter(None, utags.split(',')[1: -4])) except : return makeResponseSuccess({ 'thumbnailURL': '', 'title' : '已失效视频', 'desc' : '已失效视频', 'site': 'bilibili', 'uploadDate' : datetime.now(), "unique_id": "bilibili:%s" % vidid, "utags": [], "placeholder": True }) return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title' : title, 'desc' : desc, 'site': 'bilibili', 'uploadDate' : uploadDate, "unique_id": "bilibili:%s" % vidid, "utags": utags })
class Youtube(Crawler): NAME = 'youtube' PATTERN = r'^((https:\/\/)?(www\.|m\.)?youtube\.com\/watch\?v=[-\w]+|(https:\/\/)?youtu\.be\/(watch\?v=[-\w]+|[-\w]+))' SHORT_PATTERN = r'' HEADERS = makeUTF8({ 'Referer': 'https://www.youtube.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.youtube.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } API_KEYs = os.getenv('GOOGLE_API_KEYs', "").split(',') def normalize_url(self, link): if 'youtube.com' in link: vidid = link[link.rfind('=') + 1:] elif 'youtu.be' in link: if 'watch?v=' in link: vidid = link[link.rfind('=') + 1:] else: vidid = link[link.rfind('/') + 1:] return "https://www.youtube.com/watch?v=" + vidid def expand_url(self, short): return short def unique_id(self, link): if 'youtube.com' in link: vidid = link[link.rfind('=') + 1:] elif 'youtu.be' in link: if 'watch?v=' in link: vidid = link[link.rfind('=') + 1:] else: vidid = link[link.rfind('/') + 1:] return "youtube:%s" % vidid async def unique_id_async(self, link): return self.unique_id(self=self, link=link) def run(self, content, xpath, link): if 'youtube.com' in link: vidid = link[link.rfind('=') + 1:] elif 'youtu.be' in link: if 'watch?v=' in link: vidid = link[link.rfind('=') + 1:] else: vidid = link[link.rfind('/') + 1:] for key in Config.YOUTUBE_API_KEYS.split(","): api_url = "https://www.googleapis.com/youtube/v3/videos?id=" + vidid + "&key=" + key + "&part=snippet,contentDetails,statistics,status" apirespond = requests.get(api_url) # 得到api响应 if apirespond.status_code == 200: break else: log_ne(op='youtube_run', level='WARN', obj={ 'msg': 'FETCH_FAILED', 'key': key, 'resp': apirespond.content, 'url': api_url }) player_response = apirespond.json() player_response = player_response['items'][0] player_response = player_response['snippet'] publishedAt_time = player_response['publishedAt'] uploadDate = parse(publishedAt_time).astimezone( timezone.utc) #上传时间 格式:2019-04-27 04:58:45+00:00 title = player_response['title'] #标题 desc = player_response['description'] #描述 thumbnailsurl0 = player_response['thumbnails'] thumbnailsurl1 = thumbnailsurl0['medium'] thumbnailURL = thumbnailsurl1['url'] #缩略图url size:320 180 utags = player_response['tags'] if 'tags' in player_response else [] return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title': title, 'desc': desc, 'site': 'youtube', 'uploadDate': uploadDate, "unique_id": "youtube:%s" % vidid, "utags": utags }) async def run_async(self, content, xpath, link, update_video_detail): if 'youtube.com' in link: vidid = link[link.rfind('=') + 1:] elif 'youtu.be' in link: if 'watch?v=' in link: vidid = link[link.rfind('=') + 1:] else: vidid = link[link.rfind('/') + 1:] keys = Config.YOUTUBE_API_KEYS.split(",") while keys: key = random.choice(keys) api_url = "https://www.googleapis.com/youtube/v3/videos?id=" + vidid + "&key=" + key + "&part=snippet,contentDetails,statistics,status" async with aiohttp.ClientSession() as session: async with session.get(api_url, headers=self.HEADERS_NO_UTF8) as resp: apirespond = await resp.text() if resp.status == 200: break else: log_ne(op='youtube_run_async', level='WARN', obj={ 'msg': 'FETCH_FAILED', 'key': key, 'resp': apirespond, 'url': api_url }) keys.remove(key) player_response = loads(apirespond) player_response = player_response['items'][0] player_response = player_response['snippet'] publishedAt_time = player_response['publishedAt'] uploadDate = parse(publishedAt_time).astimezone(timezone.utc) title = player_response['title'] desc = player_response['description'] thumbnailsurl0 = player_response['thumbnails'] thumbnailsurl1 = thumbnailsurl0['medium'] thumbnailURL = thumbnailsurl1['url'] utags = player_response['tags'] if 'tags' in player_response else [] return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title': title, 'desc': desc, 'site': 'youtube', 'uploadDate': uploadDate, "unique_id": "youtube:%s" % vidid, "utags": utags })
class Acfun(Crawler): NAME = 'acfun' PATTERN = r'^(https:\/\/|http:\/\/)?(www\.)?acfun\.cn\/v\/[aA][cC][\d]+' SHORT_PATTERN = r'^[aA][cC][\d]+$' HEADERS = makeUTF8({ 'Referer': 'https://www.acfun.cn/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.acfun.cn/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } THUMBNAIL_URL = re.compile(r'"coverUrl":"([\d\w:\/\-\.]+)\?') #THUMBNAIL_URL = re.compile(r'https:\/\/imgs\.aixifan\.com\/[\w\-]{8,}') THUMBNAIL_URL_2 = re.compile( r'https:\/\/cdn\.aixifan\.com\/dotnet\/[\/\w]+\.(jpg|png|jpeg)') EXTRACT_NUM = re.compile(r'^[\d]+') def normalize_url(self, link): link = link.lower() return "https://www.acfun.cn/v/" + link[link.rfind("ac"):] def expand_url(self, short): return "https://www.acfun.cn/v/" + short.lower() def unique_id(self, link): link = link.lower() return 'acfun:%s' % link[link.rfind("ac"):] def run(self, content, xpath, link, update_video_detail): link = link.lower() vidid = link[link.rfind("ac"):] thumbnailURL = self.THUMBNAIL_URL.search(content) if thumbnailURL: thumbnailURL = thumbnailURL.group(1) #thumbnailURL = thumbnailURL[0] else: thumbnailURL = self.THUMBNAIL_URL_2.search(content) if thumbnailURL: thumbnailURL = thumbnailURL[0] else: thumbnailURL = '' title = xpath.xpath('//h1[@class="title"]/text()')[0] desc = try_get_xpath(xpath, [ '//div[@class="description-container"]/text()', '//div[@class="J_description"]/text()', '//div[@class="sp1 J_description"]/text()' ])[0] desc = re.sub(r'<br\s*?\/?>', '\n', desc) uploadDate = xpath.xpath('//div[@class="publish-time"]/text()')[0] utags = xpath.xpath('//meta[@name="keywords"]/@content')[0] utags = list(filter(None, utags.split(',')[1:-4])) try: uploadDate = parse(uploadDate) - timedelta(hours=8) except: hrs_prior = self.EXTRACT_NUM.match(uploadDate) if hrs_prior: hrs_prior = int(hrs_prior.group(0)) else: hrs_prior = 0 uploadDate = datetime.utcnow() - timedelta(hours=hrs_prior) return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title': title, 'desc': desc, 'site': 'acfun', 'uploadDate': uploadDate, "unique_id": "acfun:%s" % vidid, "utags": utags }) async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)
class BilibiliAudio(Crawler): NAME = 'bilibili_audio' PATTERN = r'^(http(s)?:\/\/)?(www\.)?bilibili\.com\/audio\/au(\d+)' SHORT_PATTERN = r'' HEADERS = makeUTF8({ 'Referer': 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } def normalize_url(self, link): url_result = re.search(self.PATTERN, link) if url_result: return f'https://www.bilibili.com/audio/au{url_result.group(4)}' else: return f'https://www.bilibili.com/audio/au{0}' def unique_id(self, link): url_result = re.search(self.PATTERN, link) if url_result: return f'bilibili_audio:{url_result.group(4)}' else: return f'' def run(self, content, xpath, link, update_video_detail): raise NotImplementedError() async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): url_result = re.search(self.PATTERN, link) if url_result: auid = url_result.group(4) else: raise NotImplementedError() api_url = f'https://www.bilibili.com/audio/music-service-c/web/song/info?sid={auid}' async with aiohttp.ClientSession() as session: async with session.get(api_url) as resp: api_resp = await resp.json() if api_resp['code'] == 0: api_resp = api_resp['data'] thumbnailURL = api_resp['cover'] title = api_resp['title'] desc = api_resp['intro'] uploadDate = datetime.fromtimestamp( api_resp['passtime']).astimezone(timezone.utc) uid = f'bilibili_audio:{auid}' utags = [] user_space_urls = [f'https://space.bilibili.com/{api_resp["uid"]}'] return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title': title, 'desc': desc, 'site': 'bilibili_audio', 'uploadDate': uploadDate, "unique_id": uid, "utags": utags, "user_space_urls": user_space_urls, 'extra': { 'vip_info': api_resp['vipInfo'] } }) else: raise UserError(f'Bilibili API resp code = {api_resp["code"]}')
class Twitter(Crawler): NAME = 'twitter' PATTERN = r'^(https:\/\/)?(www\.|mobile\.)?twitter\.com\/[\w]+\/status\/[\d]+' SHORT_PATTERN = r'' HEADERS = makeUTF8({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa }) HEADERS_NO_UTF8 = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # noqa 'Accept-Charset': 'UTF-8,*;q=0.5', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'en-US,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', # noqa } def normalize_url(self, link): if re.match(r'https?://mobile', link): # normalize mobile URL link = 'https://' + match1(link, r'//mobile\.(.+)') item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', link) return "https://twitter.com/i/status/" + item_id def unique_id(self, link): if re.match(r'https?://mobile', link): # normalize mobile URL link = 'https://' + match1(link, r'//mobile\.(.+)') item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', link) return "twitter:%s" % item_id def run(self, content, xpath, link): if re.match(r'https?://mobile', link): # normalize mobile URL link = 'https://' + match1(link, r'//mobile\.(.+)') screen_name = r1(r'twitter\.com/([^/]+)', link) or r1(r'data-screen-name="([^"]*)"', content) or \ r1(r'<meta name="twitter:title" content="([^"]*)"', content) item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', link) or r1(r'data-item-id="([^"]*)"', content) or \ r1(r'<meta name="twitter:site:id" content="([^"]*)"', content) authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' ga_url = 'https://api.twitter.com/1.1/guest/activate.json' ga_content = post_content(ga_url, headers={'authorization': authorization}) guest_token = json.loads(ga_content)['guest_token'] api_url = 'https://api.twitter.com/1.1/statuses/show.json?id=%s' % item_id api_content = get_content(api_url, headers={ 'authorization': authorization, 'x-guest-token': guest_token }) info = json.loads(api_content) if 'extended_entities' not in info: return makeResponseFailed('Not a twitter video') desc = info['text'] cover = info['extended_entities']['media'][0]['media_url'] user_name = info['user']['name'] screen_name = info['user']['screen_name'] uploadDate = parse(info['created_at']).astimezone(timezone.utc) return makeResponseSuccess({ 'thumbnailURL': cover, 'title': '%s @%s' % (user_name, screen_name), 'desc': desc, 'site': 'twitter', 'uploadDate': uploadDate, "unique_id": "twitter:%s" % item_id, "url_overwrite": f'https://twitter.com/{screen_name}/status/{item_id}', "utags": [] }) async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): if re.match(r'https?://mobile', link): # normalize mobile URL link = 'https://' + match1(link, r'//mobile\.(.+)') screen_name = r1(r'twitter\.com/([^/]+)', link) or r1(r'data-screen-name="([^"]*)"', content) or \ r1(r'<meta name="twitter:title" content="([^"]*)"', content) item_id = r1(r'twitter\.com/[^/]+/status/(\d+)', link) or r1(r'data-item-id="([^"]*)"', content) or \ r1(r'<meta name="twitter:site:id" content="([^"]*)"', content) authorization = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' ga_url = 'https://api.twitter.com/1.1/guest/activate.json' async with aiohttp.ClientSession() as session: async with session.post(ga_url, headers={'authorization': authorization}) as resp: ga_content = await resp.text() guest_token = json.loads(ga_content)['guest_token'] api_url = 'https://api.twitter.com/1.1/statuses/show.json?id=%s' % item_id async with session.get(api_url, headers={ 'authorization': authorization, 'x-guest-token': guest_token }) as resp: api_content = await resp.text() info = json.loads(api_content) if 'extended_entities' not in info: return makeResponseFailed('Not a twitter video') desc = info['text'] cover = info['extended_entities']['media'][0]['media_url'] user_name = info['user']['name'] screen_name = info['user']['screen_name'] uploadDate = parse(info['created_at']).astimezone(timezone.utc) return makeResponseSuccess({ 'thumbnailURL': cover, 'title': f'{user_name} @{screen_name}', 'desc': desc, 'site': 'twitter', 'uploadDate': uploadDate, "unique_id": "twitter:%s" % item_id, "url_overwrite": f'https://twitter.com/{screen_name}/status/{item_id}', "utags": [] })
class Nicovideo(Crawler): NAME = 'nicovideo' PATTERN = r'^(https:\/\/|http:\/\/)?(www\.|sp\.)?(nicovideo\.jp\/watch\/(s|n)m[\d]+|nico\.ms\/(s|n)m[\d]+)' SHORT_PATTERN = r'^(s|n)m[\d]+$' HEADERS = makeUTF8({ 'Referer': 'https://www.nicovideo.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.nicovideo.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } THUMBNAIL_PATTERN = r'\"(https:\\\\/\\\\/img\.cdn\.nimg\.jp\\\\/s\\\\/nicovideo\\\\/thumbnails\\\\/\d+\\\\/\d+\.\w+\\\\/\w+\?key=\w+)\"' USER_ID_MATCHER = r"www\.nicovideo\.jp\\\/user\\\/([\d]+)" #def get_cookie(self) : # return { # 'user_session': 'user_session_69318161_02257179b85d2430deb42ca8763071423671fbf8f531ddcf43185de2e376f686', # 'user_session_secure': 'NjkzMTgxNjE6ZXIwOW4yM29YdXUueHFCY2d0Qk5mZHlvOVNROGpjTjV1emRaWFRHZDJqMQ', # } def normalize_url(self, link): link = link.lower() return "https://www.nicovideo.jp/watch/" + link[link.rfind("m") - 1:] def expand_url(self, short): return "https://www.nicovideo.jp/watch/" + short def unique_id(self, link): link = link.lower() return "nicovideo:%s" % link[link.rfind("m") - 1:] def run(self, content, xpath, link, update_video_detail): link = link.lower() vidid = link[link.rfind("m") - 1:] thumbnailURL = try_get_xpath(xpath, [ '//meta[@itemprop="thumbnailUrl"]/@content', '//meta[@name="thumbnail"]/@content' ]) if thumbnailURL: thumbnailURL = thumbnailURL[0] else: url_result = re.search(self.THUMBNAIL_PATTERN, content) if url_result: thumbnailURL = url_result.group(1).replace('\\\\/', '/') import sys print(thumbnailURL, file=sys.stderr) else: thumbnailURL = '' title = try_get_xpath(xpath, [ '//meta[@itemprop="name"]/@content', '//meta[@property="og:title"]/@content' ])[0] jsons = try_get_xpath(xpath, ['//script[@type="application/ld+json"]/text()']) desc = None for json_str in jsons: json_obj = json.loads(json_str) if '@type' in json_obj and json_obj['@type'] == 'VideoObject': desc = json_obj['description'] break if desc is None: desc = try_get_xpath( xpath, [('//p[@itemprop="description"]', lambda ret: [tostring(ret[0], encoding='UTF-8').decode()]), '//meta[@itemprop="description"]/@content', '//meta[@name="description"]/@content'])[0] uploadDate = try_get_xpath(xpath, [ '//meta[@property="video:release_date"]/@content', '//meta[@name="video:release_date"]/@content' ])[0] desc = re.sub(r'<br\s*?\/?>', '\n', desc) soup = BeautifulSoup(desc, features="lxml") desc_textonly = ''.join(soup.findAll(text=True)) uploadDate = parse(uploadDate).astimezone(timezone.utc) utags = try_get_xpath(xpath, [ '//meta[@property="og:video:tag"]/@content', '//meta[@itemprop="og:video:tag"]/@content', '//meta[@name="og:video:tag"]/@content' ]) user_id = '' user_id_match_result = re.search(self.USER_ID_MATCHER, content) if user_id_match_result: user_id = user_id_match_result.group(1) if utags: utags = [str(ut) for ut in utags] else: utags = [] return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title': title, 'desc': desc_textonly, 'site': 'nicovideo', 'uploadDate': uploadDate, "unique_id": "nicovideo:%s" % vidid, "user_space_urls": [f"https://www.nicovideo.jp/user/{user_id}"] if user_id else [], "utags": utags }) async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)
class Bilibili(Crawler): NAME = 'bilibili' PATTERN = r'^(https:\/\/|http:\/\/)?((www|m)\.)?(bilibili\.com\/video\/([aA][vV][\d]+|BV[a-zA-Z0-9]+)|b23\.tv\/([aA][vV][\d]+|BV[a-zA-Z0-9]+))' SHORT_PATTERN = r'^([aA][Vv][\d]+|BV[a-zA-Z0-9]+)$' VID_MATCH_REGEX = r"([aA][Vv][\d]+|BV[a-zA-Z0-9]+)" HEADERS = makeUTF8({ 'Referer': 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } BV2AV = _bv2av() def get_cookie(self): return { 'SESSDATA': Config.BILICOOKIE_SESSDATA, 'bili_jct': Config.BILICOOKIE_bili_jct } def extract_link(self, link): ret = re.search(self.VID_MATCH_REGEX, link) vid = ret.group(1) if vid[:2].lower() == 'av': vid = vid.lower() if vid[:2].upper() == 'BV': vid = 'BV' + vid[2:] vid = 'av' + str(self.BV2AV.dec(vid)) return vid def normalize_url(self, link): return "https://www.bilibili.com/video/" + self.extract_link(self=self, link=link) def expand_url(self, short): return "https://www.bilibili.com/video/" + short def unique_id(self, link): return 'bilibili:%s' % self.extract_link(self=self, link=link) def run(self, content, xpath, link, update_video_detail): raise NotImplementedError() async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): vidid = self.extract_link(self=self, link=link) try: thumbnailURL = xpath.xpath( '//meta[@itemprop="thumbnailUrl"]/@content')[0] title = xpath.xpath('//h1[@class="video-title"]/@title')[0] desc = getInnerText( xpath.xpath('//div[@class="info open"]/node()')) uploadDate = parse( xpath.xpath('//meta[@itemprop="uploadDate"]/@content') [0]) - timedelta(hours=8) # convert from Beijing time to UTC utags = xpath.xpath('//meta[@itemprop="keywords"]/@content')[0] utags = list(filter(None, utags.split(',')[1:-4])) except: return makeResponseSuccess({ 'thumbnailURL': '', 'title': '已失效视频', 'desc': '已失效视频', 'site': 'bilibili', 'uploadDate': datetime.now(), "unique_id": "bilibili:%s" % vidid, "utags": [], "placeholder": True }) return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title': title, 'desc': desc, 'site': 'bilibili', 'uploadDate': uploadDate, "unique_id": "bilibili:%s" % vidid, "utags": utags })
class Xigua(Crawler): NAME = 'Xigua' PATTERN = r'^https?\:\/\/(www\.)?ixigua.com/(\d+)' SHORT_PATTERN = r'' HEADERS = makeUTF8({ 'Referer': 'https://www.ixigua.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.ixigua.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } META_MATCH_OBJ = r"<script\s+data-react-helmet=\"true\"\s+type=\"application/ld\+json\">\s*(.*?)\s*</script>" USER_ID_MATCH_OBJ = r"\"user_id\":(\d+)" def get_cookie(self): return { 'ttwid': '1%7Cfjd63xV6vk-PylvXbSpJ6X3A6TA9GDxriyUbQWjDsBs%7C1614899329%7C131ef1e44612efea0743459a6fe967e70a2ca5ece23fda9c9b3983354d3d00fe' } def normalize_url(self, link): ret = re.search(self.PATTERN, link) vid = ret.group(2) return f'https://www.ixigua.com/{vid}' def expand_url(self, short): return short def unique_id(self, link): ret = re.search(self.PATTERN, link) vid = ret.group(2) return f'xigua:{vid}' def run(self, content, xpath, link, update_video_detail): metadata = re.search(self.META_MATCH_OBJ, content) user_id = re.search(self.USER_ID_MATCH_OBJ, content) if user_id: user_id = user_id.group(1) if metadata: metadata = loads(metadata.group(1)) title = metadata['name'].removesuffix(' - 西瓜视频') desc = metadata['description'] cover = metadata['thumbnailUrl'][ 0] if 'thumbnailUrl' in metadata else metadata['image'][0] upload_time = parse(metadata['datePublished']).astimezone( timezone.utc) else: raise Exception('Cannot find metadata or user_id object') return makeResponseSuccess({ 'thumbnailURL': cover, 'title': title, 'desc': desc, 'site': 'xigua', 'uploadDate': upload_time, "unique_id": self.unique_id(self=self, link=link), "user_space_urls": [f"https://www.ixigua.com/home/{user_id}"] if user_id else [], "utags": [] }) async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)
class Nicovideo(Crawler): NAME = 'nicovideo' PATTERN = r'^(https:\/\/|http:\/\/)?(www\.)?(nicovideo\.jp\/watch\/(s|n)m[\d]+|nico\.ms\/(s|n)m[\d]+)' SHORT_PATTERN = r'^(s|n)m[\d]+$' HEADERS = makeUTF8({ 'Referer': 'https://www.nicovideo.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.nicovideo.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } def normalize_url(self, link): link = link.lower() return "https://www.nicovideo.jp/watch/" + link[link.rfind("m") - 1:] def expand_url(self, short): return "https://www.nicovideo.jp/watch/" + short def unique_id(self, link): link = link.lower() return "nicovideo:%s" % link[link.rfind("m") - 1:] def run(self, content, xpath, link, update_video_detail): link = link.lower() vidid = link[link.rfind("m") - 1:] thumbnailURL = try_get_xpath(xpath, [ '//meta[@itemprop="thumbnailUrl"]/@content', '//meta[@name="thumbnail"]/@content' ])[0] title = try_get_xpath(xpath, [ '//meta[@itemprop="name"]/@content', '//meta[@property="og:title"]/@content' ])[0] jsons = try_get_xpath(xpath, ['//script[@type="application/ld+json"]/text()']) desc = None for json_str in jsons: json_obj = json.loads(json_str) if '@type' in json_obj and json_obj['@type'] == 'VideoObject': desc = json_obj['description'] break if desc is None: desc = try_get_xpath( xpath, [('//p[@itemprop="description"]', lambda ret: [tostring(ret[0], encoding='UTF-8').decode()]), '//meta[@itemprop="description"]/@content', '//meta[@name="description"]/@content'])[0] uploadDate = try_get_xpath(xpath, [ '//meta[@property="video:release_date"]/@content', '//meta[@name="video:release_date"]/@content' ])[0] desc = re.sub(r'<br\s*?\/?>', '\n', desc) soup = BeautifulSoup(desc, features="lxml") desc_textonly = ''.join(soup.findAll(text=True)) uploadDate = parse(uploadDate).astimezone(timezone.utc) utags = try_get_xpath(xpath, [ '//meta[@property="og:video:tag"]/@content', '//meta[@itemprop="og:video:tag"]/@content', '//meta[@name="og:video:tag"]/@content' ]) utags = [str(ut) for ut in utags] return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title': title, 'desc': desc_textonly, 'site': 'nicovideo', 'uploadDate': uploadDate, "unique_id": "nicovideo:%s" % vidid, "utags": utags }) async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)
class Bilibili(Crawler): NAME = 'bilibili' PATTERN = r'^((https:\/\/|http:\/\/)?((www|m)\.)?(bilibili\.com\/video\/([aA][vV][\d]+|BV[a-zA-Z0-9]+)).*|https:\/\/b23\.tv\/\w+)' SHORT_PATTERN = r'^([aA][Vv][\d]+|[Bb][Vv][a-zA-Z0-9]+)$' VID_MATCH_REGEX = r"([aA][Vv][\d]+|[Bb][Vv][a-zA-Z0-9]+)" AID_MATCH_REGEX = r"__INITIAL_STATE__\s*=\s*{\"aid\"\:(\d+)," HEADERS = makeUTF8({ 'Referer': 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' }) HEADERS_NO_UTF8 = { 'Referer': 'https://www.bilibili.com/', 'User-Agent': '"Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/65.0"' } BV2AV = _bv2av() def get_cookie(self): return { 'SESSDATA': Config.BILICOOKIE_SESSDATA, 'bili_jct': Config.BILICOOKIE_bili_jct } # TODO: can not handle if p number exceeds actual p number of the given video def extract_link(self, link): ret = re.search(self.VID_MATCH_REGEX, link) if ret is None and 'b23.tv' in link: return None, None, True parsed_link = urlparse(link) qs_dict = parse_qs(parsed_link.query) p_num = 1 try: p_num = int(qs_dict['p'][0]) except: pass vid = ret.group(1) if vid[:2].lower() == 'av': vid = vid.lower() if vid[:2].upper() == 'BV': vid = 'BV' + vid[2:] vid = 'av' + str(self.BV2AV.dec(vid)) return vid, p_num, False def normalize_url(self, link): vidid, p_num, b23vid = self.extract_link(self=self, link=link) if b23vid: return link else: return f"https://www.bilibili.com/video/{vidid}?p={p_num}" def expand_url(self, short): if short[:2].lower() == 'av': short = short.lower() if short[:2].upper() == 'BV': short = 'BV' + short[2:] short = 'av' + str(self.BV2AV.dec(short)) return f"https://www.bilibili.com/video/{short}?p=1" def unique_id(self, link): vidid, p_num, b23vid = self.extract_link(self=self, link=link) if b23vid: return '' else: return 'bilibili:%s-%d' % (vidid, p_num) def run(self, content, xpath, link, update_video_detail): raise NotImplementedError() async def unique_id_async(self, link): return self.unique_id(self=self, link=link) async def run_async(self, content, xpath, link, update_video_detail): uid = '' new_url = '' try: aid, p_num, b23vid = self.extract_link(self=self, link=link) if b23vid: aid_match = re.search(self.AID_MATCH_REGEX, content) aid = 'av' + aid_match.group(1) new_url = f"https://www.bilibili.com/video/{aid}?p=1" p_num = 1 uid = 'bilibili:%s-1' % aid else: new_url = link uid = self.unique_id(self=self, link=link) aid = aid[2:] # remove 'av' api_url = f'http://api.bilibili.com/x/web-interface/view?aid={aid}' async with aiohttp.ClientSession() as session: async with session.get(api_url) as resp: api_content = await resp.json() code = api_content['code'] if code != 0 or 'data' not in api_content: raise Exception(f'api request failed, message:\n{api_content}') data = api_content['data'] thumbnailURL = data['pic'] title = data['title'] desc = data['desc'] uploadDate = datetime.fromtimestamp(data['pubdate']).astimezone( timezone.utc) api_url = f'http://api.bilibili.com/x/tag/archive/tags?aid={aid}' async with aiohttp.ClientSession() as session: async with session.get(api_url) as resp: api_content = await resp.json() code = api_content['code'] if code != 0 or 'data' not in api_content: utags = [] else: utags = [item['tag_name'] for item in api_content['data']] if 'staff' in data: user_space_urls = [ 'https://space.bilibili.com/%d' % x['mid'] for x in data['staff'] ] elif 'owner' in data: user_space_urls = [ 'https://space.bilibili.com/%d' % data['owner']['mid'] ] cid = 0 async with aiohttp.ClientSession() as session: async with session.get( f'https://api.bilibili.com/x/player/pagelist?aid={aid}&jsonp=jsonp' ) as resp: api_content = await resp.text() if resp.status == 200: api_obj = loads(api_content) num_parts = len(api_obj['data']) if p_num < 1 or p_num > num_parts: raise UserError( f'P number out of range, should be in [1, {num_parts}]' ) part_name = api_obj['data'][p_num - 1]['part'] cid = api_obj['data'][p_num - 1]['cid'] else: raise Exception( f'api request failed, message:\n{api_content}') except UserError as ex: raise ex except: return makeResponseSuccess({ 'thumbnailURL': '', 'title': '已失效视频', 'desc': '已失效视频', 'site': 'bilibili', 'uploadDate': datetime.now(), "unique_id": uid, "utags": [], "url_overwrite": new_url, "placeholder": True }) return makeResponseSuccess({ 'thumbnailURL': thumbnailURL, 'title': title, 'desc': desc, 'site': 'bilibili', 'uploadDate': uploadDate, "unique_id": uid, "utags": utags, "url_overwrite": new_url, "user_space_urls": user_space_urls, 'extra': { 'part_name': part_name, 'cid': cid } })
class SinaPC(Crawler): NAME = 'weibo-pc' SHORT_PATTERN = r'' PATTERN = r'^(https:\/\/|http:\/\/)?weibo\.(com|cn)\/tv\/v\/(\w+)\?fid=(\d+:\d+)' Cookie = 'SINAGLOBAL=4002460776686.9824.1585321155178; UOR=,,m.weibo.cn; YF-V5-G0=125128c5d7f9f51f96971f11468b5a3f; _s_tentry=-; Apache=8703795817895.288.1585556164345; ULV=1585556164370:2:2:1:8703795817895.288.1585556164345:1585321155211; YF-Page-G0=091b90e49b7b3ab2860004fba404a078|1585563210|1585563210; WBStorage=42212210b087ca50|undefined; login_sid_t=52bd5c499b65543341c46965f3d3267b; cross_origin_proto=SSL; Ugrow-G0=7e0e6b57abe2c2f76f677abd9a9ed65d; wb_view_log=2560*14401; WBtopGlobal_register_version=3d5b6de7399dfbdb; crossidccode=CODE-yf-1JiRvU-226WPk-QGmVB9KHtatfm2Ec7ed84; ALF=1617099576; SSOLoginState=1585563577; SUB=_2A25zhbfpDeRhGeBI7lIV9i_IzTuIHXVQ8q4hrDV8PUNbmtANLXDukW9NRpACXBpVshRIjli1oSoWs_HnV-7brere; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhMKKdEqihzDJl7MPTpyF_b5JpX5KzhUgL.FoqcSK5XSo2XSoM2dJLoI7LpUcf.eh.RShqt; SUHB=0WGMeWn5GWqB9T; wvr=6' HEADERS = makeUTF8({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', 'cookie': Cookie}) HEADERS_NO_UTF8 = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0', 'cookie': Cookie} def get_cookie(self) : return { 'SINAGLOBAL' : '4002460776686.9824.1585321155178', 'UOR' : ',,m.weibo.cn', 'YF-V5-G0' : '125128c5d7f9f51f96971f11468b5a3f', '_s_tentry' : '-', 'Apache' : '8703795817895.288.1585556164345', 'ULV' : '1585556164370:2:2:1:8703795817895.288.1585556164345:1585321155211', 'YF-Page-G0' : '091b90e49b7b3ab2860004fba404a078|1585563210|1585563210', 'WBStorage' : '42212210b087ca50|undefined', 'login_sid_t' : '52bd5c499b65543341c46965f3d3267b', 'cross_origin_proto' : 'SSL', 'Ugrow-G0' : '7e0e6b57abe2c2f76f677abd9a9ed65d', 'wb_view_log' : '2560*14401', 'WBtopGlobal_register_version' : '3d5b6de7399dfbdb', 'crossidccode' : 'CODE-yf-1JiRvU-226WPk-QGmVB9KHtatfm2Ec7ed84', 'ALF' : '1617099576', 'SSOLoginState' : '1585563577', 'SUB' : '_2A25zhbfpDeRhGeBI7lIV9i_IzTuIHXVQ8q4hrDV8PUNbmtANLXDukW9NRpACXBpVshRIjli1oSoWs_HnV-7brere', 'SUBP' : '0033WrSXqPxfM725Ws9jqgMF55529P9D9WhMKKdEqihzDJl7MPTpyF_b5JpX5KzhUgL.FoqcSK5XSo2XSoM2dJLoI7LpUcf.eh.RShqt', 'SUHB' : '0WGMeWn5GWqB9T', 'wvr' : '6' } def normalize_url( self, link ) : ret = re.search(self.PATTERN, link) vid = ret.group(4) rnd = ret.group(3) return f"https://weibo.com/tv/v/{rnd}?fid={vid}" def unique_id(self, link): ret = re.search(self.PATTERN, link) vid = ret.group(4) return f'weibo-pc:{vid}' def run(self, content, xpath, link, update_video_detail): soup = BeautifulSoup(content, "lxml") description = soup.find('div', class_='info_txt W_f14') description = description.get_text() #user = soup.find('span', class_='W_f14 L_autocut bot_name W_fl') #user_name = user.get_text() add_time = soup.find('div', class_='broad_time W_f12') add_time = add_time.get_text() vidid = self.unique_id(self, link) return makeResponseSuccess({ 'thumbnailURL': '', 'title': description, 'desc': description, 'site': 'weibo-pc', 'uploadDate': parse(add_time).astimezone(timezone.utc), "unique_id": vidid, 'utags': [] }) async def unique_id_async( self, link ) : return self.unique_id(self = self, link = link) async def run_async(self, content, xpath, link, update_video_detail): return self.run(self=self, content=content, xpath=xpath, link=link, update_video_detail=update_video_detail)