def get_captcha(self, path = None): if not requests.utils.dict_from_cookiejar(self.cj).has_key('sid'): utils.get_page_content(LOGIN_CAPTCHA_URL.format(random.random()), headers = {'Referer':'https://passport.bilibili.com/login'}) result = utils.get_page_content(LOGIN_CAPTCHA_URL.format(random.random()), headers = {'Referer':'https://passport.bilibili.com/login'}) if path == None: path = tempfile.gettempdir() + '/captcha.jpg' with open(path, 'wb') as f: f.write(result) return path
def get_category_from_web_page(self): category_dict = {'0': {'title': u'全部', 'url': HOME_URL, 'subs': []}} node = category_dict['0'] url = node['url'] result = BeautifulSoup(utils.get_page_content(url), "html.parser").findAll('li', {'class': 'm-i'}) for item in result: if len(item['class']) != 1: continue tid = item['data-tid'] title = item.em.contents[0] url = 'http:' + item.a['href'] category_dict[tid] = {'title': title, 'url': url, 'subs': []} node['subs'].append(tid) #Fix video and movie if '11' not in category_dict['0']['subs']: category_dict['0']['subs'].append('11') if '23' not in category_dict['0']['subs']: category_dict['0']['subs'].append('23') category_dict['11'] = { 'title': u'电视剧', 'url': 'http://bangumi.bilibili.com/tv/', 'subs': [] } category_dict['23'] = { 'title': u'电影', 'url': 'http://bangumi.bilibili.com/movie/', 'subs': [] } for sub in category_dict['0']['subs']: node = category_dict[sub] url = node['url'] result = BeautifulSoup(utils.get_page_content(url), "html.parser").select('ul.n_num li') for item in result[1:]: if not item.has_attr('tid'): continue if not hasattr(item, 'a'): continue if item.has_attr('class'): continue tid = item['tid'] title = item.a.contents[0] if item.a['href'][:2] == '//': url = 'http:' + item.a['href'] else: url = HOME_URL + item.a['href'] category_dict[tid] = {'title': title, 'url': url, 'subs': []} node['subs'].append(tid) return category_dict
def get_captcha(self, path=None): if not requests.utils.dict_from_cookiejar(self.cj).has_key('sid'): utils.get_page_content( LOGIN_CAPTCHA_URL.format(random.random()), headers={'Referer': 'https://passport.bilibili.com/login'}) result = utils.get_page_content( LOGIN_CAPTCHA_URL.format(random.random()), headers={'Referer': 'https://passport.bilibili.com/login'}) if path == None: path = tempfile.gettempdir() + '/captcha.jpg' with open(path, 'wb') as f: f.write(result) return path
def get_captcha(self, path=None): utils.get_page_content('https://passport.bilibili.com/login') result = utils.get_page_content( LOGIN_CAPTCHA_URL, headers={ 'Referer': 'https://passport.bilibili.com/ajax/miniLogin/minilogin' }) if path == None: path = tempfile.gettempdir() + '/captcha.jpg' with open(path, 'wb') as f: f.write(result) return path
def _parse_urls(self, page_content, need_subtitle = True): self._print_info('Parsing page') url_params = self.URL_PARAMS.findall(page_content) interface_full_url = '' # 如果使用第一种正则匹配成功 if url_params and len(url_params) == 1 and url_params[0]: interface_full_url = self.INTERFACE_URL.format(str(url_params[0])) # 如果匹配不成功则使用第二种正则匹配 if not url_params: self._print_info('Parsing page by another regex') url_params = self.URL_PARAMS2.findall(page_content) if url_params and len(url_params) == 1 and url_params[0]: interface_full_url = self.INTERFACE_URL.format(str(url_params[0])) if interface_full_url: self._print_info('Interface url: ' + interface_full_url) # 解析RSS页面 self._print_info('Getting video address by interface page') content = utils.get_page_content(interface_full_url) self._print_info('Interface page length: ' + str(len(content))) doc = minidom.parseString(content) parts = doc.getElementsByTagName('durl') self._print_info('Video parts found: ' + str(len(parts))) result = [] # 找出所有视频地址 for part in parts: urls = part.getElementsByTagName('url') if len(urls) > 0: result.append(urls[0].firstChild.nodeValue) if need_subtitle: return (result, self._parse_subtitle(url_params[0])) else: return (result, '') else: self._print_info('Interface url not found!') return ([], '')
def _parse_urls(self, page_content): url_params = self.URL_PARAMS.findall(page_content) interface_full_url = '' # 如果使用第一种正则匹配成功 if url_params and len(url_params) == 1 and url_params[0]: interface_full_url = self.INTERFACE_URL.format(str(url_params[0])) # 如果匹配不成功则使用第二种正则匹配 if not url_params: url_params = self.URL_PARAMS2.findall(page_content) if url_params and len(url_params) == 1 and url_params[0]: interface_full_url = self.INTERFACE_URL.format(str(url_params[0])) if interface_full_url: # 解析RSS页面 content = utils.get_page_content(interface_full_url) doc = minidom.parseString(content) parts = doc.getElementsByTagName('durl') result = [] # 找出所有视频地址 for part in parts: urls = part.getElementsByTagName('url') if len(urls) > 0: result.append(urls[0].firstChild.nodeValue) return (result, self._parse_subtitle(url_params[0])) print interface_full_url return ([], '')
def get_next_page(self,list,bangumi,page): p_url='http://www.bilibili.com/sppage/bangumi-'+str(bangumi)+'-'+str(page)+'.html' html= utils.get_page_content(p_url) html = html.replace('\r', '') html = html.replace('\n', '') checknext = re.compile('<div class="no_more">(.+?)</div>').findall(html) if len(checknext): if checknext[0]=="没有更多信息": return list else: thumbnail="" cover = re.compile('<div class="cover"><img src="(.+?)"></div>').findall(html) if len(cover): thumbnail=cover[0] series = re.compile('<a class="t" href="/video/(.+?)" target="_blank" title="(.+?)">(.+?)</a>').findall(html) if len(series): for s in series: list.append({ 'title': s[1].strip(), 'link': s[0].strip(), 'type': 'bangumi', 'page': page, 'thumbnail':thumbnail, 'published': s[0].strip()}) return self.get_next_page(list,bangumi,page+1)
def get_video_urls(self, url, need_subtitle=True): self._print_info('Getting video address') page_full_url = self.BASE_URL + url self._print_info('Page url: ' + page_full_url) page_content = utils.get_page_content(page_full_url) self._print_info('Origin page length: ' + str(len(page_content))) return self._parse_urls(page_content, need_subtitle)
def get_hot_items(self, category): self._print_info('Getting HOT CAT Items') self._print_info(category) cat_url = self._get_cat_url(category) self._print_info(cat_url) #parse_result = feedparser.parse(cat_url) self._print_info('HOT CAT Items fetched succeeded!') html = utils.get_page_content(cat_url) temp = [] pager = re.compile('<ul class="rlist">(.+?)</ul>').findall(html) if len(pager): links = re.compile( '<a href="/video/(.+?)/" title="(.+?)" target="_blank">(.+?)</a>' ).findall(pager[0]) for p in links: img = re.compile('<img src="(.+?)"').findall(p[2]) temp.append({ 'title': p[1], 'link': p[0], 'category': category, 'description': p[0], 'thumbnail': img[0], 'published': p[0] }) return temp
def get_dynamic(self, page = 1, pagesize = 10): if self.is_login == False: return [] url = DYNAMIC_URL.format(pagesize, page) result = json.loads(utils.get_page_content(url)) total_page = int((result['data']['page']['count'] + pagesize - 1) / pagesize) return result['data']['feeds'], total_page
def get_hot_items(self, category): self._print_info('Getting HOT CAT Items') self._print_info(category) cat_url = self._get_cat_url(category) self._print_info(cat_url) #parse_result = feedparser.parse(cat_url) self._print_info('HOT CAT Items fetched succeeded!') html= utils.get_page_content(cat_url) temp=[] pager = re.compile('<ul class="rlist">(.+?)</ul>').findall(html) if len(pager): links = re.compile('<a href="/video/(.+?)/" title="(.+?)" target="_blank">(.+?)</a>').findall(pager[0]) for p in links: img = re.compile('<img src="(.+?)"').findall(p[2]) temp.append({ 'title': p[1], 'link': p[0], 'category':category, 'description': p[0], 'thumbnail':img[0], 'published': p[0]}) return temp
def _get_index_items(self, url): pickle_file_by_word = tempfile.gettempdir() + '/' + url.split('/')[-1].strip() + '_word_tmp.pickle' pickle_file_by_month = tempfile.gettempdir() + '/' + url.split('/')[-1].strip() + '_month_tmp.pickle' if os.path.exists(pickle_file_by_word) and os.path.exists(pickle_file_by_month) and not self._need_rebuild(pickle_file_by_word) and not self._need_rebuild(pickle_file_by_month): return pickle.load(open(pickle_file_by_word, 'rb')), pickle.load(open(pickle_file_by_month, 'rb')) else: page_content = utils.get_page_content(url) results_dict = dict() results_month_dict = dict() parts = page_content.split('<h3>') for part in parts: results = self.ITEMS.findall(part) key = part[0] results_dict[key] = [] for r in results: results_dict[key].append((r[1], r[2], r[0])) if r[0] in results_month_dict.keys(): results_month_dict[r[0]].append((r[1], r[2])) else: results_month_dict[r[0]] = [(r[1], r[2])] word_file = open(pickle_file_by_word, 'wb') month_file = open(pickle_file_by_month, 'wb') pickle.dump(results_dict, word_file) pickle.dump(results_month_dict, month_file) return results_dict, results_month_dict
def get_video_list(self, av_id): page_full_url = self.BASE_URL + 'video/av' + str(av_id) + '/' page_content = utils.get_page_content(page_full_url) parts = self.PARTS.findall(page_content) if len(parts) == 0: return [(u'播放', 'video/av' + str(av_id) + '/')] else: return [(part[1], part[0][1:]) for part in parts]
def get_dynamic(self, page=1, pagesize=10): if self.is_login == False: return [] url = DYNAMIC_URL.format(pagesize, page) result = json.loads(utils.get_page_content(url)) total_page = int( (result['data']['page']['count'] + pagesize - 1) / pagesize) return result['data']['feeds'], total_page
def get_bangumi_detail(self, season_id): url = BANGUMI_SEASON_URL.format(season_id) result = utils.get_page_content(url) if result[0] != '{': start = result.find('(') + 1 end = result.find(');') result = result[start:end] result = json.loads(result) return result['result']
def fetch(cls, url, use_cache=True): m = re.match(r'^http://([a-z]{2})\.wikipedia\.org', url) page_lang = m.group(1).encode('utf8') page_title = extract_page_title(url, page_lang) wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang) return cls( page_title, get_page_content(wp, page_title, page_lang, use_cache) or '', page_lang)
def get_cat_items(self, category): self._print_info('Getting CAT Items') self._print_info(category) cat_url = self._get_cat_url(category) self._print_info(cat_url) #parse_result = feedparser.parse(cat_url) self._print_info('CAT Items fetched succeeded!') html = utils.get_page_content(cat_url) attrs = re.compile( '<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview" title="(.+?)"><img src="(.+?)" alt="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title" title="(.+?)">(.+?)</a>' ).findall(html) #<a href="/video/av2315815/" target="_blank" class="preview" title="【犯罪】【1990】极道之妻 最后的战争 主演: 岩下志麻 导演: 山下耕作"><img src="http://i2.hdslb.com/320_180/video/0f/0f3662fcc909ad31da2963108ea4d9f6.jpg" alt="【犯罪】【1990】极道之妻 最后的战争 主演: 岩下志麻 导演: 山下耕作"></a> temp = [{ 'title': i[1], 'description': i[1], 'link': '', 'category': category, 'thumbnail': i[2], 'published': i[0] } for i in attrs] #for t in temp: # for tt in t: # self._print_info(t[tt]) pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall( html) if len(pager): links = re.compile('href="/video/(.+?)">(.+?)</a>').findall( pager[0]) for p in links: if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[ 1] == '上页 ': temp.append({ 'title': p[1], 'link': p[0], 'category': p[0], 'description': p[1], 'thumbnail': p[1], 'published': p[0] }) else: temp.append({ 'title': '第' + p[1] + '页', 'link': p[0], 'category': p[0], 'description': p[1], 'thumbnail': p[1], 'published': p[0] }) return temp
def get_av_list_detail(self, aid, page = 1, fav = 0, pagesize = 10): params = {'id': aid, 'page': page} if fav != 0: params['fav'] = fav url = VIEW_URL.format(self.api_sign(params)) result = json.loads(utils.get_page_content(url)) results = [result] if (int(page) < result['pages']) and (pagesize > 1): results += self.get_av_list_detail(aid, int(page) + 1, fav, pagesize = pagesize - 1)[0] return results, result['pages']
def get_history(self, page=1, pagesize=10): if self.is_login == False: return [] url = HISTORY_URL.format(page, pagesize) result = json.loads(utils.get_page_content(url)) if len(result['data']) >= int(pagesize): total_page = int(page) + 1 else: total_page = int(page) return result['data'], total_page
def get_history(self, page = 1, pagesize = 10): if self.is_login == False: return [] url = HISTORY_URL.format(page, pagesize) result = json.loads(utils.get_page_content(url)) if len(result['data']) >= int(pagesize): total_page = int(page) + 1 else: total_page = int(page) return result['data'], total_page
def get_cat_items(self, category): self._print_info('Getting CAT Items') self._print_info(category) cat_url = self._get_cat_url(category) self._print_info(cat_url) #parse_result = feedparser.parse(cat_url) self._print_info('CAT Items fetched succeeded!') html = utils.get_page_content(cat_url) attrs = re.compile( '<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>' ).findall(html) temp = [{ 'title': i[3], 'description': i[3], 'link': '', 'category': category, 'thumbnail': i[1], 'published': i[0] } for i in attrs] #for t in temp: # for tt in t: # self._print_info(t[tt]) pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall( html) if len(pager): links = re.compile('href="/video/(.+?)">(.+?)</a>').findall( pager[0]) for p in links: if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[ 1] == '上页 ': temp.append({ 'title': p[1], 'link': p[0], 'category': p[0], 'description': p[1], 'thumbnail': p[1], 'published': p[0] }) else: temp.append({ 'title': '第' + p[1] + '页', 'link': p[0], 'category': p[0], 'description': p[1], 'thumbnail': p[1], 'published': p[0] }) return temp
def get_category_from_web_page(self): category_dict = {'0': {'title': u'全部', 'url': HOME_URL, 'subs':[]}} node = category_dict['0'] url = node['url'] result = BeautifulSoup(utils.get_page_content(url), "html.parser").findAll('li', {'class': 'm-i'}) for item in result: if len(item['class']) != 1: continue tid = item['data-tid'] title = item.em.contents[0] url = 'http:' + item.a['href'] category_dict[tid] = {'title': title, 'url': url, 'subs':[]} node['subs'].append(tid) #Fix video and movie if '11' not in category_dict['0']['subs']: category_dict['0']['subs'].append('11') if '23' not in category_dict['0']['subs']: category_dict['0']['subs'].append('23') category_dict['11'] = {'title': u'电视剧', 'url': 'http://bangumi.bilibili.com/tv/', 'subs': []} category_dict['23'] = {'title': u'电影', 'url': 'http://bangumi.bilibili.com/movie/', 'subs': []} for sub in category_dict['0']['subs']: node = category_dict[sub] url = node['url'] result = BeautifulSoup(utils.get_page_content(url), "html.parser").select('ul.n_num li') for item in result[1:]: if not item.has_attr('tid'): continue if not hasattr(item, 'a'): continue if item.has_attr('class'): continue tid = item['tid'] title = item.a.contents[0] if item.a['href'][:2] == '//': url = 'http:' + item.a['href'] else: url = HOME_URL + item.a['href'] category_dict[tid] = {'title': title, 'url': url, 'subs':[]} node['subs'].append(tid) return category_dict
def get_category_list(self, tid = 0, order = 'default', days = 30, page = 1, pagesize = 10): params = {'tid': tid, 'order': order, 'days': days, 'page': page, 'pagesize': pagesize} url = LIST_URL.format(self.api_sign(params)) result = json.loads(utils.get_page_content(url)) results = [] for i in range(pagesize): if result['list'].has_key(str(i)): results.append(result['list'][str(i)]) else: break return results, result['pages']
def get_encryped_pwd(self, pwd): import rsa result = json.loads(utils.get_page_content(LOGIN_HASH_URL.format(random.random()), headers={'Referer':'https://passport.bilibili.com/login'})) pwd = result['hash'] + pwd key = result['key'] pub_key = rsa.PublicKey.load_pkcs1_openssl_pem(key) pwd = rsa.encrypt(pwd.encode('utf-8'), pub_key) pwd = base64.b64encode(pwd) pwd = urllib.quote(pwd) return pwd
def get_video_urls(self, cid): m = hashlib.md5() m.update(INTERFACE_PARAMS.format(str(cid), SECRETKEY_MINILOADER)) url = INTERFACE_URL.format(str(cid), m.hexdigest()) doc = minidom.parseString(utils.get_page_content(url)) urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')] urls = [url if not re.match(r'.*\.qqvideo\.tc\.qq\.com', url) else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', url) for url in urls] return urls
def get_video_parts2(self,category,video,part): # 多个部分用这个函数处理 index 开始 # self._print_info('Getting Video Parts') self._print_info(video) #parse_result = feedparser.parse(cat_url) self._print_info('Parts fetched succeeded!') video_urls=[] title=part description=part thumbnail=video id=video url=urllib.urlencode({'kw':'http://www.bilibili.com'+part}) p_url='http://www.flvcd.com/parse.php?format=&'+url html2 = utils.get_page_content(p_url) html2=html2.decode('gbk').encode('utf-8') attrs2 = re.compile('<input type="hidden" name="(.+?)" value="(.*?)"').findall(html2) filename="" inf="" for i in (attrs2): if i[0]=="filename": filename=i[1] if i[0]=="inf": inf=i[1] link=inf video_urls.append({ 'title': '播放', 'link': link, 'category':category, 'description': description, 'thumbnail':thumbnail, 'published': id}) return video_urls
def get_cat_items(self, category): self._print_info('Getting CAT Items') self._print_info(category) cat_url = self._get_cat_url(category) self._print_info(cat_url) #parse_result = feedparser.parse(cat_url) self._print_info('CAT Items fetched succeeded!') html= utils.get_page_content(cat_url) attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview" title="(.+?)"><img src="(.+?)" alt="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title" title="(.+?)">(.+?)</a>').findall(html) #<a href="/video/av2315815/" target="_blank" class="preview" title="【犯罪】【1990】极道之妻 最后的战争 主演: 岩下志麻 导演: 山下耕作"><img src="http://i2.hdslb.com/320_180/video/0f/0f3662fcc909ad31da2963108ea4d9f6.jpg" alt="【犯罪】【1990】极道之妻 最后的战争 主演: 岩下志麻 导演: 山下耕作"></a> temp= [{ 'title': i[1], 'description': i[1], 'link':'', 'category':category, 'thumbnail':i[2], 'published': i[0]} for i in attrs] #for t in temp: # for tt in t: # self._print_info(t[tt]) pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall(html) if len(pager): links = re.compile('href="/video/(.+?)">(.+?)</a>').findall(pager[0]) for p in links: if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[1] == '上页 ': temp.append({ 'title': p[1], 'link': p[0], 'category':p[0], 'description': p[1], 'thumbnail':p[1], 'published': p[0]}) else: temp.append({ 'title': '第'+p[1]+'页', 'link': p[0], 'category':p[0], 'description': p[1], 'thumbnail':p[1], 'published': p[0]}) return temp
def get_encryped_pwd(self, pwd): import rsa result = json.loads( utils.get_page_content( LOGIN_HASH_URL.format(random.random()), headers={'Referer': 'https://passport.bilibili.com/login'})) pwd = result['hash'] + pwd key = result['key'] pub_key = rsa.PublicKey.load_pkcs1_openssl_pem(key) pwd = rsa.encrypt(pwd.encode('utf-8'), pub_key) pwd = base64.b64encode(pwd) pwd = urllib.quote(pwd) return pwd
def get_av_list_detail(self, aid, page=1, fav=0, pagesize=10): params = {'id': aid, 'page': page} if fav != 0: params['fav'] = fav url = VIEW_URL.format(self.api_sign(params)) result = json.loads(utils.get_page_content(url)) results = [result] if (int(page) < result['pages']) and (pagesize > 1): results += self.get_av_list_detail(aid, int(page) + 1, fav, pagesize=pagesize - 1)[0] return results, result['pages']
def get_video_parts(self, category, video, part): self._print_info('Getting Video Parts') self._print_info(video) #parse_result = feedparser.parse(cat_url) self._print_info('Parts fetched succeeded!') video_urls = [] title = part description = part thumbnail = video id = video #url=urllib.urlencode(video) #p_url='http://www.flvcd.com/parse.php?format=&'+url p_url = 'http://api.xinfan.tv:9999/vids/' + video videourl = utils.get_page_content(p_url) self._print_info(videourl) decodejson = {} if videourl: decodejson = json.loads(videourl) self._print_info('p_url') self._print_info(p_url) self._print_info(videourl) if decodejson['status']: for url in decodejson['urls']: video_urls.append({ 'title': '直接播放', 'link': url, 'category': category, 'description': description, 'thumbnail': thumbnail, 'published': id }) else: video_urls.append({ 'title': '无法播放', 'link': videourl, 'category': category, 'description': description, 'thumbnail': thumbnail, 'published': id }) return video_urls
def _get_search_items(self, keyword): search_url = r'http://www.bilibili.tv/search?keyword='+keyword+'&pagesize=500' pickle_file = tempfile.gettempdir() + '/' + keyword + '_tmp.pickle' if os.path.exists(pickle_file) and not self._need_rebuild(pickle_file): return pickle.load(open(pickle_file, 'rb')) else: page_content = utils.get_page_content(search_url) r = self.SEARCH.findall(page_content) results = [] for li,na in r: na = self.NOTAG.sub('',na) results.append((li,na)) word_file = open(pickle_file, 'wb') pickle.dump(results, word_file) return results
def get_video_urls(self, cid): m = hashlib.md5() m.update(INTERFACE_PARAMS.format(str(cid), SECRETKEY_MINILOADER)) url = INTERFACE_URL.format(str(cid), m.hexdigest()) doc = minidom.parseString(utils.get_page_content(url)) urls = [ durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl') ] urls = [ url if not re.match(r'.*\.qqvideo\.tc\.qq\.com', url) else re.sub( r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', url) for url in urls ] return urls
def get_video_urls(cid): interface_full_url = INTERFACE_URL.format(str(cid)) print_info('Interface url: ' + interface_full_url) # 解析RSS页面 content = utils.get_page_content(interface_full_url) doc = minidom.parseString(content) parts = doc.getElementsByTagName('durl') print_info('Video parts found: ' + str(len(parts))) result = [] # 找出所有视频地址 for part in parts: urls = part.getElementsByTagName('url') if len(urls) > 0: result.append(urls[0].firstChild.nodeValue) return result
def login(self, userid, pwd, captcha): #utils.get_page_content('http://www.bilibili.com') if self.is_login == True: return True, '' pwd = self.get_encryped_pwd(pwd) data = 'cType=2&vcType=1&captcha={}&user={}&pwd={}&keep=true&gourl=http://www.bilibili.com/'.format(captcha, userid, pwd) result = utils.get_page_content(LOGIN_URL, data, {'Origin':'https://passport.bilibili.com', 'Referer':'https://passport.bilibili.com/login'}) if not requests.utils.dict_from_cookiejar(self.cj).has_key('DedeUserID'): return False, LOGIN_ERROR_MAP[json.loads(result)['code']] self.cj.save() self.is_login = True self.mid = str(requests.utils.dict_from_cookiejar(self.cj)['DedeUserID']) return True, ''
def thread_func(): for index, link in enumerate(links): if self._stop_event.is_set(): return file_name, start_identifier = link.href.split('#') next_file_name, end_identifier = links[index + 1].href.split( '#') if index + 1 < len(links) else [None, None] content = book_file.get_item_with_href(file_name).get_content() page_content = None if next_file_name == file_name: page_content = get_page_content(content, start_identifier, end_identifier) else: page_content = get_page_content(content, start_identifier, None) parsed_content = parse_content(page_content) page = Page(parsed_content) self._add_page(page, index)
def get_hotjson_items(self,type, category): self._print_info('Getting HOT CAT JSON Items') self._print_info(category) json_url = 'http://www.bilibili.com/index/rank/all-'+type+'-'+category+'.json' self._print_info(json_url) self._print_info('HOT CAT Items fetched succeeded!') html= utils.get_page_content(json_url) data=json.loads(html) data=(data['rank']['list']) temp=[] for i in range(0,len(data)): #pprint(data[i]) aid= 'av'+str(data[i]['aid']) title=(data[i]['title']) author=(data[i]['author']) description=(data[i]['description']) mid=(data[i]['mid']) pic=(data[i]['pic']) link= 'http://www.bilibili.com/video/'+str(aid) temp.append({ 'title': title, 'link': link, 'category':category, 'description':description, 'thumbnail':pic, 'published': aid }) #temp.append({ # 'title': title, # 'link': 'http://www.bilibili.com/video/av'+aid, # 'category':category, # 'description':description, # 'thumbnail':pic, # 'published': 'av'+aid }) return temp
def get_video_paths(self,category,video): self._print_info('Getting Video Urls') video_urls=[] title=video description=video thumbnail=video id=video p_url='http://www.bilibili.com/video/'+id html = utils.get_page_content(p_url) self._print_info(p_url) self._print_info('Video url fetched succeeded!') attrs = re.compile("<option value='(.+?)'>(.+?)</option>").findall(html) parts =False for i in attrs: parts=True video_urls.append({ 'title': i[1], 'link': i[0], 'part': i[0], 'category':category, 'description': i[1], 'thumbnail':i[1], 'published': id}) if parts: self._print_info('has parts') else: self._print_info('no parts') self._print_info('End of fetch') return video_urls
def _get_index_items_from_web(self, url): page_content = utils.get_page_content(url) results_dict = dict() results_month_dict = dict() parts = page_content.split('<h3>') for part in parts: results = self.ITEMS.findall(part) key = part[0] results_dict[key] = [] for r in results: results_dict[key].append((r[1], r[2], r[0])) if r[0] in results_month_dict.keys(): results_month_dict[r[0]].append((r[1], r[2])) else: results_month_dict[r[0]] = [(r[1], r[2])] return results_dict, results_month_dict
def get_subtitle(cid): url = COMMENT_URL.format(cid) print_info('Page full url: ' + url) input = get_tmp_dir() + '/tmp.xml' output = get_tmp_dir() + '/tmp.ass' local_file = open(input, "w") local_file.write(utils.get_page_content(url)) local_file.close() Danmaku2ASS(input, output, WIDTH, HEIGHT, font_size=FONT_SIZE, text_opacity=TEXT_OPACITY, is_reduce_comments=IS_REDUCE_COMMENTS, duration_marquee=DURATION_MARQUEE, duration_still=DURATION_STILL ) return output
def get_video_parts2(self, category, video, part): # 多个部分用这个函数处理 index 开始 # self._print_info('Getting Video Parts') self._print_info(video) #parse_result = feedparser.parse(cat_url) self._print_info('Parts fetched succeeded!') video_urls = [] title = part description = part thumbnail = video id = video url = urllib.urlencode({'kw': 'http://www.bilibili.com' + part}) p_url = 'http://www.flvcd.com/parse.php?format=&' + url html2 = utils.get_page_content(p_url) html2 = html2.decode('gbk').encode('utf-8') attrs2 = re.compile( '<input type="hidden" name="(.+?)" value="(.*?)"').findall(html2) filename = "" inf = "" for i in (attrs2): if i[0] == "filename": filename = i[1] if i[0] == "inf": inf = i[1] link = inf video_urls.append({ 'title': '播放', 'link': link, 'category': category, 'description': description, 'thumbnail': thumbnail, 'published': id }) return video_urls
def get_anime_series(self,anime): self._print_info('Getting ANIME Item') anime_url = self.BASE_URL+anime self._print_info(anime_url) #parse_result = feedparser.parse(cat_url) self._print_info('ANIME Item fetched succeeded!') html= utils.get_page_content(anime_url) #attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>').findall(html) temp=[] html = html.replace('\r', '') html = html.replace('\n', '') spid = re.compile('var spid = "(.+?)";').findall(html) series = re.compile('<option value="(.+?)">(.+?)</option>').findall(html) if len(spid): spid=spid[0] self._print_info('spid '+spid) if len(series): for s in series: self._print_info('title'+s[1]) self._print_info('series number '+s[0]) link=spid+'-'+s[0] #http://www.bilibili.com/sppage/bangumi-13294-1816-1.html temp.append({ 'title': s[1], 'link': link, 'spid': spid, 'seriesid': s[1], 'type': 'series', 'thumbnail':s[0], 'published': s[0]}) else: temp=self.get_next_page(temp,spid,1) return temp
def get_video_paths(self, category, video): self._print_info('Getting Video Urls') video_urls = [] title = video description = video thumbnail = video id = video p_url = 'http://www.bilibili.com/video/' + id html = utils.get_page_content(p_url) self._print_info(p_url) self._print_info('Video url fetched succeeded!') attrs = re.compile("<option value='(.+?)'>(.+?)</option>").findall( html) parts = False for i in attrs: parts = True url = 'http://www.bilibili.com' + i[0] self._print_info(url) video_urls.append({ 'title': i[1], 'link': i[0], 'part': i[0], 'category': category, 'description': i[1], 'thumbnail': i[1], 'published': url }) if parts: self._print_info('has parts') else: self._print_info('no parts') self._print_info('End of fetch') return video_urls
def get_anime_series(self, anime): self._print_info('Getting ANIME Item') anime_url = self.BASE_URL + anime self._print_info(anime_url) #parse_result = feedparser.parse(cat_url) self._print_info('ANIME Item fetched succeeded!') html = utils.get_page_content(anime_url) #attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>').findall(html) temp = [] html = html.replace('\r', '') html = html.replace('\n', '') spid = re.compile('var spid = "(.+?)";').findall(html) series = re.compile('<option value="(.+?)">(.+?)</option>').findall( html) if len(spid): spid = spid[0] self._print_info('spid ' + spid) if len(series): for s in series: self._print_info('title' + s[1]) self._print_info('series number ' + s[0]) link = spid + '-' + s[0] #http://www.bilibili.com/sppage/bangumi-13294-1816-1.html temp.append({ 'title': s[1], 'link': link, 'spid': spid, 'seriesid': s[1], 'type': 'series', 'thumbnail': s[0], 'published': s[0] }) else: temp = self.get_next_page(temp, spid, 1) return temp
def get_hotjson_items(self, type, category): self._print_info('Getting HOT CAT JSON Items') self._print_info(category) json_url = 'http://www.bilibili.com/index/rank/all-' + type + '-' + category + '.json' self._print_info(json_url) self._print_info('HOT CAT Items fetched succeeded!') html = utils.get_page_content(json_url) data = json.loads(html) data = (data['rank']['list']) temp = [] for i in range(0, len(data)): #pprint(data[i]) aid = 'av' + str(data[i]['aid']) title = (data[i]['title']) author = (data[i]['author']) description = (data[i]['description']) mid = (data[i]['mid']) pic = (data[i]['pic']) link = 'http://www.bilibili.com/video/' + str(aid) temp.append({ 'title': title, 'link': link, 'category': category, 'description': description, 'thumbnail': pic, 'published': aid }) #temp.append({ # 'title': title, # 'link': 'http://www.bilibili.com/video/av'+aid, # 'category':category, # 'description':description, # 'thumbnail':pic, # 'published': 'av'+aid }) return temp
def login(self, userid, pwd, captcha): #utils.get_page_content('http://www.bilibili.com') if self.is_login == True: return True, '' pwd = self.get_encryped_pwd(pwd) data = 'cType=2&vcType=1&captcha={}&user={}&pwd={}&keep=true&gourl=http://www.bilibili.com/'.format( captcha, userid, pwd) result = utils.get_page_content( LOGIN_URL, data, { 'Origin': 'https://passport.bilibili.com', 'Referer': 'https://passport.bilibili.com/login' }) if not requests.utils.dict_from_cookiejar( self.cj).has_key('DedeUserID'): return False, LOGIN_ERROR_MAP[json.loads(result)['code']] self.cj.save() self.is_login = True self.mid = str( requests.utils.dict_from_cookiejar(self.cj)['DedeUserID']) return True, ''
def get_tag_category(self, cat_id): url = "http://www.bilibili.com/index/catalog_tags.json" html = utils.get_page_content(url) jsondata = json.loads(html) self._print_info('get_tag_category') items = [] for i in jsondata: if i == cat_id: _jsondata = jsondata[i] for j in _jsondata: items.append({ 'label': j, 'thumbnail': j, 'path': json.dumps(j, ensure_ascii=False).encode('utf8') }) return items
def get_category_list(self, tid=0, order='default', days=30, page=1, pagesize=10): params = { 'tid': tid, 'order': order, 'days': days, 'page': page, 'pagesize': pagesize } url = LIST_URL.format(self.api_sign(params)) result = json.loads(utils.get_page_content(url)) results = [] for i in range(pagesize): if result['list'].has_key(str(i)): results.append(result['list'][str(i)]) else: break return results, result['pages']
def get_next_page(self, list, bangumi, page): p_url = 'http://www.bilibili.com/sppage/bangumi-' + str( bangumi) + '-' + str(page) + '.html' html = utils.get_page_content(p_url) html = html.replace('\r', '') html = html.replace('\n', '') checknext = re.compile('<div class="no_more">(.+?)</div>').findall( html) if len(checknext): if checknext[0] == "没有更多信息": return list else: thumbnail = "" cover = re.compile( '<div class="cover"><img src="(.+?)"></div>').findall(html) if len(cover): thumbnail = cover[0] series = re.compile( '<a class="t" href="/video/(.+?)" target="_blank" title="(.+?)">(.+?)</a>' ).findall(html) if len(series): for s in series: list.append({ 'title': s[1].strip(), 'link': s[0].strip(), 'type': 'bangumi', 'page': page, 'thumbnail': thumbnail, 'published': s[0].strip() }) return self.get_next_page(list, bangumi, page + 1)
def get_anime_list(self, list): self._print_info('Getting ANIME LIST') anime_url = self.BASE_URL + list self._print_info(anime_url) #parse_result = feedparser.parse(cat_url) self._print_info('ANIME LIST fetched succeeded!') html = utils.get_page_content(anime_url) #attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>').findall(html) temp = [] #temp= [{ # 'title': i[3], # 'description': i[3], # 'link':'', # 'category':category, # 'thumbnail':i[1], # 'published': i[0]} for i in attrs] #for t in temp: # for tt in t: # self._print_info(t[tt]) html = html.replace('\r', '') html = html.replace('\n', '') anime = re.compile('<ul class="v_ul">(.+?)</ul>').findall(html) if len(anime): anime = anime[0].replace('\r', '') anime = anime.replace('\n', '') anime = re.compile('<li>(.+?)</li>').findall(anime) for _anime in anime: #print _anime #_anime=_anime.replace(' ', '') #print _anime cover = re.compile('<div class="cover">(.+?)</div>').findall( _anime) info_wrp = re.compile( '<div class="info_wrp">(.+?)</div>').findall(_anime) info_series = re.compile('<p class="num">(.+?)</p>').findall( _anime) image = "" link = "" title = "" if len(cover): _cover = re.compile( '<a href="(.+?)" target="_blank"><img src="(.+?)" /></a>' ).findall(cover[0]) if len(_cover): image = _cover[0][1] link = _cover[0][0] if len(info_wrp): _info_wrp = re.compile( '<a title="(.+?)" href="(.+?)" target="_blank">(.+?)</a>' ).findall(info_wrp[0]) if len(_info_wrp): title = _info_wrp[0][0] if len(info_series): info_series = info_series[0].replace('<b>', '') info_series = info_series.replace('</b>', '') title = title + " " + info_series temp.append({ 'title': title, 'link': link, 'type': 'sp', 'thumbnail': image, 'published': link }) pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall( html) if len(pager): links = re.compile('href="(.+?)">(.+?)</a>').findall(pager[0]) for p in links: if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[ 1] == '上页 ': temp.append({ 'title': p[1], 'link': p[0], 'type': 'list', 'thumbnail': p[1], 'published': p[0] }) else: temp.append({ 'title': '第' + p[1] + '页', 'link': p[0], 'type': 'list', 'thumbnail': p[1], 'published': p[0] }) return temp
def get_tag_videos(self, cat_id, tagname, page=1): print tagname tagname = tagname[1:-1] print tagname print tagname url = "http://www.bilibili.com/index/tag/" + cat_id + "/default/" + page + "/" + tagname + ".json" print url html = utils.get_page_content(url) jsondata = json.loads(html) self._print_info('get_tag_videos') print jsondata items = [] item = 0 for i in jsondata: _jsondata = jsondata[i] if type(_jsondata) is list: for j in _jsondata: items.append({ 'label': j['title'], 'thumbnail': j['pic'], 'type': 'video', 'path': 'av' + j['aid'] }) item = item + 1 else: if i == 'results': results = _jsondata if i == 'num': num = _jsondata if i == 'pages': self._print_info('page: ' + page) pages = _jsondata items.append({ 'label': '首页', 'thumbnail': '首页', 'type': 'pager', 'path': 1 }) if int(page) < int(pages): items.append({ 'label': '下一页', 'thumbnail': '下一页', 'type': 'pager', 'path': (int(page) + 1) }) if int(page) > 1: items.append({ 'label': '上一页', 'thumbnail': '上一页', 'type': 'pager', 'path': (int(page) - 1) }) items.append({ 'label': '末页', 'thumbnail': '末页', 'type': 'pager', 'path': int(pages) }) return items
category_re['en'] = re.compile(r'\[\[Category:(.+?)(?:\|.*?)?\]\]') category_re['fr'] = re.compile(r'\[\[Cat\xe9gorie:(.+?)\]\]') for rg_id, rg_gid, rg_name, ac_name, rg_sec_types, processed in db.execute(query, query_params): colored_out(bcolors.OKBLUE, 'Looking up release group "%s" http://musicbrainz.org/release-group/%s' % (rg_name, rg_gid)) matches = wps.query(escape_query(rg_name), defType='dismax', qf='name', rows=100).results last_wp_request = time.time() for match in matches: title = match['name'] if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(rg_name) and mangle_name(title) != mangle_name(rg_name): continue delay = time.time() - last_wp_request if delay < 1.0: time.sleep(1.0 - delay) last_wp_request = time.time() page_orig = get_page_content(wp, title, wp_lang) if not page_orig: continue page_title = title url = 'http://%s.wikipedia.org/wiki/%s' % (wp_lang, quote_page_title(page_title),) colored_out(bcolors.HEADER, ' * trying article %s' % (title,)) page = mangle_name(page_orig) is_canonical, reason = wp_is_canonical_page(title, page_orig) if (not is_canonical): out(' * %s, skipping' % reason) continue categories = category_re[wp_lang].findall(page_orig) is_album_page = False for category in categories:
def fetch(cls, url, use_cache=True): m = re.match(r'^http://([a-z]{2})\.wikipedia\.org', url) page_lang = m.group(1).encode('utf8') page_title = extract_page_title(url, page_lang) wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang) return cls(page_title, get_page_content(wp, page_title, page_lang, use_cache) or '', page_lang)
def get_anime_list(self,list): self._print_info('Getting ANIME LIST') anime_url = self.BASE_URL + list self._print_info(anime_url) #parse_result = feedparser.parse(cat_url) self._print_info('ANIME LIST fetched succeeded!') html= utils.get_page_content(anime_url) #attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>').findall(html) temp=[] #temp= [{ # 'title': i[3], # 'description': i[3], # 'link':'', # 'category':category, # 'thumbnail':i[1], # 'published': i[0]} for i in attrs] #for t in temp: # for tt in t: # self._print_info(t[tt]) html = html.replace('\r', '') html = html.replace('\n', '') anime = re.compile('<ul class="v_ul">(.+?)</ul>').findall(html) if len(anime): anime = anime[0].replace('\r', '') anime = anime.replace('\n', '') anime = re.compile('<li>(.+?)</li>').findall(anime) for _anime in anime: #print _anime #_anime=_anime.replace(' ', '') #print _anime cover = re.compile('<div class="cover">(.+?)</div>').findall(_anime) info_wrp = re.compile('<div class="info_wrp">(.+?)</div>').findall(_anime) info_series = re.compile('<p class="num">(.+?)</p>').findall(_anime) image="" link="" title="" if len(cover): _cover = re.compile('<a href="(.+?)" target="_blank"><img src="(.+?)" /></a>').findall(cover[0]) if len(_cover): image = _cover[0][1] link = _cover[0][0] if len(info_wrp): _info_wrp = re.compile('<a title="(.+?)" href="(.+?)" target="_blank">(.+?)</a>').findall(info_wrp[0]) if len(_info_wrp): title=_info_wrp[0][0] if len(info_series): info_series=info_series[0].replace('<b>', '') info_series=info_series.replace('</b>', '') title=title+" "+info_series temp.append({ 'title': title, 'link': link, 'type': 'sp', 'thumbnail':image, 'published': link}) pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall(html) if len(pager): links = re.compile('href="(.+?)">(.+?)</a>').findall(pager[0]) for p in links: if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[1] == '上页 ': temp.append({ 'title': p[1], 'link': p[0], 'type': 'list', 'thumbnail':p[1], 'published': p[0]}) else: temp.append({ 'title': '第'+p[1]+'页', 'link': p[0], 'type': 'list', 'thumbnail':p[1], 'published': p[0]}) return temp
def add_history(self, aid, cid): url = ADD_HISTORY_URL.format(str(cid), str(aid)) utils.get_page_content(url)