def BaiduPanHandler(url): import json o = urlparse.urlparse(url) if not o[1] or not o[1].endswith(('pan.baidu.com','yun.baidu.com')): return None #为了简单起见,这里使用网友制作的网站获取真实链接 #后续为了减少依赖,可以借鉴 #https://github.com/banbanchs/pan-baidu-download #和 https://github.com/xuanqinanhai/bleed-baidu-white #将代码集成过来 url = 'http://daimajia.duapp.com/baidu/?url=%s' % url opener = URLOpener() result = opener.open(url) if result.status_code != 200 or not result.content: return None linkinfo = json.loads(result.content.decode('utf-8')) filename = linkinfo.get('name') if '\u' in filename: try: filename = filename.decode('unicode-escape') except: pass link = linkinfo.get('download') return (filename,link) if link else None
def ParseFeedUrls(self): urls = [] # 定义一个空的列表用来存放文章元组 # 循环处理fees中两个主题页面 for feed in self.feeds: # 分别获取元组中主题的名称和链接 topic, url = feed[0], feed[1] # 请求主题链接并获取相应内容 opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'lxml') # 找出当前页面文章列表中所有文章条目 items = soup.find_all(name='div', id='list') # 循环处理每个文章条目 for item in items: title = item.a.string # 获取文章标题 link = item.a.get('href') # 获取文章链接 link = BaseFeedBook.urljoin(url, link) # 合成文章链接 urls.append((topic, title, link, None)) # 把文章元组加入列表 # 如果请求失败通知到日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % \ (URLOpener.CodeMap(result.status_code), url)) # 返回提取到的所有文章列表 return urls
def postprocess(self, content): pn = re.compile(ur'<a href="(\S*?)">本话题在纽约时报有.*?条讨论,点击查看。</a>', re.I) comment = '' mt = pn.search(content) url = mt.group(1) if mt else None if url: opener = URLOpener(url, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: comment = result.content.decode(self.feed_encoding) except UnicodeDecodeError: return content pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I) mt = pn.search(comment) if mt: comment_json = mt.group(1) j = json.loads(comment_json) soup = BeautifulSoup(content, "lxml") for c in j['comments']: u = c['user']['screen_name'] t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text'])) for img in t.find_all('img', alt=True): img.replace_with(t.new_string(img['alt'])) soup.html.body.append(t.p) content = unicode(soup) return content
def fetcharticle(self, url, decoder): opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) return None if self.page_encoding: content = content.decode('utf-8') else: content = decoder.decode(content,url) m = re.search(r'<iframe.*?src="(.*?)".*?>', content) if m: newurl = m.group(1) result = opener.open(newurl) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,newurl)) return None if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content,newurl) return content
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1].replace('gzh', 'gzhjs') isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) content = content[content.index('{'):content.index('}')+1] content = json.loads(content) for e in content['items'][:self.max_articles_per_feed]: e = feedparser.parse(e)['entries'][0] updated = None if hasattr(e, 'lastmodified') and e.lastmodified: updated = float(e.lastmodified) if self.oldest_article > 0 and updated: updated = datetime.datetime.utcfromtimestamp(updated) delta = tnow - updated if self.oldest_article > 365: threshold = self.oldest_article #以秒为单位 else: threshold = 86400*self.oldest_article #以天为单位 if delta.days*86400+delta.seconds > threshold: self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href)) continue #支持HTTPS if hasattr(e, 'href'): if url.startswith('https://'): urlfeed = e.href.replace('http://','https://') else: urlfeed = e.href if urlfeed in urladded: continue else: urlfeed = '' desc = None urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] url = r'http://cctv.cntv.cn/lm/xinwenyijiayi/video/index.shtml' opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) list_pattern=re.compile(r'{\'title\':\'.*?\'<!--VIDEOSTR-->\'}', re.S) file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search l=re.findall(list_pattern,content) tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8) for i in l[:5]: item=eval(i) try: pubdate = datetime.datetime.strptime(file_name_search(item["link_add"]).group(0), '%Y/%m/%d') except Exception as e: continue delta = tnow - pubdate if self.oldest_article > 0 and delta.days > self.oldest_article: continue urls.append((u'新闻1+1',item['title'],item['link_add'],None)) return urls
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith( "http://" ): url = url.replace('http://', 'https://') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') allComicTable = soup.find_all('table', {'width': '800', 'align': 'center'}) if (allComicTable is None): self.log.warn('allComicTable is not exist.') return chapterList for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) if (comicVolumes is None): self.log.warn('comicVolumes is not exist.') return chapterList for volume in comicVolumes: href = self.urljoin(self.host, volume.get("href")) chapterList.append((unicode(volume.string), href)) return chapterList
def SaveToInstapaper(self, user, action, orgUrl): web.header('Content-type', "text/html; charset=utf-8") T_INFO = u"""<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <title>%s</title></head><body><p style="text-align:center;font-size:1.5em;">%s</p></body></html>""" if not user.instapaper_username or not user.instapaper_password: info = T_INFO % ('No authorize info', 'Instapaper username and password have to provided fistly!<br/>Please fill them in your KindleEar application.') return info.encode('utf-8') title = web.input().get('t', '') name = web.input().get("n", '') if user.instapaper_username != name: info = T_INFO % ('Action rejected', 'Username not match!<br/>KindleEar refuse to execute your command.') return info.encode('utf-8') opener = URLOpener() password = ke_decrypt(user.instapaper_password, user.secret_key or '') apiParameters = {'username': user.instapaper_username, 'password':password, 'title':title.encode('utf-8'), 'selection':'KindleEar', 'url':orgUrl} ret = opener.open(INSTAPAPER_API_ADD_URL, data=apiParameters) if ret.status_code in (200, 201): info = _("'%s'<br/><br/>Saved to your Instapaper account.") % title info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Saved to Instapaper', info) elif ret.status_code == 403: info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Invalid username or password.") % title info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Failed to save', info) else: info = _("Failed save to Instapaper<br/>'%s'<br/><br/>Reason : Unknown(%d).") % (title, ret.status_code) info += '<br/><p style="text-align:right;color:red;">by KindleEar </p>' info = T_INFO % ('Failed to save', info) return info.encode('utf-8')
def POST(self): user = self.getcurrentuser(forAjax=True) web.header('Content-Type', 'application/json') webInput = web.input() category = webInput.get('category', '') title = webInput.get('title') feedUrl = webInput.get("url") isfulltext = bool(webInput.get('isfulltext', '').lower() == 'true') creator = webInput.get('creator', '') if not title or not feedUrl: return json.dumps({'status': _("Title or Url is empty!")}) opener = URLOpener() srvUrl = urlparse.urljoin('http://kindleear.appspot.com/', SharedLibrarykindleearAppspotCom.__url__) data = { 'category': category, 'title': title, 'url': feedUrl, 'creator': creator, 'isfulltext': 'true' if isfulltext else 'false', 'key': 'kindleear.lucky!' } result = opener.open(srvUrl, data) if result.status_code == 200 and result.content: return result.content else: return json.dumps({ 'status': _('Cannot submit data to kindleear.appspot.com, status: %s' % URLOpener.CodeMap(result.status_code)) })
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith( "https://www.manhuagui.com" ): url = url.replace('https://www.manhuagui.com', 'https://m.manhuagui.com') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') invisible_input = soup.find("input", {"id":'__VIEWSTATE'}) if invisible_input: lz_encoded=invisible_input.get("value") lz_decoded = decompressFromBase64(lz_encoded) soup = BeautifulSoup(lz_decoded, 'html.parser') else: soup = soup.find("div", {"class": 'chapter-list', "id":'chapterList'}) lias = soup.findAll('a') for aindex in range(len(lias)): rindex = len(lias)-1-aindex href = "https://m.manhuagui.com" + lias[rindex].get("href") chapterList.append(href) return chapterList
def ParseFeedUrls(self): """ return list like [(section,title,url),..] """ urls = [] for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = CONNECTION_TIMEOUT+15 if isfulltext else CONNECTION_TIMEOUT opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: feed = feedparser.parse(result.content.decode(self.feed_encoding)) else: feed = feedparser.parse(AutoDecoder().decode(result.content)) urladded = [] # 防止部分RSS产生重复文章 for e in feed['entries'][:self.max_articles_per_feed]: url = e.link if url not in urladded: if isfulltext: desc = e.content[0].value if hasattr(e, 'content') and e.content[0].value else e.summary urls.append((section, e.title, url, desc if desc else u'Has no summary, is it fulltext feed?')) else: urls.append((section, e.title, url, None)) urladded.append(url) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def get_chapter_list_from_mobile_url(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(addreferer=False, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn("fetch comic page failed: %s" % result.status_code) return [] content = self.AutoDecodeContent( result.content, decoder, self.feed_encoding, opener.realurl, result.headers ) if "obj_id" not in content: self.log.warn(u"Can't find obj_id form {}".format(url)) return [] comic_id = re.search('obj_id = "(\d+)"', content).group(1) data_match = re.search("initIntroData\(([^;]+)\);", content) if not data_match: return self.get_chapter_list_from_api(comic_id) datas = json.loads(data_match.group(1)) chapter_datas = [] for data in datas: chapter_datas += data["data"] if not chapter_datas: return self.get_chapter_list_from_api(comic_id) chapter_datas.sort(key=lambda d: d["id"]) chapters = [] for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["id"], comic_id=comic_id ) chapters.append((chapter["chapter_name"], chapter_url)) return chapters
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host) chapterList = [] url = url.replace("http://www.dm5.com", "https://www.manhuaren.com") result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})".format( url, result.status_code, result.content ) ) return chapterList content = self.AutoDecodeContent( result.content, decoder, self.feed_encoding, opener.realurl, result.headers ) soup = BeautifulSoup(content, "html.parser") chapter_datas = [] for link in soup.find_all("a", {"class": "chapteritem"}): chapter_datas.append( { "chapter_id": int(re.search("m(\d+)", link.get("href")).group(1)), "chapter_title": unicode(link.string), } ) chapter_datas.sort(key=lambda d: d["chapter_id"]) for chapter in chapter_datas: chapter_url = "http://www.manhuaren.com/m{}/".format(chapter["chapter_id"]) chapterList.append((chapter["chapter_title"], chapter_url)) return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(url) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})".format( url, result.status_code, result.content ) ) return [] content = self.AutoDecodeContent( result.content, decoder, self.feed_encoding, opener.realurl, result.headers ) soup = BeautifulSoup(content, "html.parser") scripts = soup.findAll("script", {"type": "text/javascript"}) packed_js = None for script in scripts: if "newImgs" in script.text: packed_js = script.text break if not packed_js: self.log.warn("Can't find js") return [] codes = decode_packed_codes(packed_js) return re.findall("'(.+?)'", codes)
def getImgList(self, chapterJson, comic_id): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] cid = list(chapterJson.keys())[0] getImgListUrl = 'http://ac.qq.com/ComicView/index/id/{0}/cid/{1}'.format(comic_id, cid) result = opener.open(getImgListUrl) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = result.content cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page) if len(filter_result) != 0: base64data = filter_result[0][1:] img_detail_json = json.loads(base64.decodestring(base64data)) for img_url in img_detail_json.get('picture', []): if ( 'url' in img_url ): imgList.append(img_url['url']) else: self.log.warn('no url in img_url:%s' % img_url) return imgList
def getChapterList(self, comic_id): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(comic_id) result = opener.open(getChapterListUrl) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) contentJson = json.loads(content) count = contentJson.get('length', 0) if (count != 0): for i in range(count + 1): for item in contentJson: if isinstance(contentJson[item], dict) and contentJson[item].get('seq') == i: chapterList.append({item: contentJson[item]}) break else: self.log.warn('comic count is zero.') return chapterList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: feed = feedparser.parse(result.content.decode(self.feed_encoding)) else: feed = feedparser.parse(AutoDecoder().decode(result.content)) urladded = set() # 防止部分RSS产生重复文章 for e in feed['entries'][:self.max_articles_per_feed]: #支持HTTPS urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link if urlfeed not in urladded: desc = None if isfulltext: if hasattr(e, 'content') and e.content[0].value: desc = e.content[0].value elif hasattr(e, 'summary'): desc = e.summary else: self.log.warn('feed item invalid,link to webpage for article.(%s)'%e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] url = r'http://cctv.cntv.cn/lm/jiaodianfangtan/index.shtml' opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, 'lxml') file_name_search=re.compile(r'\d{4}/\d{2}/\d{2}').search tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8) for li in soup.find_all('div', attrs={'class':'text'}): a=li.find('a') href = a['href'] try: pubdate = datetime.datetime.strptime(file_name_search(href).group(0), '%Y/%m/%d') except Exception as e: continue delta = tnow - pubdate if self.oldest_article > 0 and delta.days > self.oldest_article: continue urls.append((u'焦点访谈',a.string,href,None)) return urls
def getImgUrlList(self, url): imgUrlList = [] decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') sel = soup.find('select') #页码行,要提取所有的页面 if (sel is None): self.log.warn('soup select is not exist.') return None ulist = sel.find_all('option') if sel else None if not ulist: self.log.warn('select option is not exist.') return None for ul in ulist: if ul.get('value') == None: ulist.remove(ul) else: href = self.host + '/comic/' + ul.get('value') imgUrlList.append(href) return imgUrlList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] partition = feed[2] timeout = 30 opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: feed = json.loads(result.content.decode(self.feed_encoding)) for item in feed[partition]: for e in item['items']: #支持HTTPS urlfeed = e['share_url'].replace('http://','https://') if url.startswith('https://') else e['share_url'] if urlfeed in urladded: self.log.warn('skipped %s' % urlfeed) continue desc = None urls.append((section, e['title'], urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) #self.log.warn('%s' % json.dumps(urls)) return urls
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith( "https://m.733.so" ): url = url.replace('https://m.733.so', 'https://www.733.so') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') soup = soup.find('div', {"class": "cy_plist"}) if (soup is None): self.log.warn('cy_plist is not exist.') return chapterList lias = soup.findAll('a') if (lias is None): self.log.warn('chapterList href is not exist.') return chapterList for aindex in range(len(lias)): rindex = len(lias)-1-aindex href = "https://www.733.so" + lias[rindex].get("href") chapterList.append((lias[rindex].get_text(), href)) return chapterList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] urladded = set() url = self.url4forwarder(self.feeds[0][1]) opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: feed = json.loads(result.content.decode(self.feed_encoding)) for partition,section in self.partitions: for item in feed[partition]: urlfeed = item['share_url'] if urlfeed in urladded: self.log.info('duplicated, skipped %s' % urlfeed) continue urls.append((section, item['title'], self.url4forwarder(urlfeed), None)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) return urls #def fetcharticle(self, url, opener, decoder): # result = opener.open(self.url4forwarder(url)) # status_code, content = result.status_code, result.content # if status_code != 200 or not content: # self.log.warn('fetch article failed(%s):%s.' % (URLOpener.CodeMap(status_code),url)) # return None # # if self.page_encoding: # return content.decode(self.page_encoding) # else: # return decoder.decode(content,url,result.headers)
def POST(self, verType): INSTAPAPER_API_AUTH_URL = "https://www.instapaper.com/api/authenticate" web.header('Content-Type', 'application/json') respDict = {'status':'ok', 'correct':0} if verType.lower() != 'instapaper': respDict['status'] = _('Request type[%s] unsupported') % verType return json.dumps(respDict) user = self.getcurrentuser() username = web.input().get('username', '') password = web.input().get('password', '') opener = URLOpener() apiParameters = {'username': username, 'password':password} ret = opener.open(INSTAPAPER_API_AUTH_URL, data=apiParameters) if ret.status_code in (200, 201): respDict['correct'] = 1 elif ret.status_code == 403: respDict['correct'] = 0 else: respDict['status'] = _("The Instapaper service encountered an error. Please try again later.") return json.dumps(respDict)
def get_chapter_list_from_api(self, comic_id): opener = URLOpener(addreferer=False, timeout=60) json_result = opener.open( "http://v3api.dmzj.com/comic/{comic_id}.json".format(comic_id=comic_id) ) if json_result.status_code != 200 or not json_result.content: self.log.info( "fetch v3 chapter list failed: %s, try v2" % json_result.status_code ) json_result = opener.open( "http://v2.api.dmzj.com/comic/{comic_id}.json?channel=Android&version=2.6.004".format( comic_id=comic_id ) ) if json_result.status_code != 200 or not json_result.content: self.log.warn( "fetch v2 chapter list failed: %s" % json_result.status_code ) return [] data = json.loads(json_result.content) chapter_datas = [] for chapters_data in data["chapters"]: chapter_datas += chapters_data["data"] chapter_datas.sort(key=lambda d: d["chapter_id"]) chapters = [] for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["chapter_id"], comic_id=comic_id ) chapters.append((chapter["chapter_title"], chapter_url)) return chapters
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') scripts = soup.findAll("script", {"type": "text/javascript"}) for script in scripts: if script.text != "": raw_content = script.text break res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1) lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1) lz_decoded = decompressFromBase64(lz_encoded) res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res) codes = self.get_node_online(res) pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1)) cid = self.getChapterId(url) md5 = pages_opts["sl"]["md5"] images = pages_opts["images"] for img in images: img_url = u'https://i.hamreus.com{}?cid={}&md5={}'.format(img, cid, md5) imgList.append(img_url) return imgList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] url = r'http://opinion.people.com.cn/GB/40604/index.html' opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, 'lxml') box=soup.find('div', attrs={'class':'p2j_list'}) for li in box.find_all('li'): a=li.find('a') # print a['href'],a.string title = a.string if u'人民日报' in title: urls.append((u'人民日报',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None)) else: urls.append((u'人民网',a.string,r'%s%s'%(r'http://opinion.people.com.cn',a['href']),None)) return urls
def postprocess(self, content): pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>', re.I) mt = pn.search(content) url = mt.group(1) if mt else None if url: opener = URLOpener(url, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: comment = result.content.decode(self.feed_encoding) except UnicodeDecodeError: return content pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I) mt = pn.search(comment) comment_json = mt.group(1) if mt else None j = json.loads(comment_json) soup = BeautifulSoup(content, "lxml") for c in j['comments']: u = c['user']['screen_name'] t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text'])) for img in t.find_all('img', alt=True): img.replace_with(t.new_string(img['alt'])) soup.html.body.append(t.p) content = unicode(soup) return content
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ((u"mh" in urlpaths) and (urlpaths.index(u"mh") + 2 < len(urlpaths))): tid = str(time.time()).replace(".", "1") if len(tid) == 12: tid = tid + "1" cid = urlpaths[urlpaths.index(u"mh") + 1] pid = urlpaths[urlpaths.index(u"mh") + 2].replace(".html", "") else: self.log.warn('Can not get cid and pid from URL: {}.'.format(url)) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: if "http://www.baidu1.com/" in img: b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format( tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format( imgb64) elif "http://ac.tc.qq.com/" in img: b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format( imgb64) else: img_url = img self.log.info('Ths image herf is: %s' % img_url) imgList.append(img_url) return imgList
def SendNewSubscription(self, title, url): opener = URLOpener() path = SharedLibraryMgrkindleearAppspotCom.__url__.split('/') path[-1] = 'subscribedfromshared' srvUrl = urlparse.urljoin('http://kindleear.appspot.com/', '/'.join(path)) data = {'title': title, 'url': url} result = opener.open(srvUrl, data) #只管杀不管埋,不用管能否成功了
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,url) else: content = AutoDecoder(True).decode(result.content,url) feed = feedparser.parse(content) for e in feed['entries'][:self.max_articles_per_feed]: updated = None if hasattr(e, 'updated_parsed') and e.updated_parsed: updated = e.updated_parsed elif hasattr(e, 'published_parsed') and e.published_parsed: updated = e.published_parsed elif hasattr(e, 'created_parsed'): updated = e.created_parsed if self.oldest_article > 0 and updated: delta = tnow - datetime.datetime(*(updated[0:6])) if delta.days*86400+delta.seconds > 86400*self.oldest_article: self.log.info("Skip old article: %s" % e.link) continue #支持HTTPS urlfeed = e.link.replace('http://','https://') if url.startswith('https://') else e.link if urlfeed in urladded: continue desc = None if isfulltext: if hasattr(e, 'description'): desc = e.description elif hasattr(e, 'content') and e.content[0]['value']: desc = e.content[0]['value'] else: self.log.warn('fulltext feed item no has desc,link to webpage for article.(%s)'%e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def fetcharticle(self, url, decoder): opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(self.url4forwarder(url)) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code, url)) return None if self.page_encoding: return content.decode(self.page_encoding) else: return decoder.decode(content, url)
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout + 10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: content = result.content.decode(self.feed_encoding) else: content = AutoDecoder(True).decode(result.content, url) feed = feedparser.parse(content) for e in feed['entries'][:self.max_articles_per_feed]: if self.oldest_article > 0 and hasattr( e, 'updated_parsed'): updated = e.updated_parsed if updated: delta = tnow - datetime(*(updated[0:6])) if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article: self.log.debug("article '%s' is too old" % e.title) continue #支持HTTPS urlfeed = e.link.replace( 'http://', 'https://') if url.startswith('https://') else e.link if urlfeed in urladded: continue desc = None if isfulltext: if hasattr(e, 'content') and e.content[0]['value']: desc = e.content[0]['value'] elif hasattr(e, 'description'): desc = e.description else: self.log.warn( 'fulltext feed item no has desc,link to webpage for article.(%s)' % e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s' % (result.status_code, url)) return urls
def getRealUrl (self, url, try_count = 1): if try_count > 3: return url try: opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url, None, self.http_headers) if result.status_code > 400: return self.getRealUrl(url, try_count + 1) else: return opener.realurl except: return self.getRealUrl(url, try_count + 1)
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ( (u"mh" in urlpaths) and (urlpaths.index(u"mh")+2 < len(urlpaths)) ): tid = str(time.time()).replace(".", "1") if len(tid) == 12: tid = tid + "1" cid = urlpaths[urlpaths.index(u"mh")+1] pid = urlpaths[urlpaths.index(u"mh")+2].replace(".html", "") else: self.log.warn('Can not get cid and pid from URL: {}.'.format(url)) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: if "http://www.baidu1.com/" in img: b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format(tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64) elif "http://ac.tc.qq.com/" in img: b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(imgb64) else: img_url = img self.log.info('Ths image herf is: %s' % img_url) imgList.append(img_url) return imgList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ((u"id" in urlpaths) and (urlpaths.index(u"id") + 1 < len(urlpaths))): comic_id = urlpaths[urlpaths.index(u"id") + 1] if ((not comic_id.isdigit()) or (comic_id == "")): self.log.warn('can not get comic id: %s' % url) return chapterList url = 'https://m.ac.qq.com/comic/chapterList/id/{}'.format(comic_id) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') # <section class="chapter-list-box list-expanded" data-vip-free="1"> section = soup.find('section', {'class': 'chapter-list-box list-expanded'}) if (section is None): self.log.warn('chapter-list-box is not exist.') return chapterList # <ul class="chapter-list normal"> # <ul class="chapter-list reverse"> reverse_list = section.find('ul', {'class': 'chapter-list reverse'}) if (reverse_list is None): self.log.warn('chapter-list is not exist.') return chapterList for item in reverse_list.find_all('a'): # <a class="chapter-link lock" data-cid="447" data-seq="360" href="/chapter/index/id/531490/cid/447">360</a> # https://m.ac.qq.com/chapter/index/id/511915/cid/1 href = 'https://m.ac.qq.com' + item.get('href') isVip = "lock" in item.get('class') if isVip == True: self.log.info( "Chapter {} is Vip, waiting for free.".format(href)) continue chapterList.append((item.get_text(), href)) return chapterList
def fetcharticle(self, url, decoder): opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(self.url4forwarder(url)) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) return None if self.page_encoding: return content.decode(self.page_encoding) else: return decoder.decode(content,url)
def getImgUrl(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) url = self.host + "/comic/" + url result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) return comicImgTag.get('src') if comicImgTag else None
def getImgUrl(self, url): opener = URLOpener(self.host, timeout=60) headers = { 'Host': "img_733.234us.com", 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25' } result = opener.open(url, headers=headers) if result.status_code != 200 or opener.realurl == url: self.log.warn('can not get real comic url for : %s' % url) return None return opener.realurl
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = result.content cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page) # "picture": [{},...{}]} if len(filter_result) != 0: # "picture" > InBpY3R1cmUi # picture": > cGljdHVyZSI6 # icture":[ > aWN0dXJlIjpb if "InBpY3R1cmUi" in filter_result[0]: base64data = filter_result[0].split("InBpY3R1cmUi")[1] self.log.warn('found flag string: %s' % "InBpY3R1cmUi") elif "cGljdHVyZSI6" in filter_result[0]: base64data = filter_result[0].split("cGljdHVyZSI6")[1] self.log.warn('found flag string: %s' % "cGljdHVyZSI6") elif "aWN0dXJlIjpb" in filter_result[0]: base64data = filter_result[0].split("aWN0dXJl")[1] self.log.warn('found flag string: %s' % "aWN0dXJlIjpb") else: self.log.warn('can not found flag string in data: %s' % filter_result[0]) return imgList decodeData = base64.decodestring(base64data) startIndex = decodeData.find('[') endIndex = decodeData.find(']') if startIndex > -1 and endIndex > -1: img_detail_json = json.loads(decodeData[startIndex:endIndex + 1]) for img_url in img_detail_json: if ('url' in img_url): imgList.append(img_url['url']) else: self.log.warn('no url in img_url:%s' % img_url) else: self.log.warn('can not found [] in decodeData:%s' % decodeData) else: self.log.warn('can not fount filter_result with data: .') return imgList
def getPageContent(u_url): opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(u_url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, u_url)) return '' if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) return content
def fetcharticle(self, url, decoder): #使用同步方式获取一篇文章 if self.fulltext_by_instapaper and not self.fulltext_by_readability: url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url) opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code, url)) return None if self.page_encoding: return content.decode(self.page_encoding) else: return decoder.decode(content, url)
def fetcharticle(self, url, decoder): #使用同步方式获取一篇文章 if self.fulltext_by_instapaper and not self.fulltext_by_readability: url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url) opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) return None if self.page_encoding: return content.decode(self.page_encoding) else: return decoder.decode(content,url)
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] url = self.feeds[0][1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content, opener.realurl, result.headers) else: content = AutoDecoder(False).decode(result.content, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') for article in soup.find_all('div', attrs={'class': 'post'}): title = article.find('a', attrs={'class': 'title'}) if not title or not title.string.startswith(u'安邦'): continue #获取发布时间 pubdate = article.find('span', attrs={'class': 'date'}) if not pubdate: continue mt = re.match(ur'(\d{4})年(\d{1,2})月(\d{1,2})日', pubdate.string) if not mt: continue pubdate = datetime.datetime(int(mt.group(1)), int(mt.group(2)), int(mt.group(3))) #确定文章是否需要推送,时区固定为北京时间 tnow = datetime.datetime.utcnow() + datetime.timedelta(hours=8) delta = tnow - pubdate if self.oldest_article > 0 and delta.days > self.oldest_article: continue urls.append((u'安邦咨询', title.string, title['href'], None)) return urls
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') sel = soup.find('select') #页码行,要提取所有的页面 if (sel is None): self.log.warn('soup select is not exist.') return imgList ulist = sel.find_all('option') if sel else None if not ulist: self.log.warn('select option is not exist.') return imgList for ul in ulist: if ul.get('value') == None: ulist.remove(ul) listLen = len(ulist) firstPageTag = soup.find('img', {'oncontextmenu': 'return false'}) firstPage = firstPageTag.get('src') if firstPageTag else None if firstPage != None: firstPage = "https://www.cartoonmad.com/{}".format(firstPage) base, length, type = self.getImgStr(firstPage) for index in range(len(ulist)): imgUrl = "{}{}.{}".format(base, str(index+1).zfill(length), type) imgList.append(imgUrl) if imgList[0] == firstPage and imgList[listLen-1] == self.getImgUrl(ulist[listLen-1].get('value')): return imgList else: imgList = [] for ul in ulist: imgList.append("https://www.cartoonmad.com/{}".format(self.getImgUrl(ul.get('value')))) return imgList return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) #var chapterPath = "images/comic/31/61188/"; chapterPath = re.search( r'(var chapterPath = ")(.*)(";var chapterPrice)', content) if (chapterPath is None): self.log.warn('var chapterPath is not exist.') return imgList else: chapterPath = chapterPath.group(2) #var pageImage = "https://res.gufengmh.com/gufeng/images/"; imgPrefix = re.search(r'(var pageImage = ")(.*)(gufeng/images/)', content) if (imgPrefix is None): self.log.warn( '"https://res.gufengmh.com/gufeng/images/ is not exist.') return imgList else: imgPrefix = imgPrefix.group(2) + "/" #var chapterImages = ["",""]; images = re.search(r'(var chapterImages = \[)(.*)(\];)', content) if (images is None): self.log.warn('var chapterImages is not exist.') return imgList else: images = images.group(2).split(',') for img in images: img_url = imgPrefix + chapterPath + img.replace("\"", "") imgList.append(img_url) return imgList
def getChapterList(self, url): if url.startswith("https://m.dmzj.com"): return self.get_chapter_list_from_mobile_url(url) decoder = AutoDecoder(isfeed=False) opener = URLOpener(addreferer=False, timeout=60) chapterList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn("fetch comic page failed: %s" % result.status_code) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) comic_id = re.search('g_comic_id = "([^"]+)', content).group(1) # try get chapters from html soup = BeautifulSoup(content, "html.parser") chapter_datas = [] for comic_classname in [ "cartoon_online_border", "cartoon_online_border_other" ]: divs = soup.find_all("div", attrs={"class": comic_classname}) if not divs: continue for div in divs: for link in div.find_all("a"): chapter_datas.append({ "chapter_id": int( re.search("\/(\d+)\.shtml", link.get("href")).group(1)), "chapter_title": unicode(link.string), }) if chapter_datas: chapter_datas.sort(key=lambda d: d["chapter_id"]) for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["chapter_id"], comic_id=comic_id) chapterList.append((chapter["chapter_title"], chapter_url)) return chapterList else: return self.get_chapter_list_from_api(comic_id)
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] for feed in self.feeds: feedtitle,url = feed[0],feed[1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) continue if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, 'lxml') for article in soup.find_all('div', attrs={'class':'feed_item_question'}): title = article.find('a', attrs={'class':'question_link'}) if not title: continue #获取发布时间 pubdate = article.find('span',attrs={'class':'timestamp'}) if not pubdate: continue try: pubdate = datetime.datetime.strptime(pubdate.string, '%Y-%m-%d') except Exception as e: self.log.warn('parse pubdate failed for [%s] : %s'%(url,str(e))) continue #确定文章是否需要推送,时区固定为北京时间 tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8) delta = tnow - pubdate if self.oldest_article > 0 and delta.days > self.oldest_article: continue href = title['href'] if title['href'].startswith('http') else self.urljoin(url,title['href']) urls.append((feedtitle,string_of_tag(title),href,None)) return urls
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) try: # function base64decode(str){*}; func = re.search(r'function\ base64decode\(str\){.*};', content).group() func = func.split('base64decode')[1].replace('};', '}') # packed="*"; packed = re.search(r'packed=".*";', content).group() packed = packed.split('\"')[1] except: self.log.warn('var photosr is not exist.') return imgList # eval(function(str){*}("*").slice(4)) lz_input = "eval(function{}(\"{}\").slice(4))".format(func, packed) lz_nodejs = self.get_node_online(lz_input) if (lz_nodejs is None): self.log.warn('image list is not exist.') return imgList # photosr[1]="images/2019/11/08/09/19904f5d64.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0"; images = lz_nodejs.split("\"") # http://res.img.220012.net/2017/08/22/13/343135d67f.jpg for img in images: if ".jpg" in img: img_url = self.urljoin("http://res.img.220012.net", img) imgList.append(img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') try: func = re.search(r'function\ base64decode\(str\){.*};', content).group() packed = re.search(r'packed=".*";', content).group() except: self.log.warn('var photosr is not exist in {}.'.format(url)) return imgList # eval(function(str){*}("*").slice(4)) lz_input = "{}var photosr = new Array();{}console.log(eval(base64decode(packed).slice(4)));".format( func, packed) lz_nodejs = self.get_node_online(lz_input) if (lz_nodejs is None): self.log.warn('image list is not exist.') return imgList images = lz_nodejs.split("\"") self.log.info(images) for img in images: # photosr[1]="images/2020/05/03/17/516bbfddb4.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0"; # http://res.img.fffimage.com/images/2020/05/03/17/516bbfddb4.jpg/0 # photosr[1]="images/2020/04/21/09/3706a024c8.png/0";...photosr[12]="images/2020/04/21/09/3732355905.png/0"; # http://res.img.fffimage.com/images/2020/04/21/09/3706a024c8.png/0 if ".jpg" in img or ".png" in img: img_url = self.urljoin("http://res.img.fffimage.com/", img) imgList.append(img_url) return imgList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith("https://www.gufengmh.com"): url = url.replace('https://www.gufengmh.com', 'https://m.gufengmh.com') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') #<ul class="Drama autoHeight" data-sort="asc" id="chapter-list-1"> soup = soup.find('ul', { "class": "Drama autoHeight", "id": "chapter-list-1" }) if (soup is None): self.log.warn('chapter-list is not exist.') return chapterList lias = soup.findAll('a') if (lias is None): self.log.warn('chapterList href is not exist.') return chapterList for index, a in enumerate(lias): href = self.urljoin("https://m.gufengmh.com", a.get('href', '')) span = a.find("span") if span is None: chapterList.append((u'第%d话' % (index + 1), href)) else: chapterList.append((unicode(span.contents[0]), href)) return chapterList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] url = url.replace("https://www.manhuagui.com", "https://m.manhuagui.com") url = url.replace("https://tw.manhuagui.com", "https://m.manhuagui.com") for njRetry in range(10): try: result = opener.open(url) if result.status_code == 200 and result.content: break except Exception, e: if njRetry < 5: self.log.warn('fetch comic page failed: %s, retry' % url) continue else: self.log.warn('fetch comic page failed: %s' % url) return chapterList