def ParseFeedUrls(self): main = 'http://bbstsg.vip.qikan.com/text/Mag.aspx?issn=ACB37AEA-8FB7-4855-B7CA-D228E972162F' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s'%main) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, "lxml") for section in soup.find_all('dl'): dt=section.find('dt') span=dt.find('span') if span: sectitle = string_of_tag(span).strip() for dd in section.find_all('dd'): a=dd.find('a', href=True) title = string_of_tag(a).strip() url = a['href'] if url.startswith('Article'): url = 'http://bbstsg.vip.qikan.com/text/'+url urls.append((sectitle,title,url,None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1].replace('gzh', 'gzhjs') isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) content = content[content.index('{'):content.index('}')+1] content = json.loads(content) for e in content['items'][:self.max_articles_per_feed]: e = feedparser.parse(e)['entries'][0] updated = None if hasattr(e, 'lastmodified') and e.lastmodified: updated = float(e.lastmodified) if self.oldest_article > 0 and updated: updated = datetime.datetime.utcfromtimestamp(updated) delta = tnow - updated if self.oldest_article > 365: threshold = self.oldest_article #以秒为单位 else: threshold = 86400*self.oldest_article #以天为单位 if delta.days*86400+delta.seconds > threshold: self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href)) continue #支持HTTPS if hasattr(e, 'href'): if url.startswith('https://'): urlfeed = e.href.replace('http://','https://') else: urlfeed = e.href if urlfeed in urladded: continue else: urlfeed = '' desc = None urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url)) return urls
def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail 对于图片,mime,url,filename,content,brief,thumbnail """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' opener = URLOpener(self.host, timeout=self.timeout) decoder = AutoDecoder(False) for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新检测编码 prevsection = section opener = URLOpener(self.host, timeout=self.timeout) if self.needs_subscription: self.login(opener, decoder) article = self.fetcharticle(url, opener, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief, thumbnail in readability( article, url, opts, user): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief, thumbnail) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief, thumbnail)
def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail 对于图片,mime,url,filename,content,brief,thumbnail """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' opener = URLOpener(self.host, timeout=self.timeout) decoder = AutoDecoder(False) for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新检测编码 prevsection = section opener = URLOpener(self.host, timeout=self.timeout) if self.needs_subscription: self.login(opener, decoder) article = self.fetcharticle(url, opener, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief, thumbnail in readability(article,url,opts,user,ftitle): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief, thumbnail) else: if not title: title = ftitle if self.force_ftitle:title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief, thumbnail)
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith("https://m.733.so"): url = url.replace('https://m.733.so', 'https://www.733.so') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') soup = soup.find('div', {"class": "cy_plist"}) if (soup is None): self.log.warn('cy_plist is not exist.') return chapterList lias = soup.findAll('a') if (lias is None): self.log.warn('chapterList href is not exist.') return chapterList for aindex in range(len(lias)): rindex = len(lias) - 1 - aindex href = "https://www.733.so" + lias[rindex].get("href") chapterList.append(href) return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: imgb64 = b64encode(img.replace("http://www.baidu1.com/", "")) img_url = u'http://new.234us.com:8989/img_new.php?data={}'.format( imgb64) imgList.append(img_url) return imgList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host) chapterList = [] url = url.replace("http://www.dm5.com", "https://www.manhuaren.com") result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})". format(url, result.status_code, result.content)) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, "html.parser") chapter_datas = [] for link in soup.find_all("a", {"class": "chapteritem"}): chapter_datas.append({ "chapter_id": int(re.search("m(\d+)", link.get("href")).group(1)), "chapter_title": unicode(link.string), }) chapter_datas.sort(key=lambda d: d["chapter_id"]) for chapter in chapter_datas: chapter_url = "http://www.manhuaren.com/m{}/".format( chapter["chapter_id"]) chapterList.append((chapter["chapter_title"], chapter_url)) return chapterList
def getChapterList(self, comic_id): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(comic_id) result = opener.open(getChapterListUrl) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) contentJson = json.loads(content) count = contentJson.get('length', 0) if (count != 0): for i in range(count + 1): for item in contentJson: if isinstance(contentJson[item], dict) and contentJson[item].get('seq') == i: chapterList.append({item: contentJson[item]}) break else: self.log.warn('comic count is zero.') return chapterList
def getImgList(self, chapterJson, comic_id): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] cid = list(chapterJson.keys())[0] getImgListUrl = 'http://ac.qq.com/ComicView/index/id/{0}/cid/{1}'.format(comic_id, cid) result = opener.open(getImgListUrl) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = result.content cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page) if len(filter_result) != 0: base64data = filter_result[0][1:] img_detail_json = json.loads(base64.decodestring(base64data)) for img_url in img_detail_json.get('picture', []): if ( 'url' in img_url ): imgList.append(img_url['url']) else: self.log.warn('no url in img_url:%s' % img_url) return imgList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith("http://"): url = url.replace('http://', 'https://') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') allComicTable = soup.find_all('table', { 'width': '800', 'align': 'center' }) for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) for volume in comicVolumes: href = self.urljoin(self.host, volume.get('href')) chapterList.append(href) return chapterList
def get_chapter_list_from_mobile_url(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(addreferer=False, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn("fetch comic page failed: %s" % result.status_code) return [] content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) if "obj_id" not in content: self.log.warn(u"Can't find obj_id form {}".format(url)) return [] comic_id = re.search('obj_id = "(\d+)"', content).group(1) data_match = re.search("initIntroData\(([^;]+)\);", content) if not data_match: return self.get_chapter_list_from_api(comic_id) datas = json.loads(data_match.group(1)) chapter_datas = [] for data in datas: chapter_datas += data["data"] if not chapter_datas: return self.get_chapter_list_from_api(comic_id) chapter_datas.sort(key=lambda d: d["id"]) chapters = [] for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["id"], comic_id=comic_id) chapters.append((chapter["chapter_name"], chapter_url)) return chapters
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(url) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn( "fetch comic page failed: {} (status code {}, content {})". format(url, result.status_code, result.content)) return [] content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, "html.parser") scripts = soup.findAll("script", {"type": "text/javascript"}) packed_js = None for script in scripts: if "newImgs" in script.text: packed_js = script.text break if not packed_js: self.log.warn("Can't find js") return [] codes = decode_packed_codes(packed_js) return re.findall("'(.+?)'", codes)
def getImgUrl(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) if (comicImgTag is None): self.log.warn('can not find image href.') return None imgUrl = self.host + "/comic/" + comicImgTag.get('src') headers = {'Referer': url} result = opener.open(imgUrl, headers=headers) if result.status_code != 200 or opener.realurl == url: self.log.warn('can not get real comic url for : %s' % url) return None return opener.realurl
def getImgUrlList(self, url): imgUrlList = [] decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') sel = soup.find('select') #页码行,要提取所有的页面 if (sel is None): self.log.warn('soup select is not exist.') return None ulist = sel.find_all('option') if sel else None if not ulist: self.log.warn('select option is not exist.') return None for ul in ulist: if ul.get('value') == None: ulist.remove(ul) else: href = self.host + '/comic/' + ul.get('value') imgUrlList.append(href) return imgUrlList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] url = url.replace("https://m.tohomh123.com", "https://www.tohomh123.com") result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') soup = soup.find("ul", {"id": 'detail-list-select-2'}) if not soup: self.log.warn('chapterList is not exist.') return chapterList lias = soup.findAll('a') if not lias: self.log.warn('chapterList href is not exist.') return chapterList for a in lias: href = "https://www.tohomh123.com" + a.get("href") chapterList.append((unicode(a.contents[0]), href)) return chapterList
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] if url.startswith( "https://www.manhuagui.com" ): url = url.replace('https://www.manhuagui.com', 'https://m.manhuagui.com') result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') invisible_input = soup.find("input", {"id":'__VIEWSTATE'}) if invisible_input: lz_encoded=invisible_input.get("value") lz_decoded = decompressFromBase64(lz_encoded) soup = BeautifulSoup(lz_decoded, 'html.parser') else: soup = soup.find("div", {"class": 'chapter-list', "id":'chapterList'}) lias = soup.findAll('a') for aindex in range(len(lias)): rindex = len(lias)-1-aindex href = "https://m.manhuagui.com" + lias[rindex].get("href") chapterList.append(href) return chapterList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') scripts = soup.findAll("script", {"type": "text/javascript"}) for script in scripts: if script.text != "": raw_content = script.text break res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1) lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1) lz_decoded = decompressFromBase64(lz_encoded) res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res) codes = self.get_node_online(res) pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1)) cid = self.getChapterId(url) md5 = pages_opts["sl"]["md5"] images = pages_opts["images"] for img in images: img_url = u'https://i.hamreus.com{}?cid={}&md5={}'.format(img, cid, md5) imgList.append(img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'qTcms_S_m_murl_e="(.*)";', content).group() if (res is None): self.log.warn(content) self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: imgList.append(img) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ((u"mh" in urlpaths) and (urlpaths.index(u"mh") + 2 < len(urlpaths))): tid = str(time.time()).replace(".", "1") if len(tid) == 12: tid = tid + "1" cid = urlpaths[urlpaths.index(u"mh") + 1] pid = urlpaths[urlpaths.index(u"mh") + 2].replace(".html", "") else: self.log.warn('Can not get cid and pid from URL: {}.'.format(url)) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group() if (res is None): self.log.warn('var qTcms_S_m_murl_e is not exist.') return imgList list_encoded = res.split('\"')[1] lz_decoded = b64decode(list_encoded) images = lz_decoded.split("$qingtiandy$") if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: if "http://www.baidu1.com/" in img: b64str = img.replace("http://www.baidu1.com/", "") + '|{}|{}|{}|pc'.format( tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format( imgb64) elif "http://ac.tc.qq.com/" in img: b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid) imgb64 = b64encode(b64str) img_url = u'http://img_733.234us.com/newfile.php?data={}'.format( imgb64) else: img_url = img self.log.info('Ths image herf is: %s' % img_url) imgList.append(img_url) return imgList
def GetNewComic(self): urls = [] if not self.feeds: return [] userName = self.UserName() decoder = AutoDecoder(isfeed=False) for item in self.feeds: title, url = item[0], item[1] lastCount = LastDelivered.all().filter( 'username = '******'These is no log in db LastDelivered for name: %s, set to 0' % title) oldNum = 0 else: oldNum = lastCount.num opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200: self.log.warn( 'fetch index page for %s failed[%s] : %s' % (title, URLOpener.CodeMap(result.status_code), url)) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') allComicTable = soup.find_all('table', {'width': '688'}) addedForThisComic = False for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) for volume in comicVolumes: texts = volume.text.split(' ') if len(texts) > 2 and texts[1].isdigit() and volume.get( 'href'): num = int(texts[1]) if num > oldNum: oldNum = num href = self.urljoin(self.host, volume.get('href')) urls.append((title, num, href)) addedForThisComic = True break #一次只推送一卷(有时候一卷已经很多图片了) if addedForThisComic: break return urls
def getChapterList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) chapterList = [] urlpaths = urlparse.urlsplit(url.lower()).path.split("/") if ((u"id" in urlpaths) and (urlpaths.index(u"id") + 1 < len(urlpaths))): comic_id = urlpaths[urlpaths.index(u"id") + 1] if ((not comic_id.isdigit()) or (comic_id == "")): self.log.warn('can not get comic id: %s' % url) return chapterList url = 'https://m.ac.qq.com/comic/chapterList/id/{}'.format(comic_id) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') # <section class="chapter-list-box list-expanded" data-vip-free="1"> section = soup.find('section', {'class': 'chapter-list-box list-expanded'}) if (section is None): self.log.warn('chapter-list-box is not exist.') return chapterList # <ul class="chapter-list normal"> # <ul class="chapter-list reverse"> reverse_list = section.find('ul', {'class': 'chapter-list reverse'}) if (reverse_list is None): self.log.warn('chapter-list is not exist.') return chapterList for item in reverse_list.find_all('a'): # <a class="chapter-link lock" data-cid="447" data-seq="360" href="/chapter/index/id/531490/cid/447">360</a> # https://m.ac.qq.com/chapter/index/id/511915/cid/1 href = 'https://m.ac.qq.com' + item.get('href') isVip = "lock" in item.get('class') if isVip == True: self.log.info( "Chapter {} is Vip, waiting for free.".format(href)) continue chapterList.append((item.get_text(), href)) return chapterList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] i = 0 for feed in self.feeds: feedtitle, url = feed[0], feed[1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) continue if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode( result.content, opener.realurl, result.headers) else: content = AutoDecoder(False).decode(result.content, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') for article in soup.findAll('div', {"class": "text"}): if article.find("h2") and article.find("a"): title = article.a.contents[0].strip() if not title: continue href = self.url_prefix + article.a['href'] urls.append((feedtitle, title, href, None)) if i > 3: break else: i = i + 1 return urls
def ParseFeedUrls(self): urls = [] #用于返回 newComicUrls = self.GetNewComic() #返回[(title, num, url),...] if not newComicUrls: return [] decoder = AutoDecoder(isfeed=False) for title, num, url in newComicUrls: opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) bodySoup = BeautifulSoup(content, 'lxml') sel = bodySoup.find('select') #页码行,要提取所有的页面 ul = sel.find_all('option') if sel else None if not ul: continue for comicPage in ul: href = comicPage.get('value') if href: pageHref = self.urljoin(url, href) result = opener.open(pageHref) if result.status_code != 200: self.log.warn('fetch comic page failed: %s' % pageHref) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) comicSrc = comicImgTag.get('src') if comicImgTag else None if comicSrc: urls.append((title, comicPage.text, comicSrc, None)) self.UpdateLastDelivered(title, num) return urls
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = result.content cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers) filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page) # "picture": [{},...{}]} if len(filter_result) != 0: # "picture" > InBpY3R1cmUi # picture": > cGljdHVyZSI6 # icture":[ > aWN0dXJlIjpb if "InBpY3R1cmUi" in filter_result[0]: base64data = filter_result[0].split("InBpY3R1cmUi")[1] self.log.warn('found flag string: %s' % "InBpY3R1cmUi") elif "cGljdHVyZSI6" in filter_result[0]: base64data = filter_result[0].split("cGljdHVyZSI6")[1] self.log.warn('found flag string: %s' % "cGljdHVyZSI6") elif "aWN0dXJlIjpb" in filter_result[0]: base64data = filter_result[0].split("aWN0dXJl")[1] self.log.warn('found flag string: %s' % "aWN0dXJlIjpb") else: self.log.warn('can not found flag string in data: %s' % filter_result[0]) return imgList decodeData = base64.decodestring(base64data) startIndex = decodeData.find('[') endIndex = decodeData.find(']') if startIndex > -1 and endIndex > -1: img_detail_json = json.loads(decodeData[startIndex:endIndex + 1]) for img_url in img_detail_json: if ('url' in img_url): imgList.append(img_url['url']) else: self.log.warn('no url in img_url:%s' % img_url) else: self.log.warn('can not found [] in decodeData:%s' % decodeData) else: self.log.warn('can not fount filter_result with data: .') return imgList
def getImgUrl(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) url = self.host + "/comic/" + url result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return None content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') comicImgTag = soup.find('img', {'oncontextmenu': 'return false'}) return comicImgTag.get('src') if comicImgTag else None
def getChapterList(self, url): if url.startswith("https://m.dmzj.com"): return self.get_chapter_list_from_mobile_url(url) decoder = AutoDecoder(isfeed=False) opener = URLOpener(addreferer=False, timeout=60) chapterList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn("fetch comic page failed: %s" % result.status_code) return chapterList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) comic_id = re.search('g_comic_id = "([^"]+)', content).group(1) # try get chapters from html soup = BeautifulSoup(content, "html.parser") chapter_datas = [] for comic_classname in [ "cartoon_online_border", "cartoon_online_border_other" ]: divs = soup.find_all("div", attrs={"class": comic_classname}) if not divs: continue for div in divs: for link in div.find_all("a"): chapter_datas.append({ "chapter_id": int( re.search("\/(\d+)\.shtml", link.get("href")).group(1)), "chapter_title": unicode(link.string), }) if chapter_datas: chapter_datas.sort(key=lambda d: d["chapter_id"]) for chapter in chapter_datas: chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format( chapter_id=chapter["chapter_id"], comic_id=comic_id) chapterList.append((chapter["chapter_title"], chapter_url)) return chapterList else: return self.get_chapter_list_from_api(comic_id)
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) #var chapterPath = "images/comic/31/61188/"; chapterPath = re.search( r'(var chapterPath = ")(.*)(";var chapterPrice)', content) if (chapterPath is None): self.log.warn('var chapterPath is not exist.') return imgList else: chapterPath = chapterPath.group(2) #var pageImage = "https://res.gufengmh.com/gufeng/images/"; imgPrefix = re.search(r'(var pageImage = ")(.*)(gufeng/images/)', content) if (imgPrefix is None): self.log.warn( '"https://res.gufengmh.com/gufeng/images/ is not exist.') return imgList else: imgPrefix = imgPrefix.group(2) + "/" #var chapterImages = ["",""]; images = re.search(r'(var chapterImages = \[)(.*)(\];)', content) if (images is None): self.log.warn('var chapterImages is not exist.') return imgList else: images = images.group(2).split(',') for img in images: img_url = imgPrefix + chapterPath + img.replace("\"", "") imgList.append(img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') sel = soup.find('select') #页码行,要提取所有的页面 if (sel is None): self.log.warn('soup select is not exist.') return imgList ulist = sel.find_all('option') if sel else None if not ulist: self.log.warn('select option is not exist.') return imgList for ul in ulist: if ul.get('value') == None: ulist.remove(ul) listLen = len(ulist) firstPageTag = soup.find('img', {'oncontextmenu': 'return false'}) firstPage = firstPageTag.get('src') if firstPageTag else None if firstPage != None: firstPage = "https://www.cartoonmad.com/{}".format(firstPage) base, length, type = self.getImgStr(firstPage) for index in range(len(ulist)): imgUrl = "{}{}.{}".format(base, str(index+1).zfill(length), type) imgList.append(imgUrl) if imgList[0] == firstPage and imgList[listLen-1] == self.getImgUrl(ulist[listLen-1].get('value')): return imgList else: imgList = [] for ul in ulist: imgList.append("https://www.cartoonmad.com/{}".format(self.getImgUrl(ul.get('value')))) return imgList return imgList
def ParseFeedUrls(self): urls = [] userName = self.UserName() decoder = AutoDecoder(isfeed=False) lastCount = LastDelivered.all().filter('username = '******'' else: oldNum = lastCount.num oldChapterTitle = lastCount.record opener = URLOpener(self.host, timeout=60) result = opener.open(self.feeds) if result.status_code != 200: self.log.warn('fetch index page for %s failed[%s] : %s' % (self.title, URLOpener.CodeMap( result.status_code), self.feeds)) return [] # 从页面获取章节列表 content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') chapterList = self.GetChapterList(soup) chapterNum = 0 for chapter in chapterList: if chapterNum >= self.limit: break url = chapter.get('href') num = self.GetChapterNum(url) if num > oldNum: oldNum = num oldChapterTitle = chapter.text chapterNum += 1 urls.append( (self.title, oldChapterTitle, self.urljoin(self.host, url), '')) self.UpdateLastDelivered(self.title, oldNum, oldChapterTitle) return urls
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') scripts = soup.findAll("script", {"type": "text/javascript"}) for script in scripts: if "window[\"\\x65\\x76\\x61\\x6c\"]" in script.text != "": raw_content = script.text break if (raw_content is None): self.log.warn('raw_content href is not exist.') return imgList res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1) lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1) lz_decoded = decompressFromBase64(lz_encoded) res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res) codes = self.get_node_online(res) pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1)) # cid = self.getChapterId(url) m = pages_opts["sl"]["m"] e = pages_opts["sl"]["e"] images = pages_opts["images"] if (images is None): self.log.warn('image list is not exist.') return imgList for img in images: # https://i.hamreus.com/ps3/p/pingxingtt_gbl/%E7%AC%AC117%E8%AF%9D/1_7684.jpg.webp?e=1769209619&m=MOn_QAAi-qwQBaRjlmNYkA img_url = u'https://i.hamreus.com{}?e={}&m={}'.format(img, e, m) imgList.append(img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) try: # function base64decode(str){*}; func = re.search(r'function\ base64decode\(str\){.*};', content).group() func = func.split('base64decode')[1].replace('};', '}') # packed="*"; packed = re.search(r'packed=".*";', content).group() packed = packed.split('\"')[1] except: self.log.warn('var photosr is not exist.') return imgList # eval(function(str){*}("*").slice(4)) lz_input = "eval(function{}(\"{}\").slice(4))".format(func, packed) lz_nodejs = self.get_node_online(lz_input) if (lz_nodejs is None): self.log.warn('image list is not exist.') return imgList # photosr[1]="images/2019/11/08/09/19904f5d64.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0"; images = lz_nodejs.split("\"") # http://res.img.220012.net/2017/08/22/13/343135d67f.jpg for img in images: if ".jpg" in img: img_url = self.urljoin("http://res.img.220012.net", img) imgList.append(img_url) return imgList
def getImgList(self, url): decoder = AutoDecoder(isfeed=False) opener = URLOpener(self.host, timeout=60) imgList = [] result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch comic page failed: %s' % url) return imgList content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'html.parser') try: func = re.search(r'function\ base64decode\(str\){.*};', content).group() packed = re.search(r'packed=".*";', content).group() except: self.log.warn('var photosr is not exist in {}.'.format(url)) return imgList # eval(function(str){*}("*").slice(4)) lz_input = "{}var photosr = new Array();{}console.log(eval(base64decode(packed).slice(4)));".format( func, packed) lz_nodejs = self.get_node_online(lz_input) if (lz_nodejs is None): self.log.warn('image list is not exist.') return imgList images = lz_nodejs.split("\"") self.log.info(images) for img in images: # photosr[1]="images/2020/05/03/17/516bbfddb4.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0"; # http://res.img.fffimage.com/images/2020/05/03/17/516bbfddb4.jpg/0 # photosr[1]="images/2020/04/21/09/3706a024c8.png/0";...photosr[12]="images/2020/04/21/09/3732355905.png/0"; # http://res.img.fffimage.com/images/2020/04/21/09/3706a024c8.png/0 if ".jpg" in img or ".png" in img: img_url = self.urljoin("http://res.img.fffimage.com/", img) imgList.append(img_url) return imgList
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout + 10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) id = urlparse.urlparse(url).query.split("=")[1] result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers) else: content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers) else: self.log.warn("fetch rss failed(%d):%s" % (result.status_code, url)) continue eqs, ekv = process_eqs(content) url = WEIXIN_URL.format(id=id, eqs=urllib.quote(eqs), ekv=ekv, t=int(time.time() * 1000)) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers) else: content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers) content = content[content.find("{") : content.rfind("}") + 1] try: content = json.loads(content) except ValueError: continue for e in content["items"][: self.max_articles_per_feed]: e = feedparser.parse(e)["entries"][0] updated = None if hasattr(e, "lastmodified") and e.lastmodified: updated = float(e.lastmodified) if self.oldest_article > 0 and updated: updated = datetime.datetime.utcfromtimestamp(updated) delta = tnow - updated if self.oldest_article > 365: threshold = self.oldest_article # 以秒为单位 else: threshold = 86400 * self.oldest_article # 以天为单位 if delta.days * 86400 + delta.seconds > threshold: self.log.info("Skip old article(%s): %s" % (updated.strftime("%Y-%m-%d %H:%M:%S"), e.href)) continue # 支持HTTPS if hasattr(e, "href"): if url.startswith("https://"): urlfeed = e.href.replace("http://", "https://") else: urlfeed = e.href if urlfeed in urladded: continue else: urlfeed = "" desc = None urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn("fetch rss failed(%d):%s" % (result.status_code, url)) return urls
def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail 对于图片,mime,url,filename,content,brief,thumbnail 如果是图片,仅第一个图片的thumbnail返回True,其余为None """ decoder = AutoDecoder(False) timeout = self.timeout for section, url in self.feeds: opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) continue if self.page_encoding: try: content = content.decode(self.page_encoding) except UnicodeDecodeError: content = decoder.decode(content,opener.realurl,result.headers) else: content = decoder.decode(content,opener.realurl,result.headers) content = self.preprocess(content) soup = BeautifulSoup(content, "lxml") head = soup.find('head') if not head: head = soup.new_tag('head') soup.html.insert(0, head) if not head.find('title'): t = soup.new_tag('title') t.string = section head.append(t) try: title = soup.html.head.title.string except AttributeError: title = section #self.log.warn('object soup invalid!(%s)'%url) #continue title = self.processtitle(title) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): keep_only_tags = [self.keep_only_tags] else: keep_only_tags = self.keep_only_tags for spec in keep_only_tags: for tag in soup.find('body').find_all(**spec): body.insert(len(body.contents), tag) soup.find('body').replace_with(body) except AttributeError: # soup has no body element pass for spec in self.remove_tags_after: tag = soup.find(**spec) remove_beyond(tag, 'next_sibling') for spec in self.remove_tags_before: tag = soup.find(**spec) remove_beyond(tag, 'previous_sibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() for id in remove_ids: for tag in soup.find_all(attrs={"id":id}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class":cls}): tag.decompose() for attr in remove_attrs: for tag in soup.find_all(attrs={attr:True}): del tag[attr] for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)): cmt.extract() #删除body的所有属性,以便InsertToc使用正则表达式匹配<body> body = soup.html.body bodyattrs = [attr for attr in body.attrs] for attr in bodyattrs: del body[attr] if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) has_imgs = False thumbnail = None if self.keep_image: self.soupbeforeimage(soup) for img in soup.find_all('img'): #现在使用延迟加载图片技术的网站越来越多了,这里处理一下 #注意:如果data-src之类的属性保存的不是真实url就没辙了 imgurl = img['src'] if 'src' in img.attrs else '' if not imgurl: for attr in img.attrs: if attr != 'src' and 'src' in attr: #很多网站使用data-src imgurl = img[attr] break if not imgurl: img.decompose() continue if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered:%s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image(imgresult.content,opts) if imgresult.status_code==200 else None if imgcontent: if len(imgcontent) < self.img_min_size: #rexdf too small image img.decompose() continue imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype) img['src'] = fnimg #使用第一个图片做为目录摘要图 if not has_imgs: has_imgs = True thumbnail = imgurl yield (imgmime, imgurl, fnimg, imgcontent, None, True) else: yield (imgmime, imgurl, fnimg, imgcontent, None, None) else: img.decompose() else: self.log.warn('fetch img failed(err:%d):%s' % (imgresult.status_code,imgurl)) img.decompose() #去掉图像上面的链接 for img in soup.find_all('img'): if img.parent and img.parent.parent and \ img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() self.soupprocessex(soup) content = unicode(soup) #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None content = self.postprocess(content) yield (section, url, title, content, brief, thumbnail)