コード例 #1
0
ファイル: Shiyue.py プロジェクト: richars/KindleEar
 def ParseFeedUrls(self):
     main = 'http://bbstsg.vip.qikan.com/text/Mag.aspx?issn=ACB37AEA-8FB7-4855-B7CA-D228E972162F'
     urls = []
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(main)
     if result.status_code != 200:
         self.log.warn('fetch webpage failed:%s'%main)
         return []
     if self.feed_encoding:
         try:
             content = result.content.decode(self.feed_encoding)
         except UnicodeDecodeError:
             content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     else:
         content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     soup = BeautifulSoup(content, "lxml")
     for section in soup.find_all('dl'):
         dt=section.find('dt')
         span=dt.find('span')
         if span:
             sectitle = string_of_tag(span).strip()
         for dd in section.find_all('dd'):
             a=dd.find('a', href=True)
             title = string_of_tag(a).strip()
             url = a['href']
             if url.startswith('Article'):
                 url = 'http://bbstsg.vip.qikan.com/text/'+url
             urls.append((sectitle,title,url,None))
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
コード例 #2
0
ファイル: weixinbase.py プロジェクト: a3587556/KindleEar
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1].replace('gzh', 'gzhjs')
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout+10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                content = content[content.index('{'):content.index('}')+1]
                content = json.loads(content)

                for e in content['items'][:self.max_articles_per_feed]:
                    e = feedparser.parse(e)['entries'][0]
                    updated = None
                    if hasattr(e, 'lastmodified') and e.lastmodified:
                        updated = float(e.lastmodified)

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime.utcfromtimestamp(updated)
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article #以秒为单位
                        else:
                            threshold = 86400*self.oldest_article #以天为单位

                        if delta.days*86400+delta.seconds > threshold:
                            self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href))
                            continue

                    #支持HTTPS
                    if hasattr(e, 'href'):
                        if url.startswith('https://'):
                            urlfeed = e.href.replace('http://','https://')
                        else:
                            urlfeed = e.href

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ''

                    desc = None
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%d):%s'%(result.status_code,url))

        return urls
コード例 #3
0
    def Items(self, opts=None, user=None):
        """
        生成器,返回一个元组
        对于HTML:section,url,title,content,brief,thumbnail
        对于图片,mime,url,filename,content,brief,thumbnail
        """
        urls = self.ParseFeedUrls()
        readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
        prevsection = ''
        opener = URLOpener(self.host, timeout=self.timeout)
        decoder = AutoDecoder(False)
        for section, ftitle, url, desc in urls:
            if not desc:  #非全文RSS
                if section != prevsection or prevsection == '':
                    decoder.encoding = ''  #每个小节都重新检测编码
                    prevsection = section
                    opener = URLOpener(self.host, timeout=self.timeout)
                    if self.needs_subscription:
                        self.login(opener, decoder)

                article = self.fetcharticle(url, opener, decoder)
                if not article:
                    continue
            else:
                article = self.FragToXhtml(desc, ftitle)

            #如果是图片,title则是mime
            for title, imgurl, imgfn, content, brief, thumbnail in readability(
                    article, url, opts, user):
                if title.startswith(r'image/'):  #图片
                    yield (title, imgurl, imgfn, content, brief, thumbnail)
                else:
                    if not title: title = ftitle
                    content = self.postprocess(content)
                    yield (section, url, title, content, brief, thumbnail)
コード例 #4
0
ファイル: base.py プロジェクト: binbin/KindleEar
 def Items(self, opts=None, user=None):
     """
     生成器,返回一个元组
     对于HTML:section,url,title,content,brief,thumbnail
     对于图片,mime,url,filename,content,brief,thumbnail
     """
     urls = self.ParseFeedUrls()
     readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
     prevsection = ''
     opener = URLOpener(self.host, timeout=self.timeout)
     decoder = AutoDecoder(False)
     for section, ftitle, url, desc in urls:
         if not desc: #非全文RSS
             if section != prevsection or prevsection == '':
                 decoder.encoding = '' #每个小节都重新检测编码
                 prevsection = section
                 opener = URLOpener(self.host, timeout=self.timeout)
                 if self.needs_subscription:
                     self.login(opener, decoder)
     
             article = self.fetcharticle(url, opener, decoder)
             if not article:
                 continue
         else:
             article = self.FragToXhtml(desc, ftitle)
         
         #如果是图片,title则是mime
         for title, imgurl, imgfn, content, brief, thumbnail in readability(article,url,opts,user,ftitle):
             if title.startswith(r'image/'): #图片
                 yield (title, imgurl, imgfn, content, brief, thumbnail)
             else:
                 if not title: title = ftitle
                 if self.force_ftitle:title = ftitle
                 content =  self.postprocess(content)
                 yield (section, url, title, content, brief, thumbnail)
コード例 #5
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith("https://m.733.so"):
            url = url.replace('https://m.733.so', 'https://www.733.so')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        soup = soup.find('div', {"class": "cy_plist"})
        if (soup is None):
            self.log.warn('cy_plist is not exist.')
            return chapterList

        lias = soup.findAll('a')
        if (lias is None):
            self.log.warn('chapterList href is not exist.')
            return chapterList

        for aindex in range(len(lias)):
            rindex = len(lias) - 1 - aindex
            href = "https://www.733.so" + lias[rindex].get("href")
            chapterList.append(href)

        return chapterList
コード例 #6
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group()
        if (res is None):
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            imgb64 = b64encode(img.replace("http://www.baidu1.com/", ""))
            img_url = u'http://new.234us.com:8989/img_new.php?data={}'.format(
                imgb64)
            imgList.append(img_url)

        return imgList
コード例 #7
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host)
        chapterList = []

        url = url.replace("http://www.dm5.com", "https://www.manhuaren.com")

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".
                format(url, result.status_code, result.content))
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, "html.parser")

        chapter_datas = []
        for link in soup.find_all("a", {"class": "chapteritem"}):
            chapter_datas.append({
                "chapter_id":
                int(re.search("m(\d+)", link.get("href")).group(1)),
                "chapter_title":
                unicode(link.string),
            })
        chapter_datas.sort(key=lambda d: d["chapter_id"])
        for chapter in chapter_datas:
            chapter_url = "http://www.manhuaren.com/m{}/".format(
                chapter["chapter_id"])
            chapterList.append((chapter["chapter_title"], chapter_url))
        return chapterList
コード例 #8
0
    def getChapterList(self, comic_id):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(comic_id)
        result = opener.open(getChapterListUrl)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = result.content
        content = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)

        contentJson = json.loads(content)
        count = contentJson.get('length', 0)
        if (count != 0):
            for i in range(count + 1):
                for item in contentJson:
                    if isinstance(contentJson[item], dict) and contentJson[item].get('seq') == i:
                        chapterList.append({item: contentJson[item]})
                        break
        else:
            self.log.warn('comic count is zero.')

        return chapterList
コード例 #9
0
    def getImgList(self, chapterJson, comic_id):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        cid = list(chapterJson.keys())[0]
        getImgListUrl = 'http://ac.qq.com/ComicView/index/id/{0}/cid/{1}'.format(comic_id, cid)
        result = opener.open(getImgListUrl)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = result.content
        cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding, opener.realurl, result.headers)
        filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page)
        if len(filter_result) != 0:
            base64data = filter_result[0][1:]
            img_detail_json = json.loads(base64.decodestring(base64data))
            for img_url in img_detail_json.get('picture', []):
                if ( 'url' in img_url ):
                    imgList.append(img_url['url'])
                else:
                    self.log.warn('no url in img_url:%s' % img_url)

        return imgList
コード例 #10
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith("http://"):
            url = url.replace('http://', 'https://')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        allComicTable = soup.find_all('table', {
            'width': '800',
            'align': 'center'
        })
        for comicTable in allComicTable:
            comicVolumes = comicTable.find_all('a', {'target': '_blank'})
            for volume in comicVolumes:
                href = self.urljoin(self.host, volume.get('href'))
                chapterList.append(href)

        return chapterList
コード例 #11
0
ファイル: dmzjbase.py プロジェクト: SPPIDER/kindleEar-new
    def get_chapter_list_from_mobile_url(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(addreferer=False, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn("fetch comic page failed: %s" % result.status_code)
            return []

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        if "obj_id" not in content:
            self.log.warn(u"Can't find obj_id form {}".format(url))
            return []

        comic_id = re.search('obj_id = "(\d+)"', content).group(1)
        data_match = re.search("initIntroData\(([^;]+)\);", content)
        if not data_match:
            return self.get_chapter_list_from_api(comic_id)
        datas = json.loads(data_match.group(1))
        chapter_datas = []
        for data in datas:
            chapter_datas += data["data"]
        if not chapter_datas:
            return self.get_chapter_list_from_api(comic_id)
        chapter_datas.sort(key=lambda d: d["id"])
        chapters = []
        for chapter in chapter_datas:
            chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                chapter_id=chapter["id"], comic_id=comic_id)
            chapters.append((chapter["chapter_name"], chapter_url))
        return chapters
コード例 #12
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(url)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn(
                "fetch comic page failed: {} (status code {}, content {})".
                format(url, result.status_code, result.content))
            return []

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, "html.parser")
        scripts = soup.findAll("script", {"type": "text/javascript"})
        packed_js = None
        for script in scripts:
            if "newImgs" in script.text:
                packed_js = script.text
                break
        if not packed_js:
            self.log.warn("Can't find js")
            return []
        codes = decode_packed_codes(packed_js)
        return re.findall("'(.+?)'", codes)
コード例 #13
0
    def getImgUrl(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.page_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        comicImgTag = soup.find('img', {'oncontextmenu': 'return false'})
        if (comicImgTag is None):
            self.log.warn('can not find image href.')
            return None
        imgUrl = self.host + "/comic/" + comicImgTag.get('src')

        headers = {'Referer': url}
        result = opener.open(imgUrl, headers=headers)
        if result.status_code != 200 or opener.realurl == url:
            self.log.warn('can not get real comic url for : %s' % url)
            return None

        return opener.realurl
コード例 #14
0
    def getImgUrlList(self, url):
        imgUrlList = []
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.page_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        sel = soup.find('select')  #页码行,要提取所有的页面
        if (sel is None):
            self.log.warn('soup select is not exist.')
            return None

        ulist = sel.find_all('option') if sel else None
        if not ulist:
            self.log.warn('select option is not exist.')
            return None

        for ul in ulist:
            if ul.get('value') == None:
                ulist.remove(ul)
            else:
                href = self.host + '/comic/' + ul.get('value')
                imgUrlList.append(href)

        return imgUrlList
コード例 #15
0
ファイル: tohomhbase.py プロジェクト: SPPIDER/kindleEar-new
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        url = url.replace("https://m.tohomh123.com",
                          "https://www.tohomh123.com")

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        soup = soup.find("ul", {"id": 'detail-list-select-2'})
        if not soup:
            self.log.warn('chapterList is not exist.')
            return chapterList

        lias = soup.findAll('a')
        if not lias:
            self.log.warn('chapterList href is not exist.')
            return chapterList

        for a in lias:
            href = "https://www.tohomh123.com" + a.get("href")
            chapterList.append((unicode(a.contents[0]), href))

        return chapterList
コード例 #16
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        if url.startswith( "https://www.manhuagui.com" ):
            url = url.replace('https://www.manhuagui.com', 'https://m.manhuagui.com')

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        invisible_input = soup.find("input", {"id":'__VIEWSTATE'})
        if invisible_input:
            lz_encoded=invisible_input.get("value")
            lz_decoded = decompressFromBase64(lz_encoded)
            soup = BeautifulSoup(lz_decoded, 'html.parser')
        else:
            soup = soup.find("div", {"class": 'chapter-list', "id":'chapterList'})

        lias = soup.findAll('a')
        for aindex in range(len(lias)):
            rindex = len(lias)-1-aindex
            href = "https://m.manhuagui.com" + lias[rindex].get("href")
            chapterList.append(href)

        return chapterList
コード例 #17
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        scripts = soup.findAll("script", {"type": "text/javascript"})
        for script in scripts:
            if script.text != "":
                raw_content = script.text
                break

        res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1)
        lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1)
        lz_decoded = decompressFromBase64(lz_encoded)
        res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res)
        codes = self.get_node_online(res)
        pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1))

        cid = self.getChapterId(url)
        md5 = pages_opts["sl"]["md5"]
        images = pages_opts["images"]
        for img in images:
            img_url = u'https://i.hamreus.com{}?cid={}&md5={}'.format(img, cid, md5)
            imgList.append(img_url)

        return imgList
コード例 #18
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        res = re.search(r'qTcms_S_m_murl_e="(.*)";', content).group()
        if (res is None):
            self.log.warn(content)
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            imgList.append(img)

        return imgList
コード例 #19
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        urlpaths = urlparse.urlsplit(url.lower()).path.split("/")
        if ((u"mh" in urlpaths)
                and (urlpaths.index(u"mh") + 2 < len(urlpaths))):
            tid = str(time.time()).replace(".", "1")
            if len(tid) == 12:
                tid = tid + "1"
            cid = urlpaths[urlpaths.index(u"mh") + 1]
            pid = urlpaths[urlpaths.index(u"mh") + 2].replace(".html", "")
        else:
            self.log.warn('Can not get cid and pid from URL: {}.'.format(url))
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        res = re.search(r'var qTcms_S_m_murl_e=".*";', content).group()
        if (res is None):
            self.log.warn('var qTcms_S_m_murl_e is not exist.')
            return imgList

        list_encoded = res.split('\"')[1]
        lz_decoded = b64decode(list_encoded)
        images = lz_decoded.split("$qingtiandy$")

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            if "http://www.baidu1.com/" in img:
                b64str = img.replace("http://www.baidu1.com/",
                                     "") + '|{}|{}|{}|pc'.format(
                                         tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(
                    imgb64)
            elif "http://ac.tc.qq.com/" in img:
                b64str = img + '|{}|{}|{}|pc'.format(tid, cid, pid)
                imgb64 = b64encode(b64str)
                img_url = u'http://img_733.234us.com/newfile.php?data={}'.format(
                    imgb64)
            else:
                img_url = img

            self.log.info('Ths image herf is: %s' % img_url)
            imgList.append(img_url)

        return imgList
コード例 #20
0
    def GetNewComic(self):
        urls = []

        if not self.feeds:
            return []

        userName = self.UserName()
        decoder = AutoDecoder(isfeed=False)
        for item in self.feeds:
            title, url = item[0], item[1]

            lastCount = LastDelivered.all().filter(
                'username = '******'These is no log in db LastDelivered for name: %s, set to 0'
                    % title)
                oldNum = 0
            else:
                oldNum = lastCount.num

            opener = URLOpener(self.host, timeout=60)
            result = opener.open(url)
            if result.status_code != 200:
                self.log.warn(
                    'fetch index page for %s failed[%s] : %s' %
                    (title, URLOpener.CodeMap(result.status_code), url))
                continue
            content = result.content
            content = self.AutoDecodeContent(content, decoder,
                                             self.feed_encoding,
                                             opener.realurl, result.headers)

            soup = BeautifulSoup(content, 'lxml')

            allComicTable = soup.find_all('table', {'width': '688'})
            addedForThisComic = False
            for comicTable in allComicTable:
                comicVolumes = comicTable.find_all('a', {'target': '_blank'})
                for volume in comicVolumes:
                    texts = volume.text.split(' ')
                    if len(texts) > 2 and texts[1].isdigit() and volume.get(
                            'href'):
                        num = int(texts[1])
                        if num > oldNum:
                            oldNum = num
                            href = self.urljoin(self.host, volume.get('href'))
                            urls.append((title, num, href))
                            addedForThisComic = True
                            break  #一次只推送一卷(有时候一卷已经很多图片了)

                if addedForThisComic:
                    break

        return urls
コード例 #21
0
    def getChapterList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        chapterList = []

        urlpaths = urlparse.urlsplit(url.lower()).path.split("/")
        if ((u"id" in urlpaths)
                and (urlpaths.index(u"id") + 1 < len(urlpaths))):
            comic_id = urlpaths[urlpaths.index(u"id") + 1]

        if ((not comic_id.isdigit()) or (comic_id == "")):
            self.log.warn('can not get comic id: %s' % url)
            return chapterList

        url = 'https://m.ac.qq.com/comic/chapterList/id/{}'.format(comic_id)
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        soup = BeautifulSoup(content, 'html.parser')
        # <section class="chapter-list-box list-expanded" data-vip-free="1">
        section = soup.find('section',
                            {'class': 'chapter-list-box list-expanded'})
        if (section is None):
            self.log.warn('chapter-list-box is not exist.')
            return chapterList

        # <ul class="chapter-list normal">
        # <ul class="chapter-list reverse">
        reverse_list = section.find('ul', {'class': 'chapter-list reverse'})
        if (reverse_list is None):
            self.log.warn('chapter-list is not exist.')
            return chapterList

        for item in reverse_list.find_all('a'):
            # <a class="chapter-link lock" data-cid="447" data-seq="360" href="/chapter/index/id/531490/cid/447">360</a>
            # https://m.ac.qq.com/chapter/index/id/511915/cid/1
            href = 'https://m.ac.qq.com' + item.get('href')
            isVip = "lock" in item.get('class')
            if isVip == True:
                self.log.info(
                    "Chapter {} is Vip, waiting for free.".format(href))
                continue

            chapterList.append((item.get_text(), href))

        return chapterList
コード例 #22
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        i = 0
        for feed in self.feeds:
            feedtitle, url = feed[0], feed[1]
            opener = URLOpener(self.host, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code != 200 or not result.content:
                self.log.warn('fetch webpage failed(%d):%s.' %
                              (result.status_code, url))
                continue

            if self.feed_encoding:
                try:
                    content = result.content.decode(self.feed_encoding)
                except UnicodeDecodeError:
                    content = AutoDecoder(False).decode(
                        result.content, opener.realurl, result.headers)
            else:
                content = AutoDecoder(False).decode(result.content,
                                                    opener.realurl,
                                                    result.headers)

            soup = BeautifulSoup(content, 'lxml')
            for article in soup.findAll('div', {"class": "text"}):
                if article.find("h2") and article.find("a"):
                    title = article.a.contents[0].strip()
                    if not title:
                        continue
                    href = self.url_prefix + article.a['href']

                    urls.append((feedtitle, title, href, None))
                    if i > 3:
                        break
                    else:
                        i = i + 1

        return urls
コード例 #23
0
    def ParseFeedUrls(self):
        urls = []  #用于返回

        newComicUrls = self.GetNewComic()  #返回[(title, num, url),...]
        if not newComicUrls:
            return []

        decoder = AutoDecoder(isfeed=False)
        for title, num, url in newComicUrls:
            opener = URLOpener(self.host, timeout=60)
            result = opener.open(url)
            if result.status_code != 200 or not result.content:
                self.log.warn('fetch comic page failed: %s' % url)
                continue

            content = result.content
            content = self.AutoDecodeContent(content, decoder,
                                             self.page_encoding,
                                             opener.realurl, result.headers)

            bodySoup = BeautifulSoup(content, 'lxml')
            sel = bodySoup.find('select')  #页码行,要提取所有的页面
            ul = sel.find_all('option') if sel else None
            if not ul:
                continue

            for comicPage in ul:
                href = comicPage.get('value')
                if href:
                    pageHref = self.urljoin(url, href)
                    result = opener.open(pageHref)
                    if result.status_code != 200:
                        self.log.warn('fetch comic page failed: %s' % pageHref)
                        continue

                    content = result.content
                    content = self.AutoDecodeContent(content, decoder,
                                                     self.page_encoding,
                                                     opener.realurl,
                                                     result.headers)
                    soup = BeautifulSoup(content, 'lxml')

                    comicImgTag = soup.find('img',
                                            {'oncontextmenu': 'return false'})
                    comicSrc = comicImgTag.get('src') if comicImgTag else None
                    if comicSrc:
                        urls.append((title, comicPage.text, comicSrc, None))

            self.UpdateLastDelivered(title, num)

        return urls
コード例 #24
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = result.content
        cid_page = self.AutoDecodeContent(content, decoder, self.page_encoding,
                                          opener.realurl, result.headers)
        filter_result = re.findall(r"data\s*:\s*'(.+?)'", cid_page)
        # "picture": [{},...{}]}
        if len(filter_result) != 0:
            # "picture" > InBpY3R1cmUi
            # picture": > cGljdHVyZSI6
            # icture":[ > aWN0dXJlIjpb
            if "InBpY3R1cmUi" in filter_result[0]:
                base64data = filter_result[0].split("InBpY3R1cmUi")[1]
                self.log.warn('found flag string: %s' % "InBpY3R1cmUi")
            elif "cGljdHVyZSI6" in filter_result[0]:
                base64data = filter_result[0].split("cGljdHVyZSI6")[1]
                self.log.warn('found flag string: %s' % "cGljdHVyZSI6")
            elif "aWN0dXJlIjpb" in filter_result[0]:
                base64data = filter_result[0].split("aWN0dXJl")[1]
                self.log.warn('found flag string: %s' % "aWN0dXJlIjpb")
            else:
                self.log.warn('can not found flag string in data: %s' %
                              filter_result[0])
                return imgList
            decodeData = base64.decodestring(base64data)
            startIndex = decodeData.find('[')
            endIndex = decodeData.find(']')

            if startIndex > -1 and endIndex > -1:
                img_detail_json = json.loads(decodeData[startIndex:endIndex +
                                                        1])
                for img_url in img_detail_json:
                    if ('url' in img_url):
                        imgList.append(img_url['url'])
                    else:
                        self.log.warn('no url in img_url:%s' % img_url)
            else:
                self.log.warn('can not found [] in decodeData:%s' % decodeData)
        else:
            self.log.warn('can not fount filter_result with data: .')

        return imgList
コード例 #25
0
    def getImgUrl(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)

        url = self.host + "/comic/" + url
        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return None

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        comicImgTag = soup.find('img', {'oncontextmenu': 'return false'})
        return comicImgTag.get('src') if comicImgTag else None
コード例 #26
0
ファイル: dmzjbase.py プロジェクト: SPPIDER/kindleEar-new
    def getChapterList(self, url):
        if url.startswith("https://m.dmzj.com"):
            return self.get_chapter_list_from_mobile_url(url)
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(addreferer=False, timeout=60)
        chapterList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn("fetch comic page failed: %s" % result.status_code)
            return chapterList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        comic_id = re.search('g_comic_id = "([^"]+)', content).group(1)

        # try get chapters from html
        soup = BeautifulSoup(content, "html.parser")
        chapter_datas = []
        for comic_classname in [
                "cartoon_online_border", "cartoon_online_border_other"
        ]:
            divs = soup.find_all("div", attrs={"class": comic_classname})
            if not divs:
                continue
            for div in divs:
                for link in div.find_all("a"):
                    chapter_datas.append({
                        "chapter_id":
                        int(
                            re.search("\/(\d+)\.shtml",
                                      link.get("href")).group(1)),
                        "chapter_title":
                        unicode(link.string),
                    })
        if chapter_datas:
            chapter_datas.sort(key=lambda d: d["chapter_id"])
            for chapter in chapter_datas:
                chapter_url = "https://m.dmzj.com/view/{comic_id}/{chapter_id}.html".format(
                    chapter_id=chapter["chapter_id"], comic_id=comic_id)
                chapterList.append((chapter["chapter_title"], chapter_url))
            return chapterList
        else:
            return self.get_chapter_list_from_api(comic_id)
コード例 #27
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        #var chapterPath = "images/comic/31/61188/";
        chapterPath = re.search(
            r'(var chapterPath = ")(.*)(";var chapterPrice)', content)
        if (chapterPath is None):
            self.log.warn('var chapterPath is not exist.')
            return imgList
        else:
            chapterPath = chapterPath.group(2)

        #var pageImage = "https://res.gufengmh.com/gufeng/images/";
        imgPrefix = re.search(r'(var pageImage = ")(.*)(gufeng/images/)',
                              content)
        if (imgPrefix is None):
            self.log.warn(
                '"https://res.gufengmh.com/gufeng/images/ is not exist.')
            return imgList
        else:
            imgPrefix = imgPrefix.group(2) + "/"

        #var chapterImages = ["",""];
        images = re.search(r'(var chapterImages = \[)(.*)(\];)', content)
        if (images is None):
            self.log.warn('var chapterImages is not exist.')
            return imgList
        else:
            images = images.group(2).split(',')

        for img in images:
            img_url = imgPrefix + chapterPath + img.replace("\"", "")
            imgList.append(img_url)

        return imgList
コード例 #28
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.page_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        sel = soup.find('select') #页码行,要提取所有的页面
        if (sel is None):
            self.log.warn('soup select is not exist.')
            return imgList

        ulist = sel.find_all('option') if sel else None
        if not ulist:
            self.log.warn('select option is not exist.')
            return imgList

        for ul in ulist:
            if ul.get('value') == None:
                ulist.remove(ul)

        listLen = len(ulist)
        firstPageTag = soup.find('img', {'oncontextmenu': 'return false'})
        firstPage = firstPageTag.get('src') if firstPageTag else None

        if firstPage != None:
            firstPage = "https://www.cartoonmad.com/{}".format(firstPage)
            base, length, type = self.getImgStr(firstPage)
            for index in range(len(ulist)):
                imgUrl = "{}{}.{}".format(base, str(index+1).zfill(length), type)
                imgList.append(imgUrl)

        if imgList[0] == firstPage and imgList[listLen-1] == self.getImgUrl(ulist[listLen-1].get('value')):
            return imgList
        else:
            imgList = []
            for ul in ulist:
                imgList.append("https://www.cartoonmad.com/{}".format(self.getImgUrl(ul.get('value'))))
            return imgList

        return imgList
コード例 #29
0
ファイル: Novelbase.py プロジェクト: dragonhors/KindleEar
    def ParseFeedUrls(self):
        urls = []
        userName = self.UserName()
        decoder = AutoDecoder(isfeed=False)

        lastCount = LastDelivered.all().filter('username = '******''
        else:
            oldNum = lastCount.num
            oldChapterTitle = lastCount.record

        opener = URLOpener(self.host, timeout=60)
        result = opener.open(self.feeds)
        if result.status_code != 200:
            self.log.warn('fetch index page for %s failed[%s] : %s' %
                          (self.title, URLOpener.CodeMap(
                              result.status_code), self.feeds))
            return []

        # 从页面获取章节列表
        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, 'lxml')
        chapterList = self.GetChapterList(soup)

        chapterNum = 0
        for chapter in chapterList:
            if chapterNum >= self.limit:
                break
            url = chapter.get('href')
            num = self.GetChapterNum(url)
            if num > oldNum:
                oldNum = num
                oldChapterTitle = chapter.text
                chapterNum += 1
                urls.append(
                    (self.title, oldChapterTitle, self.urljoin(self.host,
                                                               url), ''))

        self.UpdateLastDelivered(self.title, oldNum, oldChapterTitle)
        return urls
コード例 #30
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers)
        soup = BeautifulSoup(content, 'html.parser')
        scripts = soup.findAll("script", {"type": "text/javascript"})
        for script in scripts:
            if "window[\"\\x65\\x76\\x61\\x6c\"]" in script.text != "":
                raw_content = script.text
                break

        if (raw_content is None):
            self.log.warn('raw_content href is not exist.')
            return imgList

        res = re.search(r'window\["\\x65\\x76\\x61\\x6c"\](.*\))', raw_content).group(1)
        lz_encoded = re.search(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", res).group(1)
        lz_decoded = decompressFromBase64(lz_encoded)
        res = re.sub(r"'([A-Za-z0-9+/=]+)'\['\\x73\\x70\\x6c\\x69\\x63'\]\('\\x7c'\)", "'%s'.split('|')"%(lz_decoded), res)
        codes = self.get_node_online(res)
        pages_opts = json.loads(re.search(r'^SMH.reader\((.*)\)\.preInit\(\);$', codes).group(1))

        # cid = self.getChapterId(url)
        m = pages_opts["sl"]["m"]
        e = pages_opts["sl"]["e"]
        images = pages_opts["images"]

        if (images is None):
            self.log.warn('image list is not exist.')
            return imgList

        for img in images:
            # https://i.hamreus.com/ps3/p/pingxingtt_gbl/%E7%AC%AC117%E8%AF%9D/1_7684.jpg.webp?e=1769209619&m=MOn_QAAi-qwQBaRjlmNYkA
            img_url = u'https://i.hamreus.com{}?e={}&m={}'.format(img, e, m)
            imgList.append(img_url)

        return imgList
コード例 #31
0
ファイル: pufeibase.py プロジェクト: woshialanyh/kindleyang
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)

        try:
            # function base64decode(str){*};
            func = re.search(r'function\ base64decode\(str\){.*};',
                             content).group()
            func = func.split('base64decode')[1].replace('};', '}')

            # packed="*";
            packed = re.search(r'packed=".*";', content).group()
            packed = packed.split('\"')[1]
        except:
            self.log.warn('var photosr is not exist.')
            return imgList

        # eval(function(str){*}("*").slice(4))
        lz_input = "eval(function{}(\"{}\").slice(4))".format(func, packed)
        lz_nodejs = self.get_node_online(lz_input)

        if (lz_nodejs is None):
            self.log.warn('image list is not exist.')
            return imgList

        # photosr[1]="images/2019/11/08/09/19904f5d64.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0";
        images = lz_nodejs.split("\"")
        # http://res.img.220012.net/2017/08/22/13/343135d67f.jpg
        for img in images:
            if ".jpg" in img:
                img_url = self.urljoin("http://res.img.220012.net", img)
                imgList.append(img_url)

        return imgList
コード例 #32
0
    def getImgList(self, url):
        decoder = AutoDecoder(isfeed=False)
        opener = URLOpener(self.host, timeout=60)
        imgList = []

        result = opener.open(url)
        if result.status_code != 200 or not result.content:
            self.log.warn('fetch comic page failed: %s' % url)
            return imgList

        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, 'html.parser')

        try:
            func = re.search(r'function\ base64decode\(str\){.*};',
                             content).group()
            packed = re.search(r'packed=".*";', content).group()
        except:
            self.log.warn('var photosr is not exist in {}.'.format(url))
            return imgList

        # eval(function(str){*}("*").slice(4))
        lz_input = "{}var photosr = new Array();{}console.log(eval(base64decode(packed).slice(4)));".format(
            func, packed)
        lz_nodejs = self.get_node_online(lz_input)
        if (lz_nodejs is None):
            self.log.warn('image list is not exist.')
            return imgList

        images = lz_nodejs.split("\"")
        self.log.info(images)
        for img in images:
            # photosr[1]="images/2020/05/03/17/516bbfddb4.jpg/0";...photosr[98]="images/2019/11/08/09/22abc96bd2.jpg/0";
            # http://res.img.fffimage.com/images/2020/05/03/17/516bbfddb4.jpg/0

            # photosr[1]="images/2020/04/21/09/3706a024c8.png/0";...photosr[12]="images/2020/04/21/09/3732355905.png/0";
            # http://res.img.fffimage.com/images/2020/04/21/09/3706a024c8.png/0
            if ".jpg" in img or ".png" in img:
                img_url = self.urljoin("http://res.img.fffimage.com/", img)
                imgList.append(img_url)

        return imgList
コード例 #33
0
ファイル: weixinbase.py プロジェクト: mikezhouhan/KindleEar
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1]
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout + 10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)

            id = urlparse.urlparse(url).query.split("=")[1]

            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers)
            else:
                self.log.warn("fetch rss failed(%d):%s" % (result.status_code, url))
                continue

            eqs, ekv = process_eqs(content)
            url = WEIXIN_URL.format(id=id, eqs=urllib.quote(eqs), ekv=ekv, t=int(time.time() * 1000))

            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers)
                content = content[content.find("{") : content.rfind("}") + 1]
                try:
                    content = json.loads(content)
                except ValueError:
                    continue

                for e in content["items"][: self.max_articles_per_feed]:
                    e = feedparser.parse(e)["entries"][0]
                    updated = None
                    if hasattr(e, "lastmodified") and e.lastmodified:
                        updated = float(e.lastmodified)

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime.utcfromtimestamp(updated)
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article  # 以秒为单位
                        else:
                            threshold = 86400 * self.oldest_article  # 以天为单位

                        if delta.days * 86400 + delta.seconds > threshold:
                            self.log.info("Skip old article(%s): %s" % (updated.strftime("%Y-%m-%d %H:%M:%S"), e.href))
                            continue

                    # 支持HTTPS
                    if hasattr(e, "href"):
                        if url.startswith("https://"):
                            urlfeed = e.href.replace("http://", "https://")
                        else:
                            urlfeed = e.href

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ""

                    desc = None
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn("fetch rss failed(%d):%s" % (result.status_code, url))

        return urls
コード例 #34
0
ファイル: base.py プロジェクト: binbin/KindleEar
    def Items(self, opts=None, user=None):
        """
        生成器,返回一个元组
        对于HTML:section,url,title,content,brief,thumbnail
        对于图片,mime,url,filename,content,brief,thumbnail
        如果是图片,仅第一个图片的thumbnail返回True,其余为None
        """
        decoder = AutoDecoder(False)
        timeout = self.timeout
        for section, url in self.feeds:
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            status_code, content = result.status_code, result.content
            if status_code != 200 or not content:
                self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
                continue

            if self.page_encoding:
                try:
                    content = content.decode(self.page_encoding)
                except UnicodeDecodeError:
                    content = decoder.decode(content,opener.realurl,result.headers)
            else:
                content = decoder.decode(content,opener.realurl,result.headers)

            content =  self.preprocess(content)
            soup = BeautifulSoup(content, "lxml")

            head = soup.find('head')
            if not head:
                head = soup.new_tag('head')
                soup.html.insert(0, head)
            if not head.find('title'):
                t = soup.new_tag('title')
                t.string = section
                head.append(t)
                
            try:
                title = soup.html.head.title.string
            except AttributeError:
                title = section
                #self.log.warn('object soup invalid!(%s)'%url)
                #continue

            title = self.processtitle(title)

            if self.keep_only_tags:
                body = soup.new_tag('body')
                try:
                    if isinstance(self.keep_only_tags, dict):
                        keep_only_tags = [self.keep_only_tags]
                    else:
                        keep_only_tags = self.keep_only_tags
                    for spec in keep_only_tags:
                        for tag in soup.find('body').find_all(**spec):
                            body.insert(len(body.contents), tag)
                    soup.find('body').replace_with(body)
                except AttributeError: # soup has no body element
                    pass

            for spec in self.remove_tags_after:
                tag = soup.find(**spec)
                remove_beyond(tag, 'next_sibling')

            for spec in self.remove_tags_before:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previous_sibling')

            remove_tags = self.insta_remove_tags + self.remove_tags
            remove_ids = self.insta_remove_ids + self.remove_ids
            remove_classes = self.insta_remove_classes + self.remove_classes
            remove_attrs = self.insta_remove_attrs + self.remove_attrs
            for tag in soup.find_all(remove_tags):
                tag.decompose()
            for id in remove_ids:
                for tag in soup.find_all(attrs={"id":id}):
                    tag.decompose()
            for cls in remove_classes:
                for tag in soup.find_all(attrs={"class":cls}):
                    tag.decompose()
            for attr in remove_attrs:
                for tag in soup.find_all(attrs={attr:True}):
                    del tag[attr]
            for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)):
                cmt.extract()

            #删除body的所有属性,以便InsertToc使用正则表达式匹配<body>
            body = soup.html.body
            bodyattrs = [attr for attr in body.attrs]
            for attr in bodyattrs:
                del body[attr]

            if self.extra_css:
                sty = soup.new_tag('style', type="text/css")
                sty.string = self.extra_css
                soup.html.head.append(sty)

            has_imgs = False
            thumbnail = None
            if self.keep_image:
                self.soupbeforeimage(soup)
                for img in soup.find_all('img'):
                    #现在使用延迟加载图片技术的网站越来越多了,这里处理一下
                    #注意:如果data-src之类的属性保存的不是真实url就没辙了
                    imgurl = img['src'] if 'src' in img.attrs else ''
                    if not imgurl:
                        for attr in img.attrs:
                            if attr != 'src' and 'src' in attr: #很多网站使用data-src
                                imgurl = img[attr]
                                break
                    if not imgurl:
                        img.decompose()
                        continue
                    if not imgurl.startswith('http'):
                        imgurl = self.urljoin(url, imgurl)
                    if self.fetch_img_via_ssl and url.startswith('https://'):
                        imgurl = imgurl.replace('http://', 'https://')
                    if self.isfiltered(imgurl):
                        self.log.warn('img filtered:%s' % imgurl)
                        img.decompose()
                        continue
                    imgresult = opener.open(imgurl)
                    imgcontent = self.process_image(imgresult.content,opts) if imgresult.status_code==200 else None
                    if imgcontent:
                        if len(imgcontent) < self.img_min_size: #rexdf too small image
                            img.decompose()
                            continue

                        imgtype = imghdr.what(None, imgcontent)
                        if imgtype:
                            imgmime = r"image/" + imgtype
                            fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype=='jpeg' else imgtype)
                            img['src'] = fnimg

                            #使用第一个图片做为目录摘要图
                            if not has_imgs:
                                has_imgs = True
                                thumbnail = imgurl
                                yield (imgmime, imgurl, fnimg, imgcontent, None, True)
                            else:
                                yield (imgmime, imgurl, fnimg, imgcontent, None, None)
                        else:
                            img.decompose()
                    else:
                        self.log.warn('fetch img failed(err:%d):%s' % (imgresult.status_code,imgurl))
                        img.decompose()

                #去掉图像上面的链接
                for img in soup.find_all('img'):
                    if img.parent and img.parent.parent and \
                        img.parent.name == 'a':
                        img.parent.replace_with(img)

            else:
                for img in soup.find_all('img'):
                    img.decompose()

            self.soupprocessex(soup)
            content = unicode(soup)

            #提取文章内容的前面一部分做为摘要
            brief = u''
            if GENERATE_TOC_DESC:
                for h in body.find_all(['h1','h2']): # 去掉h1/h2,避免和标题重复
                    h.decompose()
                for s in body.stripped_strings:
                    brief += unicode(s) + u' '
                    if len(brief) >= TOC_DESC_WORD_LIMIT:
                        brief = brief[:TOC_DESC_WORD_LIMIT]
                        break
            soup = None

            content =  self.postprocess(content)
            yield (section, url, title, content, brief, thumbnail)