Beispiel #1
0
    def get_sin(self, l):
        titleOrigin = l.split('=')
        i = self.href_list.index(l)

        #dot=etree.HTML(cont.content)
        fontfamily = ''
        cvlist = []
        cvdic = []
        cont = ''
        dot = ''
        codetext = ''
        badgateway = True
        while (badgateway):
            cont = requests.get(l, headers=self.headerss)
            dot = etree.HTML(
                cont.content.decode('gb18030',
                                    "ignore").encode("utf-8").decode('utf-8'))
            codetext = etree.tostring(dot, encoding="utf-8").decode()
            bdw = re.findall('<h1>502 Bad Gateway</h1>', codetext)
            if bdw == []:
                badgateway = False
            else:
                time.sleep(1)

        #字体反爬虫
        fontsrc = re.findall(
            r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext)
        if fontsrc != []:
            fontsrc = "http:" + fontsrc[0]
            fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '',
                              fontsrc)
            fontname = re.sub('.h=my.jjwxc.net', '', fontname)
            fontfamily = re.sub('.woff2', '', fontname)
            cvdic = []
            if not os.path.exists(self.path + "/Fonts/" + fontfamily + '.txt'):
                #解析json文件
                r = requests.get('http://jjwxc.yooooo.us/' + fontfamily +
                                 '.json')
                fonttxt = re.sub('{"status": 0, "data": ', '', r.text)
                fonttxt = re.sub('}}', '}', fonttxt)
                cdic = json.loads(fonttxt)
                fonttxt = ''
                f = open(self.path + "/Fonts/" + fontfamily + ".txt",
                         "w",
                         encoding='utf-8')
                for s, v in cdic.items():
                    fonttxt = fonttxt + '&#x' + s + ';-' + v + '\n'
                fonttxt.strip()
                f.write(fonttxt)
                f.close()
                '''
                #若需要下载ttf文件,可运行下方代码
                fontwb=requests.get(re.sub('woff2','ttf',fontsrc)).content
                fontf=open(self.path+"/Fonts/"+fontfamily+'.ttf','wb')
                fontf.write(fontwb)
                fontf.close()
                '''
            try:
                with open(self.path + "/Fonts/" + fontfamily + ".txt",
                          "r",
                          encoding='utf-8') as f:
                    cvlist = f.readlines()
                    for y in range(len(cvlist)):
                        cvdic.append(cvlist[y].split('-'))
                    cvdic = dict(cvdic)
            except:
                t = 1
            if cvlist != []:
                fontfamily += '_c'
            elif fontfamily not in self.fontlist:
                self.fontlist.append(fontfamily)

        #tex:正文
        tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()')

        #tex1:作话
        tex1 = dot.xpath("//div[@class='readsmall']/text()")
        #sign:作话位置
        sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class")

        title = ''
        #序号填充
        if self.titleInfo[0] == '1':
            title = str(titleOrigin[2]).zfill(self.fillNum) + "#"

        #章节名称
        if self.titleInfo[1] == '1':
            title = title + " " + self.titleindex[i].strip()

        #内容提要
        if self.titleInfo[2] == '1':
            title = title + " " + self.Summary[i].strip()

        title = re.sub('&amp;', '&', title)
        title = re.sub('&lt;', '<', title)
        title = re.sub('&gt;', '>', title)

        if self.state == 's':
            title = OpenCC('t2s').convert(title)
        elif self.state == 't':
            title = OpenCC('s2t').convert(title)
        if self.href_list[i] in self.rollSignPlace:
            v = self.rollSign[self.rollSignPlace.index(l)]
            if self.state == 's':
                v = OpenCC('t2s').convert(
                    self.rollSign[self.rollSignPlace.index(l)])
            elif self.state == 't':
                v = OpenCC('s2t').convert(
                    self.rollSign[self.rollSignPlace.index(l)])

            #创建章节文件
        fo = open("z" + str(titleOrigin[2].zfill(4)) + ".txt",
                  'w',
                  encoding='utf-8')
        #写入卷标
        if self.href_list[i] in self.rollSignPlace:
            v = re.sub('&amp;', '&', v)
            v = re.sub('&lt;', '<', v)
            v = re.sub('&gt;', '>', v)
            fo.write("\r\n\r\n" + v.rstrip() + '\r\n')
            print("\r\n" + v + "\r\n")
            fo.write(title + '\r\n')
        #写入标题
        else:
            fo.write("\r\n\r\n" + title + "\r\n")
        if len(tex) == 0:
            self.failInfo.append(titleOrigin[2].zfill(self.fillNum))
            fo.write('下载失败!')
        else:
            #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹
            if cvdic != []:
                for y in range(len(tex)):
                    for s, v in cvdic.items():
                        if not s == '&#x78"/;':
                            s = re.sub(r'&#x', r'\\u', s)
                            s = re.sub(
                                ';', '',
                                s).encode('utf-8').decode('unicode_escape')
                            tex[y] = re.sub(s, v.strip(), tex[y])
            cvdic = cvlist = []
            #作话在文前的情况
            if str(sign) == "['readsmall']":
                for m in tex1:  #删除无用文字及多余空格空行
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(m))
                    v = re.sub(' ', '', vv)
                    v = re.sub(' +', ' ', v).strip()
                    v = re.sub('&amp;', '&', v)
                    v = re.sub('&lt;', '<', v)
                    v = re.sub('&gt;', '>', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    v = re.sub('作者有话要说:', '作者有话要说:\n', v)
                    if v != "":  #按行写入正文
                        fo.write(v + "\n")
                if len(tex1) != 0:
                    fo.write("\n*\r\n")
                for tn in tex:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn))
                    v = re.sub(' ', '', vv)
                    v = re.sub(' +', ' ', v).strip()
                    v = re.sub('&amp;', '&', v)
                    v = re.sub('&lt;', '<', v)
                    v = re.sub('&gt;', '>', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write(v + "\n")
            else:  #作话在文后的情况
                for tn in tex:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn))
                    v = re.sub(' ', '', vv)
                    v = re.sub(' +', ' ', v).strip()
                    v = re.sub('&amp;', '&', v)
                    v = re.sub('&lt;', '<', v)
                    v = re.sub('&gt;', '>', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write(v + "\n")
                if len(tex1) != 0:
                    fo.write("\n*\r\n")
                for m in tex1:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(m))
                    v = re.sub(' ', '', vv)
                    v = re.sub(' +', ' ', v).strip()
                    v = re.sub('&amp;', '&', v)
                    v = re.sub('&lt;', '<', v)
                    v = re.sub('&gt;', '>', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    v = re.sub('作者有话要说:', '作者有话要说:\n', v)
                    if v != "":
                        fo.write(v + "\n")
        fo.close()
        self.percent += 1
Beispiel #2
0
    def get_txt(self, txt_id, threadnum):
        intro = ''
        ids = str(txt_id)
        self.percent = 0
        self.index = []
        self.titleindex = []
        self.Summary = []
        self.fillNum = 0
        self.rollSign = []
        self.rollSignPlace = []
        self.state = ''
        self.href_list = []
        self.td = []
        self.failInfo = []
        self.path = ''
        self.textEdit.clear()
        section_ct = 9999

        # 获取文章网址
        req_url = ids
        nid = ids.split('=')[1]
        apireq = 'https://app.jjwxc.net/androidapi/novelbasicinfo?novelId=' + nid
        apivol = 'https://app.jjwxc.net/androidapi/chapterList?novelId=' + nid + '&more=0&whole=1'

        # 通过cookie获取文章信息
        res = requests.get(req_url, headers=self.headerss)
        apires = requests.get(apireq, headers=self.headerss)
        apicont = json.loads(apires.text)
        if "message" in apicont and not "novelIntro" in apicont:
            v = apicont["message"]
            QMessageBox.warning(self, '警告', apicont["message"],
                                QMessageBox.Yes)
            apicont = {
                "message": v,
                "novelId": "",
                "novelName": "",
                "authorId": "",
                "authorName": "",
                "novelClass": "",
                "novelTags": "",
                "novelTagsId": "",
                "novelCover": "",
                "originalCover": "",
                "novelStep": "",
                "novelIntro": "",
                "novelIntroShort": "",
                "isVip": "",
                "isPackage": "",
                "novelSize": "",
                "novelsizeformat": "",
                "novelChapterCount": "",
                "renewDate": "",
                "renewChapterId": "",
                "renewChapterName": "",
                "novelScore": "",
                "islock": "",
                "novelbefavoritedcount": "",
                "novelbefavoritedcountformat": "",
                "type_id": "",
                "age": "",
                "maxChapterId": "",
                "chapterdateNewest": "",
                "local": "",
                "localImg": "",
                "novelStyle": "",
                "series": "",
                "protagonist": "",
                "costar": "",
                "other": "",
                "comment_count": "",
                "nutrition_novel": "",
                "ranking": "",
                "novip_clicks": "",
                "vipChapterid": "",
                "isSign": "",
                "ILTC": "",
                "mainview": "",
                "codeUrl": "",
                "novelReviewScore": "",
                "authorsayrule": "",
                "copystatus": "",
                "yellowcard": []
            }

        else:
            # 获取目录
            rc = requests.get(apivol, headers=self.headerss)
            cdic = json.loads(rc.text)
            cdic = cdic["chapterlist"]
            # 对文案进行编码
            ress = etree.HTML(
                res.content.decode("GB18030",
                                   "ignore").encode("utf-8",
                                                    "ignore").decode('utf-8'))
            res.close()

            # 获取文案
            if self.special.isChecked():
                intro = ress.xpath(
                    "//html/body/table/tr/td[1]/div[2]/div[@id='novelintro']")
            else:
                intro = apicont["novelIntro"]
                intro = re.sub("&lt;br/&gt;", "\n", intro).splitlines()
            # 获取标签
            info = []
            info.append("<b>标签:</b>" + apicont["novelTags"])
            info.append(apicont["protagonist"])
            info.append(apicont["costar"])
            info.append(apicont["other"])
            info.append("<b>简介:</b>" + apicont["novelIntroShort"])

            infox = []
            infox.append("文章类型:" + apicont["novelClass"])
            infox.append("作品视角:" + apicont["mainview"])
            infox.append("作品风格:" + apicont["novelStyle"])
            infox.append("所属系列:" + apicont["series"])
            if apicont["novelClass"] == "1":
                infox.append("文章进度:连载")
            elif apicont["novelClass"] == "2":
                infox.append("文章进度:完结")
            elif apicont["novelClass"] == "0":
                infox.append("文章进度:暂停")
            infox.append("全文字数:" + apicont["novelSize"] + "字")

            # 获取封面
            cover = apicont["novelCover"]

            if cover != '':
                try:
                    pres = requests.get(cover)
                except Exception:
                    img = "0"
                    self.textEdit.append("【封面下载失败!请检查网络或尝试科学上网。】\n")
                    self.textEdit.moveCursor(self.textEdit.textCursor().End)

                else:
                    img = pres.content
            else:
                img = "0"

            fpi = re.findall(r'static.jjwxc.net/novelimage.php.novelid', cover)
            if fpi:
                img = '0'
            # 获取标题和作者
            xtitle = apicont["novelName"]
            xaut = apicont["authorName"]
            ti = xtitle + '-' + xaut

            if self.state == 's':
                ti = OpenCC('t2s').convert(ti)
            elif self.state == 't':
                ti = OpenCC('s2t').convert(ti)
            self.textEdit.append("网址:" + ids + "\n小说信息:" + str(ti) + "\n")
            self.textEdit.moveCursor(self.textEdit.textCursor().End)
            self.setWindowTitle("正在下载:" + xtitle + '-' + xaut)

            # 获取所有章节网址、标题、内容提要、卷标
            loc = []
            vcount = 0
            for i in cdic:
                if i["chaptertype"] == "1":
                    vcount += 1
                    v = i["chaptername"]
                    v = re.sub('&', '&amp;', v)
                    v = re.sub('<', '&lt;', v)
                    v = re.sub('>', '&gt;', v)
                    if self.format.currentText() == "txt":
                        v = re.sub('</?\w+[^>]*>', '', v).strip()
                    v = "§ " + v + " §"
                    if self.selfvol.isChecked():
                        v = re.sub('\$1', str(vcount), self.voledit.text())
                        v = re.sub('\$2', i["chaptername"], v)
                    self.rollSign.append(v)
                    self.rollSignPlace.append(i["chapterid"])
                else:
                    u = "https://app.jjwxc.net/androidapi/chapterContent?novelId=" + nid + "&chapterId=" + i[
                        "chapterid"]
                    self.href_list.append(u)
                    v = i["chaptername"]
                    v = re.sub('&', '&amp;', v)
                    v = re.sub('&&amp;#', '&#', v)
                    v = re.sub('</?\w+[^>]*>', '', v)
                    if self.format.currentText() == "txt":
                        v = re.sub('</?\w+[^>]*>', '', v)
                    self.titleindex.append(v.strip())
                    v = i["chapterintro"]
                    v = re.sub('&', '&amp;', v)
                    v = re.sub('&&amp;#', '&#', v)
                    if self.format.currentText() == "txt":
                        v = re.sub('</?\w+[^>]*>', '', v)
                    self.Summary.append(v.strip())
                    if i["islock"] != "0":
                        loc.append(i["chapterid"])

            section_ct = len(self.href_list)
            lockinfo = ''

            self.textEdit.append("可下载章节数:" + str(section_ct) + "\n")
            self.textEdit.moveCursor(self.textEdit.textCursor().End)
            if loc != []:
                i = ""
                for x in loc:
                    i = i + x + " "
                self.textEdit.append("被锁章节:" + i + "\n")
                self.textEdit.moveCursor(self.textEdit.textCursor().End)
                if self.format.currentText() == "txt":
                    lockinfo = "被锁章节:" + i + "\n"
                else:
                    lockinfo = "<p><em>被锁章节:" + i + "</em></p>"
                if self.state == 's':
                    lockinfo = OpenCC('t2s').convert(lockinfo)
                elif self.state == 't':
                    lockinfo = OpenCC('s2t').convert(lockinfo)

            # fillNum:填充序号的长度,例如:若全文有1437章,则每章序号有四位,依次为0001、0002……
            self.fillNum = len(str(len(self.href_list)))

            # 对标题进行操作,删除违规字符等
            ti = re.sub('[\/:*?"<>|]', '_', ti)
            ti = re.sub('&', '&amp;', ti)

            xauthref = "http://www.jjwxc.net/oneauthor.php?authorid=" + apicont[
                "authorId"]

            # 若文件名不想加编号,可以将这行删除
            ti = ti + '.' + ids.split('=')[1]
            ti = re.sub('\r', '', ti)

            v = ""
            # 打开小说文件写入小说相关信息
            path = os.getcwd()
            self.path = path
            if os.path.exists(ti):
                os.chdir(ti)
            else:
                os.mkdir(ti)
                os.chdir(ti)
            ppp = os.getcwd()
            for vol in range(len(self.rollSignPlace)):
                self.rollSignPlace[vol] = self.rollSignPlace[vol].strip()
                volt = self.rollSignPlace[vol]
                ros = self.rollSign[vol]
                nm = 'z' + str(int(volt) - 1).zfill(4) + '_vol.xhtml'
                if self.state == 's':
                    ros = OpenCC('t2s').convert(ros)
                elif self.state == 't':
                    rose = OpenCC('s2t').convert(ros)
                with open(nm, 'w', encoding='utf-8') as f:
                    f.write((
                        '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>''' + ros + '''</title><meta charset="utf-8"/>
<link href="sgc-nav.css" rel="stylesheet" type="text/css"/></head>
<body><h1>''' + ros + '''</h1></body></html>'''))
            self.index = []
            # 保存封面图片
            if img != "0" and self.cover.isChecked(
            ) and not self.format.currentText() == "txt":
                with open("p.jpg", 'wb') as pic:
                    pic.write(img)

                # 写入封面
                with open("C.xhtml", 'w', encoding='utf-8') as f:
                    f.write(
                        '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Cover</title></head>
<body><div style="text-align: center; padding: 0pt; margin: 0pt;">
<svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet" version="1.1" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">
<image width="100%" xlink:href="p.jpg"/></svg></div></body></html>''')

            # 写入文章信息页
            if self.format.currentText() == "txt":
                TOC = xtitle + '\n'
                TOC += '作者:' + xaut + "\n"
                TOC += '源网址:' + req_url + '\n'
            else:
                TOC = "<h1 class='title' title='" + xtitle + "-" + xaut + "'><a href='" + req_url + "'>" + xtitle + "</a></h1>"
                TOC += "<h2 class='sigil_not_in_toc title'>作者:<a href='" + xauthref + "'>" + xaut + "</a></h2>"
                TOC += '''<blockquote>'''

            # 生成目录文字
            for l in self.href_list:
                title = ''
                titleOrigin = l.split('=')
                if titleOrigin[2] in loc:
                    title += "[锁]"

                i = self.href_list.index(l)
                #
                title += str(titleOrigin[2]).zfill(self.fillNum) + " "
                #
                title = title + self.titleindex[i].strip() + " "
                #
                title = title + self.Summary[i].strip()
                if self.state == 's':
                    title = OpenCC('t2s').convert(title)
                elif self.state == 't':
                    title = OpenCC('s2t').convert(title)
                self.index.append(title)

            for ix in infox:
                ix = ix.strip()
                ix = re.sub('\n', '', ix)
                ix = re.sub(' +', '', ix)
                if self.format.currentText() == "txt":
                    TOC += ix + "\n"
                else:
                    TOC += "<p>" + ix + "</p>"
            if self.format.currentText() == "txt":
                TOC += "文案:\n"
            else:
                TOC += "</blockquote>"
                TOC += "<hr/><p><b>文案:</b></p>"
            if self.special.isChecked():
                v = etree.tostring(intro[0], encoding="utf-8").decode()
                TOC += v
            else:
                for nx in intro:
                    v = re.sub(' +', ' ', str(nx)).strip()
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if v != "" and self.format.currentText() == "txt":
                        TOC += v + "\n"
                    elif v:
                        TOC += "<p>" + v + "</p>"
                if "立意:" in TOC:
                    TOC = re.sub('<p>立意:', '<hr/><p><b>立意</b>:', TOC)
                else:
                    TOC += '<hr/>'
            if self.format.currentText() == "txt":
                for v in info:
                    TOC += re.sub("<.*?>", "", v) + '\n'
                if self.state == 's':
                    TOC = OpenCC('t2s').convert(TOC)
                elif self.state == 't':
                    TOC = OpenCC('s2t').convert(TOC)
                with open("info.txt", 'w', encoding='utf-8') as fo:
                    fo.write(TOC.strip() + '\n')
                    fo.write(lockinfo.strip() + '\n')
            else:
                for v in info:
                    v = re.sub("主角:", "<b>主角:</b>", v)
                    v = re.sub("配角:", "<b>配角:</b>", v)
                    v = re.sub("其它:", "<b>其它:</b>", v)
                    TOC += "<p>" + v + "</p>"
                if self.state == 's':
                    TOC = OpenCC('t2s').convert(TOC)
                elif self.state == 't':
                    TOC = OpenCC('s2t').convert(TOC)
                with open("info.xhtml", 'w', encoding='utf-8') as fo:
                    fo.write(
                        '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title></title><meta charset="utf-8"/>
<link href="sgc-nav.css" rel="stylesheet" type="text/css"/></head>
<body>''' + TOC + lockinfo + '''</body></html>''')
            tlist = []
            # 获取每一章内容

            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=threadnum) as executor:
                tlist = {
                    executor.submit(self.get_sin, i): i
                    for i in self.href_list
                }
                for future in concurrent.futures.as_completed(tlist):
                    if self.percent < section_ct:
                        self.progressBar.setValue(
                            int(100 * self.percent / section_ct))
                        self.progressBar.update()
                        self.pct.setText(
                            str(self.percent) + '/' + str(section_ct))
                        self.setWindowTitle("正在下载:" + xtitle + '-' + xaut +
                                            " (" + self.pct.text() + ")")
                        QApplication.processEvents()
                if self.percent < section_ct:
                    QMessageBox.warning(
                        self, '警告', '请检查cookie是否正确!\n章节:' + self.currentTitle,
                        QMessageBox.Yes)
                self.progressBar.setValue(int(100 * self.percent / section_ct))
                self.progressBar.update()
                self.textEdit.append('\n 下载完成,总进度:' + str(self.percent) + '/' +
                                     str(section_ct))
                self.textEdit.moveCursor(self.textEdit.textCursor().End)
                time.sleep(0.1)
                self.pct.setText(str(self.percent) + '/' + str(section_ct))
            '''
            for i in self.href_list:
                self.get_sin(i)
            '''
            if self.failInfo:
                self.failInfo.sort()
                vs = ""
                for ss in self.failInfo:
                    vs = vs + ss + "|"
                self.textEdit.append("\n未购买或加载失败章节:")
                self.textEdit.append(vs[:-1] + "\n")
                self.textEdit.moveCursor(self.textEdit.textCursor().End)

            if self.format.currentText() == "txt":
                # txt整合
                os.chdir(path)
                f = open(ti + ".txt", 'w', encoding='utf-8')
                filenames = os.listdir(ppp)
                i = 0
                for filename in filenames:
                    filepath = ppp + '\\' + filename
                    for line in open(filepath,
                                     encoding='utf-8',
                                     errors='ignore'):
                        f.writelines(line)
                f.close()
                shutil.rmtree(ppp)
                self.textEdit.append("\ntxt文件整合完成")
                self.textEdit.moveCursor(self.textEdit.textCursor().End)

            else:
                # 保存为epub
                os.chdir(path)
                epub_name = ti + ".epub"
                epub = zipfile.ZipFile(epub_name, 'w')
                if self.format.currentText() == "epub2":
                    epubfile = EPUB2.epubfile()
                    if self.hvol.isChecked():
                        epubfile.htmlvol = 1
                else:
                    epubfile = EPUB3.epubfile()
                epubfile.csstext = self.cssedit.toPlainText()
                epubfile.createEpub(epub, xaut, xtitle, ti, self.index,
                                    self.rollSign, path)
                self.textEdit.append("\nepub打包完成")
                self.textEdit.moveCursor(self.textEdit.textCursor().End)
            self.setWindowTitle("下载完成:" + xtitle + '-' + xaut + " (" +
                                self.pct.text() + ")")
Beispiel #3
0
    def get_sin(self, l):
        titleOrigin = l.split('=')
        i = self.href_list.index(l)

        #dot=etree.HTML(cont.content)
        fontfamily = ''
        cvlist = []
        cvdic = []
        cont = ''
        dot = ''
        codetext = ''
        badgateway = True
        while (badgateway):
            cont = requests.get(l, headers=self.headerss)
            dot = etree.HTML(
                cont.content.decode('gb18030',
                                    "ignore").encode("utf-8").decode('utf-8'))
            codetext = etree.tostring(dot, encoding="utf-8").decode()
            bdw = re.findall('<h1>502 Bad Gateway</h1>', codetext)
            if bdw == []:
                badgateway = False
            else:
                time.sleep(1)

        #字体反爬虫
        fontsrc = re.findall(
            r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext)
        if fontsrc != []:
            fontsrc = "http:" + fontsrc[0]
            fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '',
                              fontsrc)
            fontname = re.sub('.h=my.jjwxc.net', '', fontname)
            fontfamily = re.sub('.woff2', '', fontname)
            cvdic = []
            if not os.path.exists(self.path + "/Fonts/" + fontfamily + '.txt'):
                #解析json文件
                r = requests.get('http://jjwxc.yooooo.us/' + fontfamily +
                                 '.json')
                fonttxt = re.sub('{"status": 0, "data": ', '', r.text)
                fonttxt = re.sub('}}', '}', fonttxt)
                cdic = json.loads(fonttxt)
                fonttxt = ''
                f = open(self.path + "/Fonts/" + fontfamily + ".txt",
                         "w",
                         encoding='utf-8')
                for s, v in cdic.items():
                    fonttxt = fonttxt + '&#x' + s + ';-' + v + '\n'
                fonttxt.strip()
                f.write(fonttxt)
                f.close()
                '''
                #若需要下载ttf文件,可运行下方代码
                fontwb=requests.get(re.sub('woff2','ttf',fontsrc)).content
                fontf=open(self.path+"/Fonts/"+fontfamily+'.ttf','wb')
                fontf.write(fontwb)
                fontf.close()
                '''
            try:
                with open(self.path + "/Fonts/" + fontfamily + ".txt",
                          "r",
                          encoding='utf-8') as f:
                    cvlist = f.readlines()
                    for y in range(len(cvlist)):
                        cvdic.append(cvlist[y].split('-'))
                    cvdic = dict(cvdic)
            except:
                t = 1
            if cvlist != []:
                fontfamily += '_c'
            elif fontfamily not in self.fontlist:
                self.fontlist.append(fontfamily)
                self.fontcss += '''@font-face{font-family: "%s";
src:url("%s") format('woff2'),
url("../font/%s") format('woff2'),
url("../font/%s.ttf") format("truetype");}
.%s{font-family:"%s",serif;}
''' % (fontfamily, fontsrc, fontname, fontfamily, fontfamily, fontfamily)

        #tex:正文
        tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()')

        #tex1:作话
        tex1 = dot.xpath("//div[@class='readsmall']/text()")
        #sign:作话位置
        sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class")

        title = ''
        #序号填充
        if self.titleInfo[0] == '1':
            title = str(titleOrigin[2]).zfill(self.fillNum)

        #章节名称
        if self.titleInfo[1] == '1':
            title = title + " " + self.titleindex[i].strip()

        #内容提要
        if self.titleInfo[2] == '1':
            title = title + " " + self.Summary[i].strip()

        title = title.strip()

        if self.state == 's':
            title = OpenCC('t2s').convert(title)
        elif self.state == 't':
            title = OpenCC('s2t').convert(title)
        if self.href_list[i] in self.rollSignPlace:
            v = self.rollSign[self.rollSignPlace.index(l)]
            if self.state == 's':
                v = OpenCC('t2s').convert(
                    self.rollSign[self.rollSignPlace.index(l)])
            elif self.state == 't':
                v = OpenCC('s2t').convert(
                    self.rollSign[self.rollSignPlace.index(l)])

            #创建章节文件
        fo = open("z" + str(titleOrigin[2].zfill(4)) + ".xhtml",
                  'w',
                  encoding='utf-8')

        fo.write('''<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <title>''' + title + '''</title>
<meta charset="utf-8"/>
<link href="sgc-nav.css" rel="stylesheet" type="text/css"/>
</head><body class="''' + fontfamily + '''">''')
        #写入卷标
        if self.href_list[i] in self.rollSignPlace:
            fo.write("<h1>" + v.rstrip() + "</h1>")
            print("\r\n" + v + "\r\n")
            fo.write("<h2 id='v'>" + title + "</h2>")
        #写入标题
        else:
            fo.write('<h2>' + title + "</h2>")
        if len(tex) == 0:
            self.failInfo.append(titleOrigin[2].zfill(self.fillNum))
            #print("第"+titleOrigin[2]+"章未购买或加载失败")
        else:
            #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹
            if cvdic != []:
                for y in range(len(tex)):
                    for s, v in cvdic.items():
                        if not s == '&#x78"/;':
                            s = re.sub(r'&#x', r'\\u', s)
                            s = re.sub(
                                ';', '',
                                s).encode('utf-8').decode('unicode_escape')
                            tex[y] = re.sub(s, v.strip(), tex[y])
            cvdic = cvlist = []
            #作话在文前的情况
            if str(sign) == "['readsmall']":
                fo.write('''<blockquote>''')
                for m in tex1:  #删除无用文字及多余空格空行
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(m))
                    v = re.sub(' ', '', vv)
                    v = re.sub(' +', ' ', v).strip()
                    v = re.sub('&', '&amp;', v)
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    v = re.sub('作者有话要说:', '<b>作者有话要说</b>:</p><p>', v)
                    if v != "":  #按行写入正文
                        fo.write("<p>" + v + "</p>")
                fo.write("</blockquote>")
                if len(tex1) != 0:
                    fo.write("<hr/>")
                for tn in tex:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn))
                    v = re.sub(' ', '', vv)
                    v = re.sub(' +', ' ', v).strip()
                    v = re.sub('&', '&amp;', v)
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write("<p>" + v + "</p>")
            else:  #作话在文后的情况
                for tn in tex:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn))
                    v = re.sub(' ', '', vv)
                    v = re.sub(' +', ' ', v).strip()
                    v = re.sub('&', '&amp;', v)
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write("<p>" + v + "</p>")
                if len(tex1) != 0:
                    fo.write("<hr/>")
                    fo.write('''<blockquote>''')
                for m in tex1:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(m))
                    v = re.sub(' ', '', vv)
                    v = re.sub(' +', ' ', v).strip()
                    v = re.sub('&', '&amp;', v)
                    v = re.sub('>', '&gt;', v)
                    v = re.sub('<', '&lt;', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    v = re.sub('作者有话要说:', '<b>作者有话要说</b>:</p><p>', v)
                    if v != "":
                        fo.write("<p>" + v + "</p>")
                if len(tex1) != 0:
                    fo.write("</blockquote>")
        fo.write("</body></html>")
        fo.close()
        self.percent += 1
Beispiel #4
0
    def get_sin(self, l):
        titleOrigin = l.split('=')
        i = self.href_list.index(l)
        cont = requests.get(l, headers=self.headerss).content
        dot = etree.HTML(
            cont.decode("GB18030", "ignore").encode("utf-8",
                                                    "ignore").decode('utf-8'))
        fontfamily = ''
        cvlist = []
        cvdic = []

        #字体反爬虫
        codetext = etree.tostring(dot, encoding="utf-8").decode()
        fontsrc = re.findall(
            r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext)
        if fontsrc != []:
            fontsrc = "http:" + fontsrc[0]
            fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '',
                              fontsrc)
            fontname = re.sub('.h=my.jjwxc.net', '', fontname)
            fontfamily = re.sub('.woff2', '', fontname)
            try:
                with open(self.path + "/Fonts/" + fontfamily + ".txt",
                          "r",
                          encoding='utf-8') as f:
                    cvlist = f.readlines()
                    for y in range(len(cvlist)):
                        cvdic.append(cvlist[y].split('-'))
                    cvdic = dict(cvdic)
            except:
                y = 1
            if not os.path.exists(self.path + "/Fonts/" + fontname):
                fontwb = requests.get(fontsrc).content
                fontf = open(self.path + "/Fonts/" + fontname, 'wb')
                fontf.write(fontwb)
                fontf.close()
            if cvlist != []:
                fontfamily = ''
            elif fontfamily not in self.fontlist:
                self.fontlist.append(fontfamily)

        #tex:正文
        tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()')

        #tex1:作话
        tex1 = dot.xpath("//div[@class='readsmall']/text()")
        #sign:作话位置
        sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class")

        title = ''
        #序号填充
        if self.titleInfo[0] == '1':
            title = str(titleOrigin[2]).zfill(self.fillNum) + "#"

        #章节名称
        if self.titleInfo[1] == '1':
            title = title + " " + self.titleindex[i].strip()

        #内容提要
        if self.titleInfo[2] == '1':
            title = title + " " + self.Summary[i].strip()

        if self.state == 's':
            title = OpenCC('t2s').convert(title)
        elif self.state == 't':
            title = OpenCC('s2t').convert(title)
        if self.href_list[i] in self.rollSignPlace:
            v = self.rollSign[self.rollSignPlace.index(l)]
            if self.state == 's':
                v = OpenCC('t2s').convert(
                    self.rollSign[self.rollSignPlace.index(l)])
            elif self.state == 't':
                v = OpenCC('s2t').convert(
                    self.rollSign[self.rollSignPlace.index(l)])

            #创建章节文件
        fo = open("z" + str(titleOrigin[2].zfill(4)) + ".txt",
                  'w',
                  encoding='utf-8')
        #写入卷标
        if self.href_list[i] in self.rollSignPlace:
            fo.write("\r\n\r\n" + v.rstrip() + '\r\n')
            print("\r\n" + v + "\r\n")
            fo.write(title + '\r\n')
        #写入标题
        else:
            fo.write("\r\n\r\n" + title + "\r\n")
        if len(tex) == 0:
            self.failInfo.append(titleOrigin[2].zfill(self.fillNum))
            fo.write('下载失败!')
        else:
            #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹
            if cvdic != []:
                for y in range(len(tex)):
                    for s, v in cvdic.items():
                        s = re.sub(r'&#x', r'\\u', s)
                        s = re.sub(r';', '',
                                   s).encode('utf-8').decode('unicode_escape')
                        tex[y] = re.sub(s, v.strip(), tex[y])
            cvdic = cvlist = 0
            #作话在文前的情况
            if str(sign) == "['readsmall']":
                for m in tex1:  #删除无用文字及多余空格空行
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(m))
                    v = re.sub(' +', ' ', vv).strip()
                    v = re.sub(' ', '', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    v = re.sub('作者有话要说:', '作者有话要说:\n', v)
                    if v != "":  #按行写入正文
                        fo.write(v + "\n")
                if len(tex1) != 0:
                    fo.write("\n*\r\n")
                for tn in tex:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn))
                    v = re.sub(' +', ' ', vv).strip()
                    v = re.sub(' ', '', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write(v + "\n")
            else:  #作话在文后的情况
                for tn in tex:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn))
                    v = re.sub(' +', ' ', vv).strip()
                    v = re.sub(' ', '', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    if v != "":
                        fo.write(v + "\n")
                if len(tex1) != 0:
                    fo.write("\n*\r\n")
                for m in tex1:
                    vv = re.sub('@无限好文,尽在晋江文学城', '', str(m))
                    v = re.sub(' +', ' ', vv).strip()
                    v = re.sub(' ', '', v)
                    if self.state == 's':
                        v = OpenCC('t2s').convert(v)
                    elif self.state == 't':
                        v = OpenCC('s2t').convert(v)
                    v = re.sub('作者有话要说:', '作者有话要说:\n', v)
                    if v != "":
                        fo.write(v + "\n")
        fo.close()
        self.percent += 1