def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) #dot=etree.HTML(cont.content) fontfamily = '' cvlist = [] cvdic = [] cont = '' dot = '' codetext = '' badgateway = True while (badgateway): cont = requests.get(l, headers=self.headerss) dot = etree.HTML( cont.content.decode('gb18030', "ignore").encode("utf-8").decode('utf-8')) codetext = etree.tostring(dot, encoding="utf-8").decode() bdw = re.findall('<h1>502 Bad Gateway</h1>', codetext) if bdw == []: badgateway = False else: time.sleep(1) #字体反爬虫 fontsrc = re.findall( r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext) if fontsrc != []: fontsrc = "http:" + fontsrc[0] fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '', fontsrc) fontname = re.sub('.h=my.jjwxc.net', '', fontname) fontfamily = re.sub('.woff2', '', fontname) cvdic = [] if not os.path.exists(self.path + "/Fonts/" + fontfamily + '.txt'): #解析json文件 r = requests.get('http://jjwxc.yooooo.us/' + fontfamily + '.json') fonttxt = re.sub('{"status": 0, "data": ', '', r.text) fonttxt = re.sub('}}', '}', fonttxt) cdic = json.loads(fonttxt) fonttxt = '' f = open(self.path + "/Fonts/" + fontfamily + ".txt", "w", encoding='utf-8') for s, v in cdic.items(): fonttxt = fonttxt + '&#x' + s + ';-' + v + '\n' fonttxt.strip() f.write(fonttxt) f.close() ''' #若需要下载ttf文件,可运行下方代码 fontwb=requests.get(re.sub('woff2','ttf',fontsrc)).content fontf=open(self.path+"/Fonts/"+fontfamily+'.ttf','wb') fontf.write(fontwb) fontf.close() ''' try: with open(self.path + "/Fonts/" + fontfamily + ".txt", "r", encoding='utf-8') as f: cvlist = f.readlines() for y in range(len(cvlist)): cvdic.append(cvlist[y].split('-')) cvdic = dict(cvdic) except: t = 1 if cvlist != []: fontfamily += '_c' elif fontfamily not in self.fontlist: self.fontlist.append(fontfamily) #tex:正文 tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()') #tex1:作话 tex1 = dot.xpath("//div[@class='readsmall']/text()") #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") title = '' #序号填充 if self.titleInfo[0] == '1': title = str(titleOrigin[2]).zfill(self.fillNum) + "#" #章节名称 if self.titleInfo[1] == '1': title = title + " " + self.titleindex[i].strip() #内容提要 if self.titleInfo[2] == '1': title = title + " " + self.Summary[i].strip() title = re.sub('&', '&', title) title = re.sub('<', '<', title) title = re.sub('>', '>', title) if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(4)) + ".txt", 'w', encoding='utf-8') #写入卷标 if self.href_list[i] in self.rollSignPlace: v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) fo.write("\r\n\r\n" + v.rstrip() + '\r\n') print("\r\n" + v + "\r\n") fo.write(title + '\r\n') #写入标题 else: fo.write("\r\n\r\n" + title + "\r\n") if len(tex) == 0: self.failInfo.append(titleOrigin[2].zfill(self.fillNum)) fo.write('下载失败!') else: #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹 if cvdic != []: for y in range(len(tex)): for s, v in cvdic.items(): if not s == 'x"/;': s = re.sub(r'&#x', r'\\u', s) s = re.sub( ';', '', s).encode('utf-8').decode('unicode_escape') tex[y] = re.sub(s, v.strip(), tex[y]) cvdic = cvlist = [] #作话在文前的情况 if str(sign) == "['readsmall']": for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": #按行写入正文 fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": fo.write(v + "\n") fo.close() self.percent += 1
def get_txt(self, txt_id, threadnum): intro = '' ids = str(txt_id) self.percent = 0 self.index = [] self.titleindex = [] self.Summary = [] self.fillNum = 0 self.rollSign = [] self.rollSignPlace = [] self.state = '' self.href_list = [] self.td = [] self.failInfo = [] self.path = '' self.textEdit.clear() section_ct = 9999 # 获取文章网址 req_url = ids nid = ids.split('=')[1] apireq = 'https://app.jjwxc.net/androidapi/novelbasicinfo?novelId=' + nid apivol = 'https://app.jjwxc.net/androidapi/chapterList?novelId=' + nid + '&more=0&whole=1' # 通过cookie获取文章信息 res = requests.get(req_url, headers=self.headerss) apires = requests.get(apireq, headers=self.headerss) apicont = json.loads(apires.text) if "message" in apicont and not "novelIntro" in apicont: v = apicont["message"] QMessageBox.warning(self, '警告', apicont["message"], QMessageBox.Yes) apicont = { "message": v, "novelId": "", "novelName": "", "authorId": "", "authorName": "", "novelClass": "", "novelTags": "", "novelTagsId": "", "novelCover": "", "originalCover": "", "novelStep": "", "novelIntro": "", "novelIntroShort": "", "isVip": "", "isPackage": "", "novelSize": "", "novelsizeformat": "", "novelChapterCount": "", "renewDate": "", "renewChapterId": "", "renewChapterName": "", "novelScore": "", "islock": "", "novelbefavoritedcount": "", "novelbefavoritedcountformat": "", "type_id": "", "age": "", "maxChapterId": "", "chapterdateNewest": "", "local": "", "localImg": "", "novelStyle": "", "series": "", "protagonist": "", "costar": "", "other": "", "comment_count": "", "nutrition_novel": "", "ranking": "", "novip_clicks": "", "vipChapterid": "", "isSign": "", "ILTC": "", "mainview": "", "codeUrl": "", "novelReviewScore": "", "authorsayrule": "", "copystatus": "", "yellowcard": [] } else: # 获取目录 rc = requests.get(apivol, headers=self.headerss) cdic = json.loads(rc.text) cdic = cdic["chapterlist"] # 对文案进行编码 ress = etree.HTML( res.content.decode("GB18030", "ignore").encode("utf-8", "ignore").decode('utf-8')) res.close() # 获取文案 if self.special.isChecked(): intro = ress.xpath( "//html/body/table/tr/td[1]/div[2]/div[@id='novelintro']") else: intro = apicont["novelIntro"] intro = re.sub("<br/>", "\n", intro).splitlines() # 获取标签 info = [] info.append("<b>标签:</b>" + apicont["novelTags"]) info.append(apicont["protagonist"]) info.append(apicont["costar"]) info.append(apicont["other"]) info.append("<b>简介:</b>" + apicont["novelIntroShort"]) infox = [] infox.append("文章类型:" + apicont["novelClass"]) infox.append("作品视角:" + apicont["mainview"]) infox.append("作品风格:" + apicont["novelStyle"]) infox.append("所属系列:" + apicont["series"]) if apicont["novelClass"] == "1": infox.append("文章进度:连载") elif apicont["novelClass"] == "2": infox.append("文章进度:完结") elif apicont["novelClass"] == "0": infox.append("文章进度:暂停") infox.append("全文字数:" + apicont["novelSize"] + "字") # 获取封面 cover = apicont["novelCover"] if cover != '': try: pres = requests.get(cover) except Exception: img = "0" self.textEdit.append("【封面下载失败!请检查网络或尝试科学上网。】\n") self.textEdit.moveCursor(self.textEdit.textCursor().End) else: img = pres.content else: img = "0" fpi = re.findall(r'static.jjwxc.net/novelimage.php.novelid', cover) if fpi: img = '0' # 获取标题和作者 xtitle = apicont["novelName"] xaut = apicont["authorName"] ti = xtitle + '-' + xaut if self.state == 's': ti = OpenCC('t2s').convert(ti) elif self.state == 't': ti = OpenCC('s2t').convert(ti) self.textEdit.append("网址:" + ids + "\n小说信息:" + str(ti) + "\n") self.textEdit.moveCursor(self.textEdit.textCursor().End) self.setWindowTitle("正在下载:" + xtitle + '-' + xaut) # 获取所有章节网址、标题、内容提要、卷标 loc = [] vcount = 0 for i in cdic: if i["chaptertype"] == "1": vcount += 1 v = i["chaptername"] v = re.sub('&', '&', v) v = re.sub('<', '<', v) v = re.sub('>', '>', v) if self.format.currentText() == "txt": v = re.sub('</?\w+[^>]*>', '', v).strip() v = "§ " + v + " §" if self.selfvol.isChecked(): v = re.sub('\$1', str(vcount), self.voledit.text()) v = re.sub('\$2', i["chaptername"], v) self.rollSign.append(v) self.rollSignPlace.append(i["chapterid"]) else: u = "https://app.jjwxc.net/androidapi/chapterContent?novelId=" + nid + "&chapterId=" + i[ "chapterid"] self.href_list.append(u) v = i["chaptername"] v = re.sub('&', '&', v) v = re.sub('&&#', '&#', v) v = re.sub('</?\w+[^>]*>', '', v) if self.format.currentText() == "txt": v = re.sub('</?\w+[^>]*>', '', v) self.titleindex.append(v.strip()) v = i["chapterintro"] v = re.sub('&', '&', v) v = re.sub('&&#', '&#', v) if self.format.currentText() == "txt": v = re.sub('</?\w+[^>]*>', '', v) self.Summary.append(v.strip()) if i["islock"] != "0": loc.append(i["chapterid"]) section_ct = len(self.href_list) lockinfo = '' self.textEdit.append("可下载章节数:" + str(section_ct) + "\n") self.textEdit.moveCursor(self.textEdit.textCursor().End) if loc != []: i = "" for x in loc: i = i + x + " " self.textEdit.append("被锁章节:" + i + "\n") self.textEdit.moveCursor(self.textEdit.textCursor().End) if self.format.currentText() == "txt": lockinfo = "被锁章节:" + i + "\n" else: lockinfo = "<p><em>被锁章节:" + i + "</em></p>" if self.state == 's': lockinfo = OpenCC('t2s').convert(lockinfo) elif self.state == 't': lockinfo = OpenCC('s2t').convert(lockinfo) # fillNum:填充序号的长度,例如:若全文有1437章,则每章序号有四位,依次为0001、0002…… self.fillNum = len(str(len(self.href_list))) # 对标题进行操作,删除违规字符等 ti = re.sub('[\/:*?"<>|]', '_', ti) ti = re.sub('&', '&', ti) xauthref = "http://www.jjwxc.net/oneauthor.php?authorid=" + apicont[ "authorId"] # 若文件名不想加编号,可以将这行删除 ti = ti + '.' + ids.split('=')[1] ti = re.sub('\r', '', ti) v = "" # 打开小说文件写入小说相关信息 path = os.getcwd() self.path = path if os.path.exists(ti): os.chdir(ti) else: os.mkdir(ti) os.chdir(ti) ppp = os.getcwd() for vol in range(len(self.rollSignPlace)): self.rollSignPlace[vol] = self.rollSignPlace[vol].strip() volt = self.rollSignPlace[vol] ros = self.rollSign[vol] nm = 'z' + str(int(volt) - 1).zfill(4) + '_vol.xhtml' if self.state == 's': ros = OpenCC('t2s').convert(ros) elif self.state == 't': rose = OpenCC('s2t').convert(ros) with open(nm, 'w', encoding='utf-8') as f: f.write(( '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head><title>''' + ros + '''</title><meta charset="utf-8"/> <link href="sgc-nav.css" rel="stylesheet" type="text/css"/></head> <body><h1>''' + ros + '''</h1></body></html>''')) self.index = [] # 保存封面图片 if img != "0" and self.cover.isChecked( ) and not self.format.currentText() == "txt": with open("p.jpg", 'wb') as pic: pic.write(img) # 写入封面 with open("C.xhtml", 'w', encoding='utf-8') as f: f.write( '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head><title>Cover</title></head> <body><div style="text-align: center; padding: 0pt; margin: 0pt;"> <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet" version="1.1" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink"> <image width="100%" xlink:href="p.jpg"/></svg></div></body></html>''') # 写入文章信息页 if self.format.currentText() == "txt": TOC = xtitle + '\n' TOC += '作者:' + xaut + "\n" TOC += '源网址:' + req_url + '\n' else: TOC = "<h1 class='title' title='" + xtitle + "-" + xaut + "'><a href='" + req_url + "'>" + xtitle + "</a></h1>" TOC += "<h2 class='sigil_not_in_toc title'>作者:<a href='" + xauthref + "'>" + xaut + "</a></h2>" TOC += '''<blockquote>''' # 生成目录文字 for l in self.href_list: title = '' titleOrigin = l.split('=') if titleOrigin[2] in loc: title += "[锁]" i = self.href_list.index(l) # title += str(titleOrigin[2]).zfill(self.fillNum) + " " # title = title + self.titleindex[i].strip() + " " # title = title + self.Summary[i].strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) self.index.append(title) for ix in infox: ix = ix.strip() ix = re.sub('\n', '', ix) ix = re.sub(' +', '', ix) if self.format.currentText() == "txt": TOC += ix + "\n" else: TOC += "<p>" + ix + "</p>" if self.format.currentText() == "txt": TOC += "文案:\n" else: TOC += "</blockquote>" TOC += "<hr/><p><b>文案:</b></p>" if self.special.isChecked(): v = etree.tostring(intro[0], encoding="utf-8").decode() TOC += v else: for nx in intro: v = re.sub(' +', ' ', str(nx)).strip() v = re.sub('>', '>', v) v = re.sub('<', '<', v) if v != "" and self.format.currentText() == "txt": TOC += v + "\n" elif v: TOC += "<p>" + v + "</p>" if "立意:" in TOC: TOC = re.sub('<p>立意:', '<hr/><p><b>立意</b>:', TOC) else: TOC += '<hr/>' if self.format.currentText() == "txt": for v in info: TOC += re.sub("<.*?>", "", v) + '\n' if self.state == 's': TOC = OpenCC('t2s').convert(TOC) elif self.state == 't': TOC = OpenCC('s2t').convert(TOC) with open("info.txt", 'w', encoding='utf-8') as fo: fo.write(TOC.strip() + '\n') fo.write(lockinfo.strip() + '\n') else: for v in info: v = re.sub("主角:", "<b>主角:</b>", v) v = re.sub("配角:", "<b>配角:</b>", v) v = re.sub("其它:", "<b>其它:</b>", v) TOC += "<p>" + v + "</p>" if self.state == 's': TOC = OpenCC('t2s').convert(TOC) elif self.state == 't': TOC = OpenCC('s2t').convert(TOC) with open("info.xhtml", 'w', encoding='utf-8') as fo: fo.write( '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head><title></title><meta charset="utf-8"/> <link href="sgc-nav.css" rel="stylesheet" type="text/css"/></head> <body>''' + TOC + lockinfo + '''</body></html>''') tlist = [] # 获取每一章内容 with concurrent.futures.ThreadPoolExecutor( max_workers=threadnum) as executor: tlist = { executor.submit(self.get_sin, i): i for i in self.href_list } for future in concurrent.futures.as_completed(tlist): if self.percent < section_ct: self.progressBar.setValue( int(100 * self.percent / section_ct)) self.progressBar.update() self.pct.setText( str(self.percent) + '/' + str(section_ct)) self.setWindowTitle("正在下载:" + xtitle + '-' + xaut + " (" + self.pct.text() + ")") QApplication.processEvents() if self.percent < section_ct: QMessageBox.warning( self, '警告', '请检查cookie是否正确!\n章节:' + self.currentTitle, QMessageBox.Yes) self.progressBar.setValue(int(100 * self.percent / section_ct)) self.progressBar.update() self.textEdit.append('\n 下载完成,总进度:' + str(self.percent) + '/' + str(section_ct)) self.textEdit.moveCursor(self.textEdit.textCursor().End) time.sleep(0.1) self.pct.setText(str(self.percent) + '/' + str(section_ct)) ''' for i in self.href_list: self.get_sin(i) ''' if self.failInfo: self.failInfo.sort() vs = "" for ss in self.failInfo: vs = vs + ss + "|" self.textEdit.append("\n未购买或加载失败章节:") self.textEdit.append(vs[:-1] + "\n") self.textEdit.moveCursor(self.textEdit.textCursor().End) if self.format.currentText() == "txt": # txt整合 os.chdir(path) f = open(ti + ".txt", 'w', encoding='utf-8') filenames = os.listdir(ppp) i = 0 for filename in filenames: filepath = ppp + '\\' + filename for line in open(filepath, encoding='utf-8', errors='ignore'): f.writelines(line) f.close() shutil.rmtree(ppp) self.textEdit.append("\ntxt文件整合完成") self.textEdit.moveCursor(self.textEdit.textCursor().End) else: # 保存为epub os.chdir(path) epub_name = ti + ".epub" epub = zipfile.ZipFile(epub_name, 'w') if self.format.currentText() == "epub2": epubfile = EPUB2.epubfile() if self.hvol.isChecked(): epubfile.htmlvol = 1 else: epubfile = EPUB3.epubfile() epubfile.csstext = self.cssedit.toPlainText() epubfile.createEpub(epub, xaut, xtitle, ti, self.index, self.rollSign, path) self.textEdit.append("\nepub打包完成") self.textEdit.moveCursor(self.textEdit.textCursor().End) self.setWindowTitle("下载完成:" + xtitle + '-' + xaut + " (" + self.pct.text() + ")")
def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) #dot=etree.HTML(cont.content) fontfamily = '' cvlist = [] cvdic = [] cont = '' dot = '' codetext = '' badgateway = True while (badgateway): cont = requests.get(l, headers=self.headerss) dot = etree.HTML( cont.content.decode('gb18030', "ignore").encode("utf-8").decode('utf-8')) codetext = etree.tostring(dot, encoding="utf-8").decode() bdw = re.findall('<h1>502 Bad Gateway</h1>', codetext) if bdw == []: badgateway = False else: time.sleep(1) #字体反爬虫 fontsrc = re.findall( r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext) if fontsrc != []: fontsrc = "http:" + fontsrc[0] fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '', fontsrc) fontname = re.sub('.h=my.jjwxc.net', '', fontname) fontfamily = re.sub('.woff2', '', fontname) cvdic = [] if not os.path.exists(self.path + "/Fonts/" + fontfamily + '.txt'): #解析json文件 r = requests.get('http://jjwxc.yooooo.us/' + fontfamily + '.json') fonttxt = re.sub('{"status": 0, "data": ', '', r.text) fonttxt = re.sub('}}', '}', fonttxt) cdic = json.loads(fonttxt) fonttxt = '' f = open(self.path + "/Fonts/" + fontfamily + ".txt", "w", encoding='utf-8') for s, v in cdic.items(): fonttxt = fonttxt + '&#x' + s + ';-' + v + '\n' fonttxt.strip() f.write(fonttxt) f.close() ''' #若需要下载ttf文件,可运行下方代码 fontwb=requests.get(re.sub('woff2','ttf',fontsrc)).content fontf=open(self.path+"/Fonts/"+fontfamily+'.ttf','wb') fontf.write(fontwb) fontf.close() ''' try: with open(self.path + "/Fonts/" + fontfamily + ".txt", "r", encoding='utf-8') as f: cvlist = f.readlines() for y in range(len(cvlist)): cvdic.append(cvlist[y].split('-')) cvdic = dict(cvdic) except: t = 1 if cvlist != []: fontfamily += '_c' elif fontfamily not in self.fontlist: self.fontlist.append(fontfamily) self.fontcss += '''@font-face{font-family: "%s"; src:url("%s") format('woff2'), url("../font/%s") format('woff2'), url("../font/%s.ttf") format("truetype");} .%s{font-family:"%s",serif;} ''' % (fontfamily, fontsrc, fontname, fontfamily, fontfamily, fontfamily) #tex:正文 tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()') #tex1:作话 tex1 = dot.xpath("//div[@class='readsmall']/text()") #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") title = '' #序号填充 if self.titleInfo[0] == '1': title = str(titleOrigin[2]).zfill(self.fillNum) #章节名称 if self.titleInfo[1] == '1': title = title + " " + self.titleindex[i].strip() #内容提要 if self.titleInfo[2] == '1': title = title + " " + self.Summary[i].strip() title = title.strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(4)) + ".xhtml", 'w', encoding='utf-8') fo.write('''<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>''' + title + '''</title> <meta charset="utf-8"/> <link href="sgc-nav.css" rel="stylesheet" type="text/css"/> </head><body class="''' + fontfamily + '''">''') #写入卷标 if self.href_list[i] in self.rollSignPlace: fo.write("<h1>" + v.rstrip() + "</h1>") print("\r\n" + v + "\r\n") fo.write("<h2 id='v'>" + title + "</h2>") #写入标题 else: fo.write('<h2>' + title + "</h2>") if len(tex) == 0: self.failInfo.append(titleOrigin[2].zfill(self.fillNum)) #print("第"+titleOrigin[2]+"章未购买或加载失败") else: #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹 if cvdic != []: for y in range(len(tex)): for s, v in cvdic.items(): if not s == 'x"/;': s = re.sub(r'&#x', r'\\u', s) s = re.sub( ';', '', s).encode('utf-8').decode('unicode_escape') tex[y] = re.sub(s, v.strip(), tex[y]) cvdic = cvlist = [] #作话在文前的情况 if str(sign) == "['readsmall']": fo.write('''<blockquote>''') for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '<b>作者有话要说</b>:</p><p>', v) if v != "": #按行写入正文 fo.write("<p>" + v + "</p>") fo.write("</blockquote>") if len(tex1) != 0: fo.write("<hr/>") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write("<p>" + v + "</p>") if len(tex1) != 0: fo.write("<hr/>") fo.write('''<blockquote>''') for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' ', '', vv) v = re.sub(' +', ' ', v).strip() v = re.sub('&', '&', v) v = re.sub('>', '>', v) v = re.sub('<', '<', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '<b>作者有话要说</b>:</p><p>', v) if v != "": fo.write("<p>" + v + "</p>") if len(tex1) != 0: fo.write("</blockquote>") fo.write("</body></html>") fo.close() self.percent += 1
def get_sin(self, l): titleOrigin = l.split('=') i = self.href_list.index(l) cont = requests.get(l, headers=self.headerss).content dot = etree.HTML( cont.decode("GB18030", "ignore").encode("utf-8", "ignore").decode('utf-8')) fontfamily = '' cvlist = [] cvdic = [] #字体反爬虫 codetext = etree.tostring(dot, encoding="utf-8").decode() fontsrc = re.findall( r'//static.jjwxc.net/tmp/fonts/.*?woff2.h=my.jjwxc.net', codetext) if fontsrc != []: fontsrc = "http:" + fontsrc[0] fontname = re.sub('http://static.jjwxc.net/tmp/fonts/', '', fontsrc) fontname = re.sub('.h=my.jjwxc.net', '', fontname) fontfamily = re.sub('.woff2', '', fontname) try: with open(self.path + "/Fonts/" + fontfamily + ".txt", "r", encoding='utf-8') as f: cvlist = f.readlines() for y in range(len(cvlist)): cvdic.append(cvlist[y].split('-')) cvdic = dict(cvdic) except: y = 1 if not os.path.exists(self.path + "/Fonts/" + fontname): fontwb = requests.get(fontsrc).content fontf = open(self.path + "/Fonts/" + fontname, 'wb') fontf.write(fontwb) fontf.close() if cvlist != []: fontfamily = '' elif fontfamily not in self.fontlist: self.fontlist.append(fontfamily) #tex:正文 tex = dot.xpath('//*[@id="oneboolt"]/tr[2]/td[1]/div/text()') #tex1:作话 tex1 = dot.xpath("//div[@class='readsmall']/text()") #sign:作话位置 sign = dot.xpath("//*[@id='oneboolt']/tr[2]/td[1]/div/div[4]/@class") title = '' #序号填充 if self.titleInfo[0] == '1': title = str(titleOrigin[2]).zfill(self.fillNum) + "#" #章节名称 if self.titleInfo[1] == '1': title = title + " " + self.titleindex[i].strip() #内容提要 if self.titleInfo[2] == '1': title = title + " " + self.Summary[i].strip() if self.state == 's': title = OpenCC('t2s').convert(title) elif self.state == 't': title = OpenCC('s2t').convert(title) if self.href_list[i] in self.rollSignPlace: v = self.rollSign[self.rollSignPlace.index(l)] if self.state == 's': v = OpenCC('t2s').convert( self.rollSign[self.rollSignPlace.index(l)]) elif self.state == 't': v = OpenCC('s2t').convert( self.rollSign[self.rollSignPlace.index(l)]) #创建章节文件 fo = open("z" + str(titleOrigin[2].zfill(4)) + ".txt", 'w', encoding='utf-8') #写入卷标 if self.href_list[i] in self.rollSignPlace: fo.write("\r\n\r\n" + v.rstrip() + '\r\n') print("\r\n" + v + "\r\n") fo.write(title + '\r\n') #写入标题 else: fo.write("\r\n\r\n" + title + "\r\n") if len(tex) == 0: self.failInfo.append(titleOrigin[2].zfill(self.fillNum)) fo.write('下载失败!') else: #反爬虫处理,必须把对照表TXT文件下载至Fonts文件夹 if cvdic != []: for y in range(len(tex)): for s, v in cvdic.items(): s = re.sub(r'&#x', r'\\u', s) s = re.sub(r';', '', s).encode('utf-8').decode('unicode_escape') tex[y] = re.sub(s, v.strip(), tex[y]) cvdic = cvlist = 0 #作话在文前的情况 if str(sign) == "['readsmall']": for m in tex1: #删除无用文字及多余空格空行 vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": #按行写入正文 fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") else: #作话在文后的情况 for tn in tex: vv = re.sub('@无限好文,尽在晋江文学城', '', str(tn)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) if v != "": fo.write(v + "\n") if len(tex1) != 0: fo.write("\n*\r\n") for m in tex1: vv = re.sub('@无限好文,尽在晋江文学城', '', str(m)) v = re.sub(' +', ' ', vv).strip() v = re.sub(' ', '', v) if self.state == 's': v = OpenCC('t2s').convert(v) elif self.state == 't': v = OpenCC('s2t').convert(v) v = re.sub('作者有话要说:', '作者有话要说:\n', v) if v != "": fo.write(v + "\n") fo.close() self.percent += 1