def process_novel(self, novel: Novel, url: str): retry_time = 0 while retry_time < self.retry_times: try: result = self._opener.open(url).read().decode() except Exception as e: print(e) continue # print(novel) novel.count = int( re.findall('<div> 总字数:([0-9]*?)</div>', result).pop()) novel.type = re.findall( '<div>类别:<a href="/wapsort/[0-9]*?_[0-9]*?\.html" title=".*?">(.*?)</a></div>', result).pop() page = BASE_URL + re.findall( '<a href="(/novel/[0-9]*?/[0-9]*?\.html)" title="' + PAGE + '">1</a>', result).pop() max_page = int( re.findall( '<span style="color:#666;font-size:11px;line-height: 22px;">共([0-9]*?)章节</span>', result).pop()) # print(page) result = self._opener.open(page).read().decode() with open('novels/%d_%s.txt' % (novel.id, novel.title), 'w') as f: try: for i in range(1, max_page + 1): content = re.findall( '<div id="nr1" style="font-size:18px;">([\s\S]*?)</div>', result).pop().replace( '</p>\r\n<p></p>', '').replace('<p>', '').replace('</p>', '').replace( ' ', ' ') # print(content) print('%s->%d/%d' % (novel.title, i, max_page)) f.write(content) if i == max_page: break page = re.findall( '<td class="next"><a id="pt_next" href="(' + BASE_URL + '/novel/[0-9]*?/[0-9]*?\.html)">下一章</a></td>', result).pop() # print(page) result = self._opener.open(page).read().decode() time.sleep(1) self._db.insert_novel(novel) print(novel.title + ' Done') self.log_file.write(novel.title + " Done\n") return except Exception as e: print(e) retry_time += 1 print("!!!Fail to save %s" % novel.title) self.log_file.write("!!!Fail to save %s\n" % novel.title)
def start(self): result = self._opener.open(BASE_URL + '/wapsort/11_1.html').read().decode() max_page = int( re.findall( '<input style="width: 50%;" type="number" name="page" value="" id="go_page" min="1" max="([0-9]*?)" />', result).pop()) # max_page = 5 for i in range(1, max_page): result = self._opener.open(BASE_URL + '/wapsort/11_%d.html' % i).read().decode() ids = re.findall( '<h3><a href="' + BASE_URL + '/novel/([0-9]*)\.html">.*?</a></h3>', result) titles = re.findall( '<h3><a href="' + BASE_URL + '/novel/[0-9]*\.html">(.*?)</a></h3>', result) urls = re.findall( '<h3><a href="(' + BASE_URL + '/novel/[0-9]*\.html)">', result) authors = re.findall('<p>作者:<strong>(.*?)</strong></p>', result) descriptions = re.findall( '<span class="abstract"><a href="' + BASE_URL + '/novel/[0-9]*\.html">([\s\S]*?)</a></span>', result) for j in range(0, titles.__len__()): novel = Novel(titles[j], authors[j], descriptions[j], id=int(ids[j])) if self._db.check_novel_exists(novel.id): print("Skip existing: %s" % novel.title) continue self._pool.submit(self.process_novel, self, novel, urls[j])
def html_parse(html, fileHandle): """fileHandle is a work in progress feature of creating a reading list offline""" reading_list = [] soup = BeautifulSoup(html, 'html.parser') print("\t Parsing Reading List") print(" ====================================") title_names = soup.find_all("tr", attrs={"class": "rl_links"}) chapters = soup.find_all("a", attrs={"class": "chp-release"}) counter = 0 if (len(chapters) % 2) != 0: print("Error: Incorrect number of chapters") sys.exit() if fileHandle: for i in range(len(title_names)): title = title_names[i].attrs["data-title"] current_chapter = chapters[i + counter].get_text() latest_chapter = chapters[i + counter + 1].get_text() counter += 1 temp = Novel(title, latest_chapter, current_chapter) reading_list.append(temp) return (reading_list) else: saveFile = open("readingList.txt", "w") counter = 0 for i in range(len(title_names)): saveFile.write(title_names[i].attrs["data-title"] + ",") saveFile.write(chapters[i + counter].get_text() + ",") saveFile.write(chapters[i + counter + 1].get_text() + "\n") counter += 1 print("The save file,'readingList.txt' has been created...")
def search_novel(): form = request.form text = form.get('content') novel_list = search(text) n_list = [] for i in range(len(novel_list[0])): n = Novel(novel_list[0][i], novel_list[1][i],novel_list[2][i],novel_list[3][i],novel_list[4][i], novel_list[5][i]) n_list.append(n) return render_template('search_result.html',text=text, novel=n_list)
def grab_volume(url, output_dir, cover_path, out_format): """ grab volume Args: url: A string represent the url which was input by user output_dir: A string represent the path of the output EPUB file cover_file: A string represent the path of the EPUB cover out_format: A string represent the output format """ try: print('Getting:' + url) novel = Novel(url=url, single_thread=_SINGLE_THREAD) novel.get_novel_information() epub = Epub(output_dir=output_dir, cover_path=cover_path, out_format=out_format, **novel.novel_information()) epub.generate_file() except Exception as e: print('错误', str(e) + '\nAt:' + url) raise e
def scrape(): try: url = request.json["url"] except KeyError: return "Missing url in request", 400 if DEBUG: d = f"{DATA}/{time()}" os.mkdir(d) fname = Novel(url).collect(d) return send_file(os.path.join(d, fname)) else: try: d = f"{DATA}/{time()}" os.mkdir(d) fname = Novel(url).collect(d) return send_file(os.path.join(d, fname)) except Exception as e: print(e) return "Failed to scrape book", 400
def grab_volume(url, output_dir, cover_path): """ grab volume Args: url: A string represent the url which was input by user output_dir: A string represent the path of the output EPUB file cover_file: A string represent the path of the EPUB cover """ try: print_info('Getting:' + url) novel = Novel(url=url, single_thread=SINGLE_THREAD) novel.get_novel_information() epub = Epub(output_dir=output_dir, cover_path=cover_path, **novel.novel_information()) epub.generate_epub() except Exception as e: if HAS_QT: SENDER.sigWarningMessage.emit('错误', str(e) + '\nat:' + url) SENDER.sigButton.emit() print(url) raise e
def fileReader(): if os.path.isfile("readingList.txt") == False: print("Error: Save file, 'readingList.txt' is missing") sys.exit() readingList = [] saveFile = open("readingList.txt", "r") line = saveFile.readline() while (len(line) != 0): data = line.split(",") newNovel = Novel(data[0], data[1], data[2][0:len(data[2]) - 1]) readingList.append(newNovel) line = saveFile.readline() return readingList
def __init__(self, bookid): Novel.__init__(self, bookid) self.urlheader = 'https://www.69shu.io' self.blackliststrline = ['一秒记住【69书吧www.69shu.io】,更新快,无弹窗,免费读!'] self.blackliststr = []
def __init__(self, bookid): Novel.__init__(self, bookid) self.urlheader = 'https://www.biqudu.net' self.blackliststrline = ['readx();', 'chaptererror();'] self.blackliststr = []
from novel import Novel from printer import Printer from librarian import Librarian novel = Novel("1984", "George Orwell") librarian = Librarian() librarian.setNovel(novel) printer = Printer() printer.setNovel(novel) # Uppdatera årtalet novel.setYear(1949) #uppdatera innehållet novel.setContent( "Freedom is the freedom to say that two plus two make four. If that is granted, all else follows." )
from novel import Novel # 分析三体 novel = Novel("三体.txt", "三体主要人物.txt") novel.draw_picture("三体主要人物名字出现次数")
def analyze_text(self, book_folder, out_folder): filename = self.input_file.split('.')[0] result_book_folder = out_folder + filename + "/" if not os.path.exists(result_book_folder): os.makedirs(os.path.dirname(result_book_folder)) novel = Novel(book_folder + self.input_file) novel.read() novel.parse_persons() novel.find_persons_title() novel.store(filename=result_book_folder + self.all_names, data=novel.persons) # if you do not remove single occurrences, eps behaviour will be unstable occurrence_limit = 2 novel.remove_less_than(occurrences=occurrence_limit) novel.store(filename=result_book_folder + filename + "_names_more_than_" + str(occurrence_limit) + ".csv", data=novel.persons) novel.cluster_aliases() novel.associate_simple_single_names() novel.associate_single_names() novel.store(filename=result_book_folder + self.clusters, data=novel.cluster_repetitions) novel.create_cluster_repetitions_df() novel.cluster_repetitions_df.to_pickle(result_book_folder + filename + '.pkl') novel.dealiases() novel.store(filename=result_book_folder + filename + "_dealiased.txt", data=novel.dealiased_text, type='txt') #Do the coreference after the dealias, because sometimes the coreference write a name just after a separation # and this lead to some not desired wrong situations in which name are together (e.g. "Potter,Hermione") novel.coreference() novel.store(filename=result_book_folder + self.output_file, data=novel.dealiased_text, type='txt') self.novel = novel return novel.cluster_repetitions_df
def analyze_text(self, book_folder, out_folder): filename = self.input_file.split('.')[0] result_book_folder = out_folder + filename + "/" if not os.path.exists(result_book_folder): os.makedirs(os.path.dirname(result_book_folder)) novel = Novel(book_folder + self.input_file) novel.read() novel.parse_persons() novel.store(filename=result_book_folder + self.all_names, data=novel.persons) # if you do not remove single occurrences, eps behaviour will be unstable occurrence_limit = 2 novel.remove_less_than(occurrences=occurrence_limit) novel.store(filename=result_book_folder + filename + "_names_more_than_" + str(occurrence_limit) + ".csv", data=novel.persons) novel.cluster_aliases() novel.associate_single_names() novel.store(filename=result_book_folder + self.clusters, data=novel.cluster_repetitions) novel.dealiases() novel.store(filename=result_book_folder + self.output_file, data=novel.dealiased_text, type='txt') self.novel = novel
from novel import Novel if __name__ == '__main__': Novel.load_novels() c = Novel.get_novel('Archfiend') assert isinstance(c, Novel) # c.add_chosen_book("13") c.add_chosen_book("1") # c.add_chosen_book("15") # c.add_chosen_book("16") # c.add_chosen_book("17") # c.add_chosen_book("18") # c.add_chosen_book("19") c.process()