def getNextPage(html): """获取下一页地址,如果没有,返回None""" next_url = '<div class="wp-pagenavi">.*<a href="(.*?)">下一页</a>' next_url = parse(html, next_url) if len(next_url) > 0: return next_url[0] else: return None
BASE_DIR = getBaseDir() + "\\txt\\" def getAuthor(html): regex_author = '<h1>.*?</h1>.*?<p>.*?:(.*?)</p>' author = re.findall(regex_author, html, re.S) return author[0] book_url = "https://www.dhzw.org/book/13/13766/" prefix = 'https://www.dhzw.org/book/13/13766/' html = getHtmlByUrl(book_url) book_name_regex = '<h1>(.*?)</h1>' result = parse(html, book_name_regex) isNull(result) book_name = result[0] print(book_name) book_txt = BASE_DIR + book_name + '.txt' book_catalog_txt = BASE_DIR + book_name + "_catalog.txt" # deleteFile(book_txt) # deleteFile(book_catalog_txt) catalog_all_regex = '<dl>(.*?)</dl>' result = parse(html, catalog_all_regex) isNull(result) catalogs_regex = '<dd><a href="(.*?)" .*?>(.*?)</a></dd>' result = parse(result[0], catalogs_regex) isNull(result)
def formattContent(content): content = content.replace(' ', '') content = content.replace('<br />', '') return content book_url = "http://www.123xiaoqiang.me/modules/article/reader.php?aid=15767" prefix = 'http://www.123xiaoqiang.me' BASE_DIR = getBaseDir() html = getHtmlByUrl(book_url) # 获取书名 title_regex = '<h1>(.*?)</h1>' title_result = parse(html, title_regex) isNull(title_result) title = title_result[0] # 获取作者 author_regex = '<span>作者:(.*?)</span>' author_result = parse(html, author_regex) isNull(author_result) author = '作者:' + author_result[0] # 获取目录 catalog_div_regex = '<div class="liebiao">(.*?)</div>' catalog_div_result = parse(html, catalog_div_regex) isNull(catalog_div_result) html = catalog_div_result[0] catalog_regex = '<li><a href="(.*?)">(.*?)</a></li>'
def getPicUrl(html): """根据网页获取图片地址""" pic_url = '<p><img src="(.*?)".*?class="alignnone.*?" /></p>' pic_url = parse(html, pic_url) return pic_url
def getPicNum(html): """获取图片数量""" pic_num = '<title>.*\\[(\\d+)P\\].*</title>' pic_num = parse(html, pic_num) return pic_num[0]
def getName(html): """获取漫画名称""" name = '<h1 class="entry-title">(.*?)</h1>' name = parse(html, name) return name[0]