def novel_simple_save(self): self.__get_category_list() # 按类别循环获取 for key in self.category_list: count = 0 html = None templet = self.templet_format(self, key) if count == 0 and SpiderTools.getRes( ).category_template is not None: index_url = templet.format(1) html = SpiderTools.get_html( url=index_url, header_host=SpiderTools.getRes().host, encoding=SpiderTools.getRes().encoding, network_err_fn=SpiderTools.save_to_file( "pageList.bak", index_url)) if html is None: continue self.__get_category_page_count(html) # 按页码循环列表 for i in range(1, SpiderTools.getRes().category_page_count, 1): insertnovels = [] if i != 1: index_url = templet.format(i) html = SpiderTools.get_html( url=index_url, header_host=SpiderTools.getRes().host, encoding=SpiderTools.getRes().encoding, network_err_fn=SpiderTools.save_to_file( "pageList.bak", index_url)) if html is None: continue novels = SpiderTools.get_pyquery_content( html, SpiderTools.getRes().select_novel_line) if len(novels) == 0: break # 组装小说信息 for item in novels: novelname = SpiderTools.get_pyquery_content( item, SpiderTools.getRes().select_novel_name).text() if novelname is None or novelname == '': continue url = SpiderTools.getRes().host + \ SpiderTools.get_pyquery_content(item, SpiderTools.getRes().select_novel_name).attr("href") author = \ SpiderTools.get_pyquery_content(item, SpiderTools.getRes().select_novel_author).text().split(" ")[0] insertnovels.append( str((novelname, url, author, SpiderTools.getRes().source_id))) # 小说简要信息保存到数据库 if len(insertnovels) == 0: break sql = "insert into novel (`name`,`source`,`author`,`sourceid`) values " sql = sql + ",".join( insertnovels ) + " on DUPLICATE key update source = values(source)" default_dbhelper.update(sql) time.sleep(random.uniform(1, 3))
def get_page_count(self, html): try: page_count = SpiderTools.get_pyquery_content( html, self.select_category_page_count) return int(page_count) except BaseException: raise ValueError("请重写此方法,默认方法无法获取count")
def get_chapters_save(self, url, novel_id): novel_mulu_fn = SpiderTools.save_to_file(file_name="novel_mulu.bak", save_text=url + "," + str(novel_id)) html = SpiderTools.get_html(url, encoding=SpiderTools.getRes().encoding, network_err_fn=novel_mulu_fn) if html is None: return 0 chapters = SpiderTools.get_pyquery_content( html, SpiderTools.getRes().select_chapter) insertchapters = [] for chapter_id in range(0, len(chapters), 1): if chapter_id > 2000: break title = chapters.eq(chapter_id).text() source = SpiderTools.getRes().chapter_url( SpiderTools.getRes(), chapters.eq(chapter_id).attr("href"), url) insertchapters.append( str((novel_id, chapter_id + 1, str(title).replace("%", "%%"), source, SpiderTools.sourceid))) self.init_chapter_table(novel_id) sql = "INSERT into %s (novelId,chapterId,title,source,sourceid) VALUES " % SpiderTools.table_name[ SpiderTools.sourceid] sql = sql + ",".join(insertchapters) default_dbhelper.update(sql) SpiderTools.total[SpiderTools.sourceid] = SpiderTools.total[ SpiderTools.sourceid] + len(insertchapters) if SpiderTools.total[SpiderTools.sourceid] > 5000000: default_dbhelper.update( "update router set novel_id_end = %s where sourceid = %s and novel_id_end is null", (novel_id, SpiderTools.sourceid))
def __get_category_list(self): if self.encoding is None: self.encoding = "utf-8" homepage = SpiderTools.get_html(self.home_page, encoding=self.encoding) items = SpiderTools.get_pyquery_content(homepage, self.select_category) for item in items: url = PyQuery(item).attr("href") if not str(url).startswith("http"): url = self.host + url category = PyQuery(item).text() self.category_list[category] = url
def novel_chapter_detail_save_by_tablename(table_name): dbhelper = DBhelper(host="localhost", user='******', password='******', database='novels') sql = "select novelId,chapterId,source,sourceid from {} where flag = 0 limit 100".format( table_name) result = dbhelper.query(sql) while result is not None and len(result) > 0: updatesql = "update {} set flag = 1 ,content = %s where novelId = %s and chapterId = %s".format( table_name) for item in result: novelId, chapterId, source, sourceid = item[0], item[1], item[ 2], item[3] SpiderTools.sourceid = sourceid html = SpiderTools.get_html( source, encoding=SpiderTools.getRes().encoding, header_host=SpiderTools.getRes().host, network_err_fn=SpiderTools.deal_with_status_500( table_name, novelId, chapterId, dbhelper)) if html is None: time.sleep(random.uniform(0.2, 0.4)) continue content = SpiderTools.get_pyquery_content( html, SpiderTools.getRes().select_chapter_content) content.remove("script") text = content.text().encode("utf-8", errors="ignore") zlib_chapter_text = zlib.compress(text) dbhelper.update(updatesql, (zlib_chapter_text, novelId, chapterId)) if chapterId == 1: updateNovelSql = "update novel set status = 2 where id =%s" dbhelper.update(updateNovelSql, (novelId)) time.sleep(random.uniform(0.2, 0.4)) result = dbhelper.query(sql)
def novel_detail_save(self): tags = {} tag_list = default_dbhelper.query( "SELECT id,`name` from dictionary where type = 'tag'") for _item in tag_list: tags[_item[1]] = _item[0] # 获取需要更新的小说 count_sql = "select count(1) from novel where tagid is null limit 1" count = default_dbhelper.query_one(count_sql) while int(count[0]) > 0: values = default_dbhelper.query( "select source,id,sourceid from novel where tagid is null limit 0,500" ) if values is None or len(values) == 0: break for item in values: novel_home_url = item[0] novel_id = item[1] SpiderTools.sourceid = item[2] # 保存小说详细信息 html = SpiderTools.get_html( novel_home_url, encoding=SpiderTools.getRes().encoding) if html is None: continue # 封面 cover = SpiderTools.get_pyquery_content( html, SpiderTools.getRes().select_novel_cover).attr("src") # 类型 tag = str( SpiderTools.get_pyquery_content( html, SpiderTools.getRes().select_novel_tag).text())[0:2] # 简介 introduction = SpiderTools.get_pyquery_content( html, SpiderTools.getRes().select_novel_introduction).text() bconver = None if cover is not None: # 此函式用于保存封面获取失败 img_fn = SpiderTools.save_to_file( "img.bak", cover + "," + str(novel_id)) bconver = SpiderTools.get_html(cover, return_type="binary", network_err_fn=img_fn) # 获取tagId tag_id = 0 for t in tags: if str(t).find(tag) > -1: tag_id = tags[t] break time.sleep(random.uniform(0.5, 1.5)) # 获取章节列表并保存,再更新novel is_update = self.get_chapters_save( novel_home_url + SpiderTools.getRes().list_url_template, novel_id) if is_update == 0: continue default_dbhelper.update( " update novel set tagid = %s,introduction = %s,cover = %s where id = %s ", (tag_id, introduction, bconver, novel_id)) time.sleep(random.uniform(0.5, 1.5)) count = default_dbhelper.query_one(count_sql)
def quanwenyuedu_get_page_count(target: NovelResource, html): page_count_txt = SpiderTools.get_pyquery_content( html, target.select_category_page_count) p = re.findall('\\d+', page_count_txt.text()) return int(p[1])