Esempio n. 1
0
 def __get_category_list(self):
     if self.encoding is None:
         self.encoding = "utf-8"
     homepage = SpiderTools.get_html(self.home_page, encoding=self.encoding)
     items = SpiderTools.get_pyquery_content(homepage, self.select_category)
     for item in items:
         url = PyQuery(item).attr("href")
         if not str(url).startswith("http"):
             url = self.host + url
         category = PyQuery(item).text()
         self.category_list[category] = url
Esempio n. 2
0
 def get_page_count(self, html):
     try:
         page_count = SpiderTools.get_pyquery_content(
             html, self.select_category_page_count)
         return int(page_count)
     except BaseException:
         raise ValueError("请重写此方法,默认方法无法获取count")
Esempio n. 3
0
 def get_chapters_save(self, url, novel_id):
     novel_mulu_fn = SpiderTools.save_to_file(file_name="novel_mulu.bak",
                                              save_text=url + "," +
                                              str(novel_id))
     html = SpiderTools.get_html(url,
                                 encoding=SpiderTools.getRes().encoding,
                                 network_err_fn=novel_mulu_fn)
     if html is None:
         return 0
     chapters = SpiderTools.get_pyquery_content(
         html,
         SpiderTools.getRes().select_chapter)
     insertchapters = []
     for chapter_id in range(0, len(chapters), 1):
         if chapter_id > 2000:
             break
         title = chapters.eq(chapter_id).text()
         source = SpiderTools.getRes().chapter_url(
             SpiderTools.getRes(),
             chapters.eq(chapter_id).attr("href"), url)
         insertchapters.append(
             str((novel_id, chapter_id + 1, str(title).replace("%", "%%"),
                  source, SpiderTools.sourceid)))
     self.init_chapter_table(novel_id)
     sql = "INSERT into %s (novelId,chapterId,title,source,sourceid) VALUES " % SpiderTools.table_name[
         SpiderTools.sourceid]
     sql = sql + ",".join(insertchapters)
     default_dbhelper.update(sql)
     SpiderTools.total[SpiderTools.sourceid] = SpiderTools.total[
         SpiderTools.sourceid] + len(insertchapters)
     if SpiderTools.total[SpiderTools.sourceid] > 5000000:
         default_dbhelper.update(
             "update router set novel_id_end = %s where sourceid = %s and novel_id_end is null",
             (novel_id, SpiderTools.sourceid))
Esempio n. 4
0
def novel_chapter_detail_save_by_tablename(table_name):
    dbhelper = DBhelper(host="localhost",
                        user='******',
                        password='******',
                        database='novels')
    sql = "select novelId,chapterId,source,sourceid from {} where flag = 0 limit 100".format(
        table_name)
    result = dbhelper.query(sql)
    while result is not None and len(result) > 0:
        updatesql = "update {} set flag = 1 ,content = %s where novelId = %s and chapterId = %s".format(
            table_name)
        for item in result:
            novelId, chapterId, source, sourceid = item[0], item[1], item[
                2], item[3]
            SpiderTools.sourceid = sourceid
            html = SpiderTools.get_html(
                source,
                encoding=SpiderTools.getRes().encoding,
                header_host=SpiderTools.getRes().host,
                network_err_fn=SpiderTools.deal_with_status_500(
                    table_name, novelId, chapterId, dbhelper))
            if html is None:
                time.sleep(random.uniform(0.2, 0.4))
                continue
            content = SpiderTools.get_pyquery_content(
                html,
                SpiderTools.getRes().select_chapter_content)
            content.remove("script")
            text = content.text().encode("utf-8", errors="ignore")
            zlib_chapter_text = zlib.compress(text)
            dbhelper.update(updatesql, (zlib_chapter_text, novelId, chapterId))
            if chapterId == 1:
                updateNovelSql = "update novel set status = 2 where id =%s"
                dbhelper.update(updateNovelSql, (novelId))
            time.sleep(random.uniform(0.2, 0.4))
        result = dbhelper.query(sql)
Esempio n. 5
0
 def novel_detail_save(self):
     tags = {}
     tag_list = default_dbhelper.query(
         "SELECT id,`name` from dictionary where type = 'tag'")
     for _item in tag_list:
         tags[_item[1]] = _item[0]
     # 获取需要更新的小说
     count_sql = "select count(1) from novel where tagid is null limit 1"
     count = default_dbhelper.query_one(count_sql)
     while int(count[0]) > 0:
         values = default_dbhelper.query(
             "select source,id,sourceid from novel where tagid is null limit 0,500"
         )
         if values is None or len(values) == 0:
             break
         for item in values:
             novel_home_url = item[0]
             novel_id = item[1]
             SpiderTools.sourceid = item[2]
             # 保存小说详细信息
             html = SpiderTools.get_html(
                 novel_home_url, encoding=SpiderTools.getRes().encoding)
             if html is None:
                 continue
             # 封面
             cover = SpiderTools.get_pyquery_content(
                 html,
                 SpiderTools.getRes().select_novel_cover).attr("src")
             # 类型
             tag = str(
                 SpiderTools.get_pyquery_content(
                     html,
                     SpiderTools.getRes().select_novel_tag).text())[0:2]
             # 简介
             introduction = SpiderTools.get_pyquery_content(
                 html,
                 SpiderTools.getRes().select_novel_introduction).text()
             bconver = None
             if cover is not None:
                 # 此函式用于保存封面获取失败
                 img_fn = SpiderTools.save_to_file(
                     "img.bak", cover + "," + str(novel_id))
                 bconver = SpiderTools.get_html(cover,
                                                return_type="binary",
                                                network_err_fn=img_fn)
             # 获取tagId
             tag_id = 0
             for t in tags:
                 if str(t).find(tag) > -1:
                     tag_id = tags[t]
                     break
             time.sleep(random.uniform(0.5, 1.5))
             # 获取章节列表并保存,再更新novel
             is_update = self.get_chapters_save(
                 novel_home_url + SpiderTools.getRes().list_url_template,
                 novel_id)
             if is_update == 0:
                 continue
             default_dbhelper.update(
                 " update novel set tagid = %s,introduction = %s,cover = %s where id = %s ",
                 (tag_id, introduction, bconver, novel_id))
             time.sleep(random.uniform(0.5, 1.5))
         count = default_dbhelper.query_one(count_sql)
Esempio n. 6
0
 def novel_simple_save(self):
     self.__get_category_list()
     # 按类别循环获取
     for key in self.category_list:
         count = 0
         html = None
         templet = self.templet_format(self, key)
         if count == 0 and SpiderTools.getRes(
         ).category_template is not None:
             index_url = templet.format(1)
             html = SpiderTools.get_html(
                 url=index_url,
                 header_host=SpiderTools.getRes().host,
                 encoding=SpiderTools.getRes().encoding,
                 network_err_fn=SpiderTools.save_to_file(
                     "pageList.bak", index_url))
             if html is None:
                 continue
             self.__get_category_page_count(html)
         # 按页码循环列表
         for i in range(1, SpiderTools.getRes().category_page_count, 1):
             insertnovels = []
             if i != 1:
                 index_url = templet.format(i)
                 html = SpiderTools.get_html(
                     url=index_url,
                     header_host=SpiderTools.getRes().host,
                     encoding=SpiderTools.getRes().encoding,
                     network_err_fn=SpiderTools.save_to_file(
                         "pageList.bak", index_url))
                 if html is None:
                     continue
             novels = SpiderTools.get_pyquery_content(
                 html,
                 SpiderTools.getRes().select_novel_line)
             if len(novels) == 0:
                 break
             # 组装小说信息
             for item in novels:
                 novelname = SpiderTools.get_pyquery_content(
                     item,
                     SpiderTools.getRes().select_novel_name).text()
                 if novelname is None or novelname == '':
                     continue
                 url = SpiderTools.getRes().host + \
                       SpiderTools.get_pyquery_content(item, SpiderTools.getRes().select_novel_name).attr("href")
                 author = \
                 SpiderTools.get_pyquery_content(item, SpiderTools.getRes().select_novel_author).text().split(" ")[0]
                 insertnovels.append(
                     str((novelname, url, author,
                          SpiderTools.getRes().source_id)))
             # 小说简要信息保存到数据库
             if len(insertnovels) == 0:
                 break
             sql = "insert into novel (`name`,`source`,`author`,`sourceid`) values "
             sql = sql + ",".join(
                 insertnovels
             ) + " on DUPLICATE key update source = values(source)"
             default_dbhelper.update(sql)
             time.sleep(random.uniform(1, 3))
Esempio n. 7
0
def quanwenyuedu_get_page_count(target: NovelResource, html):
    page_count_txt = SpiderTools.get_pyquery_content(
        html, target.select_category_page_count)
    p = re.findall('\\d+', page_count_txt.text())
    return int(p[1])
Esempio n. 8
0
    select_category_page_count=".box > .list_page >span:eq(1)",
    select_novel_line='.box .top',
    select_novel_name='h3 a',
    select_novel_author='p span',
    list_url_template='xiaoshuo.html',
    select_novel_tag=".top p:eq(2) span",
    select_novel_introduction=".description p",
    select_novel_cover=".top img",
    select_chapter="ul.list li a",
    select_chapter_content="#content > p")

quanwenyuedu.templet_format = quanwenyuedu_templet_format
quanwenyuedu.chapter_url = quanwenyuedu_chapter_url
quanwenyuedu.get_page_count = quanwenyuedu_get_page_count

SpiderTools.addRes(duyidu.source_id, duyidu)
SpiderTools.addRes(quanwenyuedu.source_id, quanwenyuedu)

if __name__ == '__main__':
    # html = SpiderTools.get_html("http://du1du.org/txt-33897/130802985.htm", encoding='gbk')
    # m = SpiderTools.get_pyquery_content(html, "#txtContent")
    # tag = m.remove("script")
    # print(m.text())
    if len(sys.argv) == 3:
        if int(sys.argv[1]) == 1:
            # 启动 du1du.org
            SpiderTools.sourceid = 1
            duyidu.start(int(sys.argv[2]))
        elif int(sys.argv[1]) == 2:
            # 启动 www.quanwenyuedu.io
            SpiderTools.sourceid = 2