Esempio n. 1
0
 def save_novel_one_chapter(self, index, name, content, chapter_url) -> bool:
     novel_id = self.get_nid()
     if novel_id < 0:
         if self._mysql.novel_info_exist(self.get_book_url()):  # 小说信息存在,更新
             # 获取小说 id 给 novel_info
             flag, novel_id = self._mysql.get_novel_id_by_url(self.get_book_url())
             if not flag:
                 return False
             self._info.set_nid(novel_id)
         novel_id = self.get_nid()
     parser = self._parser_name
     update_time = int(time.time())
     if novel_id < 0:
         log.error(self.get_name() + '|' + self.get_author() + '|' + 'nid 获取失败!')
         return False
     # 检查是否上锁
     if self._mysql.novel_chapter_is_locked_by_url(chapter_url):
         log.info(self.get_name() + '|' + self.get_author() + '| ' + name + ' 章节信息上锁!')
         return False
     # 检查章节信息是否存在
     if self._mysql.novel_chapter_exist(chapter_url):  # 小说章节存在,更新
         self._mysql.update_novel_chapter_by_url(novel_id, index, chapter_url, name, content, update_time)
     else:
         self._mysql.insert_novel_chapter(novel_id, index, chapter_url, parser, name, content, update_time)
     return True
Esempio n. 2
0
 def insert_novel_info(self, name: str, author: str, category: str,
                       describe: str, complete: int, parser: str,
                       book_url: str, img_url: str, img_content: str,
                       chapter_base_url: str, create_time: int,
                       update_time: int) -> (bool, int):
     flag = False
     novel_id = -1
     msql = 'INSERT INTO `novel_info` (`name`, `author`, `category`, `describe`, `complete`, `parser`, `book_url`,'\
            ' `img_url`, `img_content`, `chapter_base_url`, `create_time`, `update_time`)' \
            ' VALUES ("{name}", "{author}", "{category}", "{describe}", "{complete}", "{parser}", "{book_url}",' \
            ' "{img_url}", "{img_content}", "{chapter_base_url}", "{create_time}", "{update_time}");'\
         .format(name=self._connect.escape_string(name), author=self._connect.escape_string(author),
                 category=self._connect.escape_string(category), describe=self._connect.escape_string(describe),
                 complete=complete, parser=self._connect.escape_string(parser),
                 book_url=self._connect.escape_string(book_url), img_url=self._connect.escape_string(img_url),
                 img_content=self._connect.escape_string(str(img_content)),
                 chapter_base_url=self._connect.escape_string(chapter_base_url),
                 create_time=create_time, update_time=update_time)
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         novel_id = int(curosr.lastrowid)
         if novel_id >= 0:
             flag = True
             log.info(name + '|' + author + '信息保存成功!')
         else:
             log.error(name + '|' + author + '信息保存失败!')
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, novel_id
Esempio n. 3
0
 def insert_passage_info (self, title: str, author: str, pageviews: int, tim: int, \
                          textTop: str, img: str, textBottom: str, type: int, url: str):
     flag = False
     id = -1
     # 检查关键字段是否存在
     if (not Util.valid(title)) or (not Util.valid(author)):
         return flag, -1
     msql = 'INSERT INTO `article` (' \
            '`title`, `author`, `pageviews`, `time`, `textTop`, `img`, `textbottom`, `type`, `url`)' \
            ' VALUES ("{title}", "{author}", "{pageviews}", "{tim}", "{textTop}", "{img}", "{textBottom}",' \
            ' "{type}", "{url}");'.format(
             title=self._connect.escape_string(title), author=self._connect.escape_string(author),
             pageviews=pageviews, tim=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(tim)),
             textTop=self._connect.escape_string(textTop), img=self._connect.escape_string(img),
             textBottom=self._connect.escape_string(textBottom), type=type,
             url=self._connect.escape_string(url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         id = int(curosr.lastrowid)
         if id >= 0:
             flag = True
             log.info('id=%d, name=%s, author=%s,信息保存成功' %
                      (id, title, author))
         else:
             log.error(title + '|' + author + '信息保存失败!')
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, id
Esempio n. 4
0
 def blog_exist(self, url) -> bool:
     flag = False
     name = hashlib.md5(url.encode('utf-8')).hexdigest()
     if os.path.exists(self._dir + '/index.txt'):
         log.info('blog %s 已存在!' % self._dir)
         flag = True
     return flag
Esempio n. 5
0
 def write_blog(self, url: str, title: str, tim: int, category: str,
                tag: str, spider: str, content: str, imgUrl: str):
     # 检查关键字段是否存在
     if (not Util.valid(title)) or (not Util.valid(spider) or
                                    (not Util.valid(url))):
         log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url)
         return False, 1
     indexfile = self._dir + '/index.txt'
     log.info('blog index %s 已存在!' % indexfile)
     with open(indexfile, 'w+') as fw:
         fw.write('name:%s\n' % title)
         fw.write('url:%s\n' % url)
         fw.write('category:%s\n' % category)
         fw.write('tag:%s\n' % tag)
         fw.write('time:%s\n' % tim)
         fw.write('spider:%s\n' % spider)
         fw.write('img:%s\n' % imgUrl)
     passagefile = self._dir + '/%s.html' % title
     log.info('blog passage %s 已存在!' % passagefile)
     with open(passagefile, 'w+') as fw:
         fw.write('<!Doctype html> \
                     <html> \
                         <head> \
                             <title>%s</title> \
                         </head> \
                         <body>%s</body> \
                     </html>' % (title, content))
     return True, 1
Esempio n. 6
0
 def blog_delete(self, url) -> bool:
     name = hashlib.md5(url.encode('utf-8')).hexdigest()
     bpath = self._rootdir + '/' + name
     try:
         os.rmdir(bpath)
     except Exception as e:
         log.warn(str(e))
     log.info('blog %s 删除成功!' % bpath)
     return True
Esempio n. 7
0
 def run(self):
     parser = get_parser().get_parser(COM_PIAOLIANG_NAME)
     for url in self.get_passage_list():
         arr = url.split('|')
         url = arr[0]
         type = int(arr[1])
         text = Spider.http_get(url)
         if '' == text:
             continue
         doc = parser.parse(
             text, rule='body>.main-wrap>.m-wrap>.main-part>.listitem>ul')
         for ct in doc.children().items():
             passage = XingZuo(COM_PIAOLIANG_NAME)
             flag, name = parser.parse(
                 ct.html(), parse_type=parser.PARSER_PASSAGE_TITLE)
             if flag:
                 passage.set_title(name)
             else:
                 continue
             flag, readnum = parser.parse(
                 ct.html(), parse_type=parser.PARSER_PASSAGE_READ)
             if flag:
                 passage.set_pageviews(int(readnum))
             flag, passage_url = parser.parse(
                 ct.html(), parse_type=parser.PARSER_PASSAGE_URL)
             if not flag:
                 continue
             passage.set_url(passage_url)
             # 检测文章是否存在
             if passage.exist(passage_url):
                 log.info(name + '已存在!')
                 continue
             """ 获取文章内容 """
             text1 = Spider.http_get(passage_url)
             if '' == text1:
                 continue
             # 获取 来源、时间
             flag, tm = parser.parse(text1,
                                     parse_type=parser.PARSER_PASSAGE_DATE)
             passage.set_time(tm)  # 转成时间戳
             flag, author = parser.parse(
                 text1, parse_type=parser.PARSER_PASSAGE_AUTHOR)
             passage.set_author(author)
             # 获取 内容、图片链接
             flag, top, img, bottom = parser.parse(
                 text1, parse_type=parser.PARSER_PASSAGE_CONTENT)
             if flag:
                 passage.set_img(img)
                 passage.set_textTop(top)
                 passage.set_textBottom(bottom)
                 passage.set_type(type)
                 """ 保存文章信息 """
                 passage.save_passage_info()
             else:
                 log.warn(title + '抓取失败!')
     log.info(self._name + '执行完成!')
     pass
Esempio n. 8
0
 def update_novel_chapter_by_url(self, novel_id: int, index: int,
                                 chapter_url: str, name: str, content: str,
                                 update_time: int) -> bool:
     msql = 'UPDATE `novel_chapter` SET `nid` = "{nid}", `index`="{index}", `name`="{name}",\
         `content`="{content}", `update_time`="{update_time}" WHERE `chapter_url`="{chapter_url}";'\
         .format(nid=novel_id, index=index, name=self._connect.escape_string(name),
                 content=self._connect.escape_string(content), update_time=update_time,
                 chapter_url=self._connect.escape_string(chapter_url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         log.info(
             str(index) + '|' + name + '|' + chapter_url + ' 章节信息更新成功!')
     except Exception as e:
         log.error('章节信息更新失败: ' + str(e))
         return False
     return True
Esempio n. 9
0
 def insert_novel_chapter(self, novel_id: int, index: int, chapter_url: str,
                          parser: str, name: str, content: str,
                          update_time: int):
     msql = 'INSERT INTO `novel_chapter` (`nid`, `index`, `chapter_url`,`parser`,\
         `name`, `content`, `update_time`) VALUES \
          ("{nid}", "{index}", "{chapter_url}", "{parser}", "{name}", "{content}", "{update_time}");' \
         .format(nid=novel_id, index=index, chapter_url=self._connect.escape_string(chapter_url),
                 parser=self._connect.escape_string(parser), name=self._connect.escape_string(name),
                 content=self._connect.escape_string(str(content)), update_time=update_time)
     try:
         cursor = self._connect.cursor()
         cursor.execute(msql)
         self._connect.commit()
         log.info(
             str(index) + '|' + name + '|' + chapter_url + ' 章节信息插入成功!')
     except Exception as e:
         log.error('插入章节' + name + '错误:' + str(e))
         return False
     return True
Esempio n. 10
0
 def save_passage_info(self) -> bool:
     # 检测信息是否存在
     if self._mysql.passage_info_exist(self.get_url()):  # 存在则跳过
         log.info(self.get_title() + '|' + self.get_author() + ' 文章已存在!')
     else:  # 不存在,则插入
         flag, id = self._mysql.insert_passage_info(self.get_title(),
                                                    self.get_author(),
                                                    self.get_pageviews(),
                                                    self.get_time(),
                                                    self.get_textTop(),
                                                    self.get_img(),
                                                    self.get_textBottom(),
                                                    self.get_type(),
                                                    self.get_url())
         if not flag:
             return False
         log.info(
             str(id) + '|' + self.get_title() + '|' + self.get_author() +
             ' 书籍信息插入成功!')
     return True
Esempio n. 11
0
 def update_novel_info_by_url(self, book_url: str, name: str, author: str,
                              category: str, describe: str, complete: int,
                              img_url: str, img_content: str,
                              chapter_base_url: str, update_time: int):
     msql = 'UPDATE `novel_info` SET `name`="{name}", `author`="{author}", `category`="{category}", \
             `describe`="{describe}", `complete`="{complete}", `img_url`="{img_url}", `img_content`="{img_content}",\
             `chapter_base_url`="{chapter_base_url}", `update_time`="{update_time}" WHERE `book_url`="{book_url}";'\
             .format(name=self._connect.escape_string(name), author=self._connect.escape_string(author),
                     category=self._connect.escape_string(category), describe=self._connect.escape_string(describe),
                     complete=complete, img_url=self._connect.escape_string(img_url),
                     img_content=self._connect.escape_string(str(img_content)),
                     chapter_base_url=self._connect.escape_string(chapter_base_url),
                     update_time=update_time, book_url=self._connect.escape_string(book_url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         log.info(name + '|' + author + '信息更新成功!')
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return None
Esempio n. 12
0
 def update_novel_info_img_url_by_url(self, book_url: str, img_url: str):
     if self.novel_info_is_locked_by_url(book_url):
         log.info('书籍信息被锁!不可修改!')
         return True
     if self.novel_info_exist(book_url):  # 小说信息存在,更新
         msql = 'UPDATE `novel_info` SET img_url = "{img_url}", `update_time`="{update}"\
                 WHERE book_url = "{book_url}";'\
                 .format(img_url=self._connect.escape_string(str(img_url)),
                         book_url=self._connect.escape_string(book_url),
                         update=int(time.time()))
         try:
             curosr = self._connect.cursor()
             curosr.execute(msql)
             self._connect.commit()
         except Exception as e:
             log.error('书籍封面页URL更新失败:: ' + str(e))
             return False
         else:
             log.error('要更新的小说信息不存在!')
             return False
     return True
Esempio n. 13
0
 def save_novel_info(self) -> bool:
     # 检测信息是否上锁
     if self._mysql.novel_info_is_locked_by_url(self.get_book_url()):
         log.info(self.get_name() + '|' + self.get_author() + '信息上锁!')
         return False
     # 检测信息是否存在
     if self._mysql.novel_info_exist(self.get_book_url()):                       # 小说信息存在,更新
         # 获取小说 id 给 novel_info
         flag, novel_id = self._mysql.get_novel_id_by_url(self.get_book_url())
         if not flag:
             return False
         self._info.set_nid(novel_id)
         self._mysql.update_novel_info_by_url(self.get_book_url(), self.get_name(), self.get_author(),
                     self.get_category(), self.get_describe(), self.get_complete(), self.get_img_url(),
                     self.get_img_content(), self.get_chapter_base_url(), self.get_update_time())
         log.info(str(novel_id) + '|' + self.get_name() + '|' + self.get_author() + ' 书籍信息更新成功!')
     else:                                                                       # 小说信息不存在,插入小说信息
         flag, novel_id = self._mysql.insert_novel_info(self.get_name(), self.get_author(), self.get_category(),
                     self.get_describe(), self.get_complete(), self._parser_name, self.get_book_url(),
                     self.get_img_url(), self.get_img_content(), self.get_chapter_base_url(),
                     self.get_create_time(), self.get_update_time())
         if not flag:
             return False
         self._info.set_nid(novel_id)
         log.info(str(novel_id) + '|' + self.get_name() + '|' + self.get_author() + ' 书籍信息插入成功!')
     return True
Esempio n. 14
0
 def save (self):
     imgs = [i.get_url() for i in self.__image]
     # 检测信息是否存在
     if self.__save.blog_exist(self.get_url()):
         log.info ('文章: %s 已存在!', self.get_title())
         return True
     flag, bid = self.__save.write_blog (self.get_url() , self.get_title(),\
             self.get_date(), self.get_category(), self.get_tag(), self.get_spider_name(),\
             self.get_content(), '|'.join(imgs))
     if not flag:
         return False
     # 检测图片是否存在
     for img in self.__image:
         if self.__save.image_exist(img.get_url()):
             log.info ('图片: %s 已存在!', self.get_url())
             continue
         flag, iid = self.__save.write_image(img.get_name(), img.get_ext_name(), img.get_content(), img.get_url(), bid)
         if not flag:
             log.error ('图片: %s 保存失败!', img.get_url())
             self.__save.blog_delete (self.get_url())
             return False
     return True
Esempio n. 15
0
 def run(self):
     parser = get_parser().get_parser(COM_XZW_NAME)
     for url in self.get_passage_list():
         arr = url.split('|')
         url = arr[0]
         type = int(arr[1])
         text = Spider.http_get(url)
         if '' == text:
             continue
         doc = parser.parse(text,
                            rule='body>.wrapper>#list>.main>.l-item>ul')
         for ct in doc.children().items():
             passage = XingZuo(COM_XZW_NAME)
             flag, name = parser.parse(
                 ct.html(), parse_type=parser.PARSER_PASSAGE_TITLE)
             if flag:
                 passage.set_title(name)
             flag, passage_url = parser.parse(
                 ct.html(), parse_type=parser.PARSER_PASSAGE_URL)
             if not flag:
                 continue
             passage.set_url(passage_url)
             # 检测文章是否存在
             if passage.exist(passage_url):
                 log.info(name + '已存在!')
                 continue
             # 获取文章内容
             text1 = Spider.http_get(passage_url)
             if '' == text1:
                 continue
             flag, author = parser.parse(
                 text1, parse_type=parser.PARSER_PASSAGE_AUTHOR)
             passage.set_author(author)
             # 获取 内容、图片链接
             flag, top, img, bottom = parser.parse(
                 text1, parse_type=parser.PARSER_PASSAGE_CONTENT)
             if flag:
                 passage.set_img(img)
                 passage.set_textTop(top)
                 passage.set_textBottom(bottom)
                 passage.set_type(type)
                 if '' == bottom or '' == top or '' == name or '' == passage_url:
                     continue
                 passage.save_passage_info()
                 log.info(name + '|' + author + ' 抓取成功!')
             else:
                 log.warn(name + '抓取失败!')
     log.info(self._name + '执行完成!')
Esempio n. 16
0
 def __init__(self):
     self._name = CC_UU234_NAME
     self._webURL = CC_UU234_WEB_URL
     log.info('name:' + self._name + ' url:' + self._webURL +
              ' spider安装成功!')
Esempio n. 17
0
 def run(self):
     parser = get_parser().get_parser(CC_UU234_NAME)
     for url in self.get_book_list():
         text = Spider.http_get(url)
         if '' == text:
             continue
         doc = parser.parse(text, rule='body>div>.listconl>.clearfix')
         for ct in doc.children().items():
             novel = Novel(CC_UU234_NAME)
             flag, name = parser.parse(ct.html(),
                                       parse_type=parser.PARSER_BOOK_NAME)
             novel.set_name(name)
             flag, author = parser.parse(
                 ct.html(), parse_type=parser.PARSER_BOOK_AUTHOR)
             novel.set_author(author)
             flag, url = parser.parse(ct.html(),
                                      parse_type=parser.PARSER_BOOK_URL)
             novel.set_book_url(url)
             flag, category = parser.parse(
                 ct.html(), parse_type=parser.PARSER_BOOK_CATEGORY)
             novel.set_category(category)
             flag, status = parser.parse(
                 ct.html(), parse_type=parser.PARSER_BOOK_STATUS)
             novel.set_complete(status)
             if novel.has_book(url):
                 log.info(name + '|' + author + '已存在!')
                 continue
             text = Spider.http_get(novel.get_book_url())
             if '' == text:
                 continue
             flag, img_url = parser.parse(
                 text, parse_type=parser.PARSER_BOOK_IMG_URL)
             novel.set_img_url(img_url)
             flag, desc = parser.parse(text,
                                       parse_type=parser.PARSER_BOOK_DESC)
             novel.set_describe(desc)
             flag, chapter_url = parser.parse(
                 text, parse_type=parser.PARSER_BOOK_CHAPTER_BASE_URL)
             novel.set_chapter_base_url(chapter_url)
             img_content = Spider.http_get(novel.get_img_url(),
                                           resource_method=2)
             novel.set_img_content(img_content)
             text = Spider.http_get(novel.get_chapter_base_url())
             if '' == text:
                 continue
             if not novel.save_novel_info():
                 continue  # 保存小说信息,上锁或出错则跳过
             for index, name, chapter_url in parser.parse(
                     text, parse_type=parser.PARSER_BOOK_CHAPTER_URL):
                 # 测试是否已经包含章节信息
                 if novel.has_chapter(chapter_url):
                     # log.info(novel.get_name() + '|' + novel.get_author() + '|' + name + '已经存在!')
                     continue
                 content = ''
                 novel.save_novel_one_chapter(index, name, content,
                                              chapter_url)
                 log.info('正在获取 ' + novel.get_name() + '|' +
                          novel.get_author() + '|' + name + '|' +
                          chapter_url)
                 c = Spider.http_get(chapter_url)
                 if '' == text:
                     log.error(novel.get_name() + '|' + novel.get_author() +
                               '|' + name + '下载失败!')
                     continue
                 flag, content = parser.parse(
                     c, parse_type=parser.PARSER_BOOK_CHAPTER_CONTENT)
                 if flag:
                     novel.save_novel_one_chapter(index, name, content,
                                                  chapter_url)
     log.info(self._name + '执行完成!')
Esempio n. 18
0
 def __init__(self):
     self._name = COM_PIAOLIANG_NAME
     self._webURL = COM_PIAOLIANG_WEB_URL
     log.info('name:' + self._name + ' url:' + self._webURL +
              ' spider安装成功!')
Esempio n. 19
0
#!/usr/bin/env python3.7
# -*- encoding=utf8 -*-

from frame.log.log import log
from frame.common.param import *
from frame.thread import ThreadPool
from frame.spider_factory import SpiderFactory

from url.com_linuxidc import blog_linuxidc

if __name__ == '__main__':
    log.info('抓取任务开始执行...')
    spiderFactory = SpiderFactory()
    tpool = ThreadPool()
    """ https://www.linuxidc.com 开始 """
    linuxidc = spiderFactory.get_spider(COM_LINUXIDC_NAME)
    linuxidc.set_seed_urls(blog_linuxidc)
    tpool.set_spider(linuxidc)
    """ https://www.linuxidc.com 结束 """

    tpool.run()
    log.info('抓取任务完成!')
    exit(0)
Esempio n. 20
0
 def working (self, thread_id: int):
     log.info('spider id: ' + str(thread_id) + ' 启动成功!')
     while True:
         if self._mutex.acquire():
             spider: Spider = None
             if len(self._spider) > 0:
                 spider = self._spider.pop()
                 log.info('成功获取spider: ' + spider.get_name())
             self._mutex.release()
             if spider is None:
                 log.info('任务执行完毕! spider id: ' + str(thread_id) + ' 开始退出...')
                 break
             else:
                 log.info('spider: ' + spider.get_name() + '开始检查!')
                 spider.check()
                 log.info('spider: ' + spider.get_name() + '开始执行!')
                 spider.run()
                 log.info('spider: ' + spider.get_name() + '执行完成!')
Esempio n. 21
0
 def __init__(self):
     self._name = COM_LINUXIDC_NAME
     self._webURL = COM_LINUXIDC_WEB_URL
     self._save = 'file'
     log.info('name:' + self._name + ' url:' + self._webURL +
              ' save type:' + self._save + ' spider安裝成功!')
Esempio n. 22
0
    def run(self):
        parser = get_parser().get_parser(self._name)
        for url in self.get_passage_list():
            text = Spider.http_get(url)
            if '' == text:
                log.error('url:' + url + '抓取错误!')
                continue
            doc = parser.parse(text,
                               rule='body>div>#middle .mframe>.wrapper>.mm')
            for ct in doc.children().items():
                # 解析博客 URL
                flag, blogUrl = parser.parse(
                    ct, parse_type=parser.PARSER_PASSAGE_URL)
                if not flag:
                    log.error('url:' + url + '解析 url 错误!')
                    continue
                blog = Blog(self._name, blogUrl, self._save)
                blogUrl = Util.check_url(blogUrl, self._webURL)
                blog.set_url(blogUrl)
                # 检查是否存在
                if blog.exist(blogUrl):
                    log.info('文章url: %s 已存在!', blogUrl)
                    continue

                # 解析博客 标题
                flag, blogTitle = parser.parse(
                    ct, parse_type=parser.PARSER_PASSAGE_TITLE)
                if not flag:
                    log.error('url:' + url + '解析 title 错误!')
                    continue
                blog.set_title(blogTitle)

                # 解析博客 内容
                content = Spider.http_get(blogUrl)
                if '' == content:
                    log.error('url:' + blogUrl + '获取内容错误!')
                    continue
                flag, blogContent = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_CONTENT)
                if not flag:
                    log.error('url:' + blogUrl + '解析内容错误!')
                    continue
                blog.set_content(blogContent)

                # 解析博客 时间
                flag, blogDate = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_DATE)
                if not flag:
                    log.error('url:' + blogUrl + '解析日期错误!')
                    continue
                blogDate = Util.time_str_stamp(blogDate, '%Y-%m-%d')
                blog.set_date(blogDate)

                # 解析博客 分类
                flag, blogCategory = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_CATEGORY)
                if not flag:
                    log.error('url:' + blogUrl + '解析分类错误!')
                    continue
                blog.set_category(blogCategory)

                # 解析博客 标签
                flag, blogTag = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_TAG)
                if not flag:
                    log.error('url:' + blogUrl + '解析标签错误!')
                    continue
                blog.set_tag(blogTag)

                # 解析博客 img url 并下载图片
                flag, blogImgt = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_IMGURL)
                if not flag:
                    log.error('url:' + blogUrl + '解析图片错误!')
                    continue
                for im in blogImgt:
                    img = Image()
                    imgUrl = Util.check_url(im, self._webURL)
                    blogImgContent = Spider.http_get(imgUrl, 0)
                    img.set_url(imgUrl)
                    iname, iename = Util.image_name(imgUrl)
                    img.set_name(iname)
                    img.set_ext_name(iename)
                    img.set_content(blogImgContent)
                    blog.append_image(img)

                # 保存
                if blog.save():
                    log.info('文章: %s 保存成功!', blog.get_title())
                else:
                    log.error('文章: %s 保存失败!', blog.get_title())
 def __init__(self):
     super().__init__()
     _webURL = COM_PIAOLIANG_WEB_URL
     _parserName = COM_PIAOLIANG_NAME
     log.info('name:' + self._parserName + ' url:' + self._webURL +
              ' 解析器安装成功!')
Esempio n. 24
0
 def __init__ (self):
     super().__init__()
     self._name = NET_D1XZ_NAME
     self._webURL = NET_D1XZ_WEB_URL
     log.info('name:' + self._name + ' url:' + self._webURL + ' spider安装成功!')
Esempio n. 25
0
 def __init__(self):
     super().__init__()
     _webURL = CC_UU234_WEB_URL
     _parserName = CC_UU234_NAME
     log.info('name:' + self._parserName + ' url:' + self._webURL + ' 解析器安装成功!')
Esempio n. 26
0
 def check(self):
     check_index = 0
     check_all = 0
     check_error = 0
     check_update = 0
     check_update_chapter = 0
     check_update_exit_chapter = 0
     parser = get_parser().get_parser(CC_UU234_NAME)
     novel = Novel(CC_UU234_NAME)
     for book_url, img_url, chapter_base_url in novel.get_unlock_book_by_parser(
             CC_UU234_NAME):
         check_index += 1
         log.info('开始检查' + str(check_index) + ':' + book_url)
         check_all += 1
         text = Spider.http_get(book_url)
         if '' == text or None is text:
             check_error += 1
             log.error('获取书籍信息出错:' + book_url)
             continue
         flag, img_url_new = parser.parse(
             text, parse_type=parser.PARSER_BOOK_IMG_URL)
         if (img_url != img_url_new) and flag:
             check_update += 1
             novel.update_novel_info_img_url(book_url, img_url)
             img_content = Spider.http_get(img_url_new)
             if '' != img_content and None is not img_content:
                 check_update += 1
                 novel.update_novel_info_img_content(book_url, img_content)
         flag, chapter_url_new = parser.parse(
             text, parse_type=parser.PARSER_BOOK_CHAPTER_BASE_URL)
         if (chapter_base_url != chapter_url_new) and flag:
             check_update += 1
             novel.update_novel_info_chapter_base(book_url, chapter_url_new)
         text = Spider.http_get(chapter_url_new)
         if '' == text or None is text:
             check_error += 1
             log.error('获取书籍章节信息出错:' + chapter_url_new)
         for index, name, chapter_url in parser.parse(
                 text, parse_type=parser.PARSER_BOOK_CHAPTER_URL):
             check_update_chapter += 1
             if not novel.none_chapter(chapter_url):
                 check_update_exit_chapter += 1
                 # log.info(name + '|     ' + chapter_url + '     |' + name + ' 已经存在!')
                 continue
             c = Spider.http_get(chapter_url)
             if '' == text:
                 check_error += 1
                 log.error(name + '|' + chapter_url + '|' + name + ' 下载失败!')
                 continue
             flag, content = parser.parse(
                 c, parse_type=parser.PARSER_BOOK_CHAPTER_CONTENT)
             if flag:
                 novel.save_check_novel_one_chapter(index, name, content,
                                                    chapter_url, book_url)
         log.info('检查结束' + str(check_index) + ':' + book_url)
     log.info('检查结果:\
             \n\t\t总共:'                           + str(check_all) +\
              '\n\t\t失败:' + str(check_error) +\
              '\n\t\t成功更新:' + str(check_update) +\
              '\n\t\t已有章节:' + str(check_update_exit_chapter) +\
              '\n\t\t成功更新章节:' + str(check_update_chapter))
     time.sleep(5)
     return True