def save_novel_one_chapter(self, index, name, content, chapter_url) -> bool: novel_id = self.get_nid() if novel_id < 0: if self._mysql.novel_info_exist(self.get_book_url()): # 小说信息存在,更新 # 获取小说 id 给 novel_info flag, novel_id = self._mysql.get_novel_id_by_url(self.get_book_url()) if not flag: return False self._info.set_nid(novel_id) novel_id = self.get_nid() parser = self._parser_name update_time = int(time.time()) if novel_id < 0: log.error(self.get_name() + '|' + self.get_author() + '|' + 'nid 获取失败!') return False # 检查是否上锁 if self._mysql.novel_chapter_is_locked_by_url(chapter_url): log.info(self.get_name() + '|' + self.get_author() + '| ' + name + ' 章节信息上锁!') return False # 检查章节信息是否存在 if self._mysql.novel_chapter_exist(chapter_url): # 小说章节存在,更新 self._mysql.update_novel_chapter_by_url(novel_id, index, chapter_url, name, content, update_time) else: self._mysql.insert_novel_chapter(novel_id, index, chapter_url, parser, name, content, update_time) return True
def get_novel_info_by_url(self, url): flag = False nid, name, author, category, describe, complete, parser, book_url, img_url, img_content,\ chapter_base_url, create_time, update_time, hot, cp, lock = \ 0, '', '', '', '', 0, '', '', '', '', '', 0, 0, 0, 0, 0 msql = 'SELECT `nid`, `name`, `author`, `category`, `describe`, `complete`, `parser`, `book_url`,'\ '`img_url`, `img_content`, `chapter_base_url`, `create_time`, `update_time`,'\ '`hot`, `cp`, `lock` FROM `novel_info` WHERE book_url="{book_url}";'\ .format(book_url=self._connect.escape_string(url)) try: curosr = self._connect.cursor() curosr.execute(msql) self._connect.commit() result = curosr.fetchone() if None is not result: flag = True nid, name, author, category, describe, complete, parser, book_url, img_url, img_content,\ chapter_base_url, create_time, update_time, hot, cp, lock = \ int(result[0]), str(result[1]), str(result[2]), str(result[3]), str(result[4]),\ int(result[5]), str(result[6]), str(result[7]), str(result[8]), bytes(result[9]),\ str(result[10]), int(result[11]), int(result[12]), int(result[13]), int(result[14]), int(result[15]) except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return (flag, nid, name, author, category, describe, complete, parser, book_url, img_url, img_content, chapter_base_url, create_time, update_time, hot, cp, lock)
def write_blog(self, url: str, title: str, tim: int, category: str, tag: str, spider: str, content: str, imgUrl: str): # 检查关键字段是否存在 if (not Util.valid(title)) or (not Util.valid(spider) or (not Util.valid(url))): log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url) return False, 1 indexfile = self._dir + '/index.txt' log.info('blog index %s 已存在!' % indexfile) with open(indexfile, 'w+') as fw: fw.write('name:%s\n' % title) fw.write('url:%s\n' % url) fw.write('category:%s\n' % category) fw.write('tag:%s\n' % tag) fw.write('time:%s\n' % tim) fw.write('spider:%s\n' % spider) fw.write('img:%s\n' % imgUrl) passagefile = self._dir + '/%s.html' % title log.info('blog passage %s 已存在!' % passagefile) with open(passagefile, 'w+') as fw: fw.write('<!Doctype html> \ <html> \ <head> \ <title>%s</title> \ </head> \ <body>%s</body> \ </html>' % (title, content)) return True, 1
def insert_novel_info(self, name: str, author: str, category: str, describe: str, complete: int, parser: str, book_url: str, img_url: str, img_content: str, chapter_base_url: str, create_time: int, update_time: int) -> (bool, int): flag = False novel_id = -1 msql = 'INSERT INTO `novel_info` (`name`, `author`, `category`, `describe`, `complete`, `parser`, `book_url`,'\ ' `img_url`, `img_content`, `chapter_base_url`, `create_time`, `update_time`)' \ ' VALUES ("{name}", "{author}", "{category}", "{describe}", "{complete}", "{parser}", "{book_url}",' \ ' "{img_url}", "{img_content}", "{chapter_base_url}", "{create_time}", "{update_time}");'\ .format(name=self._connect.escape_string(name), author=self._connect.escape_string(author), category=self._connect.escape_string(category), describe=self._connect.escape_string(describe), complete=complete, parser=self._connect.escape_string(parser), book_url=self._connect.escape_string(book_url), img_url=self._connect.escape_string(img_url), img_content=self._connect.escape_string(str(img_content)), chapter_base_url=self._connect.escape_string(chapter_base_url), create_time=create_time, update_time=update_time) try: curosr = self._connect.cursor() curosr.execute(msql) self._connect.commit() novel_id = int(curosr.lastrowid) if novel_id >= 0: flag = True log.info(name + '|' + author + '信息保存成功!') else: log.error(name + '|' + author + '信息保存失败!') except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag, novel_id
def insert_passage_info (self, title: str, author: str, pageviews: int, tim: int, \ textTop: str, img: str, textBottom: str, type: int, url: str): flag = False id = -1 # 检查关键字段是否存在 if (not Util.valid(title)) or (not Util.valid(author)): return flag, -1 msql = 'INSERT INTO `article` (' \ '`title`, `author`, `pageviews`, `time`, `textTop`, `img`, `textbottom`, `type`, `url`)' \ ' VALUES ("{title}", "{author}", "{pageviews}", "{tim}", "{textTop}", "{img}", "{textBottom}",' \ ' "{type}", "{url}");'.format( title=self._connect.escape_string(title), author=self._connect.escape_string(author), pageviews=pageviews, tim=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(tim)), textTop=self._connect.escape_string(textTop), img=self._connect.escape_string(img), textBottom=self._connect.escape_string(textBottom), type=type, url=self._connect.escape_string(url)) try: curosr = self._connect.cursor() curosr.execute(msql) self._connect.commit() id = int(curosr.lastrowid) if id >= 0: flag = True log.info('id=%d, name=%s, author=%s,信息保存成功' % (id, title, author)) else: log.error(title + '|' + author + '信息保存失败!') except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag, id
def __init__(self, spiderName, url, save): self.__id = 0 # 文章在数据库中的 ID self.__url = '' # 文章URL self.__title = '' # 标题 self.__date = 0 # 日期 self.__category = '' # 分类 self.__tag = '' # 标签 self.__content = '' # 内容 HTML/XML self.__sp = spiderName # 爬虫名字 self.__image = [] # 图片 self.__saveType = save # 保存方式 mysql 或 file self.__save = None # mysql 或 file if 'file' == self.__saveType: self.__save = BlogFile() self.__save.set_rootdir(LOCAL_SPIDER_DIR + '/' + self.__sp)\ .set_dir(url) elif 'mysql' == self.__saveType: self.__save = BlogMysql() self.__save.set_ip(MYSQL_HOST)\ .set_port(MYSQL_PORT)\ .set_usr(MYSQL_USER)\ .set_password(MYSQL_PASSWORD)\ .set_database(MYSQL_BLOG_DB)\ .connect() else: log.error ('不支持的数据保存方式!') exit(1)
def _parser_book_chapter_content(self, doc: str) -> (bool, dict, dict, dict): flag = False text = None try: text = pyquery.PyQuery(doc).find('.readbg>.content').text() if None is not text and '' != text: flag = True except Exception as e: log.error('章节内容解析错误:' + str(e)) return flag, text
def write_image(self, name: str, extname: str, content: str, url: str, pid: int): # 检查关键字段是否存在 if not Util.valid(content) or pid < 0: log.error('url=%s 信息错误!', url) return False, 0 n = hashlib.md5(url.encode('utf-8')).hexdigest() imgfile = self._dir + '/%s.%s' % (n, extname) with open(imgfile, 'wb+') as fw: fw.write(content) return True, 1
def set_rootdir(self, rootdir: str): self._rootdir = obspath = os.path.abspath(rootdir) if os.path.exists(obspath): if not os.path.isdir(obspath): log.error('不合法的保存文件夹!') return if not os.access(obspath, os.R_OK | os.W_OK): log.error('保存文件夹没有读写权限!') return else: os.makedirs(obspath) return self
def set_dir(self, url: str): mdir = hashlib.md5(url.encode('utf-8')).hexdigest() self._dir = obspath = os.path.abspath(self._rootdir + '/' + mdir) if os.path.exists(obspath): if not os.path.isdir(obspath): log.error('不合法的保存文件夹!') return if not os.access(obspath, os.R_OK | os.W_OK): log.error('保存文件夹没有读写权限!') return else: os.mkdir(obspath) return self
def blog_delete(self, url) -> bool: flag = False id = -1 msql = 'delete from `blog_image` where url = "{url}"'.format(url=url) try: cursor = self._connect.cursor() cursor.execute(msql) self._connect.commit() result = cursor.fetchone() if None is not result: flag = True except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag
def novel_chapter_is_locked_by_url(self, url: str) -> bool: flag = False msql = 'SELECT `cid`, `lock` FROM `novel_chapter` WHERE chapter_url = "{chapter_url}";'\ .format(chapter_url=self._connect.escape_string(url)) cursor = self._connect.cursor() try: cursor.execute(msql) self._connect.commit() result = cursor.fetchone() if None is not result: if int(result[1]) == 1: flag = True except Exception as e: log.error('SQL 执行错误: ' + str(e)) return flag
def novel_info_exist(self, url: str) -> bool: flag = False msql = 'SELECT `nid` FROM `novel_info` WHERE book_url = "{book_url}";'\ .format(book_url=self._connect.escape_string(url)) cursor = self._connect.cursor() try: cursor.execute(msql) self._connect.commit() result = cursor.fetchone() if None is not result: flag = True except Exception as e: flag = False log.error('SQL 执行错误: ' + str(e)) return flag
def get_passage_list(self): if len(self._seedURL) <= 0: log.error(self._name + '由于未定义seed url 导致获取book list 失败!') return None try: for ik, iv in self._seedURL.items(): arr1 = ik.split('|') arr2 = iv.split('|') for x in range(int(arr2[0]), int(arr2[1])): self._bookList.append(arr1[0] + str(x) + arr1[1]) for i in self._bookList: yield i except Exception as e: log.error(self._name + '不符合的seed url 设置: ' + str(e)) return None
def passage_info_exist(self, url) -> bool: flag = False id = -1 msql = 'SELECT `id` FROM `article` WHERE url = "{url}";'.format( url=url) try: cursor = self._connect.cursor() cursor.execute(msql) self._connect.commit() result = cursor.fetchone() if None is not result: flag = True except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag
def novel_chapter_none(self, url: str) -> bool: flag = False msql = 'SELECT `cid`, `content`, `index` FROM `novel_chapter` WHERE chapter_url = "{chapter_url}";'\ .format(chapter_url=self._connect.escape_string(url)) cursor = self._connect.cursor() try: cursor.execute(msql) self._connect.commit() result = cursor.fetchone() if (None is result) or (result[1] is None) or ( result[1] == '') or (int(result[2]) < 0): flag = True except Exception as e: flag = False log.error('SQL 执行错误: ' + str(e)) return flag
def get_novel_id_by_url(self, url): flag = False nid = 0 msql = 'SELECT `nid` FROM `novel_info` WHERE book_url="{book_url}";'\ .format(book_url=self._connect.escape_string(url)) try: curosr = self._connect.cursor() curosr.execute(msql) self._connect.commit() result = curosr.fetchone() if None is not result: flag = True nid = int(result[0]) except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag, nid
def update_novel_chapter_by_url(self, novel_id: int, index: int, chapter_url: str, name: str, content: str, update_time: int) -> bool: msql = 'UPDATE `novel_chapter` SET `nid` = "{nid}", `index`="{index}", `name`="{name}",\ `content`="{content}", `update_time`="{update_time}" WHERE `chapter_url`="{chapter_url}";'\ .format(nid=novel_id, index=index, name=self._connect.escape_string(name), content=self._connect.escape_string(content), update_time=update_time, chapter_url=self._connect.escape_string(chapter_url)) try: curosr = self._connect.cursor() curosr.execute(msql) self._connect.commit() log.info( str(index) + '|' + name + '|' + chapter_url + ' 章节信息更新成功!') except Exception as e: log.error('章节信息更新失败: ' + str(e)) return False return True
def insert_novel_chapter(self, novel_id: int, index: int, chapter_url: str, parser: str, name: str, content: str, update_time: int): msql = 'INSERT INTO `novel_chapter` (`nid`, `index`, `chapter_url`,`parser`,\ `name`, `content`, `update_time`) VALUES \ ("{nid}", "{index}", "{chapter_url}", "{parser}", "{name}", "{content}", "{update_time}");' \ .format(nid=novel_id, index=index, chapter_url=self._connect.escape_string(chapter_url), parser=self._connect.escape_string(parser), name=self._connect.escape_string(name), content=self._connect.escape_string(str(content)), update_time=update_time) try: cursor = self._connect.cursor() cursor.execute(msql) self._connect.commit() log.info( str(index) + '|' + name + '|' + chapter_url + ' 章节信息插入成功!') except Exception as e: log.error('插入章节' + name + '错误:' + str(e)) return False return True
def novel_info_unlock_book_url_by_parser( self, parser_name: str) -> (str, str, str): mlist = [] msql = 'SELECT `book_url`, img_url, chapter_base_url FROM `novel_info` WHERE `parser`="{parser_name}" \ AND `lock`=0'.format( parser_name=self._connect.escape_string(parser_name)) cursor = self._connect.cursor() try: cursor.execute(msql) self._connect.commit() result = cursor.fetchall() if None is not result: for res in result: book_url = res[0] img_url = res[1] chapter_base_url = res[2] mlist.append((book_url, img_url, chapter_base_url)) except Exception as e: log.error('获取所有书籍信息失败:' + str(e)) for infos in mlist: yield infos
def update_novel_info_img_url_by_url(self, book_url: str, img_url: str): if self.novel_info_is_locked_by_url(book_url): log.info('书籍信息被锁!不可修改!') return True if self.novel_info_exist(book_url): # 小说信息存在,更新 msql = 'UPDATE `novel_info` SET img_url = "{img_url}", `update_time`="{update}"\ WHERE book_url = "{book_url}";'\ .format(img_url=self._connect.escape_string(str(img_url)), book_url=self._connect.escape_string(book_url), update=int(time.time())) try: curosr = self._connect.cursor() curosr.execute(msql) self._connect.commit() except Exception as e: log.error('书籍封面页URL更新失败:: ' + str(e)) return False else: log.error('要更新的小说信息不存在!') return False return True
def update_novel_info_by_url(self, book_url: str, name: str, author: str, category: str, describe: str, complete: int, img_url: str, img_content: str, chapter_base_url: str, update_time: int): msql = 'UPDATE `novel_info` SET `name`="{name}", `author`="{author}", `category`="{category}", \ `describe`="{describe}", `complete`="{complete}", `img_url`="{img_url}", `img_content`="{img_content}",\ `chapter_base_url`="{chapter_base_url}", `update_time`="{update_time}" WHERE `book_url`="{book_url}";'\ .format(name=self._connect.escape_string(name), author=self._connect.escape_string(author), category=self._connect.escape_string(category), describe=self._connect.escape_string(describe), complete=complete, img_url=self._connect.escape_string(img_url), img_content=self._connect.escape_string(str(img_content)), chapter_base_url=self._connect.escape_string(chapter_base_url), update_time=update_time, book_url=self._connect.escape_string(book_url)) try: curosr = self._connect.cursor() curosr.execute(msql) self._connect.commit() log.info(name + '|' + author + '信息更新成功!') except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return None
def write_blog(self, url: str, title: str, tim: int, category: str, tag: str, spider: str, content: str, imgUrl: str): flag = False id = -1 # 检查关键字段是否存在 if (not Util.valid(title)) or (not Util.valid(spider) or (not Util.valid(url))): log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url) return flag, -1 msql = """INSERT INTO `blog_passage` ( url, title, time, category, tag, spider, content, img_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s);""" try: curosr = self._connect.cursor() curosr.execute(msql, (url, title, Util.stamp_time(tim), category, tag, spider, content, imgUrl)) self._connect.commit() id = int(curosr.lastrowid) if id >= 0: flag = True else: log.error('name=%s 信息保存失败', title) except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag, id
def save (self): imgs = [i.get_url() for i in self.__image] # 检测信息是否存在 if self.__save.blog_exist(self.get_url()): log.info ('文章: %s 已存在!', self.get_title()) return True flag, bid = self.__save.write_blog (self.get_url() , self.get_title(),\ self.get_date(), self.get_category(), self.get_tag(), self.get_spider_name(),\ self.get_content(), '|'.join(imgs)) if not flag: return False # 检测图片是否存在 for img in self.__image: if self.__save.image_exist(img.get_url()): log.info ('图片: %s 已存在!', self.get_url()) continue flag, iid = self.__save.write_image(img.get_name(), img.get_ext_name(), img.get_content(), img.get_url(), bid) if not flag: log.error ('图片: %s 保存失败!', img.get_url()) self.__save.blog_delete (self.get_url()) return False return True
def write_image(self, name: str, ext_name: str, content: str, url: str, pid: int): flag = False id = -1 # 检查关键字段是否存在 if not Util.valid(content) or pid <= 0: log.error('url=%s 信息错误!', url) return flag, -1 msql = """INSERT INTO `blog_image` ( `url`, `name`, `ext_name`, `context`, `pid`) VALUES (%s, %s, %s, %s, %s);""" try: curosr = self._connect.cursor() curosr.execute(msql, (url, name, ext_name, pymysql.Binary(content), pid)) self._connect.commit() id = int(curosr.lastrowid) if id >= 0: flag = True else: log.error('name=%s 信息保存失败', name) except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag, id
def check(self): check_index = 0 check_all = 0 check_error = 0 check_update = 0 check_update_chapter = 0 check_update_exit_chapter = 0 parser = get_parser().get_parser(CC_UU234_NAME) novel = Novel(CC_UU234_NAME) for book_url, img_url, chapter_base_url in novel.get_unlock_book_by_parser( CC_UU234_NAME): check_index += 1 log.info('开始检查' + str(check_index) + ':' + book_url) check_all += 1 text = Spider.http_get(book_url) if '' == text or None is text: check_error += 1 log.error('获取书籍信息出错:' + book_url) continue flag, img_url_new = parser.parse( text, parse_type=parser.PARSER_BOOK_IMG_URL) if (img_url != img_url_new) and flag: check_update += 1 novel.update_novel_info_img_url(book_url, img_url) img_content = Spider.http_get(img_url_new) if '' != img_content and None is not img_content: check_update += 1 novel.update_novel_info_img_content(book_url, img_content) flag, chapter_url_new = parser.parse( text, parse_type=parser.PARSER_BOOK_CHAPTER_BASE_URL) if (chapter_base_url != chapter_url_new) and flag: check_update += 1 novel.update_novel_info_chapter_base(book_url, chapter_url_new) text = Spider.http_get(chapter_url_new) if '' == text or None is text: check_error += 1 log.error('获取书籍章节信息出错:' + chapter_url_new) for index, name, chapter_url in parser.parse( text, parse_type=parser.PARSER_BOOK_CHAPTER_URL): check_update_chapter += 1 if not novel.none_chapter(chapter_url): check_update_exit_chapter += 1 # log.info(name + '| ' + chapter_url + ' |' + name + ' 已经存在!') continue c = Spider.http_get(chapter_url) if '' == text: check_error += 1 log.error(name + '|' + chapter_url + '|' + name + ' 下载失败!') continue flag, content = parser.parse( c, parse_type=parser.PARSER_BOOK_CHAPTER_CONTENT) if flag: novel.save_check_novel_one_chapter(index, name, content, chapter_url, book_url) log.info('检查结束' + str(check_index) + ':' + book_url) log.info('检查结果:\ \n\t\t总共:' + str(check_all) +\ '\n\t\t失败:' + str(check_error) +\ '\n\t\t成功更新:' + str(check_update) +\ '\n\t\t已有章节:' + str(check_update_exit_chapter) +\ '\n\t\t成功更新章节:' + str(check_update_chapter)) time.sleep(5) return True
def run(self): parser = get_parser().get_parser(CC_UU234_NAME) for url in self.get_book_list(): text = Spider.http_get(url) if '' == text: continue doc = parser.parse(text, rule='body>div>.listconl>.clearfix') for ct in doc.children().items(): novel = Novel(CC_UU234_NAME) flag, name = parser.parse(ct.html(), parse_type=parser.PARSER_BOOK_NAME) novel.set_name(name) flag, author = parser.parse( ct.html(), parse_type=parser.PARSER_BOOK_AUTHOR) novel.set_author(author) flag, url = parser.parse(ct.html(), parse_type=parser.PARSER_BOOK_URL) novel.set_book_url(url) flag, category = parser.parse( ct.html(), parse_type=parser.PARSER_BOOK_CATEGORY) novel.set_category(category) flag, status = parser.parse( ct.html(), parse_type=parser.PARSER_BOOK_STATUS) novel.set_complete(status) if novel.has_book(url): log.info(name + '|' + author + '已存在!') continue text = Spider.http_get(novel.get_book_url()) if '' == text: continue flag, img_url = parser.parse( text, parse_type=parser.PARSER_BOOK_IMG_URL) novel.set_img_url(img_url) flag, desc = parser.parse(text, parse_type=parser.PARSER_BOOK_DESC) novel.set_describe(desc) flag, chapter_url = parser.parse( text, parse_type=parser.PARSER_BOOK_CHAPTER_BASE_URL) novel.set_chapter_base_url(chapter_url) img_content = Spider.http_get(novel.get_img_url(), resource_method=2) novel.set_img_content(img_content) text = Spider.http_get(novel.get_chapter_base_url()) if '' == text: continue if not novel.save_novel_info(): continue # 保存小说信息,上锁或出错则跳过 for index, name, chapter_url in parser.parse( text, parse_type=parser.PARSER_BOOK_CHAPTER_URL): # 测试是否已经包含章节信息 if novel.has_chapter(chapter_url): # log.info(novel.get_name() + '|' + novel.get_author() + '|' + name + '已经存在!') continue content = '' novel.save_novel_one_chapter(index, name, content, chapter_url) log.info('正在获取 ' + novel.get_name() + '|' + novel.get_author() + '|' + name + '|' + chapter_url) c = Spider.http_get(chapter_url) if '' == text: log.error(novel.get_name() + '|' + novel.get_author() + '|' + name + '下载失败!') continue flag, content = parser.parse( c, parse_type=parser.PARSER_BOOK_CHAPTER_CONTENT) if flag: novel.save_novel_one_chapter(index, name, content, chapter_url) log.info(self._name + '执行完成!')
def run(self): parser = get_parser().get_parser(self._name) for url in self.get_passage_list(): text = Spider.http_get(url) if '' == text: log.error('url:' + url + '抓取错误!') continue doc = parser.parse(text, rule='body>div>#middle .mframe>.wrapper>.mm') for ct in doc.children().items(): # 解析博客 URL flag, blogUrl = parser.parse( ct, parse_type=parser.PARSER_PASSAGE_URL) if not flag: log.error('url:' + url + '解析 url 错误!') continue blog = Blog(self._name, blogUrl, self._save) blogUrl = Util.check_url(blogUrl, self._webURL) blog.set_url(blogUrl) # 检查是否存在 if blog.exist(blogUrl): log.info('文章url: %s 已存在!', blogUrl) continue # 解析博客 标题 flag, blogTitle = parser.parse( ct, parse_type=parser.PARSER_PASSAGE_TITLE) if not flag: log.error('url:' + url + '解析 title 错误!') continue blog.set_title(blogTitle) # 解析博客 内容 content = Spider.http_get(blogUrl) if '' == content: log.error('url:' + blogUrl + '获取内容错误!') continue flag, blogContent = parser.parse( content, parse_type=parser.PARSER_PASSAGE_CONTENT) if not flag: log.error('url:' + blogUrl + '解析内容错误!') continue blog.set_content(blogContent) # 解析博客 时间 flag, blogDate = parser.parse( content, parse_type=parser.PARSER_PASSAGE_DATE) if not flag: log.error('url:' + blogUrl + '解析日期错误!') continue blogDate = Util.time_str_stamp(blogDate, '%Y-%m-%d') blog.set_date(blogDate) # 解析博客 分类 flag, blogCategory = parser.parse( content, parse_type=parser.PARSER_PASSAGE_CATEGORY) if not flag: log.error('url:' + blogUrl + '解析分类错误!') continue blog.set_category(blogCategory) # 解析博客 标签 flag, blogTag = parser.parse( content, parse_type=parser.PARSER_PASSAGE_TAG) if not flag: log.error('url:' + blogUrl + '解析标签错误!') continue blog.set_tag(blogTag) # 解析博客 img url 并下载图片 flag, blogImgt = parser.parse( content, parse_type=parser.PARSER_PASSAGE_IMGURL) if not flag: log.error('url:' + blogUrl + '解析图片错误!') continue for im in blogImgt: img = Image() imgUrl = Util.check_url(im, self._webURL) blogImgContent = Spider.http_get(imgUrl, 0) img.set_url(imgUrl) iname, iename = Util.image_name(imgUrl) img.set_name(iname) img.set_ext_name(iename) img.set_content(blogImgContent) blog.append_image(img) # 保存 if blog.save(): log.info('文章: %s 保存成功!', blog.get_title()) else: log.error('文章: %s 保存失败!', blog.get_title())