def write_blog(self, url: str, title: str, tim: int, category: str, tag: str, spider: str, content: str, imgUrl: str): flag = False id = -1 # 检查关键字段是否存在 if (not Util.valid(title)) or (not Util.valid(spider) or (not Util.valid(url))): log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url) return flag, -1 msql = """INSERT INTO `blog_passage` ( url, title, time, category, tag, spider, content, img_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s);""" try: curosr = self._connect.cursor() curosr.execute(msql, (url, title, Util.stamp_time(tim), category, tag, spider, content, imgUrl)) self._connect.commit() id = int(curosr.lastrowid) if id >= 0: flag = True else: log.error('name=%s 信息保存失败', title) except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag, id
def write_blog(self, url: str, title: str, tim: int, category: str, tag: str, spider: str, content: str, imgUrl: str): # 检查关键字段是否存在 if (not Util.valid(title)) or (not Util.valid(spider) or (not Util.valid(url))): log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url) return False, 1 indexfile = self._dir + '/index.txt' log.info('blog index %s 已存在!' % indexfile) with open(indexfile, 'w+') as fw: fw.write('name:%s\n' % title) fw.write('url:%s\n' % url) fw.write('category:%s\n' % category) fw.write('tag:%s\n' % tag) fw.write('time:%s\n' % tim) fw.write('spider:%s\n' % spider) fw.write('img:%s\n' % imgUrl) passagefile = self._dir + '/%s.html' % title log.info('blog passage %s 已存在!' % passagefile) with open(passagefile, 'w+') as fw: fw.write('<!Doctype html> \ <html> \ <head> \ <title>%s</title> \ </head> \ <body>%s</body> \ </html>' % (title, content)) return True, 1
def insert_passage_info (self, title: str, author: str, pageviews: int, tim: int, \ textTop: str, img: str, textBottom: str, type: int, url: str): flag = False id = -1 # 检查关键字段是否存在 if (not Util.valid(title)) or (not Util.valid(author)): return flag, -1 msql = 'INSERT INTO `article` (' \ '`title`, `author`, `pageviews`, `time`, `textTop`, `img`, `textbottom`, `type`, `url`)' \ ' VALUES ("{title}", "{author}", "{pageviews}", "{tim}", "{textTop}", "{img}", "{textBottom}",' \ ' "{type}", "{url}");'.format( title=self._connect.escape_string(title), author=self._connect.escape_string(author), pageviews=pageviews, tim=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(tim)), textTop=self._connect.escape_string(textTop), img=self._connect.escape_string(img), textBottom=self._connect.escape_string(textBottom), type=type, url=self._connect.escape_string(url)) try: curosr = self._connect.cursor() curosr.execute(msql) self._connect.commit() id = int(curosr.lastrowid) if id >= 0: flag = True log.info('id=%d, name=%s, author=%s,信息保存成功' % (id, title, author)) else: log.error(title + '|' + author + '信息保存失败!') except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag, id
def _parser_passage_date (self, doc: str) -> (bool, str): flag = False tm = pyquery.PyQuery(doc).find('body>.main>.main_left>.art_con_left>.source>p>span').eq(0).text() if None is not tm: arr = tm.strip() else: tm = '1970-12-01 00:00:00' return flag, Util.time_str_stamp(tm.strip(), "%Y-%m-%d %H:%M:%S")
def _parser_passage_url(self, doc: str) -> (bool, str): flag = False url = pyquery.PyQuery(doc).find('h3>a').attr('href') if None is not url: flag = True else: url = '' url = url.strip() url = Util.check_url(url, COM_XZW_WEB_URL) return flag, url.strip()
def write_image(self, name: str, extname: str, content: str, url: str, pid: int): # 检查关键字段是否存在 if not Util.valid(content) or pid < 0: log.error('url=%s 信息错误!', url) return False, 0 n = hashlib.md5(url.encode('utf-8')).hexdigest() imgfile = self._dir + '/%s.%s' % (n, extname) with open(imgfile, 'wb+') as fw: fw.write(content) return True, 1
def _parser_passage_date(self, doc: str) -> (bool, str): flag = False tm = pyquery.PyQuery(doc).find( 'body>.main-wrap>.m-wrap>.main-part>.sbody>.info>.s1').text() if None is not tm: tm = re.sub(r'(时间|:|:)', '', tm) arr = tm.strip().split(' ') tm = arr[0] else: tm = '1970-12-01' return flag, Util.time_str_stamp(tm.strip(), "%Y-%m-%d")
def write_image(self, name: str, ext_name: str, content: str, url: str, pid: int): flag = False id = -1 # 检查关键字段是否存在 if not Util.valid(content) or pid <= 0: log.error('url=%s 信息错误!', url) return flag, -1 msql = """INSERT INTO `blog_image` ( `url`, `name`, `ext_name`, `context`, `pid`) VALUES (%s, %s, %s, %s, %s);""" try: curosr = self._connect.cursor() curosr.execute(msql, (url, name, ext_name, pymysql.Binary(content), pid)) self._connect.commit() id = int(curosr.lastrowid) if id >= 0: flag = True else: log.error('name=%s 信息保存失败', name) except Exception as e: log.error('MySQL 执行错误: ' + str(e)) return flag, id
def run(self): parser = get_parser().get_parser(self._name) for url in self.get_passage_list(): text = Spider.http_get(url) if '' == text: log.error('url:' + url + '抓取错误!') continue doc = parser.parse(text, rule='body>div>#middle .mframe>.wrapper>.mm') for ct in doc.children().items(): # 解析博客 URL flag, blogUrl = parser.parse( ct, parse_type=parser.PARSER_PASSAGE_URL) if not flag: log.error('url:' + url + '解析 url 错误!') continue blog = Blog(self._name, blogUrl, self._save) blogUrl = Util.check_url(blogUrl, self._webURL) blog.set_url(blogUrl) # 检查是否存在 if blog.exist(blogUrl): log.info('文章url: %s 已存在!', blogUrl) continue # 解析博客 标题 flag, blogTitle = parser.parse( ct, parse_type=parser.PARSER_PASSAGE_TITLE) if not flag: log.error('url:' + url + '解析 title 错误!') continue blog.set_title(blogTitle) # 解析博客 内容 content = Spider.http_get(blogUrl) if '' == content: log.error('url:' + blogUrl + '获取内容错误!') continue flag, blogContent = parser.parse( content, parse_type=parser.PARSER_PASSAGE_CONTENT) if not flag: log.error('url:' + blogUrl + '解析内容错误!') continue blog.set_content(blogContent) # 解析博客 时间 flag, blogDate = parser.parse( content, parse_type=parser.PARSER_PASSAGE_DATE) if not flag: log.error('url:' + blogUrl + '解析日期错误!') continue blogDate = Util.time_str_stamp(blogDate, '%Y-%m-%d') blog.set_date(blogDate) # 解析博客 分类 flag, blogCategory = parser.parse( content, parse_type=parser.PARSER_PASSAGE_CATEGORY) if not flag: log.error('url:' + blogUrl + '解析分类错误!') continue blog.set_category(blogCategory) # 解析博客 标签 flag, blogTag = parser.parse( content, parse_type=parser.PARSER_PASSAGE_TAG) if not flag: log.error('url:' + blogUrl + '解析标签错误!') continue blog.set_tag(blogTag) # 解析博客 img url 并下载图片 flag, blogImgt = parser.parse( content, parse_type=parser.PARSER_PASSAGE_IMGURL) if not flag: log.error('url:' + blogUrl + '解析图片错误!') continue for im in blogImgt: img = Image() imgUrl = Util.check_url(im, self._webURL) blogImgContent = Spider.http_get(imgUrl, 0) img.set_url(imgUrl) iname, iename = Util.image_name(imgUrl) img.set_name(iname) img.set_ext_name(iename) img.set_content(blogImgContent) blog.append_image(img) # 保存 if blog.save(): log.info('文章: %s 保存成功!', blog.get_title()) else: log.error('文章: %s 保存失败!', blog.get_title())