def create_work_set(self, target_url): u""" 根据博客首页的url, 首先通过re获得博客id, 然后根据博客"关于我"的页面的内容获得写入sinablog_info 的数据(这部分理应不在这个函数中, 可以改进), 最后通过博客目录页面的内容, 获得每篇博文的地址, 放入work_set中 :param target_url: 博客首页的url :return: """ if target_url in self.task_complete_set: return result = Match.sinablog_author(target_url) sinablog_author_id = int(result.group('sinablog_people_id')) article_num = self.get_sinablog_question_list(sinablog_author_id) if article_num % 50 != 0: page_num = article_num/50 + 1 # 50 href on 1 page else: page_num = article_num / 50 self.question_list[0]['article_num'] = article_num # 上面这行, 暂时只能这样写, 因为"关于我"的页面没有文章的数量 self.task_complete_set.add(target_url) for page in range(page_num): url = 'http://blog.sina.com.cn/s/articlelist_{}_0_{}.html'.format(sinablog_author_id, page+1) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) return
def parse_sinablog_author(command): u""" :param command: 某个新浪博客博主的首页地址 :return: task: """ result = Match.sinablog_author(command) sinablog_author_id = result.group('sinablog_people_id') Debug.logger.debug(u"sinablog_people_id:" + str(sinablog_author_id)) task = SingleTask() task.author_id = sinablog_author_id task.kind = 'sinablog_author' task.spider.href_article_list = 'http://blog.sina.com.cn/s/articlelist_{}_0_1.html'.\ format(sinablog_author_id) task.spider.href = 'http://blog.sina.com.cn/u/{}'.format(sinablog_author_id) task.spider.href_profile = 'http://blog.sina.com.cn/s/profile_{}.html'.format(sinablog_author_id) task.book.kind = 'sinablog_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(sinablog_author_id) task.book.sql.article_extra = 'author_id = "{}"'.format(sinablog_author_id) task.book.author_id = sinablog_author_id return task