def create_work_set(self, target_url): u""" 根据target_url(例:http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles)的内容, 先获得creator_id, 再根据文章的数目, 获得页面数, 依次打开每个页面, 将文章的地址放入work_set中 :param target_url: :return: """ if target_url in self.task_complete_set: return id_result = Match.jianshu_author(target_url) jianshu_id = id_result.group('jianshu_id') article_num, article_list = self.get_jianshu_question_list(target_url) self.task_complete_set.add(target_url) if article_num % 9 != 0: page_num = article_num/9 + 1 # 9 href on one page else: page_num = article_num / 9 for item in article_list: self.work_set.add(item) for page in range(page_num-1): # page+2, don't need to get the first page url = 'http://www.jianshu.com/users/{}/latest_articles?page={}'.format(jianshu_id, page+2) content_article_list = Http.get_content(url) article_list = self.parse_get_article_list(content_article_list) for item in article_list: self.work_set.add(item) return
def parse_jianshu_author(command): u""" :param command: homepage of someone, e.g. http://www.jianshu.com/users/b1dd2b2c87a8/latest_articles :return: task: """ result = Match.jianshu_author(command) jianshu_id = result.group('jianshu_id') task = SingleTask() task.author_id = jianshu_id task.kind = 'jianshu_author' task.spider.href = 'http://www.jianshu.com/users/{}/latest_articles'.format(jianshu_id) task.book.kind = 'jianshu_author' task.book.sql.info_extra = 'creator_id = "{}"'.format(jianshu_id) task.book.sql.article_extra = 'author_id = "{}"'.format(jianshu_id) task.book.author_id = jianshu_id return task