def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) column_id = result.group('column_id') content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + column_id) if not content: return raw_info = json.loads(content) info = {} info['creator_id'] = raw_info['creator']['slug'] info['creator_hash'] = raw_info['creator']['hash'] info['creator_sign'] = raw_info['creator']['bio'] info['creator_name'] = raw_info['creator']['name'] info['creator_logo'] = raw_info['creator']['avatar'][ 'template'].replace('{id}', raw_info['creator']['avatar']['id']).replace( '_{size}', '') info['column_id'] = raw_info['slug'] info['name'] = raw_info['name'] info['logo'] = raw_info['creator']['avatar']['template'].replace( '{id}', raw_info['avatar']['id']).replace('_{size}', '') info['article'] = raw_info['postsCount'] info['follower'] = raw_info['followersCount'] info['description'] = raw_info['description'] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format( column_id) for i in range(info['article'] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) self.column_id = result.group('column_id') content = Http.get_content('https://zhuanlan.zhihu.com/api/columns/' + self.column_id) if not content: return raw_info = json.loads(content) info = {} info['creator_id'] = raw_info['creator']['slug'] info['creator_hash'] = raw_info['creator']['hash'] info['creator_sign'] = raw_info['creator']['bio'] info['creator_name'] = raw_info['creator']['name'] info['creator_logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['creator']['avatar'][ 'id']).replace('_{size}', '') info['column_id'] = raw_info['slug'] info['name'] = raw_info['name'] info['logo'] = raw_info['creator']['avatar']['template'].replace('{id}', raw_info['avatar']['id']).replace( '_{size}', '') info['article'] = raw_info['postsCount'] info['follower'] = raw_info['followersCount'] info['description'] = raw_info['description'] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset='.format(self.column_id) for i in range(info['article'] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def create_work_set(self, target_url): if target_url in self.task_complete_set: return result = Match.column(target_url) self.column_id = result.group("column_id") content = Http.get_content("https://zhuanlan.zhihu.com/api/columns/" + self.column_id) if not content: return raw_info = json.loads(content) info = {} info["creator_id"] = raw_info["creator"]["slug"] info["creator_hash"] = raw_info["creator"]["hash"] info["creator_sign"] = raw_info["creator"]["bio"] info["creator_name"] = raw_info["creator"]["name"] info["creator_logo"] = ( raw_info["creator"]["avatar"]["template"] .replace("{id}", raw_info["creator"]["avatar"]["id"]) .replace("_{size}", "") ) info["column_id"] = raw_info["slug"] info["name"] = raw_info["name"] info["logo"] = ( raw_info["creator"]["avatar"]["template"].replace("{id}", raw_info["avatar"]["id"]).replace("_{size}", "") ) info["article"] = raw_info["postsCount"] info["follower"] = raw_info["followersCount"] info["description"] = raw_info["description"] self.info_list.append(info) self.task_complete_set.add(target_url) detect_url = "https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=10&offset=".format(self.column_id) for i in range(info["article"] / 10 + 1): self.work_set.add(detect_url + str(i * 10)) return
def parse_column(command): result = Match.column(command) column_id = result.group('column_id') task = SingleTask() task.kind = 'column' task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format(column_id) task.book.kind = 'column' task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format(column_id) task.book.sql.question = '' task.book.sql.answer = 'select * from Article where column_id = "{}" '.format(column_id) return task
def parse_column(command): result = Match.column(command) column_id = result.group('column_id') task = SingleTask() task.kind = 'column' task.spider.href = 'https://zhuanlan.zhihu.com/{}'.format( column_id) task.book.kind = 'column' task.book.sql.info = 'select * from ColumnInfo where column_id = "{}" '.format( column_id) task.book.sql.question = '' task.book.sql.answer = 'select * from Article where column_id = "{}" '.format( column_id) return task
def parse_column(command): result = Match.column(command) column_id = result.group(u'column_id') task = ColumnTask(column_id) return task