def __init__(self, max_redirect=10, max_tries=4, max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.seen_topics = set() self.seen_zhuanlan = set() for t in LIVE_TYPE: for offset in range(max_tasks): self.add_url(LIVE_API_URL.format(type=t, offset=offset * 10)) self.t0 = time.time() self.t1 = None self.client = ZhihuClient() self.headers = {} self.client.auth(self) # [@Stephen] 调用auth的__call__,把headers传递给本对象,存入self.headers,为爬虫进行auth self._session = None self.__stopped = {}.fromkeys(['ended', 'ongoing', 'posts'], False)
def __init__(self, max_redirect=10, max_tries=4, max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() for t in LIVE_TYPE: for offset in range(max_tasks): self.add_url(LIVE_API_URL.format(type=t, offset=offset * 10)) self.t0 = time.time() self.t1 = None client = ZhihuClient() self.headers = {} client.auth(self) self._session = None
class Crawler: def __init__(self, max_redirect=10, max_tries=4, max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.seen_topics = set() self.seen_zhuanlan = set() for t in LIVE_TYPE: for offset in range(max_tasks): self.add_url(LIVE_API_URL.format(type=t, offset=offset * 10)) self.t0 = time.time() self.t1 = None self.client = ZhihuClient() self.headers = {} self.client.auth(self) # [@Stephen] 调用auth的__call__,把headers传递给本对象,存入self.headers,为爬虫进行auth self._session = None self.__stopped = {}.fromkeys(['ended', 'ongoing', 'posts'], False) async def check_token(self): async with self.session.get( LIVE_API_URL.format(type='ended', offset=0)) as resp: if resp.status == 401: self.client.refresh_token() @property def session(self): if self._session is None: self._session = aiohttp.ClientSession( headers=self.headers, loop=self.loop) return self._session async def convert_local_image(self, pic): pic_name = pic.split('/')[-1] path = os.path.join(IMAGE_FOLDER, pic_name) if not os.path.exists(path): async with self.session.get(pic) as resp: content = await resp.read() with open(path, 'wb') as f: f.write(content) return path def close(self): self.session.close() async def parse_zhuanlan_link(self, response): posts = await response.json() if response.status == 200 and posts: for post in posts: cover = post['titleImage'] if not cover: continue s = Live.search() title = post['title'] for sep in ('-', '—'): if sep in title: title = title.split(sep)[-1].strip() speaker_id = post['author']['hash'] zid = post['url'].split('/')[-1] s = s.query(Q('match_phrase', subject=title)) lives = await s.execute() for live in lives: if live.speaker and live.speaker.speaker_id == speaker_id: await self.update_live(zid, cover, live) break else: match = LIVE_REGEX.search(post['content']) if match: live_id = match.group(2) try: live = await Live.get(live_id) except NotFoundError: pass else: await self.update_live(zid, cover, live) return get_next_url(response.url) async def update_live(self, zid, cover, live): if live.id in self.seen_zhuanlan: return self.seen_zhuanlan.add(live.id) zhuanlan_url = ZHUANLAN_URL.format(zid) cover = await self.convert_local_image(cover) await live.update(cover=cover, zhuanlan_url=zhuanlan_url) async def parse_topic_link(self, response): rs = await response.json() if response.status == 200: rs['avatar_url'] = await self.convert_local_image( rs['avatar_url'].replace('_s', '_xl')) Topic.add_or_update(**flatten_live_dict(rs, TOPIC_KEYS)) async def parse_live_link(self, response): rs = await response.json() if response.status == 200: for live in rs['data']: speaker = live.pop('speaker') speaker_id = speaker['member']['id'] speaker['member']['avatar_url'] = await self.convert_local_image( # noqa speaker['member']['avatar_url'].replace('_s', '_xl')) user = User.add(speaker_id=speaker_id, **flatten_live_dict(speaker, SPEAKER_KEYS)) live_dict = flatten_live_dict(live, LIVE_KEYS) topics = live_dict.pop('topics') for topic in topics: topic_id = topic['id'] if topic_id not in self.seen_topics: self.seen_topics.add(topic_id) self.add_url(TOPIC_API_URL.format(topic_id), self.max_redirect) topics = [t['name'] for t in topics] tags = ' '.join(set(sum([(t['name'], t['short_name']) for t in live_dict.pop('tags')], ()))) live_dict['speaker_id'] = user.id live_dict['speaker_name'] = user.name live_dict['topics'] = topics live_dict['topic_names'] = ' '.join(topics) live_dict['seats_taken'] = live_dict.pop('seats')['taken'] live_dict['amount'] = live_dict.pop('fee')['amount'] / 100 live_dict['status'] = live_dict['status'] == 'public' live_dict['tag_names'] = tags live_dict['starts_at'] = datetime.fromtimestamp( live_dict['starts_at']) live_dict['live_suggest'] = gen_suggests( live_dict['topic_names'], tags, live_dict['outline'], user.name, live_dict['subject']) result = await Live.add(**live_dict) if result.meta['version'] == 1: user.incr_live_count() paging = rs['paging'] if not paging['is_end']: next_url = paging['next'] return paging['next'] else: print('HTTP status_code is {}'.format(response.status)) async def fetch(self, url, max_redirect): tries = 0 exception = None while tries < self.max_tries: try: response = await self.session.get( url, allow_redirects=False) break except aiohttp.ClientError as client_error: exception = client_error tries += 1 else: return try: if 'api.zhihu.com' in url: parse_func = (self.parse_topic_link if 'topics' in url else self.parse_live_link) next_url = await parse_func(response) else: next_url = await self.parse_zhuanlan_link(response) print('{} has finished'.format(url)) if next_url is not None: self.add_url(next_url, max_redirect) else: for type in self.__stopped: if type in url: self.__stopped[type] = True finally: response.release() async def work(self): try: while 1: url, max_redirect = await self.q.get() if url in self.seen_urls: type = url.split('/')[-1].split('?')[0] if not type.isdigit() and not self.__stopped[type]: self.add_url(get_next_url(url), max_redirect) await self.fetch(url, max_redirect) self.q.task_done() asyncio.sleep(1) except asyncio.CancelledError: pass def add_url(self, url, max_redirect=None): if max_redirect is None: max_redirect = self.max_redirect if url not in self.seen_urls: self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) def add_zhuanlan_urls(self): for offset in range(self.max_tasks): self.add_url(ZHUANLAN_API_URL.format(offset=offset * 20)) async def crawl(self): await self.check_token() self.__workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() await self.q.join() self.add_zhuanlan_urls() await self.q.join() self.t1 = time.time() for w in self.__workers: w.cancel()
def download_activs(): # 登录知乎 s = ZhihuClient(USERNAME, PASSWD).get_session() # 增加权限认证 s.headers.update({'authorization': AUTH}) download_activs_json(s, START_URL)