class Anicobin(Collector): def __init__(self, reporter, waiter, outdir, useragent) -> None: super(Anicobin, self).__init__() self.reporter: Reporter = reporter self.waiter = waiter self.outdir = outdir self.useragent = useragent self.cacher = Cacher(self.outdir) self.semaphore = Semaphore(2) async def get(self, url): filename = urllib.parse.quote(url, safe='') + '.html' cache, _ = self.cacher.get(filename) if cache: html = cache else: await self.waiter.wait(url) print('fetching', url) async with aiohttp.request('get', url, headers={'user-agent': self.useragent}) as req: content = await req.read() html = content.decode(SITE_ENCODING) self.cacher.set(filename, html) return html async def collect(self, base_url, queue_size=3): async def f(page): print(page) html, _ = await self.async_retry(3, self.get, f'{base_url}?p={page}') result = [] for post_url in await self.run_in_executor(get_post_urls, html): _html = await self.get(post_url) urls = get_pict_urls(_html) for url in urls: filename = urllib.parse.quote(url, safe='') content, _ = self.cacher.get(filename, binary=True) if not content: await self.add_future( 'dlimage', download_file(url, filename, self.cacher)) result.extend(urls) return len(result) > 0 await self.queued_paging(1, 1000, lambda page: f(page), queue_size=queue_size)
class Keiba(Collector): def __init__(self, reporter, waiter, outdir, useragent) -> None: super(Keiba, self).__init__() self.reporter: Reporter = reporter self.waiter = waiter self.outdir = outdir self.useragent = useragent self.cacher = Cacher(self.outdir) self.semaphore = Semaphore(2) async def get_search_page( self, n: int, options: dict = { 'pid': str, 'word': str, 'track[]': str, 'start_year': str, 'start_mon': str, 'end_year': str, 'end_mon': str, 'jyo[]': str, 'kyori_min': str, 'kyori_max': str, 'sort': str, 'list': str, }): url = 'https://db.netkeiba.com/' psuedo_url = f'{url}?{urllib.parse.urlencode(options)}&page=1' filename = urllib.parse.quote(psuedo_url + '.html', safe='') cache, _ = self.cacher.get(filename) if cache: search_result = cache else: await self.waiter.wait(psuedo_url) print('fetching', psuedo_url) async with aiohttp.request( 'post', url=url, headers={ 'content-type': 'application/x-www-form-urlencoded', 'user-agent': self.useragent }, data=urllib.parse.urlencode(options)) as req: content = await req.read() search_result = content.decode(SITE_ENCODING) if str(req.url) == url: self.cacher.set(filename, search_result) else: print(f'Warning: redirected to {str(req.url)}') self.cacher.set( urllib.parse.quote(str(req.url), safe='') + '.html', search_result) return None if n == 1: return search_result else: data = await self.run_in_executor(get_nextpage_data, search_result) data['page'] = str(n) psuedo_url = f'{url}?{urllib.parse.urlencode(options)}&page={n}' filename = urllib.parse.quote(psuedo_url + '.html', safe='') cache, _ = self.cacher.get(filename) if cache: result = cache else: await self.waiter.wait(psuedo_url) print('fetching', psuedo_url) async with aiohttp.request( 'post', url=url, headers={ 'content-type': 'application/x-www-form-urlencoded', 'user-agent': self.useragent }, data=urllib.parse.urlencode( data, encoding=SITE_ENCODING)) as req: try: content = await req.read() result = content.decode(SITE_ENCODING) self.cacher.set(filename, result) except Exception as e: print('get_tail', e) return None return result async def get_race_page(self, url): filename = urllib.parse.quote(url, safe='') + '.html' cache, _ = self.cacher.get(filename, ext='') if cache: html = cache else: await self.waiter.wait(url) print('fetching', url) async with aiohttp.request('get', url, headers={'user-agent': self.useragent}) as req: content = await req.read() html = content.decode(SITE_ENCODING) self.cacher.set(filename, html) return html async def collect(self, year, queue_size=3): async def f(page): print(page) html, _ = await self.async_retry( 3, self.get_search_page, page, { 'pid': 'race_list', 'start_year': str(year), 'end_year': str(year), 'sort': 'date', 'list': '100' }) return len([ await self.add_future('get_race', self.get_race_page(race_url)) for race_url in await self.run_in_executor( get_race_urls, html) ]) == 100 if html else False await self.queued_paging(1, 1000, lambda page: f(page), queue_size=queue_size) async def collect_horse(self, year, queue_size=3): async def f(page): print(page) html, error = await self.async_retry(3, self.get_search_page, page, { 'pid': 'horse_list', 'list': '100', 'birthyear': year, }) if error: print('Waringn: max retries exceeded') return False return len(await self.run_in_executor( get_horse_urls, html)) == 100 if html else False await self.queued_paging(1, 1000, lambda page: f(page), queue_size=queue_size)
class WearCollector(Collector): def __init__(self, reporter: Reporter, waiter: Waiter, outdir: str, useragent: str = ''): super(WearCollector, self).__init__() self.reporter: Reporter = reporter self.waiter = waiter self.outdir = outdir self.useragent = useragent self.cacher = Cacher(self.outdir) # 非同期処理の同時接続数制御 self.semaphore = Semaphore(2) # ファイルダウンローダ self.downloader = Downloader(self.waiter, self.semaphore, self.reporter) async def download_user_page(self, url: str, page_num): url = url + f'?pageno={page_num}' # キャッシュがあれば使う filename = urllib.parse.quote(url, safe='') + '.html' content, info = self.cacher.get(filename) if content and info: html = content realurl = info.get('realurl') self.reporter.report(INFO, f'use cache {url}') else: await self.waiter.wait(url) async with self.semaphore: self.reporter.report(INFO, f'fetching {url}', type=NETWORK) async with aiohttp.request( 'get', url, headers={'user-agent': self.useragent}) as res: html = await res.text() realurl = str(res.url) self.cacher.set(filename, html, { 'status': res.status, 'realurl': realurl }) # 終了条件 if page_num >= 2 and realurl.count('?pageno') == 0: return False else: for url, data in await self.run_in_executor(parse_user, html): await self.add_future( 'gallery', self.gallery_collector(url, 1, 501, userdata=data)) return True async def user_collector(self, url: str, pagestart: int, pageend: int): await self.queued_paging( pagestart, pageend, lambda page: self.download_user_page(url, page)) async def download_gallery_page(self, url: str, page_num: int, userdata=None): url = url + f'?pageno={page_num}' filename = urllib.parse.quote(url, safe='') + '.html' content, info = self.cacher.get(filename) if content and info: html = content realurl = info.get('realurl') self.reporter.report(INFO, f'use cache {url}') else: await self.waiter.wait(url) async with self.semaphore: self.reporter.report(INFO, f'fetching {url}', type=NETWORK) async with aiohttp.request( 'get', url, headers={'user-agent': self.useragent}) as res: html = await res.text() realurl = str(res.url) self.cacher.set(filename, html, { 'status': res.status, 'realurl': realurl }) # 終了条件 if page_num >= 2 and realurl.count('?pageno') == 0: return False else: for url, data in await self.run_in_executor( parse_gallely, html, userdata): imagefile = urllib.parse.quote(url, safe='') tmp_save(os.path.join(self.outdir, imagefile + '.json'), json.dumps(data)) imagepath = os.path.join(self.outdir, imagefile) if not os.path.exists(imagepath): await self.add_future( 'image', self.downloader.download_file( url, imagepath, headers={'user-agent': self.useragent})) return True async def gallery_collector(self, url: str, pagestart: int, pageend: int, userdata=None): await self.queued_paging( pagestart, pageend, lambda page: self.download_gallery_page( url, page, userdata=userdata))