def cache_html(url, name, attempts=1): # proxies = { # 'http': 'socks5://127.0.0.1:9050', # } if attempts > MAX_GET_ATTEMPTS: logger.critical(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}') raise TimeoutError(f'Tried {MAX_GET_ATTEMPTS} times to get URL {url}') logger.info(f'GET: {url}') if attempts > 1: logger.info(f'attempt: {attempts}') site = requests.get(url, headers=HEADERS()) site.encoding = 'utf-8' if is_captcha(site.content): logger.warning(f'Captcha received for url: {url}') logger.warning(f'sleeping for {TIMEOUT_SEC * attempts}s...') sleep(TIMEOUT_SEC * attempts) return cache_html(url, name, attempts=attempts + 1) try: with open(Path(CACHED_FOLDER, name), 'wb') as out: out.write(site.content) except FileNotFoundError: import os os.mkdir(CACHED_FOLDER) with open(Path(CACHED_FOLDER, name), 'wb') as out: out.write(site.content) logger.info(f'Cache name: {name}') return site.content
async def fetch(session, url): async with session.get(url, headers=HEADERS()) as response: return await response.text()