def async_get(event_loop): """ AsyncSession cannot be created global since it will create a different loop from pytest-asyncio. """ async_session = AsyncHTMLSession() async_session.mount('file://', FileAdapter()) path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path) return partial(async_session.get, url)
async def __get_vm(self, stop: str) -> HTMLResponse: url = '{}/?przystanek={}'.format(Scrapper.VM_URL, stop) session = AsyncHTMLSession() adapter = adapters.HTTPAdapter(max_retries=5) session.mount('http://', adapter) session.mount('https://', adapter) result = None while not isinstance(result, HTMLResponse): proxy = self.__get_next_proxy() try: result = await session.get(url, proxies=Scrapper.__proxies(proxy), timeout=Scrapper.GET_TIMEOUT) except RequestException: if len(self.proxies) == 1: raise Exception('no working proxy available') pass await session.close() return result
import requests import re import json from dataclasses import dataclass from urllib.parse import urljoin from urllib.parse import urlparse from requests_html import AsyncHTMLSession, HTML from utils import PASS_DOMAIN, geuss_link_url, rm_slash, has_url_html_been_fetched from itertools import chain from multiprocessing import cpu_count, Pool, Manager, Queue, TimeoutError from site_feature import SiteFeatureTransformer asession = AsyncHTMLSession() adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) asession.mount('http://', adapter) asession.mount('https://', adapter) def save_html(domain, html): path = os.path.join('html', f'{domain}.html') with open(path, 'w') as f: f.write(html) def get_data(urls, is_zh_i9t_blog=False): res = get_frineds_and_res(urls, is_zh_i9t_blog) data = [] for url, friends, r in res: site_feature = SiteFeatureTransformer(r=r, url=url,