async def fetch_url(url): async with aiohttp.ClientSession( headers=headers, connector=aiohttp.TCPConnector(ssl=False)) as session: async with session.get(url) as response: return await response.text()
def test_dont_recreate_ssl_context(loop): conn = aiohttp.TCPConnector(loop=loop) ctx = conn.ssl_context assert ctx is conn.ssl_context
def session(self) -> aiohttp.ClientSession: connector = aiohttp.TCPConnector(ssl=self._ssl) self._session = self._session or aiohttp.ClientSession( connector=connector, timeout=self._timeout) return self._session
def test_tcp_connector_ctor_fingerprint_valid(loop): valid = b'\xa2\x06G\xad\xaa\xf5\xd8\\J\x99^by;\x06=' conn = aiohttp.TCPConnector(loop=loop, fingerprint=valid) assert conn.fingerprint == valid
def test_tcp_connector_clear_dns_cache_bad_args(loop): conn = aiohttp.TCPConnector(loop=loop) with pytest.raises(ValueError): conn.clear_dns_cache('localhost')
async def on_connect(self): await cleanup() global session connector = aiohttp.TCPConnector(limit=60) session = aiohttp.ClientSession(connector=connector)
def request(method, url, *, params=None, data=None, headers=None, skip_auto_headers=None, cookies=None, auth=None, allow_redirects=True, max_redirects=10, encoding='utf-8', version=None, compress=None, chunked=None, expect100=False, connector=None, loop=None, read_until_eof=True, request_class=None, response_class=None, proxy=None, proxy_auth=None): """Constructs and sends a request. Returns response object. method - HTTP method url - request url params - (optional) Dictionary or bytes to be sent in the query string of the new request data - (optional) Dictionary, bytes, or file-like object to send in the body of the request headers - (optional) Dictionary of HTTP Headers to send with the request cookies - (optional) Dict object to send with the request auth - (optional) BasicAuth named tuple represent HTTP Basic Auth auth - aiohttp.helpers.BasicAuth allow_redirects - (optional) If set to False, do not follow redirects version - Request HTTP version. compress - Set to True if request has to be compressed with deflate encoding. chunked - Set to chunk size for chunked transfer encoding. expect100 - Expect 100-continue response from server. connector - BaseConnector sub-class instance to support connection pooling. read_until_eof - Read response until eof if response does not have Content-Length header. request_class - (optional) Custom Request class implementation. response_class - (optional) Custom Response class implementation. loop - Optional event loop. Usage:: >>> import aiohttp >>> resp = yield from aiohttp.request('GET', 'http://python.org/') >>> resp <ClientResponse(python.org/) [200]> >>> data = yield from resp.read() """ warnings.warn("Use ClientSession().request() instead", DeprecationWarning) if connector is None: connector = aiohttp.TCPConnector(loop=loop, force_close=True) kwargs = {} if request_class is not None: kwargs['request_class'] = request_class if response_class is not None: kwargs['response_class'] = response_class session = ClientSession(loop=loop, cookies=cookies, connector=connector, **kwargs) return _DetachedRequestContextManager( session._request(method, url, params=params, data=data, headers=headers, skip_auto_headers=skip_auto_headers, auth=auth, allow_redirects=allow_redirects, max_redirects=max_redirects, encoding=encoding, version=version, compress=compress, chunked=chunked, expect100=expect100, read_until_eof=read_until_eof, proxy=proxy, proxy_auth=proxy_auth,), session=session)
def __init__(self, endpoint, *, loop): self._endpoint = endpoint self._session = aiohttp.ClientSession( connector=aiohttp.TCPConnector(use_dns_cache=True, loop=loop), loop=loop) self._base_url = 'http://{0.host}:{0.port}/'.format(endpoint)
async def init(): bot.session = aiohttp.ClientSession(connector=aiohttp.TCPConnector( family=socket.AF_INET))
def request(method, url, *, params=None, data=None, headers=None, cookies=None, files=None, auth=None, allow_redirects=True, max_redirects=10, encoding='utf-8', version=aiohttp.HttpVersion11, compress=None, chunked=None, expect100=False, connector=None, loop=None, read_until_eof=True, request_class=None, response_class=None): """Constructs and sends a request. Returns response object. :param str method: http method :param str url: request url :param params: (optional) Dictionary or bytes to be sent in the query string of the new request :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the request :param dict headers: (optional) Dictionary of HTTP Headers to send with the request :param dict cookies: (optional) Dict object to send with the request :param auth: (optional) BasicAuth named tuple represent HTTP Basic Auth :type auth: aiohttp.helpers.BasicAuth :param bool allow_redirects: (optional) Set to True if POST/PUT/DELETE redirect following is allowed. :param version: Request http version. :type version: aiohttp.protocol.HttpVersion :param bool compress: Set to True if request has to be compressed with deflate encoding. :param chunked: Set to chunk size for chunked transfer encoding. :type chunked: bool or int :param bool expect100: Expect 100-continue response from server. :param connector: BaseConnector sub-class instance to support connection pooling and session cookies. :type connector: aiohttp.connector.BaseConnector :param bool read_until_eof: Read response until eof if response does not have Content-Length header. :param request_class: (optional) Custom Request class implementation. :param response_class: (optional) Custom Response class implementation. :param loop: Optional event loop. Usage:: >>> import aiohttp >>> resp = yield from aiohttp.request('GET', 'http://python.org/') >>> resp <ClientResponse(python.org/) [200]> >>> data = yield from resp.read() """ redirects = 0 method = method.upper() if loop is None: loop = asyncio.get_event_loop() if request_class is None: request_class = ClientRequest if connector is None: connector = aiohttp.TCPConnector(force_close=True, loop=loop) while True: req = request_class( method, url, params=params, headers=headers, data=data, cookies=cookies, files=files, encoding=encoding, auth=auth, version=version, compress=compress, chunked=chunked, loop=loop, expect100=expect100, response_class=response_class) try: conn = yield from connector.connect(req) resp = req.send(conn.writer, conn.reader) try: yield from resp.start(conn, read_until_eof) except: resp.close() conn.close() raise except aiohttp.BadStatusLine as exc: raise aiohttp.ClientConnectionError(exc) except OSError as exc: raise aiohttp.OsConnectionError(exc) # redirects if resp.status in (301, 302, 303, 307) and allow_redirects: redirects += 1 if max_redirects and redirects >= max_redirects: resp.close(force=True) break # For 301 and 302, mimic IE behaviour, now changed in RFC. # Details: https://github.com/kennethreitz/requests/pull/269 if resp.status != 307: method = 'GET' data = None cookies = resp.cookies r_url = resp.headers.get('LOCATION') or resp.headers.get('URI') scheme = urllib.parse.urlsplit(r_url)[0] if scheme not in ('http', 'https', ''): resp.close(force=True) raise ValueError('Can redirect only to http or https') elif not scheme: r_url = urllib.parse.urljoin(url, r_url) url = urllib.parse.urldefrag(r_url)[0] if url: yield from asyncio.async(resp.release(), loop=loop) continue break return resp
def get_outgoing_mappings(self, cfg): """Reads the outgoing webhook definitions from the config file. This also sets up the HTTP client session for each webhook.""" bridges = cfg['bridges'] for bridge in bridges: if 'outgoing_webhooks' not in bridge: # No outgoing webhooks in this bridge. continue outgoing_webhooks = bridge['outgoing_webhooks'] xmpp_endpoints = bridge['xmpp_endpoints'] # Check whether all normal messages to this bridge should be # relayed. relay_all_normal = False for xmpp_endpoint in xmpp_endpoints: if ('relay_all_normal' in xmpp_endpoint and xmpp_endpoint['relay_all_normal'] is True): relay_all_normal = True break for outgoing_webhook in outgoing_webhooks: if 'url' not in outgoing_webhook: raise InvalidConfigError("Error in config file: " "'url' is missing from an " "outgoing webhook definition.") # Set up SSL context for certificate pinning. if 'cafile' in outgoing_webhook: cafile = os.path.abspath(outgoing_webhook['cafile']) sslcontext = ssl.create_default_context(cafile=cafile) conn = aiohttp.TCPConnector(ssl_context=sslcontext) session = aiohttp.ClientSession(loop=self.loop, connector=conn) else: session = aiohttp.ClientSession(loop=self.loop) # TODO: Handle ConnectionRefusedError. outgoing_webhook['session'] = session if relay_all_normal: self.outgoing_mappings['all_normal'].append( outgoing_webhook) for xmpp_endpoint in xmpp_endpoints: # Determine whether the JID corresponds to a MUC or a # normal chat: if 'muc' in xmpp_endpoint: if xmpp_endpoint['muc'] not in self.mucs: raise InvalidConfigError( "Error in config file: XMPP MUC '{}' was not " "defined in the xmpp.mucs section.".format( xmpp_endpoint['muc'])) self.outgoing_mappings[xmpp_endpoint['muc']].append( outgoing_webhook) elif 'normal' in xmpp_endpoint: if relay_all_normal: # Don't add normal JIDs when all normal messages # are relayed anyways. continue self.outgoing_mappings[xmpp_endpoint['normal']].append( outgoing_webhook)
async def get(url, params=None, headers=None): connector = aiohttp.TCPConnector(verify_ssl=False) with aiohttp.ClientSession(connector=connector) as session: async with session.get(url, params=params, headers=headers) as resp: return await resp.text()
async def get(url): async with aiohttp.ClientSession( connector=aiohttp.TCPConnector(ssl=False)) as session: async with session.get(url) as res: return await res.text()
def default_session(): connector = aiohttp.TCPConnector(limit=None, verify_ssl=False) session = aiohttp.ClientSession(connector=connector) return session
def _craft_aiohttp_connector(context): return aiohttp.TCPConnector()
async def main(signals=None, site_settings=None): template_path = TemplatePathSettings().template_path if site_settings is None: site_settings = SiteSettings() if not site_settings.check_if_valid(): logger.critical("Settings are not correctly configured. " "Please run 'python main.py --help' for more info. " "Exiting...") return ssl_context = ssl.create_default_context(cafile=certifi.where()) conn = aiohttp.TCPConnector(ssl=ssl_context, limit=site_settings.conn_limit, limit_per_host=site_settings.conn_limit_per_host) async with monitor.MonitorSession(signals=signals, raise_for_status=True, connector=conn, timeout=aiohttp.ClientTimeout(30)) as session: logger.debug(f"Loading template: {template_path}") queue = unique_queue.UniqueQueue() producers = [] cancellable_pool = CancellablePool() template_file = os.path.join(os.path.dirname(__file__), template_path) template = template_parser.Template(path=template_file, signals=signals) try: template.load() except Exception as e: logger.critical(f"A critical error occurred while passing the template." f" {type(e).__name__}: {e}. Exiting...", exc_info=True) return await template.run_root(producers, session, queue, site_settings=site_settings, cancellable_pool=cancellable_pool) user_statistic = asyncio.ensure_future(async_user_statistics(session, site_settings.username)) logger.debug(f"Checking for update") latest_version = await async_get_latest_version(session) if latest_version != VERSION: logger.info(f"A new update is available. Update with 'git pull'." f" New version: {latest_version}. Current version {VERSION}") logger.debug("Starting consumers") consumers = [asyncio.ensure_future(downloader.download_files(session, queue)) for _ in range(20)] logger.debug("Gathering producers") await asyncio.gather(*producers) logger.debug("Waiting for queue") num_unfinished_downloads = queue.qsize() + queue._unfinished_tasks if num_unfinished_downloads: logger.info(f"Waiting for {num_unfinished_downloads} potential download(s) to finish") await queue.join() logger.debug("Cancel consumers") for c in consumers: c.cancel() cancellable_pool.shutdown() await user_statistic
def __init__(self, ad, name, logger, error, loglevel, args): self.AD = ad self.logger = logger self.error = error self.stopping = False self.config = args self.loglevel = loglevel self.ws = None self.reading_messages = False self.name = name self.log("INFO", "HASS Plugin Initializing") self.name = name if "namespace" in args: self.namespace = args["namespace"] else: self.namespace = "default" if "verbose" in args: self.verbose = args["verbose"] else: self.verbose = False if "ha_key" in args: self.ha_key = args["ha_key"] else: self.ha_key = "" if "ha_url" in args: self.ha_url = args["ha_url"] else: self.log("WARN", "ha_url not found in HASS configuration - module not initialized") if "cert_path" in args: self.cert_path = args["cert_path"] else: self.cert_path = None if "timeout" in args: self.timeout = args["timeout"] else: self.timeout = None if "cert_verify" in args: self.cert_verify = args["cert_verify"] else: self.cert_verify = True if "commtype" in args: self.commtype = args["commtype"] else: self.commtype = "WS" # # Set up HTTP Client # conn = aiohttp.TCPConnector() self.session = aiohttp.ClientSession(connector=conn) self.log("INFO", "HASS Plugin initialization complete")
def init_session(self, loop): connector = aiohttp.TCPConnector(ttl_dns_cache=300) self._session = aiohttp.ClientSession(connector=connector, loop=loop) return self._session
def __init__(self, *, connector=None, loop=None, cookies=None, headers=None, skip_auto_headers=None, auth=None, request_class=ClientRequest, response_class=ClientResponse, ws_response_class=ClientWebSocketResponse, version=aiohttp.HttpVersion11, cookie_jar=None, read_timeout=None, time_service=None): implicit_loop = False if loop is None: if connector is not None: loop = connector._loop else: implicit_loop = True loop = asyncio.get_event_loop() if connector is None: connector = aiohttp.TCPConnector(loop=loop) if connector._loop is not loop: raise RuntimeError( "Session and connector has to use same event loop") self._loop = loop if loop.get_debug(): self._source_traceback = traceback.extract_stack(sys._getframe(1)) if implicit_loop and not loop.is_running(): warnings.warn("Creating a client session outside of coroutine is " "a very dangerous idea", ResourceWarning, stacklevel=2) context = {'client_session': self, 'message': 'Creating a client session outside ' 'of coroutine'} if self._source_traceback is not None: context['source_traceback'] = self._source_traceback loop.call_exception_handler(context) if cookie_jar is None: cookie_jar = CookieJar(loop=loop) self._cookie_jar = cookie_jar if cookies is not None: self._cookie_jar.update_cookies(cookies) self._connector = connector self._default_auth = auth self._version = version self._read_timeout = read_timeout # Convert to list of tuples if headers: headers = CIMultiDict(headers) else: headers = CIMultiDict() self._default_headers = headers if skip_auto_headers is not None: self._skip_auto_headers = frozenset([istr(i) for i in skip_auto_headers]) else: self._skip_auto_headers = frozenset() self._request_class = request_class self._response_class = response_class self._ws_response_class = ws_response_class if time_service is not None: self._time_service_owner = False self._time_service = time_service else: self._time_service_owner = True self._time_service = TimeService(self._loop)
async def connect(self): if self.session is None: self.session = aiohttp.ClientSession( connector=aiohttp.TCPConnector(ssl=False))
def test_default_use_dns_cache(loop): conn = aiohttp.TCPConnector(loop=loop) assert conn.use_dns_cache
def request_timeout(self, timeout): self._request_timeout = timeout self.aiohttp_clientsession = aiohttp.ClientSession( connector=aiohttp.TCPConnector(limit=self.request_batch_size * 2), timeout=aiohttp.ClientTimeout(total=timeout))
def test_tcp_connector_fingerprint_invalid(loop): invalid = b'\x00' with pytest.raises(ValueError): aiohttp.TCPConnector(loop=loop, fingerprint=invalid)
import asyncio import aiohttp import async_timeout import atexit import re import json from .. import exception from ..api import _methodurl, _which_pool, _fileurl, _guess_filename _loop = asyncio.get_event_loop() _pools = { 'default': aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=10), loop=_loop) } _timeout = 30 _proxy = None # (url, (username, password)) def set_proxy(url, basic_auth=None): global _proxy if not url: _proxy = None else: _proxy = (url, basic_auth) if basic_auth else (url, ) def _close_pools(): global _pools for s in _pools.values():
def test_ambigous_verify_ssl_and_ssl_context(loop): with pytest.raises(ValueError): aiohttp.TCPConnector(verify_ssl=False, ssl_context=ssl.SSLContext(ssl.PROTOCOL_SSLv23), loop=loop)
def _create_onetime_pool(): return aiohttp.ClientSession(connector=aiohttp.TCPConnector( limit=1, force_close=True), loop=_loop)
def test_respect_precreated_ssl_context(loop): ctx = ssl.SSLContext(ssl.PROTOCOL_SSLv23) conn = aiohttp.TCPConnector(loop=loop, ssl_context=ctx) assert ctx is conn.ssl_context
def __init__( self, info: credentials.ConnectionInfo, ) -> None: super().__init__() # Some SSL data are not accepted directly, so we have to use temp files. tempfiles = _TempFiles() ca_path: Optional[str] certificate_path: Optional[str] private_key_path: Optional[str] if info.ca_path and info.ca_data: raise credentials.LoginError( "Both CA path & data are set. Need only one.") elif info.ca_path: ca_path = info.ca_path elif info.ca_data: ca_path = tempfiles[base64.b64decode(info.ca_data)] else: ca_path = None if info.certificate_path and info.certificate_data: raise credentials.LoginError( "Both certificate path & data are set. Need only one.") elif info.certificate_path: certificate_path = info.certificate_path elif info.certificate_data: certificate_path = tempfiles[base64.b64decode( info.certificate_data)] else: certificate_path = None if info.private_key_path and info.private_key_data: raise credentials.LoginError( "Both private key path & data are set. Need only one.") elif info.private_key_path: private_key_path = info.private_key_path elif info.private_key_data: private_key_path = tempfiles[base64.b64decode( info.private_key_data)] else: private_key_path = None # The SSL part (both client certificate auth and CA verification). context: ssl.SSLContext if certificate_path and private_key_path: context = ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH, cafile=ca_path) context.load_cert_chain(certfile=certificate_path, keyfile=private_key_path) else: context = ssl.create_default_context(cafile=ca_path) if info.insecure: context.check_hostname = False context.verify_mode = ssl.CERT_NONE # The token auth part. headers: Dict[str, str] = {} if info.scheme and info.token: headers['Authorization'] = f'{info.scheme} {info.token}' elif info.scheme: headers['Authorization'] = f'{info.scheme}' elif info.token: headers['Authorization'] = f'Bearer {info.token}' # The basic auth part. auth: Optional[aiohttp.BasicAuth] if info.username and info.password: auth = aiohttp.BasicAuth(info.username, info.password) else: auth = None # It is a good practice to self-identify a bit. headers['User-Agent'] = f'kopf/unknown' # TODO: add version someday # Generic aiohttp session based on the constructed credentials. self.session = aiohttp.ClientSession( connector=aiohttp.TCPConnector( limit=0, ssl=context, ), headers=headers, auth=auth, ) # Add the extra payload information. We avoid overriding the constructor. self.server = info.server self.default_namespace = info.default_namespace # For purging on garbage collection. self._tempfiles = tempfiles self._discovery_lock = asyncio.Lock() self._discovered_resources = {}
async def maigret(username, site_dict, query_notify, logger, proxy=None, timeout=None, recursive_search=False, id_type='username', tags=None, debug=False, forced=False, max_connections=100, no_progressbar=False): """Main search func Checks for existence of username on various social media sites. Keyword Arguments: username -- String indicating username that report should be created against. site_dict -- Dictionary containing all of the site data. query_notify -- Object with base type of QueryNotify(). This will be used to notify the caller about query results. proxy -- String indicating the proxy URL timeout -- Time in seconds to wait before timing out request. Default is no timeout. recursive_search -- Search for other usernames in website pages & recursive search by them. Return Value: Dictionary containing results from report. Key of dictionary is the name of the social network site, and the value is another dictionary with the following keys: url_main: URL of main site. url_user: URL of user on site (if account exists). status: QueryResult() object indicating results of test for account existence. http_status: HTTP status code of query which checked for existence on site. response_text: Text that came back from request. May be None if there was an HTTP error when checking for existence. """ # Notify caller that we are starting the query. if tags is None: tags = set() query_notify.start(username, id_type) # TODO: connector connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False) # connector = aiohttp.TCPConnector(ssl=False) connector.verify_ssl=False session = aiohttp.ClientSession(connector=connector) if logger.level == logging.DEBUG: future = session.get(url='https://icanhazip.com') ip, status, error, expection = await get_response(future, None, logger) if ip: logger.debug(f'My IP is: {ip.strip()}') else: logger.debug(f'IP requesting {error}: {expection}') # Results from analysis of all sites results_total = {} # First create futures for all requests. This allows for the requests to run in parallel for site_name, site in site_dict.items(): fulltags = site.tags if site.type != id_type: continue site_tags = set(fulltags) if tags: if not set(tags).intersection(site_tags): continue if site.disabled and not forced: continue # Results from analysis of this specific site results_site = {} # Record URL of main site and username results_site['username'] = username results_site['parsing_enabled'] = recursive_search results_site['url_main'] = site.url_main headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0', } headers.update(site.headers) if not 'url' in site.__dict__: logger.error('No URL for site %s', site.name) # URL of user on site (if it exists) url = site.url.format( urlMain=site.url_main, urlSubpath=site.url_subpath, username=username ) # workaround to prevent slash errors url = re.sub('(?<!:)/+', '/', url) # Don't make request if username is invalid for the site if site.regex_check and re.search(site.regex_check, username) is None: # No need to do the check at the site: this user name is not allowed. results_site['status'] = QueryResult(username, site_name, url, QueryStatus.ILLEGAL) results_site["url_user"] = "" results_site['http_status'] = "" results_site['response_text'] = "" query_notify.update(results_site['status']) else: # URL of user on site (if it exists) results_site["url_user"] = url url_probe = site.url_probe if url_probe is None: # Probe URL is normal one seen by people out on the web. url_probe = url else: # There is a special URL for probing existence separate # from where the user profile normally can be found. url_probe = url_probe.format( urlMain=site.url_main, urlSubpath=site.url_subpath, username=username, ) if site.check_type == 'status_code' and site.request_head_only: # In most cases when we are detecting by status code, # it is not necessary to get the entire body: we can # detect fine with just the HEAD response. request_method = session.head else: # Either this detect method needs the content associated # with the GET response, or this specific website will # not respond properly unless we request the whole page. request_method = session.get if site.check_type == "response_url": # Site forwards request to a different URL if username not # found. Disallow the redirect so we can capture the # http status from the original URL request. allow_redirects = False else: # Allow whatever redirect that the site wants to do. # The final result of the request will be what is available. allow_redirects = True # TODO: cookies using # def parse_cookies(cookies_str): # cookies = SimpleCookie() # cookies.load(cookies_str) # return {key: morsel.value for key, morsel in cookies.items()} # # if os.path.exists(cookies_file): # cookies_obj = cookielib.MozillaCookieJar(cookies_file) # cookies_obj.load(ignore_discard=True, ignore_expires=True) future = request_method(url=url_probe, headers=headers, allow_redirects=allow_redirects, timeout=timeout, ) # Store future in data for access later # TODO: move to separate obj site.request_future = future # Add this site's results into final dictionary with all of the other results. results_total[site_name] = results_site # TODO: move into top-level function sem = asyncio.Semaphore(max_connections) tasks = [] for sitename, result_obj in results_total.items(): update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify) future = asyncio.ensure_future(update_site_coro) tasks.append(future) if no_progressbar: await asyncio.gather(*tasks) else: for f in tqdm.asyncio.tqdm.as_completed(tasks): await f await session.close() # Notify caller that all queries are finished. query_notify.finish() return results_total
async def fetch_urls(urls, out_fname, logging_fnames=None): tasks = [] connector = aiohttp.TCPConnector(limit_per_host=1) async with aiohttp.ClientSession( connector=connector, cookie_jar=aiohttp.DummyCookieJar()) as session: # Async fetch the urls sem = asyncio.Semaphore(FLAGS.max_parallel_requests) for url in urls: side_data = {"url": url} task = asyncio.ensure_future( throttled_fetch_url(url, sem, session, side_data)) tasks.append(task) tf.logging.info("Async requested %d urls", len(urls)) # Setup output files file_handles = [] out_f = make_tfrecord_writer(out_fname) file_handles.append(out_f) logging_fnames = logging_fnames or {} samples_f = None if "samples" in logging_fnames: samples_f = tf.gfile.Open(logging_fnames["samples"], "w") file_handles.append(samples_f) refs_written = [0] # Made a list so can be mutated def text_extraction_callback(callback_arg): url, text = callback_arg written = write_ref_content(url, text, out_f) if not written: return if not refs_written[0] % FLAGS.log_every: timestamp = datetime.datetime.now().strftime("%H:%M") tf.logging.info("%s: Wrote ref %d in group", timestamp, refs_written[0]) if samples_f is not None: samples_f.write(url) samples_f.write("\n") samples_f.write(text) samples_f.write("\n\n---\n\n") refs_written[0] += 1 try: # Process each URL as it comes in. # Using a multiprocessing Pool because the text extraction is expensive # and so we distribute across cores. pool = multiprocessing.Pool() results = [] for task in asyncio.as_completed(tasks): html, side_data = await task url = side_data["url"] if not html: continue res = pool.apply_async(mp_get_text, (url, html), {}, text_extraction_callback) results.append(res) for res in results: try: res.get(timeout=10) except multiprocessing.TimeoutError: pass finally: for f in file_handles: f.close() return refs_written[0]