async def _request(self, method, url, headers, data, verify, stream): # Note: When using aiobotocore with dynamodb, requests fail on crc32 # checksum computation as soon as the response data reaches ~5KB. # When AWS response is gzip compressed: # 1. aiohttp is automatically decompressing the data # (http://aiohttp.readthedocs.io/en/stable/client.html#binary-response-content) # 2. botocore computes crc32 on the uncompressed data bytes and fails # cause crc32 has been computed on the compressed data # The following line forces aws not to use gzip compression, # if there is a way to configure aiohttp not to perform decompression, # we can remove the following line and take advantage of # aws gzip compression. # https://github.com/boto/botocore/issues/1255 headers['Accept-Encoding'] = 'identity' headers_ = MultiDict( (z[0], text_(z[1], encoding='utf-8')) for z in headers.items()) # botocore does this during the request so we do this here as well proxy = self.proxies.get(urlparse(url.lower()).scheme) if isinstance(data, io.IOBase): data = _IOBaseWrapper(data) url = URL(url, encoded=True) resp = await self._aio_session.request(method, url=url, headers=headers_, data=data, proxy=proxy, verify_ssl=verify) # If we're not streaming, read the content so we can retry any timeout # errors, see: # https://github.com/boto/botocore/blob/develop/botocore/vendored/requests/sessions.py#L604 if not stream: await resp.read() return resp
def _aiohttp_do_redirect(session, method, url, headers, data, resp): # This is the redirect code from aiohttp, remove once # https://github.com/aio-libs/aiobotocore/issues/267 is supported # For 301 and 302, mimic IE, now changed in RFC # https://github.com/kennethreitz/requests/pull/269 if (resp.status == 303 and resp.method != hdrs.METH_HEAD) \ or (resp.status in (301, 302) and resp.method == hdrs.METH_POST): method = hdrs.METH_GET data = None if headers.get(hdrs.CONTENT_LENGTH): headers.pop(hdrs.CONTENT_LENGTH) r_url = (resp.headers.get(hdrs.LOCATION) or resp.headers.get(hdrs.URI)) if r_url is None: return None r_url = URL( r_url, encoded=not session.requote_redirect_url) scheme = r_url.scheme if scheme not in ('http', 'https', ''): resp.close() raise ValueError( 'Can redirect only to http or https') elif not scheme: r_url = url.join(r_url) url = r_url params = None resp.release() return method, url, headers, params, data
async def download_if_not_exist(session, path, url, site_settings, cancellable_pool, with_extension=True, session_kwargs=None, allowed_extensions=None, forbidden_extensions=None, checksum=None, signal_handler=None, unique_key=None): if session_kwargs is None: session_kwargs = {} if allowed_extensions is None: allowed_extensions = [] if forbidden_extensions is None: forbidden_extensions = [] allowed_extensions += site_settings.allowed_extensions forbidden_extensions += site_settings.forbidden_extensions if isinstance(url, str): url = URL(url) domain = url.host timeout = aiohttp.ClientTimeout(total=0) if os.path.isabs(path): raise ValueError("Absolutes paths are not allowed") absolute_path = os.path.join(site_settings.base_path, path) if not with_extension: guess_extension = await cache.check_extension( session, str(url), session_kwargs=session_kwargs) if guess_extension is None: logger.warning(f"Could not retrieve the extension for {url}") return absolute_path += "." + guess_extension force = False if checksum is not None: force = not cache.is_checksum_same(absolute_path, checksum) elif site_settings.force_download and domain not in FORCE_DOWNLOAD_BLACKLIST: force = True if os.path.exists(absolute_path) and not force: return if os.path.exists(absolute_path): headers = session_kwargs.get("headers", {}) etag = cache.get_etag(absolute_path) if etag is not None: headers["If-None-Match"] = etag if headers: session_kwargs["headers"] = headers if os.path.exists(absolute_path): action = ACTION_REPLACE else: action = ACTION_NEW file_name = os.path.basename(absolute_path) file_extension = get_extension(file_name) if is_extension_forbidden(extension=file_extension, forbidden_extensions=forbidden_extensions, allowed_extensions=allowed_extensions): return async with session.get(url, timeout=timeout, **session_kwargs) as response: response.raise_for_status() response_headers = response.headers if response.status == 304: logger.debug(f"File '{absolute_path}' not modified") cache.save_checksum(absolute_path, checksum) return if file_extension.lower() in MOVIE_EXTENSIONS: logger.info(f"Starting to download {file_name}") pathlib.Path(os.path.dirname(absolute_path)).mkdir(parents=True, exist_ok=True) if action == ACTION_REPLACE and site_settings.keep_replaced_files: dir_path = os.path.dirname(absolute_path) pure_name, extension = split_name_extension(file_name) old_file_name = f"{pure_name}-old.{extension}" old_absolute_path = os.path.join(dir_path, old_file_name) os.replace(absolute_path, old_absolute_path) try: with open(absolute_path, 'wb') as f: while True: chunk = await response.content.read(8192) if not chunk: break f.write(chunk) except BaseException as e: os.remove(absolute_path) logger.debug(f"Removed file {absolute_path}") raise e if site_settings.highlight_difference and \ action == ACTION_REPLACE and \ site_settings.keep_replaced_files and \ file_extension.lower() == "pdf": logger.debug(f"Adding highlights to {absolute_path}") temp_file_name = f"{pure_name}-temp.{extension}" temp_absolute_path = os.path.join(dir_path, temp_file_name) future = cancellable_pool.apply( functools.partial(pdf_highlighter.add_differ_highlight, new_path=absolute_path, old_path=old_absolute_path, out_path=temp_absolute_path)) try: await future os.replace(temp_absolute_path, old_absolute_path) except asyncio.CancelledError as e: os.replace(old_absolute_path, absolute_path) logger.debug(f"Reverted old file {absolute_path}") raise e except Exception as e: logger.warning( f"Could not add pdf highlight to {absolute_path}. {type(e).__name__}: {e}" ) signal_handler.got_warning( unique_key, f"Could not add pdf highlight to {absolute_path}. {type(e).__name__}: {e}" ) finally: if os.path.exists(temp_absolute_path): logger.debug(f"Removed temp file {temp_absolute_path}") os.remove(temp_absolute_path) if "ETag" in response_headers: cache.save_etag(absolute_path, response.headers["ETag"]) elif domain not in FORCE_DOWNLOAD_BLACKLIST: logger.warning( f"url: {url} had not an etag and is not in the blacklist") cache.save_checksum(absolute_path, checksum) if action == ACTION_REPLACE: if site_settings.keep_replaced_files and os.path.exists( old_absolute_path): signal_handler.replaced_file(unique_key, absolute_path, old_absolute_path) else: signal_handler.replaced_file(unique_key, absolute_path) method_msg = "Replaced" elif action == ACTION_NEW: signal_handler.added_new_file(unique_key, absolute_path) method_msg = "Added new" else: method_msg = "Unexpected action" start = { "name": f"{method_msg} file: '{{}}'", "var": file_name, "priority": 100, "cut": "back", } end = { "name": " in '{}'", "var": os.path.dirname(absolute_path), "priority": -100, "cut": "front", } logger.info(fit_sections_to_console(start, end, margin=1))
async def download_if_not_exist(session, path, url, download_settings, cancellable_pool, with_extension=True, session_kwargs=None, allowed_extensions=None, forbidden_extensions=None, checksum=None, signal_handler=None, unique_key=None): if session_kwargs is None: session_kwargs = {} if allowed_extensions is None: allowed_extensions = [] if forbidden_extensions is None: forbidden_extensions = [] if download_settings.allowed_extensions is not None: allowed_extensions += download_settings.allowed_extensions if download_settings.forbidden_extensions is not None: forbidden_extensions += download_settings.forbidden_extensions if isinstance(url, str): url = URL(url) domain = url.host if os.path.isabs(path): raise ValueError("Absolutes paths are not allowed") absolute_path = os.path.join(download_settings.save_path, path) if not with_extension: guess_extension = await cache.check_extension( session, str(url), session_kwargs=session_kwargs) if guess_extension is None: logger.warning(f"Could not retrieve the extension for {url}") return absolute_path += "." + guess_extension file_name = os.path.basename(absolute_path) dir_path = os.path.dirname(absolute_path) file_extension = core.utils.get_extension(file_name) temp_file_name = core.utils.add_extension(f"{random.getrandbits(64)}", file_extension) temp_absolute_path = os.path.join(core.utils.get_temp_path(), temp_file_name) old_file_name = core.utils.insert_text_before_extension(file_name, "-old") old_absolute_path = os.path.join(dir_path, old_file_name) diff_file_name = core.utils.insert_text_before_extension( file_name, "-diff") diff_absolute_path = os.path.join(dir_path, diff_file_name) force = False if checksum is not None: force = not cache.is_checksum_same(absolute_path, checksum) elif download_settings.force_download and domain not in FORCE_DOWNLOAD_BLACKLIST: force = True if os.path.exists(absolute_path) and not force: return if os.path.exists(absolute_path): headers = session_kwargs.get("headers", {}) etag = cache.get_etag(absolute_path) if etag is not None: headers["If-None-Match"] = etag if headers: session_kwargs["headers"] = headers if os.path.exists(absolute_path): action = ACTION_REPLACE else: action = ACTION_NEW if is_extension_forbidden(extension=file_extension, forbidden_extensions=forbidden_extensions, allowed_extensions=allowed_extensions): return try: async with session.get(url, timeout=aiohttp.ClientTimeout(total=0), **session_kwargs) as response: response.raise_for_status() response_headers = response.headers if response.status == 304: logger.debug(f"File '{absolute_path}' not modified") cache.save_checksum(absolute_path, checksum) return if file_extension and file_extension.lower() in MOVIE_EXTENSIONS: logger.info(f"Starting to download {file_name}") pathlib.Path(os.path.dirname(absolute_path)).mkdir(parents=True, exist_ok=True) if action == ACTION_REPLACE: shutil.move(absolute_path, temp_absolute_path) file_hash = hashlib.md5() try: with open(absolute_path, 'wb') as f: while True: chunk = await response.content.read(8192) if not chunk: break f.write(chunk) file_hash.update(chunk) except BaseException as e: os.remove(absolute_path) logger.debug(f"Removed file {absolute_path}") if action == ACTION_REPLACE: logger.debug( f"Reverting temp file to new file: {absolute_path}") shutil.move(temp_absolute_path, absolute_path) raise e if action == ACTION_REPLACE and cache.is_own_checksum_same( absolute_path, file_hash.hexdigest()): logger.debug( f"own_checksum is same for {url}. Skipping processing") if "ETag" in response_headers: cache.save_etag(absolute_path, response.headers["ETag"]) elif domain not in FORCE_DOWNLOAD_BLACKLIST: logger.warning( f"url: {url} had not an etag and is not in the blacklist") cache.save_checksum(absolute_path, checksum) return if download_settings.highlight_difference and \ action == ACTION_REPLACE and \ file_extension and \ file_extension.lower() == "pdf": await _add_pdf_highlights(download_settings=download_settings, cancellable_pool=cancellable_pool, signal_handler=signal_handler, unique_key=unique_key, absolute_path=absolute_path, old_absolute_path=temp_absolute_path, out_path=diff_absolute_path) if action == ACTION_REPLACE and download_settings.keep_replaced_files: shutil.move(temp_absolute_path, old_absolute_path) cache.save_own_checksum(absolute_path, file_hash.hexdigest()) if "ETag" in response_headers: cache.save_etag(absolute_path, response.headers["ETag"]) elif domain not in FORCE_DOWNLOAD_BLACKLIST: logger.warning( f"url: {url} had not an etag and is not in the blacklist") cache.save_checksum(absolute_path, checksum) if action == ACTION_REPLACE: signal_old_path, signal_diff_path = None, None if os.path.exists(old_absolute_path ) and download_settings.keep_replaced_files: signal_old_path = old_absolute_path if os.path.exists(diff_absolute_path ) and download_settings.highlight_difference: signal_diff_path = diff_absolute_path signal_handler.replaced_file(unique_key, absolute_path, signal_old_path, signal_diff_path) elif action == ACTION_NEW: signal_handler.added_new_file(unique_key, absolute_path) if action == ACTION_REPLACE: method_msg = "Replaced" elif action == ACTION_NEW: method_msg = "Added new" else: method_msg = "Unexpected action" start = { "name": f"{method_msg} file: '{{}}'", "var": file_name, "priority": 100, "cut": "back", } end = { "name": " in '{}'", "var": os.path.dirname(absolute_path), "priority": -100, "cut": "front", } logger.info(core.utils.fit_sections_to_console(start, end, margin=1)) finally: if os.path.exists(temp_absolute_path): os.remove(temp_absolute_path)
async def download(session, queue, base_path, url, password=None, file_name=None): domain = re.match(r"https?://([^.]*\.?)zoom.us", url).group(1) agent_header = { "referer": f"https://{domain}zoom.us/", "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/74.0.3729.169 " "Safari/537.36") } async with session.get(url, headers=agent_header) as response: html = await response.text() if password is not None: meet_id_regex = re.compile("<input[^>]*") for inp in meet_id_regex.findall(html): input_split = inp.split() if input_split[2] == 'id="meetId"': meet_id = input_split[3][7:-1] break data = { "id": meet_id, "passwd": password, "action": "viewdetailpage", "recaptcha": "" } check_url = f"https://{domain}zoom.us/rec/validate_meet_passwd" async with session.post(check_url, data=data, headers=agent_header) as response: pass async with session.get(url, headers=agent_header) as response: html = await response.text() metadata = _get_page_meta(html, ("viewMp4Url", "topic")) if metadata is None: logger.warning(f"Zoom url: {url} has no video") return None vid_url = metadata.get("viewMp4Url", None) if vid_url is None: raise LoginError("Could not Login") extension = get_extension(vid_url.split("?")[0].split("/")[-1]) name = file_name or metadata.get("topic") # We need to disable the decoding of the url, because zoom is not RFC-compliant (btw f**k zoom). await queue.put({ "url": URL(vid_url, encoded=True), "path": safe_path_join(base_path, add_extension(name, extension)), "session_kwargs": dict(headers=agent_header) })