def urlretrieve(url: str, filename: str, context: ssl.SSLContext, reporthook=None, cookies_path=None): """ original source: https://github.com/python/cpython/blob/ 21bee0bd71e1ad270274499f9f58194ebb52e236/Lib/urllib/request.py#L229 Because urlopen also supports context, I decided to adapt the download function. """ url_parsed = urlparse.urlparse(url) request = urllib.request.Request(url=url, headers=RequestHelper.stdHeader) if cookies_path is not None: cookie_jar = MozillaCookieJar(cookies_path) cookie_jar.load(ignore_discard=True, ignore_expires=True) cookie_jar.add_cookie_header(request) with contextlib.closing(urllib.request.urlopen(request, context=context)) as fp: headers = fp.info() # Just return the local path and the 'headers' for file:// # URLs. No sense in performing a copy unless requested. if url_parsed.scheme == 'file' and not filename: return os.path.normpath(url_parsed.path), headers if not filename: raise RuntimeError('No filename specified!') tfp = open(filename, 'wb') with tfp: result = filename, headers # read overall read = 0 # 4kb at once bs = 1024 * 8 blocknum = 0 # guess size size = int(headers.get('Content-Length', -1)) if reporthook: reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) if size >= 0 and read < size: raise ContentTooShortError('retrieval incomplete: got only %i out of %i bytes' % (read, size), result) return result
def start(self, blocking=None): ''' Starts the download task. Will raise `RuntimeError` if it's the object's already downloading. .. warning:: If you're using the non-blocking mode, Exceptions won't be raised. In that case, call `isSuccessful()` after the task is finished, to make sure the download succeeded. Call `get_errors()` to get the the exceptions. :param blocking: If true, calling this function will block the thread until the download finished. Default is *True*. :type blocking: bool ''' if not self.status == "ready": raise RuntimeError("cannot start (current status is {})".format( self.status)) self.logger.info('Starting a new SmartDL operation.') if blocking is None: blocking = self._start_func_blocking else: self._start_func_blocking = blocking if self.mirrors: self.logger.info('One URL and {} mirrors are loaded.'.format( len(self.mirrors))) else: self.logger.info('One URL is loaded.') if self.verify_hash and os.path.exists(self.dest): if utils.get_file_hash(self.hash_algorithm, self.dest) == self.hash_code: self.logger.info( "Destination '%s' already exists, and the hash matches. No need to download." % self.dest) self.status = 'finished' return self.logger.info("Downloading '{}' to '{}'...".format( self.url, self.dest)) req = urllib.request.Request(self.url, **self.requestArgs) # set cookie if we passed in cookie file and update the requestArgs to propagate it to the rest of the requests if self.cookie_file: try: cookie = MozillaCookieJar(self.cookie_file) cookie.load(ignore_expires=True, ignore_discard=True) cookie.add_cookie_header(req) self.requestArgs['headers']['Cookie'] = req.get_header( "Cookie") except OSError: self.logger.error( "Cookie file passed in is invalid, ignoring the cookie") raise try: urlObj = urllib.request.urlopen(req, timeout=self.timeout, context=self.context) except (urllib.error.HTTPError, urllib.error.URLError, socket.timeout) as e: self.errors.append(e) if self.mirrors: self.logger.info("{} Trying next mirror...".format(str(e))) self.url = self.mirrors.pop(0) self.logger.info('Using url "{}"'.format(self.url)) self.start(blocking) return else: self.logger.warning(str(e)) self.errors.append(e) self._failed = True self.status = "finished" raise try: self.filesize = int(urlObj.headers["Content-Length"]) self.logger.info("Content-Length is {} ({}).".format( self.filesize, utils.sizeof_human(self.filesize))) except (IndexError, KeyError, TypeError): self.logger.warning( "Server did not send Content-Length. Filesize is unknown.") self.filesize = 0 args = utils.calc_chunk_size(self.filesize, self.threads_count, self.minChunkFile) bytes_per_thread = args[0][1] - args[0][0] + 1 if len(args) > 1: self.logger.info( "Launching {} threads (downloads {}/thread).".format( len(args), utils.sizeof_human(bytes_per_thread))) else: self.logger.info("Launching 1 thread (downloads {}).".format( utils.sizeof_human(bytes_per_thread))) self.status = "downloading" for i, arg in enumerate(args): req = self.pool.submit(download, self.url, self.dest + ".%.3d" % i, self.requestArgs, self.context, arg[0], arg[1], self.timeout, self.shared_var, self.thread_shared_cmds, self.logger) self.post_threadpool_thread = threading.Thread( target=post_threadpool_actions, args=(self.pool, [[(self.dest + ".%.3d" % i) for i in range(len(args))], self.dest], self.filesize, self)) self.post_threadpool_thread.daemon = True self.post_threadpool_thread.start() self.control_thread = ControlThread(self) if blocking: self.wait(raise_exceptions=True)
def try_download_link( self, add_token: bool = False, delete_if_successful: bool = False, use_cookies: bool = False ) -> bool: """This function should only be used for shortcut/URL files. It tests whether a URL refers to a file, that is not an HTML web page. Then downloads it. Otherwise an attempt will be made to download an HTML video from the website. Args: add_token (bool, optional): Adds the ws-token to the url. Defaults to False. delete_if_successful (bool, optional): Deletes the tmp file if download was successfull. Defaults to False. use_cookies (bool, optional): Adds the cookies to the requests. Defaults to False. Returns: bool: If it was successfull. """ urlToDownload = self.file.content_fileurl if add_token: urlToDownload = self._add_token_to_url(self.file.content_fileurl) cookies_path = None if use_cookies: cookies_path = self.options.get('cookies_path', None) if cookies_path is None: self.success = False raise ValueError('Moodle Cookies are missing.') isHTML = False new_filename = "" total_bytes_estimate = -1 request = urllib.request.Request(url=urlToDownload, headers=RequestHelper.stdHeader) if use_cookies: cookie_jar = MozillaCookieJar(cookies_path) cookie_jar.load(ignore_discard=True, ignore_expires=True) cookie_jar.add_cookie_header(request) with contextlib.closing(urllib.request.urlopen(request, context=self.ssl_context)) as fp: headers = fp.info() content_type = headers.get_content_type() if content_type == 'text/html' or content_type == 'text/plain': isHTML = True url_parsed = urlparse.urlsplit(urlToDownload) new_filename = posixpath.basename(url_parsed.path) new_filename = headers.get_filename(new_filename) total_bytes_estimate = int(headers.get('Content-Length', -1)) if isHTML: tmp_filename = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) tmp_file = str(Path(self.destination) / tmp_filename) ydl_opts = { 'logger': self.YtLogger(self.thread_id), 'progress_hooks': [self.yt_hook], 'outtmpl': (tmp_file + '.%(ext)s'), 'nocheckcertificate': True, } if use_cookies: ydl_opts.update({'cookiefile': cookies_path}) with youtube_dl.YoutubeDL(ydl_opts) as ydl: try: ydl_results = ydl.download([urlToDownload]) if ydl_results == 1: # return False pass else: if delete_if_successful: try: os.remove(self.file.saved_to) except Exception as e: logging.warning( 'T%s - Could not delete %s after youtube-dl was successful. Error: %s', self.thread_id, self.file.saved_to, e, ) self.log_exception_extras(e) self.move_tmp_file(tmp_file) self.success = True return True except Exception: # return False pass # generate file extension for modules names new_name, new_extension = os.path.splitext(new_filename) if new_extension == '' and isHTML: new_extension = '.html' # if self.filename.startswith(('https://', 'http://')): # self.filename = new_name + new_extension old_name, old_extension = os.path.splitext(self.filename) if old_extension != new_extension: self.filename = self.filename + new_extension if delete_if_successful: try: os.remove(self.file.saved_to) except Exception: logging.warning( 'T%s - Could not delete %s before download is started. Error: %s', self.thread_id, self.file.saved_to, e, ) self.set_path(True) if total_bytes_estimate != -1: self.thread_report[self.thread_id]['extra_totalsize'] = total_bytes_estimate self.urlretrieve( urlToDownload, self.file.saved_to, context=self.ssl_context, reporthook=self.add_progress, cookies_path=cookies_path, ) self.file.time_stamp = int(time.time()) self.success = True return True
class ClientHandler(urllib.request.BaseHandler): DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36", "Accept": "text/html, application/xhtml+xml, application/xml; q=0.9, image/webp, */*; q=0.8", "Accept-Encoding": "gzip, deflate" } def __init__(self): self.headers = self.DEFAULT_HEADERS self.cookies = MozillaCookieJar() def load_cookie(self, fpath): self.cookies.load(fpath) def save_cookie(self, fpath): self.cookies.save(fpath) def set_header(self, name, value): self.headers[name] = value def set_cookie(self, cookie_string): cookie = self._make_cookie(cookie_string) if cookie: self.cookies.set_cookie(cookie) def _make_cookie(self, cookie_string): return HttpClient.ClientCookie().make(cookie_string) def _request_add_headers(self, request): for name, value in list(self.headers.items()): request.add_header(name, value) request.add_unredirected_header(name, value) return self def _request_add_cookies(self, request): self.cookies.add_cookie_header(request) def _response_extract_cookies(self, request, response): self.cookies.extract_cookies(response, request) return self def _response_decompress_content(self, request, response): encoding = response.headers.get("content-encoding", "") if encoding == "gzip": response.fp = gzip.GzipFile(fileobj=StringIO(response.read())) elif encoding == "deflate": response.fp = zlib.decompress(response.read()) else: pass return self def http_request(self, request): self._request_add_headers(request) self._request_add_cookies(request) return request def http_response(self, request, response): self._response_extract_cookies( request, response)._response_decompress_content(request, response) return response https_request = http_request https_response = http_response