Exemple #1
0
    def urlretrieve(url: str, filename: str, context: ssl.SSLContext, reporthook=None, cookies_path=None):
        """
        original source:
        https://github.com/python/cpython/blob/
        21bee0bd71e1ad270274499f9f58194ebb52e236/Lib/urllib/request.py#L229

        Because urlopen also supports context,
        I decided to adapt the download function.
        """
        url_parsed = urlparse.urlparse(url)

        request = urllib.request.Request(url=url, headers=RequestHelper.stdHeader)
        if cookies_path is not None:
            cookie_jar = MozillaCookieJar(cookies_path)
            cookie_jar.load(ignore_discard=True, ignore_expires=True)
            cookie_jar.add_cookie_header(request)

        with contextlib.closing(urllib.request.urlopen(request, context=context)) as fp:
            headers = fp.info()

            # Just return the local path and the 'headers' for file://
            # URLs. No sense in performing a copy unless requested.
            if url_parsed.scheme == 'file' and not filename:
                return os.path.normpath(url_parsed.path), headers

            if not filename:
                raise RuntimeError('No filename specified!')

            tfp = open(filename, 'wb')

            with tfp:
                result = filename, headers

                # read overall
                read = 0

                # 4kb at once
                bs = 1024 * 8
                blocknum = 0

                # guess size
                size = int(headers.get('Content-Length', -1))

                if reporthook:
                    reporthook(blocknum, bs, size)

                while True:
                    block = fp.read(bs)
                    if not block:
                        break
                    read += len(block)
                    tfp.write(block)
                    blocknum += 1
                    if reporthook:
                        reporthook(blocknum, bs, size)

        if size >= 0 and read < size:
            raise ContentTooShortError('retrieval incomplete: got only %i out of %i bytes' % (read, size), result)

        return result
Exemple #2
0
    def start(self, blocking=None):
        '''
        Starts the download task. Will raise `RuntimeError` if it's the object's already downloading.
        
        .. warning::
            If you're using the non-blocking mode, Exceptions won't be raised. In that case, call
            `isSuccessful()` after the task is finished, to make sure the download succeeded. Call
            `get_errors()` to get the the exceptions.
        
        :param blocking: If true, calling this function will block the thread until the download finished. Default is *True*.
        :type blocking: bool
        '''
        if not self.status == "ready":
            raise RuntimeError("cannot start (current status is {})".format(
                self.status))
        self.logger.info('Starting a new SmartDL operation.')

        if blocking is None:
            blocking = self._start_func_blocking
        else:
            self._start_func_blocking = blocking

        if self.mirrors:
            self.logger.info('One URL and {} mirrors are loaded.'.format(
                len(self.mirrors)))
        else:
            self.logger.info('One URL is loaded.')

        if self.verify_hash and os.path.exists(self.dest):
            if utils.get_file_hash(self.hash_algorithm,
                                   self.dest) == self.hash_code:
                self.logger.info(
                    "Destination '%s' already exists, and the hash matches. No need to download."
                    % self.dest)
                self.status = 'finished'
                return

        self.logger.info("Downloading '{}' to '{}'...".format(
            self.url, self.dest))
        req = urllib.request.Request(self.url, **self.requestArgs)
        # set cookie if we passed in cookie file and update the requestArgs to propagate it to the rest of the requests
        if self.cookie_file:
            try:
                cookie = MozillaCookieJar(self.cookie_file)
                cookie.load(ignore_expires=True, ignore_discard=True)
                cookie.add_cookie_header(req)
                self.requestArgs['headers']['Cookie'] = req.get_header(
                    "Cookie")
            except OSError:
                self.logger.error(
                    "Cookie file passed in is invalid, ignoring the cookie")
                raise
        try:
            urlObj = urllib.request.urlopen(req,
                                            timeout=self.timeout,
                                            context=self.context)
        except (urllib.error.HTTPError, urllib.error.URLError,
                socket.timeout) as e:
            self.errors.append(e)
            if self.mirrors:
                self.logger.info("{} Trying next mirror...".format(str(e)))
                self.url = self.mirrors.pop(0)
                self.logger.info('Using url "{}"'.format(self.url))
                self.start(blocking)
                return
            else:
                self.logger.warning(str(e))
                self.errors.append(e)
                self._failed = True
                self.status = "finished"
                raise

        try:
            self.filesize = int(urlObj.headers["Content-Length"])
            self.logger.info("Content-Length is {} ({}).".format(
                self.filesize, utils.sizeof_human(self.filesize)))
        except (IndexError, KeyError, TypeError):
            self.logger.warning(
                "Server did not send Content-Length. Filesize is unknown.")
            self.filesize = 0

        args = utils.calc_chunk_size(self.filesize, self.threads_count,
                                     self.minChunkFile)
        bytes_per_thread = args[0][1] - args[0][0] + 1
        if len(args) > 1:
            self.logger.info(
                "Launching {} threads (downloads {}/thread).".format(
                    len(args), utils.sizeof_human(bytes_per_thread)))
        else:
            self.logger.info("Launching 1 thread (downloads {}).".format(
                utils.sizeof_human(bytes_per_thread)))

        self.status = "downloading"

        for i, arg in enumerate(args):
            req = self.pool.submit(download, self.url, self.dest + ".%.3d" % i,
                                   self.requestArgs, self.context, arg[0],
                                   arg[1], self.timeout, self.shared_var,
                                   self.thread_shared_cmds, self.logger)

        self.post_threadpool_thread = threading.Thread(
            target=post_threadpool_actions,
            args=(self.pool, [[(self.dest + ".%.3d" % i)
                               for i in range(len(args))],
                              self.dest], self.filesize, self))
        self.post_threadpool_thread.daemon = True
        self.post_threadpool_thread.start()

        self.control_thread = ControlThread(self)

        if blocking:
            self.wait(raise_exceptions=True)
Exemple #3
0
    def try_download_link(
        self, add_token: bool = False, delete_if_successful: bool = False, use_cookies: bool = False
    ) -> bool:
        """This function should only be used for shortcut/URL files.
        It tests whether a URL refers to a file, that is not an HTML web page.
        Then downloads it. Otherwise an attempt will be made to download an HTML video
        from the website.

        Args:
            add_token (bool, optional): Adds the ws-token to the url. Defaults to False.
            delete_if_successful (bool, optional): Deletes the tmp file if download was successfull. Defaults to False.
            use_cookies (bool, optional): Adds the cookies to the requests. Defaults to False.

        Returns:
            bool: If it was successfull.
        """

        urlToDownload = self.file.content_fileurl
        if add_token:
            urlToDownload = self._add_token_to_url(self.file.content_fileurl)

        cookies_path = None
        if use_cookies:
            cookies_path = self.options.get('cookies_path', None)
            if cookies_path is None:
                self.success = False
                raise ValueError('Moodle Cookies are missing.')

        isHTML = False
        new_filename = ""
        total_bytes_estimate = -1
        request = urllib.request.Request(url=urlToDownload, headers=RequestHelper.stdHeader)
        if use_cookies:
            cookie_jar = MozillaCookieJar(cookies_path)
            cookie_jar.load(ignore_discard=True, ignore_expires=True)
            cookie_jar.add_cookie_header(request)

        with contextlib.closing(urllib.request.urlopen(request, context=self.ssl_context)) as fp:
            headers = fp.info()

            content_type = headers.get_content_type()
            if content_type == 'text/html' or content_type == 'text/plain':
                isHTML = True

            url_parsed = urlparse.urlsplit(urlToDownload)
            new_filename = posixpath.basename(url_parsed.path)
            new_filename = headers.get_filename(new_filename)
            total_bytes_estimate = int(headers.get('Content-Length', -1))

        if isHTML:
            tmp_filename = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
            tmp_file = str(Path(self.destination) / tmp_filename)
            ydl_opts = {
                'logger': self.YtLogger(self.thread_id),
                'progress_hooks': [self.yt_hook],
                'outtmpl': (tmp_file + '.%(ext)s'),
                'nocheckcertificate': True,
            }
            if use_cookies:
                ydl_opts.update({'cookiefile': cookies_path})

            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                try:
                    ydl_results = ydl.download([urlToDownload])
                    if ydl_results == 1:
                        # return False
                        pass
                    else:
                        if delete_if_successful:
                            try:
                                os.remove(self.file.saved_to)
                            except Exception as e:
                                logging.warning(
                                    'T%s - Could not delete %s after youtube-dl was successful. Error: %s',
                                    self.thread_id,
                                    self.file.saved_to,
                                    e,
                                )
                                self.log_exception_extras(e)
                        self.move_tmp_file(tmp_file)
                        self.success = True
                        return True
                except Exception:
                    # return False
                    pass

        # generate file extension for modules names
        new_name, new_extension = os.path.splitext(new_filename)
        if new_extension == '' and isHTML:
            new_extension = '.html'

        # if self.filename.startswith(('https://', 'http://')):
        #     self.filename = new_name + new_extension

        old_name, old_extension = os.path.splitext(self.filename)

        if old_extension != new_extension:
            self.filename = self.filename + new_extension

        if delete_if_successful:
            try:
                os.remove(self.file.saved_to)
            except Exception:
                logging.warning(
                    'T%s - Could not delete %s before download is started. Error: %s',
                    self.thread_id,
                    self.file.saved_to,
                    e,
                )

        self.set_path(True)

        if total_bytes_estimate != -1:
            self.thread_report[self.thread_id]['extra_totalsize'] = total_bytes_estimate

        self.urlretrieve(
            urlToDownload,
            self.file.saved_to,
            context=self.ssl_context,
            reporthook=self.add_progress,
            cookies_path=cookies_path,
        )

        self.file.time_stamp = int(time.time())

        self.success = True
        return True
Exemple #4
0
    class ClientHandler(urllib.request.BaseHandler):
        DEFAULT_HEADERS = {
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
            "Accept":
            "text/html, application/xhtml+xml, application/xml; q=0.9, image/webp, */*; q=0.8",
            "Accept-Encoding": "gzip, deflate"
        }

        def __init__(self):
            self.headers = self.DEFAULT_HEADERS
            self.cookies = MozillaCookieJar()

        def load_cookie(self, fpath):
            self.cookies.load(fpath)

        def save_cookie(self, fpath):
            self.cookies.save(fpath)

        def set_header(self, name, value):
            self.headers[name] = value

        def set_cookie(self, cookie_string):
            cookie = self._make_cookie(cookie_string)
            if cookie:
                self.cookies.set_cookie(cookie)

        def _make_cookie(self, cookie_string):
            return HttpClient.ClientCookie().make(cookie_string)

        def _request_add_headers(self, request):
            for name, value in list(self.headers.items()):
                request.add_header(name, value)
                request.add_unredirected_header(name, value)
            return self

        def _request_add_cookies(self, request):
            self.cookies.add_cookie_header(request)

        def _response_extract_cookies(self, request, response):
            self.cookies.extract_cookies(response, request)
            return self

        def _response_decompress_content(self, request, response):
            encoding = response.headers.get("content-encoding", "")
            if encoding == "gzip":
                response.fp = gzip.GzipFile(fileobj=StringIO(response.read()))
            elif encoding == "deflate":
                response.fp = zlib.decompress(response.read())
            else:
                pass
            return self

        def http_request(self, request):
            self._request_add_headers(request)
            self._request_add_cookies(request)
            return request

        def http_response(self, request, response):
            self._response_extract_cookies(
                request,
                response)._response_decompress_content(request, response)
            return response

        https_request = http_request
        https_response = http_response