Example #1
0
    def urlretrieve(url: str, filename: str, context: ssl.SSLContext, reporthook=None, cookies_path=None):
        """
        original source:
        https://github.com/python/cpython/blob/
        21bee0bd71e1ad270274499f9f58194ebb52e236/Lib/urllib/request.py#L229

        Because urlopen also supports context,
        I decided to adapt the download function.
        """
        url_parsed = urlparse.urlparse(url)

        request = urllib.request.Request(url=url, headers=RequestHelper.stdHeader)
        if cookies_path is not None:
            cookie_jar = MozillaCookieJar(cookies_path)
            cookie_jar.load(ignore_discard=True, ignore_expires=True)
            cookie_jar.add_cookie_header(request)

        with contextlib.closing(urllib.request.urlopen(request, context=context)) as fp:
            headers = fp.info()

            # Just return the local path and the 'headers' for file://
            # URLs. No sense in performing a copy unless requested.
            if url_parsed.scheme == 'file' and not filename:
                return os.path.normpath(url_parsed.path), headers

            if not filename:
                raise RuntimeError('No filename specified!')

            tfp = open(filename, 'wb')

            with tfp:
                result = filename, headers

                # read overall
                read = 0

                # 4kb at once
                bs = 1024 * 8
                blocknum = 0

                # guess size
                size = int(headers.get('Content-Length', -1))

                if reporthook:
                    reporthook(blocknum, bs, size)

                while True:
                    block = fp.read(bs)
                    if not block:
                        break
                    read += len(block)
                    tfp.write(block)
                    blocknum += 1
                    if reporthook:
                        reporthook(blocknum, bs, size)

        if size >= 0 and read < size:
            raise ContentTooShortError('retrieval incomplete: got only %i out of %i bytes' % (read, size), result)

        return result
Example #2
0
    def retrieve(self, url, filename, reporthook=None, data=None):
        fp = self.opener.open(url, data)
        try:
            headers = fp.info()
            tfp = open(filename, 'wb')
            try:
                result = filename, headers
                bs = 1024 * 8
                size = -1
                read = 0
                blocknum = 0
                if reporthook:
                    if "content-length" in headers:
                        size = int(headers["Content-Length"])
                    reporthook(blocknum, bs, size)
                while 1:
                    block = fp.read(bs)
                    if not block:
                        break
                    read += len(block)
                    tfp.write(block)
                    blocknum += 1
                    if reporthook:
                        reporthook(blocknum, bs, size)
            finally:
                tfp.close()
        finally:
            fp.close()
        # raise exception if actual size does not match content-length header
        if size >= 0 and read < size:
            raise ContentTooShortError(
                _("retrieval incomplete: got only %i out of %i bytes") %
                (read, size), result)

        return result
Example #3
0
    def retrieve(self,
                 url,
                 filename=None,
                 filestream=None,
                 reporthook=None,
                 data=None):
        # return filename, headers (in dict), initial file bytes (to detect logon requests)
        headers = None
        initialBytes = b''
        fp = self.opener.open(url, data, timeout=self.timeout)
        try:
            headers = fp.info()
            if filename:
                tfp = open(filename, 'wb')
            elif filestream:
                tfp = filestream
            try:
                result = filename, headers
                bs = 1024 * 8
                size = -1
                read = 0
                blocknum = 0
                if reporthook:
                    if "content-length" in headers:
                        size = int(headers["Content-Length"])
                    reporthook(blocknum, bs, size)
                isGzipped = "gzip" in headers.get("content-encoding", "")
                if isGzipped:
                    decompressor = zlib.decompressobj(
                        16 + zlib.MAX_WBITS
                    )  #this magic number can be inferred from the structure of a gzip file
                while 1:
                    block = fp.read(bs)
                    if not block:
                        break
                    if isGzipped:
                        block = decompressor.decompress(block)
                    read += len(block)
                    tfp.write(block)
                    if blocknum == 0:
                        initialBytes = block
                    blocknum += 1
                    if reporthook:
                        reporthook(blocknum, bs, size)
            finally:
                if filename:
                    tfp.close()
        finally:
            if fp:
                fp.close()
        # raise exception if actual size does not match content-length header
        if size >= 0 and read < size:
            raise ContentTooShortError(
                _("retrieval incomplete: got only %i out of %i bytes") %
                (read, size), result)

        if filestream:
            tfp.seek(0)
        return filename, headers, initialBytes
Example #4
0
    def retrieve(self,
                 url,
                 filename=None,
                 filestream=None,
                 reporthook=None,
                 data=None):
        # return filename, headers (in dict), initial file bytes (to detect logon requests)
        headers = None
        initialBytes = b''
        fp = self.opener.open(url, data, timeout=self.timeout)
        try:
            headers = fp.info()
            if filename:
                tfp = open(filename, 'wb')
            elif filestream:
                tfp = filestream
            try:
                result = filename, headers
                bs = 1024 * 8
                size = -1
                read = 0
                blocknum = 0
                if reporthook:
                    if "content-length" in headers:
                        size = int(headers["Content-Length"])
                    reporthook(blocknum, bs, size)
                while 1:
                    block = fp.read(bs)
                    if not block:
                        break
                    read += len(block)
                    tfp.write(block)
                    if blocknum == 0:
                        initialBytes = block
                    blocknum += 1
                    if reporthook:
                        reporthook(blocknum, bs, size)
            finally:
                if filename:
                    tfp.close()
        finally:
            if fp:
                fp.close()
        # raise exception if actual size does not match content-length header
        if size >= 0 and read < size:
            raise ContentTooShortError(
                _("retrieval incomplete: got only %i out of %i bytes") %
                (read, size), result)

        if filestream:
            tfp.seek(0)
        return filename, headers, initialBytes
Example #5
0
def retrieve_file(url: URL) -> Tuple[TextIO, str]:
    """
    Fetch a file from a URL (handling SSL).

    This is based off `urllib.request.urlretrieve`.

    """
    if url.scheme not in ("http", "https"):
        raise InvalidConfiguration("Illegal scheme.")

    context = (ssl.SSLContext(
        protocol=ssl.PROTOCOL_TLSv1_2) if url.scheme == "https" else None)

    with contextlib.closing(
            urlopen(url,
                    context=context)  # nosec - There is a check above for SSL
    ) as response:
        block_size = 1024 * 8
        size = -1
        read = 0

        headers = response.info()
        if "Content-Length" in headers:
            size = int(headers["Content-Length"])

        if "Content-Type" in headers:
            content_type = headers["Content-Type"]
        else:
            content_type = content_type_from_url(url)

        tfp = tempfile.TemporaryFile()
        while True:
            block = response.read(block_size)
            if not block:
                break
            read += len(block)
            tfp.write(block)

        # Seek to start
        tfp.seek(0)

    if size >= 0 and read < size:
        tfp.close()

        raise ContentTooShortError(
            f"retrieval incomplete: got only {read} out of {size} bytes",
            headers)

    return tfp, content_type
Example #6
0
def url_retrieve_with_headers(url, filename=None, headers=None, reporthook=None):
    url_type, path = urllib.parse.splittype(url)
    opener = urllib.request.build_opener()
    if headers:
        opener.addheaders = list(headers.items())
    with contextlib.closing(opener.open(url)) as fp:
        headers = fp.info()

        # Just return the local path and the "headers" for file://
        # URLs. No sense in performing a copy unless requested.
        if url_type == "file" and not filename:
            return os.path.normpath(path), headers

        tfp = open(filename, 'wb')

        with tfp:
            result = filename, headers
            bs = 1024*8
            size = -1
            read = 0
            blocknum = 0
            if "content-length" in headers:
                size = int(headers["Content-Length"])

            if reporthook:
                reporthook(blocknum, bs, size)

            while True:
                block = fp.read(bs)
                if not block:
                    break
                read += len(block)
                tfp.write(block)
                blocknum += 1
                if reporthook:
                    reporthook(blocknum, bs, size)

    if size >= 0 and read < size:
        raise ContentTooShortError(
            "retrieval incomplete: got only %i out of %i bytes"
            % (read, size), result)

    return result
Example #7
0
def fetch_binary(src, dst, name):
    tarball = os.path.join(dst, os.path.basename(src))
    target = os.path.join(dst, f'{name}')
    try:
        os.stat(target)  # short circuit to avoid redownloading tarballs
    except FileNotFoundError:
        pass
    else:
        return target

    try:
        urlretrieve(url=src, filename=tarball, reporthook=show_progress)
    except ContentTooShortError as e:
        raise ContentTooShortError(f"failed to download {src}: {e}")

    with tarfile.open(tarball) as tar:
        tar.extract(member=name, path=dst)
        tar.close()
    return target
Example #8
0
File: get.py Project: ye40/MiniUrl
def download_func(url,
                  filename=None,
                  ip=None,
                  headers=None,
                  reporthook=None,
                  data=None):
    """参考于urllib.request.urlretrieve函数,进行了一定的改进
            1.可以添加ip代理与headers
            2.可以显示制定的进度条

        参数:
            url:下载文件的链接
            filename:下载文件路径名称
            ip:代理ip 格式为字符串 【ip:port】
            headers:添加头文件,默认为空
            reporthook:是否显示进度条  True False
            data:向服务器发送数据
        返回值:
            下载文件相关信息
    """
    url_type, path = splittype(url)
    req_obj = Request(url)

    # 添加headers
    if headers:

        def addheaders(headers, req_obj):
            for key, value in headers.items():
                req_obj.add_header(key, value)  # 添加头文件
            return req_obj

        req = addheaders(headers, req_obj)

    # 添加ip代理
    if ip:
        proxies = {"http": ip, "https": ip, "socks": ip}
        proxy_support = ProxyHandler(proxies)
        opener = build_opener(proxy_support)
        install_opener(opener)

    # 利用urlopen进行下载
    with contextlib.closing(urlopen(req, data)) as fp:
        headers = fp.info()

        # 判断传入的url是http网址还是本地文件地址
        if url_type == "file" and not filename:
            return os.path.normpath(path), headers

        # 如果为本地地址则open创建新文件
        #        否则创建临时文件
        if filename:
            tfp = open(filename, 'wb')
        else:
            tfp = tempfile.NamedTemporaryFile(delete=False)
            filename = tfp.name
            url_tempfile.append(filename)
        with tfp:
            result = filename, headers
            bs = 1024 * 20  # 每块的大小
            size = -1  # 总大小
            downsize = 0  # 已下载的大小

            if "content-length" in headers:
                size = int(headers["Content-Length"])
            else:
                headers = {
                    'Accept-Encoding': 'None'
                }  # 有的返回headers中不存在Conten-Length, 令其为none
                new_req = addheaders(headers, req_obj)
                size = int(urlopen(new_req, data).info()["Content-Length"])

            if reporthook:
                start_time = time.time()  # 网速的起始时间
                reporthook(downsize, size, start_time)

            # 下载文件循环,直到下载完成
            while True:
                block = fp.read(bs)
                if not block:
                    break
                downsize += len(block)
                tfp.write(block)
                if reporthook:
                    reporthook(downsize, size, start_time)
    if size >= 0 and downsize < size:
        raise ContentTooShortError(
            "retrieval incomplete: got only %i out of %i bytes" %
            (downsize, size), result)

    return result
Example #9
0
def _urlretrieve(url,
                 fname=None,
                 dir_prefix=".",
                 headers=None,
                 content_disposition=False,
                 blocksize=1024 * 8,
                 timeout=None,
                 progressbar=True,
                 reporthook=None,
                 file_hash=None,
                 hash_algorithm="auto",
                 force_download=False):
    """
    A more advance version of urllib.request.urlretrieve with support of progress bars,
    automatic file name, cache and file hash
    """
    if headers is None:
        headers = {}

    dir_prefix = os.path.expanduser(dir_prefix)

    if fname is None and not content_disposition:
        fname = filename_from_url(url)

    # Check if file already exists before doing any request
    if fname is not None and os.path.exists(os.path.join(
            dir_prefix, fname)) and not force_download:
        if file_hash is not None and not validate_file(
                os.path.join(dir_prefix, fname), file_hash, hash_algorithm):
            _warn_about_different_hash(file_hash, hash_algorithm)
        else:
            return os.path.join(dir_prefix, fname)

    request = Request(url, headers=headers)

    with urlopen(request, timeout=timeout) as response:
        headers = response.info()

        if callable(fname):
            fname = fname(response)

        if fname is None:
            fname = headers.get_filename()

        if fname is None:
            fname = filename_from_url(url)

        if os.path.isabs(fname):
            file_path = fname
        else:
            os.makedirs(dir_prefix, exist_ok=True)
            file_path = os.path.join(dir_prefix, fname)

        _, extension = splitext(fname)

        if not extension:
            extension = guess_extension(headers.get_content_type() or "")
            if extension is not None:
                file_path += extension

        if os.path.exists(file_path) and not force_download:
            if file_hash is not None and not validate_file(
                    file_path, file_hash, hash_algorithm):
                _warn_about_different_hash(file_hash, hash_algorithm)
            else:
                return file_path

        content_length = int(headers.get("Content-Length", -1))

        blocknum = 0
        bytes_read = 0

        download_file_path = file_path + ".download"

        with open(download_file_path,
                  "wb") as fp, tqdm(total=content_length,
                                    unit='B',
                                    unit_scale=True,
                                    miniters=1,
                                    unit_divisor=1024,
                                    desc="Downloading {}...".format(fname),
                                    disable=not progressbar) as pbar:
            while True:
                block = response.read(blocksize)
                if not block:
                    break

                fp.write(block)

                blocknum += 1
                bytes_read += len(block)

                if pbar is not None:
                    pbar.update(blocksize)

                if reporthook is not None:
                    reporthook(blocknum, blocksize, content_length)

    if content_length >= 0 and bytes_read < content_length:
        error_msg = "retrieval incomplete: got only {} out of {} bytes".format(
            bytes_read, content_length)
        raise ContentTooShortError(error_msg, (download_file_path, headers))

    os.rename(download_file_path, file_path)

    return file_path
Example #10
0
 def http_open(self, req):
     """Provide http_open to raise exception."""
     raise ContentTooShortError("Expected 1000 bytes", CALENDAR_DATA)