Exemple #1
0
def manuallyDownloadFile(fileUrl, fileToSave):
    isDownOK = False
    downloadingFile = ''

    try:
        if fileUrl:
            # 1. find real address
            #print "fileUrl=",fileUrl;
            resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout'])
            #print "resp=",resp;
            realUrl = resp.geturl()
            # not same with original file url if redirect

            # if url is invalid, then add timeout can avoid dead
            respHtml = getUrlRespHtml(realUrl,
                                      useGzip=False,
                                      timeout=gConst['defaultTimeout'])

            isDownOK = saveBinDataToFile(respHtml, fileToSave)
        else:
            print "Input download file url is NULL"
    except urllib.ContentTooShortError(msg):
        isDownOK = False
    except:
        isDownOK = False

    return isDownOK
    def urlretrieve(url: str, filename: str,
                    context: ssl.SSLContext, reporthook=None):
        """
        original source:
        https://github.com/python/cpython/blob/
        21bee0bd71e1ad270274499f9f58194ebb52e236/Lib/urllib/request.py#L229

        Because urlopen also supports context,
        I decided to adapt the download function.
        """
        url_parsed = urlparse.urlparse(url)

        with contextlib.closing(urllib.request.urlopen(url,
                                                       context=context)) as fp:
            headers = fp.info()

            # Just return the local path and the "headers" for file://
            # URLs. No sense in performing a copy unless requested.
            if url_parsed.scheme == "file" and not filename:
                return os.path.normpath(url_parsed.path), headers

            if not filename:
                raise RuntimeError("No filename specified!")

            tfp = open(filename, 'wb')

            with tfp:
                result = filename, headers

                # read overall
                read = 0

                # 4kb at once
                bs = 1024 * 8
                blocknum = 0

                # guess size
                size = -1
                if "content-length" in headers:
                    size = int(headers["Content-Length"])

                if reporthook:
                    reporthook(blocknum, bs, size)

                while True:
                    block = fp.read(bs)
                    if not block:
                        break
                    read += len(block)
                    tfp.write(block)
                    blocknum += 1
                    if reporthook:
                        reporthook(blocknum, bs, size)

        if size >= 0 and read < size:
            raise urllib.ContentTooShortError(
                "retrieval incomplete: got only %i out of %i bytes"
                % (read, size), result)

        return result
Exemple #3
0
def downloadFile(fileUrl, fileToSave, needReport=False):
    isDownOK = False
    downloadingFile = ''

    #---------------------------------------------------------------------------
    # note: totalFileSize -> may be -1 on older FTP servers which do not return a file size in response to a retrieval request
    def reportHook(copiedBlocks, blockSize, totalFileSize):
        #global downloadingFile
        if copiedBlocks == 0:  # 1st call : once on establishment of the network connection
            print 'Begin to download %s, total size=%d' % (downloadingFile,
                                                           totalFileSize)
        else:  # rest call : once after each block read thereafter
            print 'Downloaded bytes: %d' % (blockSize * copiedBlocks)
        return

    #---------------------------------------------------------------------------

    try:
        if fileUrl:
            downloadingFile = fileUrl
            if needReport:
                urllib.urlretrieve(fileUrl, fileToSave, reportHook)
            else:
                urllib.urlretrieve(fileUrl, fileToSave)
            isDownOK = True
        else:
            print "Input download file url is NULL"
    except urllib.ContentTooShortError(msg):
        isDownOK = False
    except:
        isDownOK = False

    return isDownOK
Exemple #4
0
def urlretrieve(url, response, timeouts=None):
    ''' download to temp file. mostly copied from urllib.py '''
    timeouts = timeouts or [300, 300, 300]  # default to wait 5 minutes 3 times

    # tmp filename
    import tempfile
    path = urlparse.urlsplit(url).path
    suffix = os.path.splitext(path)[1]
    (fd, tmp_filename) = tempfile.mkstemp(suffix)

    # write
    headers = response.headers.dict
    tmpfile = os.fdopen(fd, 'wb')
    try:
        block_size = 1024 * 8
        read = 0
        size = -1
        if "content-length" in headers:
            size = int(headers["content-length"])
        while True:
            last_exception = None
            for timeout in timeouts:
                # download block
                try:
                    block = response.read(block_size)
                    break
                # timeout
                except IOError as e:
                    last_exception = e
                    continue
                except httplib.HTTPException as e:
                    last_exception = e
                    continue
            # all retries failed
            else:
                raise last_exception

            if block == "":
                break
            read += len(block)
            tmpfile.write(block)
    finally:
        tmpfile.close()
    if size >= 0 and read < size:
        raise urllib.ContentTooShortError("retrieval incomplete: got only %i out of %i bytes" \
                                           % (read, size), (tmp_filename, headers))

    return tmp_filename
Exemple #5
0
def manuallyDownloadFile(fileUrl, fileToSave, headerDict):
    isDownOK = False
    downloadingFile = ''

    try:
        if fileUrl:
            respHtml = getUrlRespHtml(fileUrl,
                                      headerDict=headerDict,
                                      useGzip=False,
                                      timeout=gConst['defaultTimeout'])
            if (respHtml):
                isDownOK = saveBinDataToFile(respHtml, fileToSave)
        else:
            print "Input download file url is NULL"
    except urllib.ContentTooShortError(msg):
        isDownOK = False
    except:
        isDownOK = False

    return isDownOK
def download(url, local, **kwargs):
    if not local:
        raise ValueError('local filepath is empty')
    try:
        if not os.path.exists(os.path.dirname(local)):
            os.makedirs(os.path.dirname(local))
        res = Request(url, **kwargs)
        read_size = 0
        real_size = int(res.header['content-length'])
        with open(local, 'wb') as f:
            while True:
                block = res.response.read(1024*8)
                if not block:
                    break
                f.write(block)
                read_size += len(block)
        if read_size < real_size:
            raise urllib.ContentTooShortError(
                'retrieval incomplete: got only {} out of {} bytes'.formate(read_size, real_size),
                None
                )
    except Exception, e:
        raise e
Exemple #7
0
def mirror_modis_dates_html(base_url, mirror_dir, use_wget=False):
    """
    Download all MODIS date listing pages to a local directory.
    Usually, a MODIS listing for a date should not change (only new dates
    should be added), so there should be no need to re-download.
    """
    ndownloads = 0
    dates_urls = collect_all_dates_pages(base_url)
    utils.mkdir_p(mirror_dir)
    for date, url in dates_urls:
        fname = os.path.join(mirror_dir, date + '.html')
        if not os.path.exists(fname):
            print 'Downloading ', fname
            if use_wget:
                subprocess.check_call('/usr/bin/wget %s -O %s' % (url, fname),
                                      shell=True)
            else:
                urllib.urlretrieve(url, fname)
            ndownloads += 1
            # The MODIS MOLT repository server doesn't return Content-Length
            # so urllib cannot tell if it downloaded the whole html or was
            # just disconnected, which could lead to incomplete HTML being
            # downloaded. So we check if the downloaded file ends with </html>
            with open(fname, 'r') as f:
                # seek 10 bytes from the end
                f.seek(-10, 2)
                line = f.read(10)
                if "</html>" not in line:
                    raise urllib.ContentTooShortError(
                        "Couldn't find </html> in downloaded file, probably a partial download", ""
                    )

            # Just avoid firing requests as fast as possible
            time.sleep(0.1)

    return ndownloads > 0
Exemple #8
0
    def retrieve_resume(self, url, filename, reporthook=None, data=None):
        """Download files from an URL; return (headers, real_url)

        Resumes a download if the local filename exists and
        the server supports download resuming.
        """

        current_size = 0
        tfp = None
        if os.path.exists(filename):
            try:
                current_size = os.path.getsize(filename)
                tfp = open(filename, 'ab')
                #If the file exists, then only download the remainder
                if current_size > 0:
                    self.addheader('Range', 'bytes=%s-' % (current_size))
            except:
                logger.warn('Cannot resume download: %s',
                            filename,
                            exc_info=True)
                tfp = None
                current_size = 0

        if tfp is None:
            tfp = open(filename, 'wb')

        # Fix a problem with bad URLs that are not encoded correctly (bug 549)
        url = url.decode('ascii', 'ignore')
        url = url.translate(self.ESCAPE_CHARS)
        url = url.encode('ascii')

        url = urllib.unwrap(urllib.toBytes(url))
        fp = self.open(url, data)
        headers = fp.info()

        if current_size > 0:
            # We told the server to resume - see if she agrees
            # See RFC2616 (206 Partial Content + Section 14.16)
            # XXX check status code here, too...
            range = ContentRange.parse(headers.get('content-range', ''))
            if range is None or range.start != current_size:
                # Ok, that did not work. Reset the download
                # TODO: seek and truncate if content-range differs from request
                tfp.close()
                tfp = open(filename, 'wb')
                current_size = 0
                logger.warn('Cannot resume: Invalid Content-Range (RFC2616).')

        result = headers, fp.geturl()
        bs = 1024 * 8
        size = -1
        read = current_size
        blocknum = int(current_size / bs)
        if reporthook:
            if "content-length" in headers:
                size = int(
                    headers.getrawheader("Content-Length")) + current_size
            reporthook(blocknum, bs, size)
        while read < size or size == -1:
            if size == -1:
                block = fp.read(bs)
            else:
                block = fp.read(min(size - read, bs))
            if block == "":
                break
            read += len(block)
            tfp.write(block)
            blocknum += 1
            if reporthook:
                reporthook(blocknum, bs, size)
        fp.close()
        tfp.close()
        del fp
        del tfp

        # raise exception if actual size does not match content-length header
        if size >= 0 and read < size:
            raise urllib.ContentTooShortError(
                "retrieval incomplete: got only %i out "
                "of %i bytes" % (read, size), result)

        return result
  def retrieve(self, url, filename=None, reporthook=None, data=None):
    """ Retrieves data from the given url and returns a tuple of filename and headers

    Args:
      url (str): url of the data to be retrieved
      filename (str, optional): filename from the url to download
      reporthook: (function, optional): function that should be called for e.g. keeping an UI updated with current state
      data (, optional):

    Returns:
      result: (filename, headers)

    See Also:
        urllib.URLopener
    """
    self._canceled=False
    url = urllib.unwrap(urllib.toBytes(url))
    if self.tempcache and url in self.tempcache:
      return self.tempcache[url]
    type, url1 = urllib.splittype(url)
    if filename is None and (not type or type == 'file'):
      try:
        fp = self.open_local_file(url1)
        hdrs = fp.info()
        fp.close()
        return urllib.url2pathname(urllib.splithost(url1)[1]), hdrs
      except IOError:
        pass
    fp = self.open(url, data)
    try:
      headers = fp.info()
      if filename:
        tfp = open(filename, 'wb')
      else:
        import tempfile
        garbage, path = urllib.splittype(url)
        garbage, path = urllib.splithost(path or "")
        path, garbage = urllib.splitquery(path or "")
        path, garbage = urllib.splitattr(path or "")
        suffix = os.path.splitext(path)[1]
        (fd, filename) = tempfile.mkstemp(suffix)
        self.__tempfiles.append(filename)
        tfp = os.fdopen(fd, 'wb')
      try:
        result = filename, headers
        if self.tempcache is not None:
          self.tempcache[url] = result
        bs = 1024 * 8
        size = -1
        read = 0
        blocknum = 0
        if "content-length" in headers:
          size = int(headers["Content-Length"])
        if reporthook:
          reporthook(blocknum, bs, size)
        while not self._canceled:
          block = fp.read(bs)
          if block == "":
            break
          read += len(block)
          tfp.write(block)
          blocknum += 1
          if reporthook:
            reporthook(blocknum, bs, size)
      finally:
        tfp.close()
    finally:
      fp.close()

    # raise exception if actual size does not match content-length header
    if size >= 0 and read < size:
      raise urllib.ContentTooShortError("retrieval incomplete: got only %i out "
                                 "of %i bytes" % (read, size), result)

    if self._canceled and os.path.exists(filename):
      os.remove(filename)
    return result
Exemple #10
0
class MyFancyUrlopener(urllib.FancyURLopener):
    def retrieve(self, url, filename=None, reporthook=None, data=None):
        """retrieve(url) returns (filename, headers) for a local object
        or (tempfilename, headers) for a remote object."""
        url = urllib.unwrap(urllib.toBytes(url))
        if self.tempcache and url in self.tempcache:
            return self.tempcache[url]
        type, url1 = urllib.splittype(url)
        if filename is None and (not type or type == 'file'):
            try:
                fp = self.open_local_file(url1)
                hdrs = fp.info()
                del fp
                return urllib.url2pathname(urllib.splithost(url1)[1]), hdrs
            except IOError, msg:
                pass
        fp = self.open(url, data)
        try:
            headers = fp.info()
            code = fp.code
            if filename:
                tfp = open(filename, 'wb')
            else:
                import tempfile
                garbage, path = urllib.splittype(url)
                garbage, path = urllib.splithost(path or "")
                path, garbage = urllib.splitquery(path or "")
                path, garbage = urllib.splitattr(path or "")
                suffix = os.path.splitext(path)[1]
                (fd, filename) = tempfile.mkstemp(suffix)
                self.__tempfiles.append(filename)
                tfp = os.fdopen(fd, 'wb')
            try:
                result = filename, headers, code
                if self.tempcache is not None:
                    self.tempcache[url] = result
                bs = 1024 * 8
                size = -1
                read = 0
                blocknum = 0
                if reporthook:
                    if "content-length" in headers:
                        size = int(headers["Content-Length"])
                    reporthook(blocknum, bs, size)
                while 1:
                    block = fp.read(bs)
                    if block == "":
                        break
                    read += len(block)
                    tfp.write(block)
                    blocknum += 1
                    if reporthook:
                        reporthook(blocknum, bs, size)
            finally:
                tfp.close()
        finally:
            fp.close()
        del fp
        del tfp

        # raise exception if actual size does not match content-length header
        if size >= 0 and read < size:
            raise urllib.ContentTooShortError(
                "retrieval incomplete: got only %i out "
                "of %i bytes" % (read, size), result)

        return result
Exemple #11
0
    def retrieve(self, url, filename=None, reporthook=None, data=None):
        # overridden method from urllib.URLopener
        self._cancelDownload = False
        url = urllib.unwrap(urllib.toBytes(url))
        if self.tempcache and url in self.tempcache:
            return self.tempcache[url]
        type, url1 = urllib.splittype(url)
        if filename is None and (not type or type == 'file'):
            try:
                fp = self.open_local_file(url1)
                hdrs = fp.info()
                fp.close()
                return urllib.url2pathname(urllib.splithost(url1)[1]), hdrs
            except IOError:
                pass
        fp = self.open(url, data)
        try:
            headers = fp.info()
            if filename:
                tfp = open(filename, 'wb')
            else:
                import tempfile
                garbage, path = urllib.splittype(url)
                garbage, path = urllib.splithost(path or "")
                path, garbage = urllib.splitquery(path or "")
                path, garbage = urllib.splitattr(path or "")
                suffix = os.path.splitext(path)[1]
                (fd, filename) = tempfile.mkstemp(suffix)
                self.__tempfiles.append(filename)
                tfp = os.fdopen(fd, 'wb')
            try:
                result = filename, headers
                if self.tempcache is not None:
                    self.tempcache[url] = result
                bs = 1024 * 8
                size = -1
                read = 0
                blocknum = 0
                if "content-length" in headers:
                    size = int(headers["Content-Length"])
                if reporthook:
                    reporthook(blocknum, bs, size)
                while not self._cancelDownload:
                    block = fp.read(bs)
                    if block == "":
                        break
                    read += len(block)
                    tfp.write(block)
                    blocknum += 1
                    if reporthook:
                        reporthook(blocknum, bs, size)
            finally:
                tfp.close()
        finally:
            fp.close()

        # raise exception if actual size does not match content-length header
        if size >= 0 and read < size:
            raise urllib.ContentTooShortError(
                "retrieval incomplete: got only %i out "
                "of %i bytes" % (read, size), result)

        if self._cancelDownload and os.path.exists(filename):
            os.remove(filename)
            self.wasCanceled = True
        return result