def manuallyDownloadFile(fileUrl, fileToSave): isDownOK = False downloadingFile = '' try: if fileUrl: # 1. find real address #print "fileUrl=",fileUrl; resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']) #print "resp=",resp; realUrl = resp.geturl() # not same with original file url if redirect # if url is invalid, then add timeout can avoid dead respHtml = getUrlRespHtml(realUrl, useGzip=False, timeout=gConst['defaultTimeout']) isDownOK = saveBinDataToFile(respHtml, fileToSave) else: print "Input download file url is NULL" except urllib.ContentTooShortError(msg): isDownOK = False except: isDownOK = False return isDownOK
def urlretrieve(url: str, filename: str, context: ssl.SSLContext, reporthook=None): """ original source: https://github.com/python/cpython/blob/ 21bee0bd71e1ad270274499f9f58194ebb52e236/Lib/urllib/request.py#L229 Because urlopen also supports context, I decided to adapt the download function. """ url_parsed = urlparse.urlparse(url) with contextlib.closing(urllib.request.urlopen(url, context=context)) as fp: headers = fp.info() # Just return the local path and the "headers" for file:// # URLs. No sense in performing a copy unless requested. if url_parsed.scheme == "file" and not filename: return os.path.normpath(url_parsed.path), headers if not filename: raise RuntimeError("No filename specified!") tfp = open(filename, 'wb') with tfp: result = filename, headers # read overall read = 0 # 4kb at once bs = 1024 * 8 blocknum = 0 # guess size size = -1 if "content-length" in headers: size = int(headers["Content-Length"]) if reporthook: reporthook(blocknum, bs, size) while True: block = fp.read(bs) if not block: break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) if size >= 0 and read < size: raise urllib.ContentTooShortError( "retrieval incomplete: got only %i out of %i bytes" % (read, size), result) return result
def downloadFile(fileUrl, fileToSave, needReport=False): isDownOK = False downloadingFile = '' #--------------------------------------------------------------------------- # note: totalFileSize -> may be -1 on older FTP servers which do not return a file size in response to a retrieval request def reportHook(copiedBlocks, blockSize, totalFileSize): #global downloadingFile if copiedBlocks == 0: # 1st call : once on establishment of the network connection print 'Begin to download %s, total size=%d' % (downloadingFile, totalFileSize) else: # rest call : once after each block read thereafter print 'Downloaded bytes: %d' % (blockSize * copiedBlocks) return #--------------------------------------------------------------------------- try: if fileUrl: downloadingFile = fileUrl if needReport: urllib.urlretrieve(fileUrl, fileToSave, reportHook) else: urllib.urlretrieve(fileUrl, fileToSave) isDownOK = True else: print "Input download file url is NULL" except urllib.ContentTooShortError(msg): isDownOK = False except: isDownOK = False return isDownOK
def urlretrieve(url, response, timeouts=None): ''' download to temp file. mostly copied from urllib.py ''' timeouts = timeouts or [300, 300, 300] # default to wait 5 minutes 3 times # tmp filename import tempfile path = urlparse.urlsplit(url).path suffix = os.path.splitext(path)[1] (fd, tmp_filename) = tempfile.mkstemp(suffix) # write headers = response.headers.dict tmpfile = os.fdopen(fd, 'wb') try: block_size = 1024 * 8 read = 0 size = -1 if "content-length" in headers: size = int(headers["content-length"]) while True: last_exception = None for timeout in timeouts: # download block try: block = response.read(block_size) break # timeout except IOError as e: last_exception = e continue except httplib.HTTPException as e: last_exception = e continue # all retries failed else: raise last_exception if block == "": break read += len(block) tmpfile.write(block) finally: tmpfile.close() if size >= 0 and read < size: raise urllib.ContentTooShortError("retrieval incomplete: got only %i out of %i bytes" \ % (read, size), (tmp_filename, headers)) return tmp_filename
def manuallyDownloadFile(fileUrl, fileToSave, headerDict): isDownOK = False downloadingFile = '' try: if fileUrl: respHtml = getUrlRespHtml(fileUrl, headerDict=headerDict, useGzip=False, timeout=gConst['defaultTimeout']) if (respHtml): isDownOK = saveBinDataToFile(respHtml, fileToSave) else: print "Input download file url is NULL" except urllib.ContentTooShortError(msg): isDownOK = False except: isDownOK = False return isDownOK
def download(url, local, **kwargs): if not local: raise ValueError('local filepath is empty') try: if not os.path.exists(os.path.dirname(local)): os.makedirs(os.path.dirname(local)) res = Request(url, **kwargs) read_size = 0 real_size = int(res.header['content-length']) with open(local, 'wb') as f: while True: block = res.response.read(1024*8) if not block: break f.write(block) read_size += len(block) if read_size < real_size: raise urllib.ContentTooShortError( 'retrieval incomplete: got only {} out of {} bytes'.formate(read_size, real_size), None ) except Exception, e: raise e
def mirror_modis_dates_html(base_url, mirror_dir, use_wget=False): """ Download all MODIS date listing pages to a local directory. Usually, a MODIS listing for a date should not change (only new dates should be added), so there should be no need to re-download. """ ndownloads = 0 dates_urls = collect_all_dates_pages(base_url) utils.mkdir_p(mirror_dir) for date, url in dates_urls: fname = os.path.join(mirror_dir, date + '.html') if not os.path.exists(fname): print 'Downloading ', fname if use_wget: subprocess.check_call('/usr/bin/wget %s -O %s' % (url, fname), shell=True) else: urllib.urlretrieve(url, fname) ndownloads += 1 # The MODIS MOLT repository server doesn't return Content-Length # so urllib cannot tell if it downloaded the whole html or was # just disconnected, which could lead to incomplete HTML being # downloaded. So we check if the downloaded file ends with </html> with open(fname, 'r') as f: # seek 10 bytes from the end f.seek(-10, 2) line = f.read(10) if "</html>" not in line: raise urllib.ContentTooShortError( "Couldn't find </html> in downloaded file, probably a partial download", "" ) # Just avoid firing requests as fast as possible time.sleep(0.1) return ndownloads > 0
def retrieve_resume(self, url, filename, reporthook=None, data=None): """Download files from an URL; return (headers, real_url) Resumes a download if the local filename exists and the server supports download resuming. """ current_size = 0 tfp = None if os.path.exists(filename): try: current_size = os.path.getsize(filename) tfp = open(filename, 'ab') #If the file exists, then only download the remainder if current_size > 0: self.addheader('Range', 'bytes=%s-' % (current_size)) except: logger.warn('Cannot resume download: %s', filename, exc_info=True) tfp = None current_size = 0 if tfp is None: tfp = open(filename, 'wb') # Fix a problem with bad URLs that are not encoded correctly (bug 549) url = url.decode('ascii', 'ignore') url = url.translate(self.ESCAPE_CHARS) url = url.encode('ascii') url = urllib.unwrap(urllib.toBytes(url)) fp = self.open(url, data) headers = fp.info() if current_size > 0: # We told the server to resume - see if she agrees # See RFC2616 (206 Partial Content + Section 14.16) # XXX check status code here, too... range = ContentRange.parse(headers.get('content-range', '')) if range is None or range.start != current_size: # Ok, that did not work. Reset the download # TODO: seek and truncate if content-range differs from request tfp.close() tfp = open(filename, 'wb') current_size = 0 logger.warn('Cannot resume: Invalid Content-Range (RFC2616).') result = headers, fp.geturl() bs = 1024 * 8 size = -1 read = current_size blocknum = int(current_size / bs) if reporthook: if "content-length" in headers: size = int( headers.getrawheader("Content-Length")) + current_size reporthook(blocknum, bs, size) while read < size or size == -1: if size == -1: block = fp.read(bs) else: block = fp.read(min(size - read, bs)) if block == "": break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) fp.close() tfp.close() del fp del tfp # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise urllib.ContentTooShortError( "retrieval incomplete: got only %i out " "of %i bytes" % (read, size), result) return result
def retrieve(self, url, filename=None, reporthook=None, data=None): """ Retrieves data from the given url and returns a tuple of filename and headers Args: url (str): url of the data to be retrieved filename (str, optional): filename from the url to download reporthook: (function, optional): function that should be called for e.g. keeping an UI updated with current state data (, optional): Returns: result: (filename, headers) See Also: urllib.URLopener """ self._canceled=False url = urllib.unwrap(urllib.toBytes(url)) if self.tempcache and url in self.tempcache: return self.tempcache[url] type, url1 = urllib.splittype(url) if filename is None and (not type or type == 'file'): try: fp = self.open_local_file(url1) hdrs = fp.info() fp.close() return urllib.url2pathname(urllib.splithost(url1)[1]), hdrs except IOError: pass fp = self.open(url, data) try: headers = fp.info() if filename: tfp = open(filename, 'wb') else: import tempfile garbage, path = urllib.splittype(url) garbage, path = urllib.splithost(path or "") path, garbage = urllib.splitquery(path or "") path, garbage = urllib.splitattr(path or "") suffix = os.path.splitext(path)[1] (fd, filename) = tempfile.mkstemp(suffix) self.__tempfiles.append(filename) tfp = os.fdopen(fd, 'wb') try: result = filename, headers if self.tempcache is not None: self.tempcache[url] = result bs = 1024 * 8 size = -1 read = 0 blocknum = 0 if "content-length" in headers: size = int(headers["Content-Length"]) if reporthook: reporthook(blocknum, bs, size) while not self._canceled: block = fp.read(bs) if block == "": break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) finally: tfp.close() finally: fp.close() # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise urllib.ContentTooShortError("retrieval incomplete: got only %i out " "of %i bytes" % (read, size), result) if self._canceled and os.path.exists(filename): os.remove(filename) return result
class MyFancyUrlopener(urllib.FancyURLopener): def retrieve(self, url, filename=None, reporthook=None, data=None): """retrieve(url) returns (filename, headers) for a local object or (tempfilename, headers) for a remote object.""" url = urllib.unwrap(urllib.toBytes(url)) if self.tempcache and url in self.tempcache: return self.tempcache[url] type, url1 = urllib.splittype(url) if filename is None and (not type or type == 'file'): try: fp = self.open_local_file(url1) hdrs = fp.info() del fp return urllib.url2pathname(urllib.splithost(url1)[1]), hdrs except IOError, msg: pass fp = self.open(url, data) try: headers = fp.info() code = fp.code if filename: tfp = open(filename, 'wb') else: import tempfile garbage, path = urllib.splittype(url) garbage, path = urllib.splithost(path or "") path, garbage = urllib.splitquery(path or "") path, garbage = urllib.splitattr(path or "") suffix = os.path.splitext(path)[1] (fd, filename) = tempfile.mkstemp(suffix) self.__tempfiles.append(filename) tfp = os.fdopen(fd, 'wb') try: result = filename, headers, code if self.tempcache is not None: self.tempcache[url] = result bs = 1024 * 8 size = -1 read = 0 blocknum = 0 if reporthook: if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size) while 1: block = fp.read(bs) if block == "": break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) finally: tfp.close() finally: fp.close() del fp del tfp # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise urllib.ContentTooShortError( "retrieval incomplete: got only %i out " "of %i bytes" % (read, size), result) return result
def retrieve(self, url, filename=None, reporthook=None, data=None): # overridden method from urllib.URLopener self._cancelDownload = False url = urllib.unwrap(urllib.toBytes(url)) if self.tempcache and url in self.tempcache: return self.tempcache[url] type, url1 = urllib.splittype(url) if filename is None and (not type or type == 'file'): try: fp = self.open_local_file(url1) hdrs = fp.info() fp.close() return urllib.url2pathname(urllib.splithost(url1)[1]), hdrs except IOError: pass fp = self.open(url, data) try: headers = fp.info() if filename: tfp = open(filename, 'wb') else: import tempfile garbage, path = urllib.splittype(url) garbage, path = urllib.splithost(path or "") path, garbage = urllib.splitquery(path or "") path, garbage = urllib.splitattr(path or "") suffix = os.path.splitext(path)[1] (fd, filename) = tempfile.mkstemp(suffix) self.__tempfiles.append(filename) tfp = os.fdopen(fd, 'wb') try: result = filename, headers if self.tempcache is not None: self.tempcache[url] = result bs = 1024 * 8 size = -1 read = 0 blocknum = 0 if "content-length" in headers: size = int(headers["Content-Length"]) if reporthook: reporthook(blocknum, bs, size) while not self._cancelDownload: block = fp.read(bs) if block == "": break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) finally: tfp.close() finally: fp.close() # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise urllib.ContentTooShortError( "retrieval incomplete: got only %i out " "of %i bytes" % (read, size), result) if self._cancelDownload and os.path.exists(filename): os.remove(filename) self.wasCanceled = True return result