def fs_write(fname, data, hdfs_client=None): """ Write to local fs or HDFS based on fname uri """ if fname.startswith('hdfs://'): fname = strip_uri_proto(fname, 'hdfs://') if hdfs_client is None: hdfs_client = terrahdfs.hdfs_client() # When writing fractions to HDFS, we might want to adapt the # blocksize to match the file size to avoid fractionning and avoid # using too much space # http://grokbase.com/t/cloudera/cdh-user/133z6yj74d/how-to-write-a-file-with-custom-block-size-to-hdfs # TODO: Not sure if this has a performance impact # round up to MB and add 2MB to have a margin (on-disk size is not # exactly equal to len(data) for some reason... block metadata ?) blocksize_mb = int(np.ceil(len(data) / (1024 * 1024))) + 2 # minimum blocksize = 1MB - fully masked fracs are pretty small blocksize = 1024 * 1024 * max(1, blocksize_mb) with hdfs_client.write(fname, overwrite=True, blocksize=blocksize) as writer: writer.write(data) # Verify write correctness by requesting file status and check # that filesize < blocksize stat = hdfs_client.status(fname) assert stat['blockSize'] > stat['length'], "blockSize <= length for "\ " file %s" % fname else: fname = strip_uri_proto(fname, 'fs://') outdir = os.path.dirname(fname) utils.mkdir_p(outdir) with open(fname, 'wb') as f: f.write(data)
def write_frac(fname, data, hdfs_client=None): if fname.startswith('hdfs://'): buf = StringIO.StringIO() np.save(buf, data) rasterio.fs_write(fname, buf.getvalue(), hdfs_client) else: # Short-circuit _fs_read if reading from fs:// fname = rasterio.strip_uri_proto(fname, 'fs://') outdir = os.path.dirname(fname) utils.mkdir_p(outdir) # If writing to fs://, we short-cirtcuit fs_write with open(fname, 'wb') as f: np.save(f, data)
def _do_download_aria2(files): # We download to a temporary loaction and copy to final destination once # finished try: tmpdir = tempfile.mkdtemp() print '==== Downloading to tmpdir=%s' % tmpdir # Create temporary filenames and aria2 download file download_fname = os.path.join(tmpdir, 'downloads.txt') tmp2fname = {} with open(download_fname, 'w') as f: for i in range(len(files)): url, dst_filename = files[i] tmpname = os.path.join(tmpdir, '%d' % i) tmp2fname[tmpname] = dst_filename # Read aria2 docs for the format of the input file. Basically, # an URI line can be followed by options line which MUST start # with one or more spaces # https://aria2.github.io/manual/en/html/aria2c.html # This is also relevant with some tips for how to write the # file https://github.com/tatsuhiro-t/aria2/issues/190 f.write('%s\n out=%s\n' % (url, tmpname)) del url, dst_filename assert config.MODIS_HTTP_USER != '',\ "You should set MODIS_HTTP_USER in your config file" # Download using aria2 cmd = [ '/usr/bin/env', 'aria2c', '--http-user=%s' % config.MODIS_HTTP_USER, '--http-pass=%s' % config.MODIS_HTTP_PASS, '-i %s' % download_fname ] # Set the cwd to / because aria2 interpret filenames as relative # even if they start with / subprocess.check_call(cmd, cwd='/') # Copy files to their final destination for tmpname, dstname in tmp2fname.items(): year_dir = os.path.join(os.path.dirname(dstname), os.pardir) utils.mkdir_p(year_dir) shutil.copyfile(tmpname, dstname) print 'Finished ', dstname finally: shutil.rmtree(tmpdir)
def download_url(url, dst_filename): """ Download url into dst_filename. To avoid having half-complete files lying around, this first downloads to a temporary location and move to dst_filename once the download is complete """ print 'Starting ', dst_filename # Ensure year directory exists year_dir = os.path.join(os.path.dirname(dst_filename), os.pardir) utils.mkdir_p(year_dir) with tempfile.NamedTemporaryFile() as f: subprocess.check_call('/usr/bin/wget %s -O %s' % (url, f.name), shell=True) shutil.copyfile(f.name, dst_filename) print 'Finished ', dst_filename
def mirror_modis_dates_html(base_url, mirror_dir, use_wget=False): """ Download all MODIS date listing pages to a local directory. Usually, a MODIS listing for a date should not change (only new dates should be added), so there should be no need to re-download. """ ndownloads = 0 dates_urls = collect_all_dates_pages(base_url) utils.mkdir_p(mirror_dir) for date, url in dates_urls: fname = os.path.join(mirror_dir, date + '.html') if not os.path.exists(fname): print 'Downloading ', fname if use_wget: subprocess.check_call('/usr/bin/wget %s -O %s' % (url, fname), shell=True) else: urllib.urlretrieve(url, fname) ndownloads += 1 # The MODIS MOLT repository server doesn't return Content-Length # so urllib cannot tell if it downloaded the whole html or was # just disconnected, which could lead to incomplete HTML being # downloaded. So we check if the downloaded file ends with </html> with open(fname, 'r') as f: # seek 10 bytes from the end f.seek(-10, 2) line = f.read(10) if "</html>" not in line: raise urllib.ContentTooShortError( "Couldn't find </html> in downloaded file, probably a partial download", "" ) # Just avoid firing requests as fast as possible time.sleep(0.1) return ndownloads > 0