Exemple #1
0
def fs_write(fname, data, hdfs_client=None):
    """
    Write to local fs or HDFS based on fname uri
    """
    if fname.startswith('hdfs://'):
        fname = strip_uri_proto(fname, 'hdfs://')
        if hdfs_client is None:
            hdfs_client = terrahdfs.hdfs_client()

        # When writing fractions to HDFS, we might want to adapt the
        # blocksize to match the file size to avoid fractionning and avoid
        # using too much space
        # http://grokbase.com/t/cloudera/cdh-user/133z6yj74d/how-to-write-a-file-with-custom-block-size-to-hdfs
        # TODO: Not sure if this has a performance impact
        # round up to MB and add 2MB to have a margin (on-disk size is not
        # exactly equal to len(data) for some reason... block metadata ?)
        blocksize_mb = int(np.ceil(len(data) / (1024 * 1024))) + 2
        # minimum blocksize = 1MB - fully masked fracs are pretty small
        blocksize = 1024 * 1024 * max(1, blocksize_mb)
        with hdfs_client.write(fname, overwrite=True,
                               blocksize=blocksize) as writer:
            writer.write(data)
        # Verify write correctness by requesting file status and check
        # that filesize < blocksize
        stat = hdfs_client.status(fname)
        assert stat['blockSize'] > stat['length'], "blockSize <= length for "\
            " file %s" % fname
    else:
        fname = strip_uri_proto(fname, 'fs://')
        outdir = os.path.dirname(fname)
        utils.mkdir_p(outdir)
        with open(fname, 'wb') as f:
            f.write(data)
Exemple #2
0
def write_frac(fname, data, hdfs_client=None):
    if fname.startswith('hdfs://'):
        buf = StringIO.StringIO()
        np.save(buf, data)
        rasterio.fs_write(fname, buf.getvalue(), hdfs_client)
    else:
        # Short-circuit _fs_read if reading from fs://
        fname = rasterio.strip_uri_proto(fname, 'fs://')
        outdir = os.path.dirname(fname)
        utils.mkdir_p(outdir)
        # If writing to fs://, we short-cirtcuit fs_write
        with open(fname, 'wb') as f:
            np.save(f, data)
Exemple #3
0
def _do_download_aria2(files):
    # We download to a temporary loaction and copy to final destination once
    # finished
    try:
        tmpdir = tempfile.mkdtemp()
        print '==== Downloading to tmpdir=%s' % tmpdir
        # Create temporary filenames and aria2 download file
        download_fname = os.path.join(tmpdir, 'downloads.txt')
        tmp2fname = {}
        with open(download_fname, 'w') as f:
            for i in range(len(files)):
                url, dst_filename = files[i]
                tmpname = os.path.join(tmpdir, '%d' % i)
                tmp2fname[tmpname] = dst_filename

                # Read aria2 docs for the format of the input file. Basically,
                # an URI line can be followed by options line which MUST start
                # with one or more spaces
                # https://aria2.github.io/manual/en/html/aria2c.html
                # This is also relevant with some tips for how to write the
                # file https://github.com/tatsuhiro-t/aria2/issues/190
                f.write('%s\n out=%s\n' % (url, tmpname))

                del url, dst_filename

        assert config.MODIS_HTTP_USER != '',\
            "You should set MODIS_HTTP_USER in your config file"

        # Download using aria2
        cmd = [
            '/usr/bin/env',
            'aria2c',
            '--http-user=%s' % config.MODIS_HTTP_USER,
            '--http-pass=%s' % config.MODIS_HTTP_PASS,
            '-i %s' % download_fname
        ]

        # Set the cwd to / because aria2 interpret filenames as relative
        # even if they start with /
        subprocess.check_call(cmd, cwd='/')

        # Copy files to their final destination
        for tmpname, dstname in tmp2fname.items():
            year_dir = os.path.join(os.path.dirname(dstname), os.pardir)
            utils.mkdir_p(year_dir)
            shutil.copyfile(tmpname, dstname)
            print 'Finished ', dstname

    finally:
        shutil.rmtree(tmpdir)
Exemple #4
0
def download_url(url, dst_filename):
    """
    Download url into dst_filename.
    To avoid having half-complete files lying around, this first downloads
    to a temporary location and move to dst_filename once the download
    is complete
    """
    print 'Starting ', dst_filename
    # Ensure year directory exists
    year_dir = os.path.join(os.path.dirname(dst_filename), os.pardir)
    utils.mkdir_p(year_dir)
    with tempfile.NamedTemporaryFile() as f:
        subprocess.check_call('/usr/bin/wget %s -O %s' % (url, f.name),
                              shell=True)
        shutil.copyfile(f.name, dst_filename)
    print 'Finished ', dst_filename
Exemple #5
0
def mirror_modis_dates_html(base_url, mirror_dir, use_wget=False):
    """
    Download all MODIS date listing pages to a local directory.
    Usually, a MODIS listing for a date should not change (only new dates
    should be added), so there should be no need to re-download.
    """
    ndownloads = 0
    dates_urls = collect_all_dates_pages(base_url)
    utils.mkdir_p(mirror_dir)
    for date, url in dates_urls:
        fname = os.path.join(mirror_dir, date + '.html')
        if not os.path.exists(fname):
            print 'Downloading ', fname
            if use_wget:
                subprocess.check_call('/usr/bin/wget %s -O %s' % (url, fname),
                                      shell=True)
            else:
                urllib.urlretrieve(url, fname)
            ndownloads += 1
            # The MODIS MOLT repository server doesn't return Content-Length
            # so urllib cannot tell if it downloaded the whole html or was
            # just disconnected, which could lead to incomplete HTML being
            # downloaded. So we check if the downloaded file ends with </html>
            with open(fname, 'r') as f:
                # seek 10 bytes from the end
                f.seek(-10, 2)
                line = f.read(10)
                if "</html>" not in line:
                    raise urllib.ContentTooShortError(
                        "Couldn't find </html> in downloaded file, probably a partial download", ""
                    )

            # Just avoid firing requests as fast as possible
            time.sleep(0.1)

    return ndownloads > 0