Esempio n. 1
0
    def __call__(self, task):
        # Determine the path etc here

        updated_kwargs = {
            i: self.target_kwargs[i]
            for i in self.target_kwargs if i != 'glob'
        }

        if 'glob' in self.target_kwargs:
            target_path = self.file_pattern.format(task=task)
            updated_glob = self.target_kwargs['glob'].format(
                task=task) + self.ext.format(task=task)
            updated_kwargs['glob'] = new_glob
        else:
            target_path = (self.file_pattern.format(task=task) +
                           self.ext.format(task=task))

        path_sep = get_fs_token_paths(target_path)[0].sep
        if target_path[-1] != path_sep:
            if target_path[-1] == "/":
                target_path = target_path[:-1]
            target_path = target_path + path_sep
        fs, _, _ = get_fs_token_paths(target_path)

        return self.target_class(target_path, **updated_kwargs)
Esempio n. 2
0
    def __call__(self, task):
        # Use either ext to specify extensions or no ext in which case custom params go to
        revised_kwargs = {
            i: self.target_kwargs[i]
            for i in self.target_kwargs if i != "ext"
        }

        # If glob is specified,
        if "glob" in self.target_kwargs:
            target_path = self.file_pattern.format(task=task)
            revised_glob = self.target_kwargs["glob"].format(
                task=task) + self.ext.format(task=task)
            revised_kwargs["glob"] = revised_glob
        else:
            target_path = self.file_pattern.format(
                task=task) + self.ext.format(task=task)

        # Note that these targets force you to specify directory datasets with an ending /; Dask (annoyingly) is
        # inconsistent on this, so you may find yourself manipulating paths inside ParquetTarget and CSVTarget
        # differently. The user of these targets should not need to worry about these details!
        path_sep = get_fs_token_paths(target_path)[0].sep
        if target_path[-1] != path_sep:
            if target_path[-1] == "/":
                target_path = target_path[:-1]
            target_path = target_path + path_sep
        fs, _, _ = get_fs_token_paths(target_path)

        if "{ext}" not in self.file_pattern and not "" == self.ext:
            target_path = target_path + self.ext

        return self.target_class(target_path, **revised_kwargs)
Esempio n. 3
0
def test_urlpath_expand_read():
    """Make sure * is expanded in file paths when reading."""
    # when reading, globs should be expanded to read files by mask
    with filetexts(csv_files, mode='b'):
        _, _, paths = get_fs_token_paths('.*.csv')
        assert len(paths) == 2
        _, _, paths = get_fs_token_paths(['.*.csv'])
        assert len(paths) == 2
Esempio n. 4
0
def test_urlpath_expand_read():
    """Make sure * is expanded in file paths when reading."""
    # when reading, globs should be expanded to read files by mask
    with filetexts(csv_files, mode='b'):
        _, _, paths = get_fs_token_paths('.*.csv')
        assert len(paths) == 2
        _, _, paths = get_fs_token_paths(['.*.csv'])
        assert len(paths) == 2
Esempio n. 5
0
def test_urlpath_expand_write():
    """Make sure * is expanded in file paths when writing."""
    _, _, paths = get_fs_token_paths('prefix-*.csv', mode='wb', num=2)
    assert paths == ['prefix-0.csv', 'prefix-1.csv']
    _, _, paths = get_fs_token_paths(['prefix-*.csv'], mode='wb', num=2)
    assert paths == ['prefix-0.csv', 'prefix-1.csv']
    # we can read with multiple masks, but not write
    with pytest.raises(ValueError):
        _, _, paths = get_fs_token_paths(['prefix1-*.csv', 'prefix2-*.csv'], mode='wb', num=2)
Esempio n. 6
0
def test_urlpath_expand_write():
    """Make sure * is expanded in file paths when writing."""
    _, _, paths = get_fs_token_paths('prefix-*.csv', mode='wb', num=2)
    assert paths == ['prefix-0.csv', 'prefix-1.csv']
    _, _, paths = get_fs_token_paths(['prefix-*.csv'], mode='wb', num=2)
    assert paths == ['prefix-0.csv', 'prefix-1.csv']
    # we can read with multiple masks, but not write
    with pytest.raises(ValueError):
        _, _, paths = get_fs_token_paths(['prefix1-*.csv', 'prefix2-*.csv'], mode='wb', num=2)
Esempio n. 7
0
def test_urlpath_expand_write():
    """Make sure * is expanded in file paths when writing."""
    _, _, paths = get_fs_token_paths("prefix-*.csv", mode="wb", num=2)
    assert [p.endswith(pa) for p, pa in zip(paths, ["prefix-0.csv", "prefix-1.csv"])]
    _, _, paths = get_fs_token_paths(["prefix-*.csv"], mode="wb", num=2)
    assert [p.endswith(pa) for p, pa in zip(paths, ["prefix-0.csv", "prefix-1.csv"])]
    # we can read with multiple masks, but not write
    with pytest.raises(ValueError):
        _, _, paths = get_fs_token_paths(
            ["prefix1-*.csv", "prefix2-*.csv"], mode="wb", num=2
        )
Esempio n. 8
0
def test_urlpath_inference_errors():
    # Empty list
    with pytest.raises(ValueError) as err:
        get_fs_token_paths([])
    assert 'empty' in str(err)

    # Protocols differ
    with pytest.raises(ValueError) as err:
        get_fs_token_paths(['s3://test/path.csv', '/other/path.csv'])
    assert 'same protocol and options' in str(err)

    # Options differ
    with pytest.raises(ValueError) as err:
        get_fs_token_paths([
            'hdfs://[email protected]/test/path.csv',
            'hdfs://[email protected]/other/path.csv'
        ])
    assert 'same protocol and options' in str(err)

    # Unknown type
    with pytest.raises(TypeError):
        get_fs_token_paths({
            'sets/are.csv', 'unordered/so/they.csv', 'should/not/be.csv'
            'allowed.csv'
        })
Esempio n. 9
0
def test_urlpath_inference_errors():
    # Empty list
    with pytest.raises(ValueError) as err:
        get_fs_token_paths([])
    assert "empty" in str(err)

    # Protocols differ
    with pytest.raises(ValueError) as err:
        get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"])
    assert "same protocol and options" in str(err)

    # Options differ
    with pytest.raises(ValueError) as err:
        get_fs_token_paths(
            [
                "hdfs://[email protected]/test/path.csv",
                "hdfs://[email protected]/other/path.csv",
            ]
        )
    assert "same protocol and options" in str(err)

    # Unknown type
    with pytest.raises(TypeError):
        get_fs_token_paths(
            {"sets/are.csv", "unordered/so/they.csv", "should/not/be.csv" "allowed.csv"}
        )
Esempio n. 10
0
def test_urlpath_inference_errors():
    # Empty list
    with pytest.raises(ValueError, match="empty"):
        get_fs_token_paths([])

    # Protocols differ
    with pytest.raises(ValueError, match="the same protocol"):
        get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"])

    # Options differ
    with pytest.raises(ValueError, match="the same file-system options"):
        get_fs_token_paths(
            [
                "ftp://[email protected]/test/path.csv",
                "ftp://[email protected]/other/path.csv",
            ]
        )

    # Unknown type
    with pytest.raises(TypeError):
        get_fs_token_paths(
            {
                "sets/are.csv",
                "unordered/so/they.csv",
                "should/not/be.csv",
                "allowed.csv",
            }
        )
Esempio n. 11
0
 def fs(self):
     """
     Filesystem from a urlpath and options.
     """
     fs, token, paths = get_fs_token_paths(
         self.path, storage_options=self.storage_options)
     return fs
Esempio n. 12
0
def test_urlpath_inference_strips_protocol(tmpdir):
    tmpdir = str(tmpdir)
    paths = [os.path.join(tmpdir, 'test.%02d.csv' % i) for i in range(20)]

    for path in paths:
        with open(path, 'wb') as f:
            f.write(b'1,2,3\n' * 10)

    # globstring
    protocol = 'file:///' if sys.platform == 'win32' else 'file://'
    urlpath = protocol + os.path.join(tmpdir, 'test.*.csv')
    _, _, paths2 = get_fs_token_paths(urlpath)
    assert paths2 == paths

    # list of paths
    _, _, paths2 = get_fs_token_paths([protocol + p for p in paths])
    assert paths2 == paths
Esempio n. 13
0
def test_urlpath_inference_strips_protocol(tmpdir):
    tmpdir = str(tmpdir)
    paths = [os.path.join(tmpdir, 'test.%02d.csv' % i) for i in range(20)]

    for path in paths:
        with open(path, 'wb') as f:
            f.write(b'1,2,3\n' * 10)

    # globstring
    protocol = 'file:///' if sys.platform == 'win32' else 'file://'
    urlpath = protocol + os.path.join(tmpdir, 'test.*.csv')
    _, _, paths2 = get_fs_token_paths(urlpath)
    assert paths2 == paths

    # list of paths
    _, _, paths2 = get_fs_token_paths([protocol + p for p in paths])
    assert paths2 == paths
Esempio n. 14
0
def test_urlpath_inference_strips_protocol(tmpdir):
    tmpdir = str(tmpdir)
    paths = [os.path.join(tmpdir, "test.%02d.csv" % i) for i in range(20)]

    for path in paths:
        with open(path, "wb") as f:
            f.write(b"1,2,3\n" * 10)

    # globstring
    protocol = "file:///" if sys.platform == "win32" else "file://"
    urlpath = protocol + os.path.join(tmpdir, "test.*.csv")
    _, _, paths2 = get_fs_token_paths(urlpath)
    assert paths2 == paths

    # list of paths
    _, _, paths2 = get_fs_token_paths([protocol + p for p in paths])
    assert paths2 == paths
Esempio n. 15
0
def test_recursive_glob_expand():
    """Make sure * is expanded in file paths when reading."""
    with filetexts(
        {"sub1/afile.csv": b"", "sub1/sub2/another.csv": b"", "sub1/twofile.csv": b""},
        mode="b",
    ):
        _, _, paths = get_fs_token_paths(os.path.abspath("**/*.csv"))
        assert len(paths) == 3
Esempio n. 16
0
    def __call__(self, task):
        """ Implements the "output()" method of a Luigi Task.
        This method allows the descriptor to be used as "output" composition of a Luigi Task.
        A Target (or subsclass) is instantiated and returned. The target file path as well
        as the "file_pattern" template is evaluated here.
        Args:
            task: host class instance
        Returns:
            A Luigi Target (or subclass) instance.
        """

        # If there is a "glob" in target_kwargs, the extension is attached to the end of glob.
        # Otherwise, the extension is attached to end of flie_pattern.
        new_kwargs = {
            i: self.target_kwargs[i]
            for i in self.target_kwargs if i != 'glob'
        }

        if 'glob' in self.target_kwargs:
            target_path = self.file_pattern.format(task=task)

            new_glob = self.target_kwargs['glob'].format(
                task=task) + self.ext.format(task=task)
            new_kwargs['glob'] = new_glob
        else:
            target_path = (self.file_pattern.format(task=task) +
                           self.ext.format(task=task))

        # Make sure that the directory path ends with a system dependent separator.
        path_sep = get_fs_token_paths(target_path)[0].sep
        if target_path[-1] != path_sep:
            if target_path[-1] == "/":
                target_path = target_path[:-1]
            target_path = target_path + path_sep
        fs, _, _ = get_fs_token_paths(target_path)

        return self.target_class(target_path, **new_kwargs)
Esempio n. 17
0
    def _get_schema(self):
        if self._pf is None:
            # copied from dask to allow remote
            soptions = self._kwargs.pop('storage_options', {})
            fs, fs_token, paths = get_fs_token_paths(self._urlpath,
                                                     mode='rb',
                                                     storage_options=soptions)

            if len(paths) > 1:
                pf = fp.ParquetFile(paths, open_with=fs.open, sep=fs.sep)
            else:
                try:
                    pf = fp.ParquetFile(paths[0] + fs.sep + '_metadata',
                                        open_with=fs.open,
                                        sep=fs.sep)
                except Exception:
                    pf = fp.ParquetFile(paths[0],
                                        open_with=fs.open,
                                        sep=fs.sep)

            self._pf = pf
        pf = self._pf
        if self._df is not None:
            return base.Schema(datashape=None,
                               dtype=self._df._meta,
                               shape=(pf.count, len(self._df.columns)),
                               npartitions=self._df.npartitions,
                               extra_metadata=pf.key_value_metadata)
        columns = self._kwargs.get('columns', None)
        if columns:
            dtypes = {k: v for k, v in pf.dtypes.items() if k in columns}
        else:
            dtypes = pf.dtypes
        if 'filters' in self._kwargs:
            rgs = pf.filter_row_groups(self._kwargs['filters'])
            parts = len(rgs)
            count = sum(rg.num_rows for rg in rgs)
        else:
            parts = len(pf.row_groups)
            count = pf.count

        return base.Schema(
            datashape=None,
            dtype=dtypes,  # one of these is the index
            shape=(count, len(dtypes)),
            npartitions=parts,
            extra_metadata=pf.key_value_metadata)
Esempio n. 18
0
def read_parquet(path, storage_options=None):
    """
    Construct a SpatialPointsFrame from a spatially partitioned parquet
    file

    If the input parquet file does not contain compatible spatial metadata,
    then the resulting SpatialPointsFrame will have a .spatial property of
    None, and the spatial_query operation will be unavailable.

    Parameters
    ----------
    path: str
        Path to a spatially partitioned parquet file that was created
        using datashader.spatial.points.to_parquet

    storage_options : dict or None (default None)
        Key/value pairs to be passed on to the file-system backend, if any.

    Returns
    -------
    SpatialPointsFrame
        A spatially sorted Dask dataframe reconstructed from disk
    """
    _validate_fastparquet()

    # Read parquet file
    frame = dd.read_parquet(path, storage_options=storage_options)

    # Open parquet file
    fs, _, paths = get_fs_token_paths(path,
                                      mode="rb",
                                      storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)
    pf = fp.ParquetFile(path, open_with=fs.open)

    # Check for spatial points metadata
    if 'SpatialPointsFrame' in pf.key_value_metadata:
        # Load metadata
        props = json.loads(pf.key_value_metadata['SpatialPointsFrame'])
    else:
        props = None

    # Call DataFrame constructor with the internals of frame
    return SpatialPointsFrame(frame.dask, frame._name, frame._meta,
                              frame.divisions, props)
Esempio n. 19
0
def test_glob(hdfs):

    tree = {
        basedir: (["c", "c2"], ["a", "a1", "a2", "a3", "b1"]),
        basedir + "/c": (["d"], ["x1", "x2"]),
        basedir + "/c2": (["d"], ["x1", "x2"]),
        basedir + "/c/d": ([], ["x3"]),
    }

    hdfs, _, _ = get_fs_token_paths("hdfs:///")
    hdfs.makedirs(basedir + "/c/d")
    hdfs.makedirs(basedir + "/c2/d/")
    for fn in (posixpath.join(dirname, f)
               for (dirname, (_, fils)) in tree.items() for f in fils):
        with hdfs.open(fn, mode="wb") as f2:
            f2.write(b"000")

    assert set(hdfs.glob(basedir + "/a*")) == {
        basedir + p
        for p in ["/a", "/a1", "/a2", "/a3"]
    }

    assert set(hdfs.glob(basedir + "/c/*")) == {
        basedir + p
        for p in ["/c/x1", "/c/x2", "/c/d"]
    }

    assert set(hdfs.glob(basedir + "/*/x*")) == {
        basedir + p
        for p in ["/c/x1", "/c/x2", "/c2/x1", "/c2/x2"]
    }
    assert set(hdfs.glob(basedir + "/*/x1")) == {
        basedir + p
        for p in ["/c/x1", "/c2/x1"]
    }

    assert hdfs.find("/this-path-doesnt-exist") == []
    assert hdfs.find(basedir + "/missing/") == []
    assert hdfs.find(basedir + "/missing/x1") == []
    assert hdfs.glob(basedir + "/missing/*") == []
    assert hdfs.glob(basedir + "/*/missing") == []

    assert set(hdfs.glob(basedir + "/*")) == {
        basedir + p
        for p in ["/a", "/a1", "/a2", "/a3", "/b1", "/c", "/c2"]
    }
Esempio n. 20
0
def test_urlpath_inference_errors():
    # Empty list
    with pytest.raises(ValueError) as err:
        get_fs_token_paths([])
    assert 'empty' in str(err)

    # Protocols differ
    with pytest.raises(ValueError) as err:
        get_fs_token_paths(['s3://test/path.csv', '/other/path.csv'])
    assert 'same protocol and options' in str(err)

    # Options differ
    with pytest.raises(ValueError) as err:
        get_fs_token_paths(['hdfs://[email protected]/test/path.csv',
                            'hdfs://[email protected]/other/path.csv'])
    assert 'same protocol and options' in str(err)

    # Unknown type
    with pytest.raises(TypeError):
        get_fs_token_paths({'sets/are.csv', 'unordered/so/they.csv',
                            'should/not/be.csv' 'allowed.csv'})
Esempio n. 21
0
def read_avro(urlpath,
              blocksize=100000000,
              storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile,
                                 tokenize)
    from dask.bag import from_delayed
    import_required(
        'fastavro', "fastavro is a required dependency for using "
        "bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode='rb', storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head['sync']
            f = OpenFile(fs, path, compression=compression)
            token = tokenize(fs_token, delimiter, path, fs.ukey(path),
                             compression, offset)
            keys = ['read-avro-%s-%s' % (o, token) for o in offset]
            values = [
                dread(f, o, l, head, dask_key_name=key)
                for o, key, l in zip(offset, keys, length)
            ]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Esempio n. 22
0
def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
    """Read cudf dataframe from ORC file(s).

    Note that this function is mostly borrowed from upstream Dask.

    Parameters
    ----------
    path: str or list(str)
        Location of file(s), which can be a full URL with protocol specifier,
        and may include glob character if a single string.
    columns: None or list(str)
        Columns to load. If None, loads all.
    filters : None or list of tuple or list of lists of tuples
        If not None, specifies a filter predicate used to filter out row groups
        using statistics stored for each row group as Parquet metadata. Row
        groups that do not match the given filter predicate are not read. The
        predicate is expressed in disjunctive normal form (DNF) like
        `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical
        combinations of single column predicates. The innermost tuples each
        describe a single column predicate. The list of inner predicates is
        interpreted as a conjunction (AND), forming a more selective and
        multiple column predicate. Finally, the outermost list combines
        these filters as a disjunction (OR). Predicates may also be passed
        as a list of tuples. This form is interpreted as a single conjunction.
        To express OR in predicates, one must use the (preferred) notation of
        list of lists of tuples.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.

    Returns
    -------
    cudf.DataFrame
    """

    storage_options = storage_options or {}
    fs, fs_token, paths = get_fs_token_paths(
        path, mode="rb", storage_options=storage_options
    )
    schema = None
    nstripes_per_file = []
    for path in paths:
        with fs.open(path, "rb") as f:
            o = orc.ORCFile(f)
            if schema is None:
                schema = o.schema
            elif schema != o.schema:
                raise ValueError(
                    "Incompatible schemas while parsing ORC files"
                )
            nstripes_per_file.append(o.nstripes)
    schema = _get_pyarrow_dtypes(schema, categories=None)
    if columns is not None:
        ex = set(columns) - set(schema)
        if ex:
            raise ValueError(
                "Requested columns (%s) not in schema (%s)" % (ex, set(schema))
            )
    else:
        columns = list(schema)

    with fs.open(paths[0], "rb") as f:
        meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs)

    name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
    dsk = {}
    N = 0
    for path, n in zip(paths, nstripes_per_file):
        for stripe in (
            range(n)
            if filters is None
            else cudf.io.orc._filter_stripes(filters, path)
        ):
            dsk[(name, N)] = (
                _read_orc_stripe,
                fs,
                path,
                stripe,
                columns,
                kwargs,
            )
            N += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Esempio n. 23
0
def to_parquet(df,
               path,
               x,
               y,
               p=10,
               npartitions=None,
               shuffle=None,
               compression='default',
               storage_options=None):
    """
    Perform spatial partitioning on an input dataframe and write the
    result to a parquet file.  The resulting parquet file will contain
    the same columns as the input dataframe, but the dataframe's original
    index will be dropped.

    The resulting parquet file will contain all of the rows from the
    input dataframe, but they will be spatially sorted and partitioned
    along a 2D Hilbert curve (https://en.wikipedia.org/wiki/Hilbert_curve).

    The parquet file will also contain custom metadata that is needed to
    reconstruct the Hilbert curve distances on load.  This parquet file
    may then be used to construct SpatialPointsFrame instances using
    datashader.spatial.points.read_parquet.

    Parameters
    ----------
    df: pd.DataFrame or dd.DataFrame
        The input dataframe to partition
    path: str
        The path where the resulting parquet file should be written.
        See dask.dataframe.to_parquet for description of supported path
        specifications.
    x, y
        The column labels in df of the x and y coordinates of each row
    p: int (default 10)
        The Hilbert curve order parameter that determines the resolution
        of the 2D grid that data points are rounded to before computing
        their Hilbert distance. Points will be discretized into 2 ** p
        bins in each the x and y dimensions.

        This parameter should be increased if the partitions of the
        resulting parquet files are significantly unbalanced.

    npartitions: int or None (default None)
        The number of partitions for the resulting parquet file.  If None
        (the default) this is chosen to be the greater of 8 and
        len(df) // 2**23.

        In general, increasing the number of partitions will improve
        performance when processing small subsets of the overall parquet
        data set.  But this comes at the cost of some additional overhead
        when processing the entire data set.

    shuffle: str or None (default None)
        The dask.dataframe.DataFrame.set_index shuffle method. If None,
        a default is chosen based on the current scheduler.

    compression: str or None (default)
        The dask.dataframe.to_parquet compression method.

    storage_options : dict or None (default None)
        Key/value pairs to be passed on to the file-system backend, if any.
    """

    _validate_fastparquet()

    # Validate filename
    if (not isinstance(path, basestring)
            or not (path.endswith('.parquet') or path.endswith('.parq'))):
        raise ValueError("""\
'filename must be a string ending with a .parquet or .parq extension""")

    # Remove any existing directory
    if os.path.exists(path):
        shutil.rmtree(path)

    # Normalize to dask dataframe
    if isinstance(df, pd.DataFrame):
        ddf = dd.from_pandas(df, npartitions=4)
    elif isinstance(df, dd.DataFrame):
        ddf = df
    else:
        raise ValueError("""
df must be a pandas or dask DataFrame instance.
Received value of type {typ}""".format(typ=type(df)))

    # Get number of rows
    nrows = len(df)

    # Compute npartitions if needed
    if npartitions is None:
        # Make partitions of ~8 million rows with a minimum of 8
        # partitions
        npartitions = max(nrows // 2**23, 8)

    # Compute data extents
    extents = ddf.map_partitions(_compute_extents, x, y).compute()

    x_range = (float(extents['x_min'].min()), float(extents['x_max'].max()))

    y_range = (float(extents['y_min'].min()), float(extents['y_max'].max()))

    # Compute distance of points along the Hilbert-curve
    ddf = ddf.assign(distance=ddf.map_partitions(_compute_distance,
                                                 x=x,
                                                 y=y,
                                                 p=p,
                                                 x_range=x_range,
                                                 y_range=y_range,
                                                 as_series=True))

    # Set index to distance. This will trigger an expensive shuffle
    # sort operation
    ddf = ddf.set_index('distance', npartitions=npartitions, shuffle=shuffle)

    # Get list of the distance divisions computed by dask
    distance_divisions = [int(d) for d in ddf.divisions]

    # Save properties as custom metadata in the parquet file
    props = dict(version='1.0',
                 x=x,
                 y=y,
                 p=p,
                 distance_divisions=distance_divisions,
                 x_range=x_range,
                 y_range=y_range,
                 nrows=nrows)

    # Drop distance index to save storage space
    ddf = ddf.reset_index(drop=True)

    # Save ddf to parquet
    dd.to_parquet(ddf,
                  path,
                  engine='fastparquet',
                  compression=compression,
                  storage_options=storage_options)

    # Open resulting parquet file
    fs, _, paths = get_fs_token_paths(path,
                                      mode="wb",
                                      storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)
    pf = fp.ParquetFile(path, open_with=fs.open)

    # Add a new property to the file metadata
    new_fmd = copy.copy(pf.fmd)
    new_kv = fp.parquet_thrift.KeyValue()
    new_kv.key = 'SpatialPointsFrame'
    new_kv.value = json.dumps(props)
    new_fmd.key_value_metadata.append(new_kv)

    # Overwrite file metadata
    fn = os.path.join(path, '_metadata')
    fp.writer.write_common_metadata(fn,
                                    new_fmd,
                                    no_row_groups=False,
                                    open_with=fs.open)

    fn = os.path.join(path, '_common_metadata')
    fp.writer.write_common_metadata(fn, new_fmd, open_with=fs.open)
Esempio n. 24
0
def touch(path, storage_options=None, _dep=None):
    fs, token, paths = get_fs_token_paths(path,
                                          storage_options=storage_options)
    with fs.open(path, mode="wb"):
        pass
Esempio n. 25
0
File: orc.py Progetto: vuule/cudf
def to_orc(
    df,
    path,
    write_index=True,
    storage_options=None,
    compression=None,
    compute=True,
    **kwargs,
):
    """Write a dask_cudf dataframe to ORC file(s) (one file per partition).

    Parameters
    ----------
    df : dask_cudf.DataFrame
    path: string or pathlib.Path
        Destination directory for data.  Prepend with protocol like ``s3://``
        or ``hdfs://`` for remote data.
    write_index : boolean, optional
        Whether or not to write the index. Defaults to True.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.
    compression : string or dict, optional
    compute : bool, optional
        If True (default) then the result is computed immediately. If False
        then a ``dask.delayed`` object is returned for future computation.
    """

    from dask import delayed
    from dask import compute as dask_compute

    # TODO: Use upstream dask implementation once available
    #       (see: Dask Issue#5596)

    if hasattr(path, "name"):
        path = stringify_path(path)
    fs, _, _ = get_fs_token_paths(path,
                                  mode="wb",
                                  storage_options=storage_options)
    # Trim any protocol information from the path before forwarding
    path = fs._strip_protocol(path)

    if write_index:
        df = df.reset_index()
    else:
        # Not writing index - might as well drop it
        df = df.reset_index(drop=True)

    fs.mkdirs(path, exist_ok=True)

    # Use i_offset and df.npartitions to define file-name list
    filenames = ["part.%i.orc" % i for i in range(df.npartitions)]

    # write parts
    dwrite = delayed(write_orc_partition)
    parts = [
        dwrite(d, path, fs, filename, compression=compression)
        for d, filename in zip(df.to_delayed(), filenames)
    ]

    if compute:
        return dask_compute(*parts)

    return delayed(list)(parts)
Esempio n. 26
0
File: orc.py Progetto: vuule/cudf
def read_orc(path, columns=None, storage_options=None, **kwargs):
    """Read cudf dataframe from ORC file(s).

    Note that this function is mostly borrowed from upstream Dask.

    Parameters
    ----------
    path: str or list(str)
        Location of file(s), which can be a full URL with protocol specifier,
        and may include glob character if a single string.
    columns: None or list(str)
        Columns to load. If None, loads all.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.

    Returns
    -------
    cudf.DataFrame
    """

    storage_options = storage_options or {}
    fs, fs_token, paths = get_fs_token_paths(path,
                                             mode="rb",
                                             storage_options=storage_options)
    schema = None
    nstripes_per_file = []
    for path in paths:
        with fs.open(path, "rb") as f:
            o = orc.ORCFile(f)
            if schema is None:
                schema = o.schema
            elif schema != o.schema:
                raise ValueError(
                    "Incompatible schemas while parsing ORC files")
            nstripes_per_file.append(o.nstripes)
    schema = _get_pyarrow_dtypes(schema, categories=None)
    if columns is not None:
        ex = set(columns) - set(schema)
        if ex:
            raise ValueError("Requested columns (%s) not in schema (%s)" %
                             (ex, set(schema)))
    else:
        columns = list(schema)

    with fs.open(paths[0], "rb") as f:
        meta = cudf.read_orc(f, stripe=0, columns=columns, **kwargs)

    name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
    dsk = {}
    N = 0
    for path, n in zip(paths, nstripes_per_file):
        for stripe in range(n):
            dsk[(name, N)] = (
                _read_orc_stripe,
                fs,
                path,
                stripe,
                columns,
                kwargs,
            )
            N += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Esempio n. 27
0
def test_recursive_glob_expand():
    """Make sure * is expanded in file paths when reading."""
    with filetexts(csv_files, mode='b'):
        _, _, paths = get_fs_token_paths('**/.*.csv')
        assert len(paths) == 3
Esempio n. 28
0
def test_recursive_glob_expand():
    """Make sure * is expanded in file paths when reading."""
    with filetexts(csv_files, mode="b"):
        _, _, paths = get_fs_token_paths("**/.*.csv")
        assert len(paths) == 3
Esempio n. 29
0
 def fs(self):
     fs, token, paths = get_fs_token_paths(
         self.path, storage_options=self.storage_options)
     return fs
Esempio n. 30
0
def read_avro(urlpath, blocksize=100000000, storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import (open_files, get_fs_token_paths,
                                 OpenFile, tokenize)
    from dask.bag import from_delayed
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode='rb', storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head['sync']
            f = OpenFile(fs, path, compression=compression)
            token = tokenize(fs_token, delimiter, path, fs.ukey(path),
                             compression, offset)
            keys = ['read-avro-%s-%s' % (o, token) for o in offset]
            values = [dread(f, o, l, head, dask_key_name=key)
                      for o, key, l in zip(offset, keys, length)]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, compression=compression, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)