Ejemplo n.º 1
0
def test_add():
    df = pd.DataFrame(np.identity(12))
    df2 = df.copy()
    df2.index += 1

    sf1 = sp.SparseFrame(df)
    sf2 = sp.SparseFrame(df2)
    correct = sf1.add(sf2).todense()

    dsf = dsp.from_pandas(df, npartitions=4)
    dsf2 = dsp.from_pandas(df2, npartitions=4)

    res = dsf.add(dsf2).compute().todense()
    pdt.assert_frame_equal(res, correct)
Ejemplo n.º 2
0
def from_pandas(df, npartitions=None, chunksize=None, name=None):
    """
    Parameters
    ----------
    df : pandas.DataFrame or pandas.Series
        The DataFrame/Series with which to construct a Dask DataFrame/Series
    npartitions : int, optional
        The number of partitions of the index to create. Note that depending on
        the size and index of the dataframe, the output may have fewer
        partitions than requested.
    chunksize : int, optional
        The size of the partitions of the index.
    name: string, optional
        An optional keyname for the dataframe. Define when dataframe large.
        Defaults to hashing the input. Hashing takes a lot of time on large df.
    """
    nrows = df.shape[0]

    if chunksize is None:
        chunksize = int(ceil(nrows / npartitions))
    else:
        npartitions = int(ceil(nrows / chunksize))

    if not df.index.is_monotonic_increasing:
        df = df.sort_index()

    divisions, locations = sorted_division_locations(df.index,
                                                     chunksize=chunksize)
    name = name or 'from_pandas-{}'.format(tokenize(df, npartitions))
    dsk = dict(
        ((name, i), sp.SparseFrame(df.iloc[start:stop]))
        for i, (start, stop) in enumerate(zip(locations[:-1], locations[1:])))
    meta = _make_meta(df)
    return SparseFrame(dsk, name, meta, divisions)
Ejemplo n.º 3
0
def one_hot_encode(ddf, column, categories, index_col):
    """
    Sparse one hot encoding of dask.DataFrame

    Convert a dask.DataFrame into a series of SparseFrames. By one hot
    encoding a single column

    Parameters
    ----------
    ddf: dask.DataFrame
        e.g. the clickstream
    column: str
        column name to one hot encode in with SparseFrame
    categories: iterable
        possible category values
    index_col: str, iterable
        which columns to use as index

    Returns
    -------
        sparse_one_hot: dask.Series
    """
    idx_meta = ddf._meta.reset_index().set_index(index_col).index[:0] \
        if index_col else ddf._meta.index
    meta = sp.SparseFrame(np.array([]), columns=categories, index=idx_meta)

    dsf = ddf.map_partitions(sparse_one_hot,
                             column=column,
                             categories=categories,
                             index_col=index_col,
                             meta=object)

    return SparseFrame(dsf.dask, dsf._name, meta, dsf.divisions)
Ejemplo n.º 4
0
def _make_meta(inp):
    if isinstance(inp, sp.SparseFrame):
        return inp.iloc[:0]
    else:
        meta = dd_make_meta(inp)
        if isinstance(meta, pd.core.generic.NDFrame):
            return sp.SparseFrame(meta)
        return meta
Ejemplo n.º 5
0
def test_read_npz():
    sf = sp.SparseFrame(np.identity(100))
    with tmpdir() as tmp:
        sf.iloc[:25].to_npz(os.path.join(tmp, '1'))
        sf.iloc[25:50].to_npz(os.path.join(tmp, '2'))
        sf.iloc[50:75].to_npz(os.path.join(tmp, '3'))
        sf.iloc[75:].to_npz(os.path.join(tmp, '4'))

        dsf = dsp.read_npz(os.path.join(tmp, '*.npz'))
        sf = dsf.compute()
    assert np.all(sf.data.toarray() == np.identity(100))
Ejemplo n.º 6
0
def read_npz(path, read_divisions=False, storage_options=None):
    """
    Read SparseFrame from npz archives

    Parameters
    ----------
    path: str
        path to load files from can contain '*' to
        reference multiple files
    read_divisions: bool
        if the files are sorted read the index for each file
        to obtain divions. If files are not sorted this will
        raise and error.

    Returns
    -------
        dsf: dask.SparseFrame
    """
    dsk = {}
    name = 'read_npz-{}'.format(tokenize(path))
    loader = None
    divisions = None
    try:
        loader = _open_npz_archive(
            path.split('*')[0] + 'metadata.npz', storage_options)
        divisions = loader['divisions']
        _paths = loader['partitions']
    except FileNotFoundError:
        _paths = _sorted(list(glob(path)))
    finally:
        if loader:
            loader.close()
    archive = _open_npz_archive(_paths[0], storage_options)

    meta_idx, meta_cols = archive['frame_index'], archive['frame_columns']
    meta = sp.SparseFrame(np.empty(shape=(0, len(meta_cols))),
                          index=meta_idx[:0],
                          columns=meta_cols)

    for i, p in enumerate(_paths):
        dsk[name, i] = (sp.SparseFrame.read_npz, p, storage_options)

    if divisions is None and read_divisions:
        level = 0 if isinstance(meta_idx, pd.MultiIndex) else None
        divisions = _npz_read_divisions(_paths, level=level)
    elif divisions is None:
        divisions = [None] * (len(_paths) + 1)

    return SparseFrame(dsk, name, meta, divisions=divisions)
Ejemplo n.º 7
0
    def __init__(self, dsk, name, meta, divisions=None):
        if isinstance(meta, SparseFrame):
            # TODO: remove this case once we subclass from dask._Frame
            meta = meta._meta
        if not isinstance(meta, sp.SparseFrame):
            meta = sp.SparseFrame(meta)

        self.dask = dsk
        self._name = name
        self._meta = make_meta(meta)

        self.divisions = tuple(divisions)
        self.ndim = 2

        self.loc = _LocIndexer(self)
Ejemplo n.º 8
0
def test_distributed_join_shortcut(how):
    left = pd.DataFrame(np.identity(10),
                        index=np.arange(10),
                        columns=list('ABCDEFGHIJ'))
    right = pd.DataFrame(np.identity(10),
                         index=np.arange(5, 15),
                         columns=list('KLMNOPQRST'))
    correct = left.join(right, how=how).fillna(0)

    d_left = dsp.from_pandas(left, npartitions=2)
    d_right = sp.SparseFrame(right)

    joined = d_left.join(d_right, how=how)

    res = joined.compute().todense()

    pdt.assert_frame_equal(correct, res)
Ejemplo n.º 9
0
    def _construct_item_features(self, item_features, item_ids):
        """Create item features during predict."""
        # align feature names
        if self.indicator_setting in ['both', 'items']:
            item_indicator = sp.SparseFrame(self._item_indicator,
                                            index=self.iid_map.index)
            item_indicator = item_indicator.reindex(item_ids).data
        else:
            item_indicator = None

        if self.item_feature_names is None:
            return item_indicator

        item_feat_csr = item_features\
            .loc[:, self.item_feature_names]\
            .reindex(item_ids, axis=0)\
            .data
        if item_indicator is not None:
            item_feat_csr = sparse.hstack([item_feat_csr, item_indicator])
        return item_feat_csr
Ejemplo n.º 10
0
def read_npz(path, sorted=False):
    """
    Read SparseFrame from npz archives

    Parameters
    ----------
    path: str
        path to load files from can contain '*' to
        reference multiple files
    sorted: bool
        if the files are sorted read the index for each file
        to obtain divions

    Returns
    -------
        dsf: dask.SparseFrame
    """
    dsk = {}
    name = 'read_npz-{}'.format(tokenize(path))
    _paths = _sorted(list(glob(path)))
    archive = np.load(_paths[0])

    meta_idx, meta_cols = archive['frame_index'], archive['frame_columns']
    meta = sp.SparseFrame(np.empty(shape=(0, len(meta_cols))),
                          index=meta_idx[:0],
                          columns=meta_cols)
    for i, p in enumerate(_paths):
        dsk[name, i] = (sp.SparseFrame.read_npz, p)

    if sorted:
        level = 0 if isinstance(meta_idx, pd.MultiIndex) else None
        divisions = _npz_read_divisions(_paths, level=level)
    else:
        divisions = [None] * (len(_paths) + 1)

    return SparseFrame(dsk, name, meta, divisions=divisions)
Ejemplo n.º 11
0
 def foo(sf, x, y):
     return sp.SparseFrame(sf.data * x * y,
                           index=sf.index,
                           columns=sf.columns)
Ejemplo n.º 12
0
def meta_nonempty_sparsity(x):
    idx = _nonempty_index(x.index)
    return sp.SparseFrame(sparse.csr_matrix((len(idx), len(x.columns))),
                          index=idx,
                          columns=x.columns)
Ejemplo n.º 13
0
def one_hot_encode(ddf, column=None, categories=None, index_col=None,
                   order=None, prefixes=False, sep='_',
                   ignore_cat_order_mismatch=False):
    """
    Sparse one hot encoding of dask.DataFrame.

    Convert a dask.DataFrame into a series of SparseFrames by one-hot
    encoding specified columns.

    Parameters
    ----------
    ddf: dask.DataFrame
        e.g. the clickstream
    categories: dict
        Maps ``column name`` to specification on how to treat this column.
        Specification can be:
        - iterable of possible category values;
        - ``None`` if this column is already of categorical dtype;
        - ``False`` if this column should not be one-hot-encoded - it will be
          included in the result untouched.
        This argument decides which column(s) will be processed by this
        function. See description of `order` and `ignore_cat_order_mismatch`.
        
        By default, try to ohe-hot-encode all categorical columns and include
        all the other columns untouched.
    index_col: str | iterable
        which columns to use as index
    order: iterable
        Specify order in which one-hot encoded columns should be aligned.
        Must have the same elements as keys of ``categories``.

        If `order = [col_name1, col_name2]`
        and `categories = {col_name1: ['A', 'B'], col_name2: ['C', 'D']}`,
        then the resulting SparseFrame will have columns
        `['A', 'B', 'C', 'D']`.

        If you don't specify order, then output columns' order depends on
        iteration over `categories` dictionary. You can pass `categories`
        as an OrderedDict instead of providing `order` explicitly.
    prefixes: bool
        If False, column names will be the same as categories,
        so that new columns will be named like:
        [cat11, cat12, cat21, cat22, ...].

        If True, original column name followed by a separator will be added
        in front of each category name, so that new columns will be named like:
        [col1_cat11, col1_cat12, col2_cat21, col2_cat22, ...].
        See ``sep`` argument.
    sep: str
        Separator used when ``prefixes`` is True.
    column: DEPRECATED
        Kept only for backward compatibility.
    ignore_cat_order_mismatch: bool
        If a column being one-hot encoded is of categorical dtype, it has
        its categories already predefined, so we don't need to explicitly pass
        them in `categories` argument (see this argument's description).
        However, if we pass them, they may be different than ones defined in
        column.cat.categories. In such a situation, a ValueError will be
        raised. However, if only orders of categories are different (but sets
        of elements are same), you may specify ignore_cat_order_mismatch=True
        to suppress this error. In such a situation, column's predefined
        categories will be used.

    Returns
    -------
        sparse_one_hot: sparsity.dask.SparseFrame
    """
    idx_meta = ddf._meta.reset_index().set_index(index_col).index[:0] \
        if index_col else ddf._meta.index

    columns = sparse_one_hot(ddf._meta,
                             column=column,
                             categories=categories,
                             index_col=index_col,
                             order=order,
                             prefixes=prefixes,
                             sep=sep,
                             ignore_cat_order_mismatch=ignore_cat_order_mismatch
                             ).columns
    meta = sp.SparseFrame(np.empty(shape=(0, len(columns))), columns=columns,
                          index=idx_meta)

    dsf = ddf.map_partitions(sparse_one_hot,
                             column=column,
                             categories=categories,
                             index_col=index_col,
                             order=order,
                             prefixes=prefixes,
                             sep=sep,
                             ignore_cat_order_mismatch=ignore_cat_order_mismatch,
                             meta=object)

    return SparseFrame(dsf.dask, dsf._name, meta, dsf.divisions)