Example #1
0
 def check_and_return(ddfs, dfs, join):
     sol = concat(dfs, join=join)
     res = dd.concat(ddfs, join=join, interleave_partitions=divisions)
     assert_eq(res, sol)
     if known:
         parts = compute_as_if_collection(dd.DataFrame, res.dask,
                                          res.__dask_keys__())
         for p in [i.iloc[:0] for i in parts]:
             res._meta == p  # will error if schemas don't align
     assert not cat_index or has_known_categories(res.index) == known
     return res
Example #2
0
def _get_categories_agg(parts):
    res = defaultdict(list)
    res_ind = []
    for p in parts:
        for k, v in p[0].items():
            res[k].append(v)
        res_ind.append(p[1])
    res = {
        k: methods.concat(v, ignore_index=True).drop_duplicates()
        for k, v in res.items()
    }
    if res_ind[0] is None:
        return res, None
    return res, res_ind[0].append(res_ind[1:]).drop_duplicates()
Example #3
0
def _tail_timedelta(prevs, current, before):
    """Return the concatenated rows of each dataframe in ``prevs`` whose
    index is after the first observation in ``current`` - ``before``.

    Parameters
    ----------
    current : DataFrame
    prevs : list of DataFrame objects
    before : timedelta

    Returns
    -------
    overlapped : DataFrame
    """
    selected = methods.concat(
        [prev[prev.index > (current.index.min() - before)] for prev in prevs])
    return selected
Example #4
0
def test_concat_datetimeindex():
    # https://github.com/dask/dask/issues/2932
    b2 = pd.DataFrame({'x': ['a']},
                      index=pd.DatetimeIndex(['2015-03-24 00:00:16'],
                                             dtype='datetime64[ns]'))
    b3 = pd.DataFrame({'x': ['c']},
                      index=pd.DatetimeIndex(['2015-03-29 00:00:44'],
                                             dtype='datetime64[ns]'))

    b2['x'] = b2.x.astype('category').cat.set_categories(['a', 'c'])
    b3['x'] = b3.x.astype('category').cat.set_categories(['a', 'c'])

    db2 = dd.from_pandas(b2, 1)
    db3 = dd.from_pandas(b3, 1)

    result = concat([b2.iloc[:0], b3.iloc[:0]])
    assert result.index.dtype == '<M8[ns]'

    result = dd.concat([db2, db3])
    expected = pd.concat([b2, b3])
    assert_eq(result, expected)
Example #5
0
def _combined_parts(prev_part, current_part, next_part, before, after):
    msg = ("Partition size is less than overlapping "
           "window size. Try using ``df.repartition`` "
           "to increase the partition size.")

    if prev_part is not None and isinstance(before, Integral):
        if prev_part.shape[0] != before:
            raise NotImplementedError(msg)

    if next_part is not None and isinstance(after, Integral):
        if next_part.shape[0] != after:
            raise NotImplementedError(msg)

    parts = [p for p in (prev_part, current_part, next_part) if p is not None]
    combined = methods.concat(parts)

    return CombinedOutput((
        combined,
        len(prev_part) if prev_part is not None else None,
        len(next_part) if next_part is not None else None,
    ))
Example #6
0
def overlap_chunk(
    func, prev_part, current_part, next_part, before, after, args, kwargs
):

    msg = (
        "Partition size is less than overlapping "
        "window size. Try using ``df.repartition`` "
        "to increase the partition size."
    )

    if prev_part is not None and isinstance(before, Integral):
        if prev_part.shape[0] != before:
            raise NotImplementedError(msg)

    if next_part is not None and isinstance(after, Integral):
        if next_part.shape[0] != after:
            raise NotImplementedError(msg)

    parts = [p for p in (prev_part, current_part, next_part) if p is not None]
    combined = methods.concat(parts)
    out = func(combined, *args, **kwargs)
    if prev_part is None:
        before = None
    if isinstance(before, datetime.timedelta):
        before = len(prev_part)

    expansion = None
    if combined.shape[0] != 0:
        expansion = out.shape[0] // combined.shape[0]
    if before and expansion:
        before *= expansion
    if next_part is None:
        return out.iloc[before:]
    if isinstance(after, datetime.timedelta):
        after = len(next_part)
    if after and expansion:
        after *= expansion
    return out.iloc[before:-after]