Example #1
0
    def to_dask(self, columns=None):
        import dask.dataframe as dd

        if columns is None:
            columns = self.columns

        token = md5(str(
            (self.path, os.path.getmtime(self.path))).encode()).hexdigest()
        name = 'from-castra-' + token

        divisions = [self.minimum] + self.partitions.index.tolist()
        if '.index' in self.categories:
            divisions = (
                [self.categories['.index'][0]] +
                [self.categories['.index'][d + 1]
                 for d in divisions[1:-1]] + [self.categories['.index'][-1]])

        key_parts = list(enumerate(self.partitions.values))

        dsk = dict(((name, i), (Castra.load_partition, self, part, columns))
                   for i, part in key_parts)
        if isinstance(columns, list):
            return dd.DataFrame(dsk, name, columns, divisions)
        else:
            return dd.Series(dsk, name, columns, divisions)
Example #2
0
def test_loc_with_text_dates():
    A = tm.makeTimeSeries(10).iloc[:5]
    B = tm.makeTimeSeries(10).iloc[5:]
    s = dd.Series({('df', 0): A, ('df', 1): B}, 'df', A,
                  [A.index.min(), B.index.min(), B.index.max()])

    assert s.loc['2000': '2010'].divisions == s.divisions
    assert_eq(s.loc['2000': '2010'], s)
    assert len(s.loc['2000-01-03': '2000-01-05'].compute()) == 3
Example #3
0
def _construct_dask_df_with_divisions(df):
    """Construct the new task graph and make a new dask.dataframe around it"""
    divisions = _get_divisions(df)
    name = 'csv-index' + df._name
    dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i]) for i in range(df.npartitions)}
    from toolz import merge
    if isinstance(df, dd.DataFrame):
        return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions)
    elif isinstance(df, dd.Series):
        return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
Example #4
0
 def to_dask(self, columns=None):
     if columns is None:
         columns = self.columns
     import dask.dataframe as dd
     name = 'from-castra' + next(dd.core.tokens)
     dsk = dict(((name, i), (Castra.load_partition, self, part, columns))
                for i, part in enumerate(self.partitions.values))
     divisions = [self.minimum] + list(self.partitions.index)
     if isinstance(columns, list):
         return dd.DataFrame(dsk, name, columns, divisions)
     else:
         return dd.Series(dsk, name, columns, divisions)
Example #5
0
def _construct_dask_df_with_divisions(df):
    """Construct the new task graph and make a new dask.dataframe around it."""
    divisions = _get_divisions(df)
    # pylint: disable=protected-access
    name = 'csv-index' + df._name
    dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i])
           for i in range(df.npartitions)}
    # pylint: enable=protected-access
    from toolz import merge  # pylint: disable=g-import-not-at-top
    if isinstance(df, dd.DataFrame):
        return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions)
    elif isinstance(df, dd.Series):
        return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
def test_loc_with_text_dates():
    A = dd._compat.makeTimeSeries().iloc[:5]
    B = dd._compat.makeTimeSeries().iloc[5:]
    s = dd.Series(
        {("df", 0): A, ("df", 1): B},
        "df",
        A,
        [A.index.min(), B.index.min(), B.index.max()],
    )

    assert s.loc["2000":"2010"].divisions == s.divisions
    assert_eq(s.loc["2000":"2010"], s)
    assert len(s.loc["2000-01-03":"2000-01-05"].compute()) == 3
Example #7
0
    def to_dask(self, columns=None):
        import dask.dataframe as dd

        if columns is None:
            columns = self.columns

        token = md5(str(
            (self.path, os.path.getmtime(self.path))).encode()).hexdigest()
        name = 'from-castra-' + token
        dsk = dict(((name, i), (Castra.load_partition, self, part, columns))
                   for i, part in enumerate(self.partitions.values))
        divisions = [self.minimum] + list(self.partitions.index)
        if isinstance(columns, list):
            return dd.DataFrame(dsk, name, columns, divisions)
        else:
            return dd.Series(dsk, name, columns, divisions)