def shard_df_on_index(df, divisions): """ Shard a DataFrame by ranges on its index Examples -------- >>> df = pd.DataFrame({'a': [0, 10, 20, 30, 40], 'b': [5, 4 ,3, 2, 1]}) >>> df a b 0 0 5 1 10 4 2 20 3 3 30 2 4 40 1 >>> shards = list(shard_df_on_index(df, [2, 4])) >>> shards[0] a b 0 0 5 1 10 4 >>> shards[1] a b 2 20 3 3 30 2 >>> shards[2] a b 4 40 1 >>> list(shard_df_on_index(df, []))[0] # empty case a b 0 0 5 1 10 4 2 20 3 3 30 2 4 40 1 """ from dask.dataframe.categorical import iscategorical if isinstance(divisions, Iterator): divisions = list(divisions) if not len(divisions): yield df else: divisions = np.array(divisions) df = df.sort_index() index = df.index if iscategorical(index.dtype): index = index.as_ordered() indices = index.searchsorted(divisions) yield df.iloc[:indices[0]] for i in range(len(indices) - 1): yield df.iloc[indices[i]: indices[i+1]] yield df.iloc[indices[-1]:]
def shard_df_on_index(df, divisions): """ Shard a DataFrame by ranges on its index Examples -------- >>> df = pd.DataFrame({'a': [0, 10, 20, 30, 40], 'b': [5, 4 ,3, 2, 1]}) >>> df a b 0 0 5 1 10 4 2 20 3 3 30 2 4 40 1 >>> shards = list(shard_df_on_index(df, [2, 4])) >>> shards[0] a b 0 0 5 1 10 4 >>> shards[1] a b 2 20 3 3 30 2 >>> shards[2] a b 4 40 1 >>> list(shard_df_on_index(df, []))[0] # empty case a b 0 0 5 1 10 4 2 20 3 3 30 2 4 40 1 """ from dask.dataframe.categorical import iscategorical if isinstance(divisions, Iterator): divisions = list(divisions) if not len(divisions): yield df else: divisions = np.array(divisions) df = df.sort_index() index = df.index if iscategorical(index.dtype): index = index.as_ordered() indices = index.searchsorted(divisions) yield df.iloc[:indices[0]] for i in range(len(indices) - 1): yield df.iloc[indices[i]:indices[i + 1]] yield df.iloc[indices[-1]:]