def test_append2(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]})} meta = make_meta({'a': 'i8', 'b': 'i8'}) ddf1 = dd.DataFrame(dsk, 'x', meta, [None, None]) dsk = {('y', 0): pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60]}), ('y', 1): pd.DataFrame({'a': [40, 50, 60], 'b': [30, 20, 10]}), ('y', 2): pd.DataFrame({'a': [70, 80, 90], 'b': [0, 0, 0]})} ddf2 = dd.DataFrame(dsk, 'y', meta, [None, None]) dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60]}), ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10]})} meta = make_meta({'b': 'i8', 'c': 'i8'}) ddf3 = dd.DataFrame(dsk, 'y', meta, [None, None]) assert_eq(ddf1.append(ddf2), ddf1.compute().append(ddf2.compute())) assert_eq(ddf2.append(ddf1), ddf2.compute().append(ddf1.compute())) # Series + DataFrame with pytest.warns(None): # RuntimeWarning from pandas on comparing int and str assert_eq(ddf1.a.append(ddf2), ddf1.a.compute().append(ddf2.compute())) assert_eq(ddf2.a.append(ddf1), ddf2.a.compute().append(ddf1.compute())) # different columns assert_eq(ddf1.append(ddf3), ddf1.compute().append(ddf3.compute())) assert_eq(ddf3.append(ddf1), ddf3.compute().append(ddf1.compute())) # Series + DataFrame with pytest.warns(None): # RuntimeWarning from pandas on comparing int and str assert_eq(ddf1.a.append(ddf3), ddf1.a.compute().append(ddf3.compute())) assert_eq(ddf3.b.append(ddf1), ddf3.b.compute().append(ddf1.compute())) # Dask + pandas assert_eq(ddf1.append(ddf2.compute()), ddf1.compute().append(ddf2.compute())) assert_eq(ddf2.append(ddf1.compute()), ddf2.compute().append(ddf1.compute())) # Series + DataFrame with pytest.warns(None): # RuntimeWarning from pandas on comparing int and str assert_eq(ddf1.a.append(ddf2.compute()), ddf1.a.compute().append(ddf2.compute())) assert_eq(ddf2.a.append(ddf1.compute()), ddf2.a.compute().append(ddf1.compute())) # different columns assert_eq(ddf1.append(ddf3.compute()), ddf1.compute().append(ddf3.compute())) assert_eq(ddf3.append(ddf1.compute()), ddf3.compute().append(ddf1.compute())) # Series + DataFrame with pytest.warns(None): # RuntimeWarning from pandas on comparing int and str assert_eq(ddf1.a.append(ddf3.compute()), ddf1.a.compute().append(ddf3.compute())) assert_eq(ddf3.b.append(ddf1.compute()), ddf3.b.compute().append(ddf1.compute()))
def test_categorize(): dsk = { ('x', 0): pd.DataFrame({ 'a': ['Alice', 'Bob', 'Alice'], 'b': ['C', 'D', 'E'] }, index=[0, 1, 2]), ('x', 1): pd.DataFrame({ 'a': ['Bob', 'Charlie', 'Charlie'], 'b': ['A', 'A', 'B'] }, index=[3, 4, 5]) } meta = make_meta({'a': 'O', 'b': 'O'}, index=pd.Index([], 'i8')) d = dd.DataFrame(dsk, 'x', meta, [0, 3, 5]) full = d.compute() c = d.categorize('a') cfull = c.compute() assert cfull.dtypes['a'] == 'category' assert cfull.dtypes['b'] == 'O' assert list(cfull.a.astype('O')) == list(full.a) assert (d._get(c.dask, c._keys()[:1])[0].dtypes == cfull.dtypes).all() assert (d.categorize().compute().dtypes == 'category').all()
def test_unknown_categoricals(shuffle_method): ddf = dd.DataFrame( {("unknown", i): df for (i, df) in enumerate(frames)}, "unknown", make_meta( { "v": "object", "w": "category", "x": "i8", "y": "category", "z": "f8" }, parent_meta=frames[0], ), [None] * 4, ) # Compute df = ddf.compute() assert_eq(ddf.w.value_counts(), df.w.value_counts()) assert_eq(ddf.w.nunique(), df.w.nunique()) assert_eq(ddf.groupby(ddf.w).sum(), df.groupby(df.w).sum()) assert_eq(ddf.groupby(ddf.w).y.nunique(), df.groupby(df.w).y.nunique()) assert_eq(ddf.y.groupby(ddf.w).count(), df.y.groupby(df.w).count())
def from_pandas(df, npartitions=None, chunksize=None, name=None): """ Parameters ---------- df : pandas.DataFrame or pandas.Series The DataFrame/Series with which to construct a Dask DataFrame/Series npartitions : int, optional The number of partitions of the index to create. Note that depending on the size and index of the dataframe, the output may have fewer partitions than requested. chunksize : int, optional The size of the partitions of the index. name: string, optional An optional keyname for the dataframe. Define when dataframe large. Defaults to hashing the input. Hashing takes a lot of time on large df. """ nrows = df.shape[0] if chunksize is None: chunksize = int(ceil(nrows / npartitions)) else: npartitions = int(ceil(nrows / chunksize)) if not df.index.is_monotonic_increasing: df = df.sort_index() divisions, locations = sorted_division_locations(df.index, chunksize=chunksize) name = name or 'from_pandas-{}'.format(tokenize(df, npartitions)) dsk = dict( ((name, i), sp.SparseFrame(df.iloc[start:stop])) for i, (start, stop) in enumerate(zip(locations[:-1], locations[1:]))) meta = make_meta(df) return SparseFrame(dsk, name, meta, divisions)
def from_delayed(dfs, meta=None, divisions=None, prefix="from-delayed"): for df in dfs: if not isinstance(df, Delayed): raise TypeError("Expected Delayed object, got %s" % type(df).__name__) if meta is None: warnings.warn("`from_delayed` must compute meta. Pass `meta` argument " "to avoid computation.") meta = delayed(make_meta)(dfs[0]).compute() else: meta = make_meta(meta) name = prefix + "-" + tokenize(*dfs) dsk = merge(df.dask for df in dfs) dsk.update({(name, i): (lambda x: x, df.key) for (i, df) in enumerate(dfs)}) if divisions is None or divisions == "sorted": divs = [None] * (len(dfs) + 1) else: divs = tuple(divisions) if len(divs) != len(dfs) + 1: raise ValueError("divisions should be a tuple of len(dfs) + 1") sf = SparseFrame(dsk, name, meta, divisions=divs) return sf
def test_reduction_series_invalid_axis(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() for axis in [1, 'columns']: for s in [ddf1.a, pdf1.a]: # both must behave the same pytest.raises(ValueError, lambda: s.sum(axis=axis)) pytest.raises(ValueError, lambda: s.prod(axis=axis)) pytest.raises(ValueError, lambda: s.min(axis=axis)) pytest.raises(ValueError, lambda: s.max(axis=axis)) # only count doesn't have axis keyword pytest.raises(TypeError, lambda: s.count(axis=axis)) pytest.raises(ValueError, lambda: s.std(axis=axis)) pytest.raises(ValueError, lambda: s.var(axis=axis)) pytest.raises(ValueError, lambda: s.sem(axis=axis)) pytest.raises(ValueError, lambda: s.mean(axis=axis))
def test_pivot_table_errors(): df = pd.DataFrame({ "A": np.random.choice(list("abc"), size=10), "B": np.random.randn(10), "C": pd.Categorical(np.random.choice(list("abc"), size=10)), }) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index=["A"], columns="C", values="B") assert msg in str(err.value) msg = "'columns' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns=["C"], values="B") assert msg in str(err.value) msg = "'values' must refer to an existing column or columns" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values=[["B"]]) assert msg in str(err.value) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc=["sum"]) assert msg in str(err.value) with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B", aggfunc="xx") assert msg in str(err.value) # unknown categories ddf._meta = make_meta({ "A": object, "B": float, "C": "category" }, parent_meta=pd.DataFrame()) msg = "'columns' must have known categories" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values=["B"]) assert msg in str(err.value) df = pd.DataFrame({ "A": np.random.choice(list("abc"), size=10), "B": np.random.randn(10), "C": np.random.choice(list("abc"), size=10), }) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index="A", columns="C", values="B") assert msg in str(err.value)
def test_reductions_frame(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() assert eq(ddf1.sum(), pdf1.sum()) assert eq(ddf1.min(), pdf1.min()) assert eq(ddf1.max(), pdf1.max()) assert eq(ddf1.count(), pdf1.count()) assert eq(ddf1.std(), pdf1.std()) assert eq(ddf1.var(), pdf1.var()) assert eq(ddf1.std(ddof=0), pdf1.std(ddof=0)) assert eq(ddf1.var(ddof=0), pdf1.var(ddof=0)) assert eq(ddf1.mean(), pdf1.mean()) for axis in [0, 1, 'index', 'columns']: assert eq(ddf1.sum(axis=axis), pdf1.sum(axis=axis)) assert eq(ddf1.min(axis=axis), pdf1.min(axis=axis)) assert eq(ddf1.max(axis=axis), pdf1.max(axis=axis)) assert eq(ddf1.count(axis=axis), pdf1.count(axis=axis)) assert eq(ddf1.std(axis=axis), pdf1.std(axis=axis)) assert eq(ddf1.var(axis=axis), pdf1.var(axis=axis)) assert eq(ddf1.std(axis=axis, ddof=0), pdf1.std(axis=axis, ddof=0)) assert eq(ddf1.var(axis=axis, ddof=0), pdf1.var(axis=axis, ddof=0)) assert eq(ddf1.mean(axis=axis), pdf1.mean(axis=axis)) assert raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute()) # axis=0 assert_dask_graph(ddf1.sum(), 'dataframe-sum') assert_dask_graph(ddf1.min(), 'dataframe-min') assert_dask_graph(ddf1.max(), 'dataframe-max') assert_dask_graph(ddf1.count(), 'dataframe-count') # std, var, mean consists from sum and count operations assert_dask_graph(ddf1.std(), 'dataframe-sum') assert_dask_graph(ddf1.std(), 'dataframe-count') assert_dask_graph(ddf1.var(), 'dataframe-sum') assert_dask_graph(ddf1.var(), 'dataframe-count') assert_dask_graph(ddf1.mean(), 'dataframe-sum') assert_dask_graph(ddf1.mean(), 'dataframe-count') # axis=1 assert_dask_graph(ddf1.sum(axis=1), 'dataframe-sum') assert_dask_graph(ddf1.min(axis=1), 'dataframe-min') assert_dask_graph(ddf1.max(axis=1), 'dataframe-max') assert_dask_graph(ddf1.count(axis=1), 'dataframe-count') assert_dask_graph(ddf1.std(axis=1), 'dataframe-std') assert_dask_graph(ddf1.var(axis=1), 'dataframe-var') assert_dask_graph(ddf1.mean(axis=1), 'dataframe-mean')
def test_pivot_table_errors(): df = pd.DataFrame({ 'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': pd.Categorical(np.random.choice(list('abc'), size=10)) }) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index=['A'], columns='C', values='B') assert msg in str(err.value) msg = "'columns' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns=['C'], values='B') assert msg in str(err.value) msg = "'values' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values=['B']) assert msg in str(err.value) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum']) assert msg in str(err.value) with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx') assert msg in str(err.value) # unknown categories ddf._meta = make_meta({'A': object, 'B': float, 'C': 'category'}) msg = "'columns' must have known categories" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values=['B']) assert msg in str(err.value) df = pd.DataFrame({ 'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': np.random.choice(list('abc'), size=10) }) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B') assert msg in str(err.value)
def test_concat2(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]})} meta = make_meta({'a': 'i8', 'b': 'i8'}) a = dd.DataFrame(dsk, 'x', meta, [None, None]) dsk = {('y', 0): pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60]}), ('y', 1): pd.DataFrame({'a': [40, 50, 60], 'b': [30, 20, 10]}), ('y', 2): pd.DataFrame({'a': [70, 80, 90], 'b': [0, 0, 0]})} b = dd.DataFrame(dsk, 'y', meta, [None, None]) dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60]}), ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10]})} meta = make_meta({'b': 'i8', 'c': 'i8'}) c = dd.DataFrame(dsk, 'y', meta, [None, None]) dsk = {('y', 0): pd.DataFrame({'b': [10, 20, 30], 'c': [40, 50, 60], 'd': [70, 80, 90]}), ('y', 1): pd.DataFrame({'b': [40, 50, 60], 'c': [30, 20, 10], 'd': [90, 80, 70]}, index=[3, 4, 5])} meta = make_meta({'b': 'i8', 'c': 'i8', 'd': 'i8'}, index=pd.Index([], 'i8')) d = dd.DataFrame(dsk, 'y', meta, [0, 3, 5]) cases = [[a, b], [a, c], [a, d]] assert dd.concat([a]) is a for case in cases: result = dd.concat(case) pdcase = [_c.compute() for _c in case] assert result.npartitions == case[0].npartitions + case[1].npartitions assert result.divisions == (None, ) * (result.npartitions + 1) assert_eq(pd.concat(pdcase), result) assert set(result.dask) == set(dd.concat(case).dask) result = dd.concat(case, join='inner') assert result.npartitions == case[0].npartitions + case[1].npartitions assert result.divisions == (None, ) * (result.npartitions + 1) assert_eq(pd.concat(pdcase, join='inner'), result) assert set(result.dask) == set(dd.concat(case, join='inner').dask)
def __init__(self, dsk, name, meta, divisions=None): if isinstance(meta, SparseFrame): # TODO: remove this case once we subclass from dask._Frame meta = meta._meta if not isinstance(meta, sp.SparseFrame): meta = sp.SparseFrame(meta) self.dask = dsk self._name = name self._meta = make_meta(meta) self.divisions = tuple(divisions) self.ndim = 2 self.loc = _LocIndexer(self)
def test_unknown_categoricals(): ddf = dd.DataFrame({('unknown', i): df for (i, df) in enumerate(frames)}, 'unknown', make_meta({'v': 'object', 'w': 'category', 'x': 'i8', 'y': 'category', 'z': 'f8'}), [None] * 4) # Compute df = ddf.compute() assert_eq(ddf.w.value_counts(), df.w.value_counts()) assert_eq(ddf.w.nunique(), df.w.nunique()) assert_eq(ddf.groupby(ddf.w).sum(), df.groupby(df.w).sum()) assert_eq(ddf.groupby(ddf.w).y.nunique(), df.groupby(df.w).y.nunique()) assert_eq(ddf.y.groupby(ddf.w).count(), df.y.groupby(df.w).count())
def calculate_meta(self): """ Since Elasticsearch is schemaless it is possible that to indices might have different mappings for the same "type" of documents. To get by this, merge all meta's together. During reading the Parsers will be responsible for handling mssing or different type of data. :return: Empty dataframe containing the expected schema :rtype: pandas.DataFrame """ meta = {} for index in self.indices.values(): meta.update(index.mapping) meta_df = make_meta(meta) return meta_df
def test_categorize(): dsk = {('x', 0): pd.DataFrame({'a': ['Alice', 'Bob', 'Alice'], 'b': ['C', 'D', 'E']}, index=[0, 1, 2]), ('x', 1): pd.DataFrame({'a': ['Bob', 'Charlie', 'Charlie'], 'b': ['A', 'A', 'B']}, index=[3, 4, 5])} meta = make_meta({'a': 'O', 'b': 'O'}, index=pd.Index([], 'i8')) d = dd.DataFrame(dsk, 'x', meta, [0, 3, 5]) full = d.compute() c = d.categorize('a') cfull = c.compute() assert cfull.dtypes['a'] == 'category' assert cfull.dtypes['b'] == 'O' assert list(cfull.a.astype('O')) == list(full.a) assert (d._get(c.dask, c._keys()[:1])[0].dtypes == cfull.dtypes).all() assert (d.categorize().compute().dtypes == 'category').all()
def test_pivot_table_errors(): df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': pd.Categorical(np.random.choice(list('abc'), size=10))}) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index=['A'], columns='C', values='B') assert msg in str(err.value) msg = "'columns' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns=['C'], values='B') assert msg in str(err.value) msg = "'values' must be the name of an existing column" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values=['B']) assert msg in str(err.value) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum']) assert msg in str(err.value) with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx') assert msg in str(err.value) # unknown categories ddf._meta = make_meta({'A': object, 'B': float, 'C': 'category'}) msg = "'columns' must have known categories" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values=['B']) assert msg in str(err.value) df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': np.random.choice(list('abc'), size=10)}) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with pytest.raises(ValueError) as err: dd.pivot_table(ddf, index='A', columns='C', values='B') assert msg in str(err.value)
def test_get_dummies_errors(): with pytest.raises(NotImplementedError): # not Categorical s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4]) ds = dd.from_pandas(s, 2) dd.get_dummies(ds) # unknown categories df = pd.DataFrame({'x': list('abcbc'), 'y': list('bcbcb')}) ddf = dd.from_pandas(df, npartitions=2) ddf._meta = make_meta({'x': 'category', 'y': 'category'}) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=['x', 'y']) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.x)
def test_get_dummies_errors(): with pytest.raises(NotImplementedError): # not Categorical s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4]) ds = dd.from_pandas(s, 2) dd.get_dummies(ds) # unknown categories df = pd.DataFrame({"x": list("abcbc"), "y": list("bcbcb")}) ddf = dd.from_pandas(df, npartitions=2) ddf._meta = make_meta({"x": "category", "y": "category"}) with pytest.raises(NotImplementedError): dd.get_dummies(ddf) with pytest.raises(NotImplementedError): dd.get_dummies(ddf, columns=["x", "y"]) with pytest.raises(NotImplementedError): dd.get_dummies(ddf.x)
def make_warehouse(df_tx, df_block, start_date, end_date, tab='poolminer'): logger.warning("df_tx columns in make_poolminer_warehose:%s", df_tx.columns.tolist()) logger.warning("df_block columns in make_poolminer_warehose:%s", df_block.columns.tolist()) #df_tx = df_tx[['block_timestamp','transaction_hash','from_addr','to_addr','value']] #df_block = df_block[['miner_address','block_number','transaction_hashes']] df_block = df_block.drop(['block_timestamp'], axis=1) try: key_params = ['block_tx_warehouse', tab] meta = make_meta({ 'block_timestamp': 'M8', 'block_number': 'i8', 'miner_address': 'object', 'transaction_hashes': 'object' }) df_block = df_block.map_partitions(explode_transaction_hashes) logger.warning('COLUMNS %s:', df_block.columns.tolist()) df_block.reset_index() # join block and transaction table df = df_block.merge(df_tx, how='left', left_on='transaction_hashes', right_on='transaction_hash') # do the merge\ df = df.drop(['transaction_hashes'], axis=1) values = { 'transaction_hash': 'unknown', 'value': 0, 'from_addr': 'unknown', 'to_addr': 'unknown', 'block_number': 0 } df = df.fillna(value=values) logger.warning(("merged columns", df.columns.tolist())) return df except Exception: logger.error("make poolminer warehouse", exc_info=True)
def test_reduction_series_invalid_axis(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() for axis in [1, 'columns']: for s in [ddf1.a, pdf1.a]: # both must behave the same assert raises(ValueError, lambda: s.sum(axis=axis)) assert raises(ValueError, lambda: s.min(axis=axis)) assert raises(ValueError, lambda: s.max(axis=axis)) # only count doesn't have axis keyword assert raises(TypeError, lambda: s.count(axis=axis)) assert raises(ValueError, lambda: s.std(axis=axis)) assert raises(ValueError, lambda: s.var(axis=axis)) assert raises(ValueError, lambda: s.mean(axis=axis))
def __setup_index_repo(self, mocked_index_repo, index_name, no_of_shards=3, doc_per_shard=1000): index = Index(name='index_name') for shard in range(no_of_shards): shard = Shard(shard_id=shard, node=self.__node, no_of_docs=doc_per_shard, state='STARTED') index.add_shard(shard) mocked_index_repo().indices = {index.name: index} mocked_index_repo.get_documents_count.return_value = doc_per_shard mocked_index_repo().calculate_meta.return_value = make_meta({ 'col1': np.dtype(object), 'col2': np.dtype('float64') })
def test_reductions(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert eq(dds.sum(), pds.sum()) assert eq(dds.min(), pds.min()) assert eq(dds.max(), pds.max()) assert eq(dds.count(), pds.count()) assert eq(dds.std(), pds.std()) assert eq(dds.var(), pds.var()) assert eq(dds.std(ddof=0), pds.std(ddof=0)) assert eq(dds.var(ddof=0), pds.var(ddof=0)) assert eq(dds.mean(), pds.mean()) assert eq(dds.nunique(), pds.nunique()) assert eq(dds.nbytes, pds.nbytes) assert eq(dds.sum(skipna=False), pds.sum(skipna=False)) assert eq(dds.min(skipna=False), pds.min(skipna=False)) assert eq(dds.max(skipna=False), pds.max(skipna=False)) assert eq(dds.std(skipna=False), pds.std(skipna=False)) assert eq(dds.var(skipna=False), pds.var(skipna=False)) assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0)) assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0)) assert eq(dds.mean(skipna=False), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(), 'series-sum') assert_dask_graph(ddf1.b.min(), 'series-min') assert_dask_graph(ddf1.b.max(), 'series-max') assert_dask_graph(ddf1.b.count(), 'series-count') assert_dask_graph(ddf1.b.std(), 'series-std') assert_dask_graph(ddf1.b.var(), 'series-var') assert_dask_graph(ddf1.b.std(ddof=0), 'series-std') assert_dask_graph(ddf1.b.var(ddof=0), 'series-var') assert_dask_graph(ddf1.b.mean(), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates') eq(ddf1.index.min(), pdf1.index.min()) eq(ddf1.index.max(), pdf1.index.max())
def test_arithmetics(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() pdf2 = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4]}) pdf3 = pd.DataFrame({'a': [5, 6, 7, 8, 4, 3, 2, 1], 'b': [2, 4, 5, 3, 4, 2, 1, 0]}) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) dsk4 = {('y', 0): pd.DataFrame({'a': [3, 2, 1], 'b': [7, 8, 9]}, index=[0, 1, 3]), ('y', 1): pd.DataFrame({'a': [5, 2, 8], 'b': [4, 2, 3]}, index=[5, 6, 8]), ('y', 2): pd.DataFrame({'a': [1, 4, 10], 'b': [1, 0, 5]}, index=[9, 9, 9])} ddf4 = dd.DataFrame(dsk4, 'y', meta, [0, 4, 9, 9]) pdf4 = ddf4.compute() # Arithmetics cases = [(ddf1, ddf1, pdf1, pdf1), (ddf1, ddf1.repartition([0, 1, 3, 6, 9]), pdf1, pdf1), (ddf2, ddf3, pdf2, pdf3), (ddf2.repartition([0, 3, 6, 7]), ddf3.repartition([0, 7]), pdf2, pdf3), (ddf2.repartition([0, 7]), ddf3.repartition([0, 2, 4, 5, 7]), pdf2, pdf3), (ddf1, ddf4, pdf1, pdf4), (ddf1, ddf4.repartition([0, 9]), pdf1, pdf4), (ddf1.repartition([0, 3, 9]), ddf4.repartition([0, 5, 9]), pdf1, pdf4), # dask + pandas (ddf1, pdf4, pdf1, pdf4), (ddf2, pdf3, pdf2, pdf3)] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b) check_frame_arithmetics(l, r, el, er) # different index, pandas raises ValueError in comparison ops pdf5 = pd.DataFrame({'a': [3, 2, 1, 5, 2, 8, 1, 4, 10], 'b': [7, 8, 9, 4, 2, 3, 1, 0, 5]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf5 = dd.from_pandas(pdf5, 2) pdf6 = pd.DataFrame({'a': [3, 2, 1, 5, 2, 8, 1, 4, 10], 'b': [7, 8, 9, 5, 7, 8, 4, 2, 5]}, index=[0, 1, 2, 3, 4, 5, 6, 7, 9]) ddf6 = dd.from_pandas(pdf6, 4) pdf7 = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4]}, index=list('aaabcdeh')) pdf8 = pd.DataFrame({'a': [5, 6, 7, 8, 4, 3, 2, 1], 'b': [2, 4, 5, 3, 4, 2, 1, 0]}, index=list('abcdefgh')) ddf7 = dd.from_pandas(pdf7, 3) ddf8 = dd.from_pandas(pdf8, 4) pdf9 = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4], 'c': [5, 6, 7, 8, 1, 2, 3, 4]}, index=list('aaabcdeh')) pdf10 = pd.DataFrame({'b': [5, 6, 7, 8, 4, 3, 2, 1], 'c': [2, 4, 5, 3, 4, 2, 1, 0], 'd': [2, 4, 5, 3, 4, 2, 1, 0]}, index=list('abcdefgh')) ddf9 = dd.from_pandas(pdf9, 3) ddf10 = dd.from_pandas(pdf10, 4) # Arithmetics with different index cases = [(ddf5, ddf6, pdf5, pdf6), (ddf5.repartition([0, 9]), ddf6, pdf5, pdf6), (ddf5.repartition([0, 5, 9]), ddf6.repartition([0, 7, 9]), pdf5, pdf6), (ddf7, ddf8, pdf7, pdf8), (ddf7.repartition(['a', 'c', 'h']), ddf8.repartition(['a', 'h']), pdf7, pdf8), (ddf7.repartition(['a', 'b', 'e', 'h']), ddf8.repartition(['a', 'e', 'h']), pdf7, pdf8), (ddf9, ddf10, pdf9, pdf10), (ddf9.repartition(['a', 'c', 'h']), ddf10.repartition(['a', 'h']), pdf9, pdf10), # dask + pandas (ddf5, pdf6, pdf5, pdf6), (ddf7, pdf8, pdf7, pdf8), (ddf9, pdf10, pdf9, pdf10)] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b, allow_comparison_ops=False) check_frame_arithmetics(l, r, el, er, allow_comparison_ops=False)
("x", 0): pd.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] }, index=[0, 1, 3]), ("x", 1): pd.DataFrame({ "a": [4, 5, 6], "b": [3, 2, 1] }, index=[5, 6, 8]), ("x", 2): pd.DataFrame({ "a": [7, 8, 9], "b": [0, 0, 0] }, index=[9, 9, 9]), } meta = make_meta({ "a": "i8", "b": "i8" }, index=pd.Index([], "i8"), parent_meta=pd.DataFrame()) d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9]) full = d.compute() CHECK_FREQ = {} if PANDAS_GT_110: CHECK_FREQ["check_freq"] = False def test_loc(): assert d.loc[3:8].divisions[0] == 3 assert d.loc[3:8].divisions[-1] == 8 assert d.loc[5].divisions == (5, 5)
""" Function: Author: Du Fei Create Time: 2020/5/31 11:08 """ import numpy as np import dask.array as da from dask.dataframe.utils import make_meta if __name__ == '__main__': source_array = np.random.randint(0, 10, (2, 4)) index_array = np.asarray([[0, 0], [1, 0], [2, 1], [3, 2]]) b = np.apply_along_axis(lambda a: a[index_array], 1, source_array) print(b) source_array = da.from_array(source_array) # b = da.apply_along_axis(lambda a: a[index_array], 1, source_array) res = da.apply_along_axis(lambda a: a[index_array], 1, source_array, shape=make_meta(source_array).shape, dtype=make_meta(source_array).dtype).compute() print(res)
from dask.dataframe.shuffle import (shuffle, partitioning_index, rearrange_by_column, rearrange_by_divisions, maybe_buffered_partd) from dask.async import get_sync from dask.dataframe.utils import assert_eq, make_meta dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [1, 4, 7]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [2, 5, 8]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [3, 6, 9]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) d = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) full = d.compute() shuffle_func = shuffle # conflicts with keyword argument @pytest.mark.parametrize('shuffle', ['disk', 'tasks']) def test_shuffle(shuffle): s = shuffle_func(d, d.b, shuffle=shuffle) assert isinstance(s, dd.DataFrame) assert s.npartitions == d.npartitions x = get_sync(s.dask, (s._name, 0)) y = get_sync(s.dask, (s._name, 1))
def test_make_meta(): df = pd.DataFrame({'a': [1, 2, 3], 'b': list('abc'), 'c': [1., 2., 3.]}, index=[10, 20, 30]) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({'a': 'i8', 'b': 'O', 'c': 'f8'}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta([('a', 'i8'), ('c', 'f8'), ('b', 'O')]) assert (meta.columns == ['a', 'c', 'b']).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(('a', 'i8')) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == 'i8' assert meta.name == 'a' # With index meta = make_meta({'a': 'i8', 'b': 'i4'}, pd.Int64Index([1, 2], name='foo')) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta(('a', 'i8'), pd.Int64Index([1, 2], name='foo')) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Categoricals meta = make_meta({'a': 'category'}) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(('a', 'category')) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0)) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x) assert meta is x # Dtype expressions meta = make_meta('i8') assert isinstance(meta, np.int64) meta = make_meta(float) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype('bool')) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
def test_reductions(split_every): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert_eq(dds.sum(split_every=split_every), pds.sum()) assert_eq(dds.prod(split_every=split_every), pds.prod()) assert_eq(dds.min(split_every=split_every), pds.min()) assert_eq(dds.max(split_every=split_every), pds.max()) assert_eq(dds.count(split_every=split_every), pds.count()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.std(split_every=split_every), pds.std()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.var(split_every=split_every), pds.var()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.sem(split_every=split_every), pds.sem()) assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0)) assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0)) assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0)) assert_eq(dds.mean(split_every=split_every), pds.mean()) assert_eq(dds.nunique(split_every=split_every), pds.nunique()) assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False)) assert_eq(dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False)) assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False)) assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False)) assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False)) assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False)) assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False)) assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every), pds.std(skipna=False, ddof=0)) assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every), pds.var(skipna=False, ddof=0)) assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every), pds.sem(skipna=False, ddof=0)) assert_eq(dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum') assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod') assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min') assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max') assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count') assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(split_every=split_every), 'drop-duplicates') # testing index assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min()) assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max()) assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
def test_make_meta(): df = pd.DataFrame( {"a": [1, 2, 3], "b": list("abc"), "c": [1.0, 2.0, 3.0]}, index=[10, 20, 30] ) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({"a": "i8", "b": "O", "c": "f8"}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")]) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(("a", "i8")) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == "i8" assert meta.name == "a" # With index meta = make_meta({"a": "i8", "b": "i4"}, index=pd.Int64Index([1, 2], name="foo")) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta(("a", "i8"), index=pd.Int64Index([1, 2], name="foo")) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Categoricals meta = make_meta({"a": "category"}) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(("a", "category")) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0)) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x) assert meta is x # Dtype expressions meta = make_meta("i8") assert isinstance(meta, np.int64) meta = make_meta(float) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype("bool")) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
def test_arithmetics(): dsk = { ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]), ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]), ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]), } meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() pdf2 = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": [5, 6, 7, 8, 1, 2, 3, 4]}) pdf3 = pd.DataFrame({"a": [5, 6, 7, 8, 4, 3, 2, 1], "b": [2, 4, 5, 3, 4, 2, 1, 0]}) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) dsk4 = { ("y", 0): pd.DataFrame({"a": [3, 2, 1], "b": [7, 8, 9]}, index=[0, 1, 3]), ("y", 1): pd.DataFrame({"a": [5, 2, 8], "b": [4, 2, 3]}, index=[5, 6, 8]), ("y", 2): pd.DataFrame({"a": [1, 4, 10], "b": [1, 0, 5]}, index=[9, 9, 9]), } ddf4 = dd.DataFrame(dsk4, "y", meta, [0, 4, 9, 9]) pdf4 = ddf4.compute() # Arithmetics cases = [ (ddf1, ddf1, pdf1, pdf1), (ddf1, ddf1.repartition([0, 1, 3, 6, 9]), pdf1, pdf1), (ddf2, ddf3, pdf2, pdf3), (ddf2.repartition([0, 3, 6, 7]), ddf3.repartition([0, 7]), pdf2, pdf3), (ddf2.repartition([0, 7]), ddf3.repartition([0, 2, 4, 5, 7]), pdf2, pdf3), (ddf1, ddf4, pdf1, pdf4), (ddf1, ddf4.repartition([0, 9]), pdf1, pdf4), (ddf1.repartition([0, 3, 9]), ddf4.repartition([0, 5, 9]), pdf1, pdf4), # dask + pandas (ddf1, pdf4, pdf1, pdf4), (ddf2, pdf3, pdf2, pdf3), ] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b) check_frame_arithmetics(l, r, el, er) # different index, pandas raises ValueError in comparison ops pdf5 = pd.DataFrame( {"a": [3, 2, 1, 5, 2, 8, 1, 4, 10], "b": [7, 8, 9, 4, 2, 3, 1, 0, 5]}, index=[0, 1, 3, 5, 6, 8, 9, 9, 9], ) ddf5 = dd.from_pandas(pdf5, 2) pdf6 = pd.DataFrame( {"a": [3, 2, 1, 5, 2, 8, 1, 4, 10], "b": [7, 8, 9, 5, 7, 8, 4, 2, 5]}, index=[0, 1, 2, 3, 4, 5, 6, 7, 9], ) ddf6 = dd.from_pandas(pdf6, 4) pdf7 = pd.DataFrame( {"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": [5, 6, 7, 8, 1, 2, 3, 4]}, index=list("aaabcdeh"), ) pdf8 = pd.DataFrame( {"a": [5, 6, 7, 8, 4, 3, 2, 1], "b": [2, 4, 5, 3, 4, 2, 1, 0]}, index=list("abcdefgh"), ) ddf7 = dd.from_pandas(pdf7, 3) ddf8 = dd.from_pandas(pdf8, 4) pdf9 = pd.DataFrame( { "a": [1, 2, 3, 4, 5, 6, 7, 8], "b": [5, 6, 7, 8, 1, 2, 3, 4], "c": [5, 6, 7, 8, 1, 2, 3, 4], }, index=list("aaabcdeh"), ) pdf10 = pd.DataFrame( { "b": [5, 6, 7, 8, 4, 3, 2, 1], "c": [2, 4, 5, 3, 4, 2, 1, 0], "d": [2, 4, 5, 3, 4, 2, 1, 0], }, index=list("abcdefgh"), ) ddf9 = dd.from_pandas(pdf9, 3) ddf10 = dd.from_pandas(pdf10, 4) # Arithmetics with different index cases = [ (ddf5, ddf6, pdf5, pdf6), (ddf5.repartition([0, 9]), ddf6, pdf5, pdf6), (ddf5.repartition([0, 5, 9]), ddf6.repartition([0, 7, 9]), pdf5, pdf6), (ddf7, ddf8, pdf7, pdf8), (ddf7.repartition(["a", "c", "h"]), ddf8.repartition(["a", "h"]), pdf7, pdf8), ( ddf7.repartition(["a", "b", "e", "h"]), ddf8.repartition(["a", "e", "h"]), pdf7, pdf8, ), (ddf9, ddf10, pdf9, pdf10), (ddf9.repartition(["a", "c", "h"]), ddf10.repartition(["a", "h"]), pdf9, pdf10), # dask + pandas (ddf5, pdf6, pdf5, pdf6), (ddf7, pdf8, pdf7, pdf8), (ddf9, pdf10, pdf9, pdf10), ] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b, allow_comparison_ops=False) check_frame_arithmetics(l, r, el, er, allow_comparison_ops=False)
def test_reductions(split_every): dsk = { ("x", 0): pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [True, True, False]}, index=[0, 1, 3] ), ("x", 1): pd.DataFrame( {"a": [4, 5, 6], "b": [3, 2, 1], "c": [False, False, False]}, index=[5, 6, 8], ), ("x", 2): pd.DataFrame( { "a": [13094304034, 3489385935, 100006774], "b": [0, 0, 0], "c": [True, True, True], }, index=[9, 9, 9], ), } meta = make_meta({"a": "i8", "b": "i8", "c": "bool"}, index=pd.Index([], "i8")) ddf1 = dd.DataFrame(dsk, "x", meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [ (ddf1.a, pdf1.a), (ddf1.b, pdf1.b), (ddf1.c, pdf1.c), (ddf1["a"], pdf1["a"]), (ddf1["b"], pdf1["b"]), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools), ]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert_eq(dds.sum(split_every=split_every), pds.sum()) assert_eq(dds.prod(split_every=split_every), pds.prod()) assert_eq(dds.min(split_every=split_every), pds.min()) assert_eq(dds.max(split_every=split_every), pds.max()) assert_eq(dds.count(split_every=split_every), pds.count()) if scipy: # pandas uses unbiased skew, need to correct for that n = pds.shape[0] bias_factor = (n * (n - 1)) ** 0.5 / (n - 2) assert_eq(dds.skew(), pds.skew() / bias_factor) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.std(split_every=split_every), pds.std()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.var(split_every=split_every), pds.var()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.sem(split_every=split_every), pds.sem()) with warnings.catch_warnings(): # dask.dataframe should probably filter this, to match pandas, but # it seems quite difficult. warnings.simplefilter("ignore", RuntimeWarning) assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0)) assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0)) assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0)) assert_eq(dds.mean(split_every=split_every), pds.mean()) assert_eq(dds.nunique(split_every=split_every), pds.nunique()) assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False)) assert_eq( dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False) ) assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False)) assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False)) assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False)) assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False)) assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False)) assert_eq( dds.std(skipna=False, ddof=0, split_every=split_every), pds.std(skipna=False, ddof=0), ) assert_eq( dds.var(skipna=False, ddof=0, split_every=split_every), pds.var(skipna=False, ddof=0), ) assert_eq( dds.sem(skipna=False, ddof=0, split_every=split_every), pds.sem(skipna=False, ddof=0), ) assert_eq( dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False) ) assert_dask_graph(ddf1.b.sum(split_every=split_every), "series-sum") assert_dask_graph(ddf1.b.prod(split_every=split_every), "series-prod") assert_dask_graph(ddf1.b.min(split_every=split_every), "series-min") assert_dask_graph(ddf1.b.max(split_every=split_every), "series-max") assert_dask_graph(ddf1.b.count(split_every=split_every), "series-count") assert_dask_graph(ddf1.b.std(split_every=split_every), "series-std") assert_dask_graph(ddf1.b.var(split_every=split_every), "series-var") assert_dask_graph(ddf1.b.sem(split_every=split_every), "series-sem") assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), "series-std") assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), "series-var") assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), "series-sem") assert_dask_graph(ddf1.b.mean(split_every=split_every), "series-mean") # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(split_every=split_every), "drop-duplicates") # testing index assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min()) assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max()) assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
def read_json( url_path, orient="records", lines=None, storage_options=None, blocksize=None, sample=2**20, encoding="utf-8", errors="strict", compression="infer", meta=None, engine=pd.read_json, include_path_column=False, path_converter=None, **kwargs, ): """Create a dataframe from a set of JSON files This utilises ``pandas.read_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this is appropriate for line-delimited "JSON-lines" data, the kind of JSON output that is most common in big-data scenarios, and which can be chunked when reading (see ``read_json()``). All other options require blocksize=None, i.e., one partition per input file. Parameters ---------- url_path: str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation blocksize: None or int If None, files are not blocked, and you get one partition per input file. If int, which can only be used for line-delimited JSON files, each partition will be approximately this size in bytes, to the nearest newline character. sample: int Number of bytes to pre-load, to provide an empty dataframe structure to any blocks without data. Only relevant when using blocksize. encoding, errors: Text conversion, ``see bytes.decode()`` compression : string or None String like 'gzip' or 'xz'. engine : function object, default ``pd.read_json`` The underlying function that dask will use to read JSON files. By default, this will be the pandas JSON reader (``pd.read_json``). include_path_column : bool or str, optional Include a column with the file path where each row in the dataframe originated. If ``True``, a new column is added to the dataframe called ``path``. If ``str``, sets new column name. Default is ``False``. path_converter : function or None, optional A function that takes one argument and returns a string. Used to convert paths in the ``path`` column, for instance, to strip a common prefix from all the paths. $META Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_json('myfile.1.json') # doctest: +SKIP Load multiple files >>> dd.read_json('myfile.*.json') # doctest: +SKIP >>> dd.read_json(['myfile.1.json', 'myfile.2.json']) # doctest: +SKIP Load large line-delimited JSON files using partitions of approx 256MB size >> dd.read_json('data/file*.csv', blocksize=2**28) """ if lines is None: lines = orient == "records" if orient != "records" and lines: raise ValueError( "Line-delimited JSON is only available with" 'orient="records".' ) if blocksize and (orient != "records" or not lines): raise ValueError( "JSON file chunking only allowed for JSON-lines" "input (orient='records', lines=True)." ) storage_options = storage_options or {} if include_path_column is True: include_path_column = "path" if path_converter is None: path_converter = lambda x: x if blocksize: b_out = read_bytes( url_path, b"\n", blocksize=blocksize, sample=sample, compression=compression, include_path=include_path_column, **storage_options, ) if include_path_column: first, chunks, paths = b_out first_path = path_converter(paths[0]) path_dtype = pd.CategoricalDtype(path_converter(p) for p in paths) flat_paths = flatten( [path_converter(p)] * len(chunk) for p, chunk in zip(paths, chunks) ) else: first, chunks = b_out first_path = None flat_paths = (None,) path_dtype = None flat_chunks = flatten(chunks) if meta is None: meta = read_json_chunk( first, encoding, errors, engine, include_path_column, first_path, path_dtype, kwargs, ) meta = make_meta(meta) parts = [ delayed(read_json_chunk)( chunk, encoding, errors, engine, include_path_column, path, path_dtype, kwargs, meta=meta, ) for chunk, path in zip_longest(flat_chunks, flat_paths) ] else: files = open_files( url_path, "rt", encoding=encoding, errors=errors, compression=compression, **storage_options, ) path_dtype = pd.CategoricalDtype(path_converter(f.path) for f in files) parts = [ delayed(read_json_file)( f, orient, lines, engine, include_path_column, path_converter(f.path), path_dtype, kwargs, ) for f in files ] return from_delayed(parts, meta=meta)
def test_arithmetics(): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() pdf2 = pd.DataFrame({ 'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4] }) pdf3 = pd.DataFrame({ 'a': [5, 6, 7, 8, 4, 3, 2, 1], 'b': [2, 4, 5, 3, 4, 2, 1, 0] }) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) dsk4 = { ('y', 0): pd.DataFrame({ 'a': [3, 2, 1], 'b': [7, 8, 9] }, index=[0, 1, 3]), ('y', 1): pd.DataFrame({ 'a': [5, 2, 8], 'b': [4, 2, 3] }, index=[5, 6, 8]), ('y', 2): pd.DataFrame({ 'a': [1, 4, 10], 'b': [1, 0, 5] }, index=[9, 9, 9]) } ddf4 = dd.DataFrame(dsk4, 'y', meta, [0, 4, 9, 9]) pdf4 = ddf4.compute() # Arithmetics cases = [ (ddf1, ddf1, pdf1, pdf1), (ddf1, ddf1.repartition([0, 1, 3, 6, 9]), pdf1, pdf1), (ddf2, ddf3, pdf2, pdf3), (ddf2.repartition([0, 3, 6, 7]), ddf3.repartition([0, 7]), pdf2, pdf3), (ddf2.repartition([0, 7]), ddf3.repartition([0, 2, 4, 5, 7]), pdf2, pdf3), (ddf1, ddf4, pdf1, pdf4), (ddf1, ddf4.repartition([0, 9]), pdf1, pdf4), (ddf1.repartition([0, 3, 9]), ddf4.repartition([0, 5, 9]), pdf1, pdf4), # dask + pandas (ddf1, pdf4, pdf1, pdf4), (ddf2, pdf3, pdf2, pdf3) ] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b) check_frame_arithmetics(l, r, el, er) # different index, pandas raises ValueError in comparison ops pdf5 = pd.DataFrame( { 'a': [3, 2, 1, 5, 2, 8, 1, 4, 10], 'b': [7, 8, 9, 4, 2, 3, 1, 0, 5] }, index=[0, 1, 3, 5, 6, 8, 9, 9, 9]) ddf5 = dd.from_pandas(pdf5, 2) pdf6 = pd.DataFrame( { 'a': [3, 2, 1, 5, 2, 8, 1, 4, 10], 'b': [7, 8, 9, 5, 7, 8, 4, 2, 5] }, index=[0, 1, 2, 3, 4, 5, 6, 7, 9]) ddf6 = dd.from_pandas(pdf6, 4) pdf7 = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4] }, index=list('aaabcdeh')) pdf8 = pd.DataFrame( { 'a': [5, 6, 7, 8, 4, 3, 2, 1], 'b': [2, 4, 5, 3, 4, 2, 1, 0] }, index=list('abcdefgh')) ddf7 = dd.from_pandas(pdf7, 3) ddf8 = dd.from_pandas(pdf8, 4) pdf9 = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 6, 7, 8], 'b': [5, 6, 7, 8, 1, 2, 3, 4], 'c': [5, 6, 7, 8, 1, 2, 3, 4] }, index=list('aaabcdeh')) pdf10 = pd.DataFrame( { 'b': [5, 6, 7, 8, 4, 3, 2, 1], 'c': [2, 4, 5, 3, 4, 2, 1, 0], 'd': [2, 4, 5, 3, 4, 2, 1, 0] }, index=list('abcdefgh')) ddf9 = dd.from_pandas(pdf9, 3) ddf10 = dd.from_pandas(pdf10, 4) # Arithmetics with different index cases = [ (ddf5, ddf6, pdf5, pdf6), (ddf5.repartition([0, 9]), ddf6, pdf5, pdf6), (ddf5.repartition([0, 5, 9]), ddf6.repartition([0, 7, 9]), pdf5, pdf6), (ddf7, ddf8, pdf7, pdf8), (ddf7.repartition(['a', 'c', 'h']), ddf8.repartition(['a', 'h']), pdf7, pdf8), (ddf7.repartition(['a', 'b', 'e', 'h']), ddf8.repartition(['a', 'e', 'h']), pdf7, pdf8), (ddf9, ddf10, pdf9, pdf10), (ddf9.repartition(['a', 'c', 'h']), ddf10.repartition(['a', 'h']), pdf9, pdf10), # dask + pandas (ddf5, pdf6, pdf5, pdf6), (ddf7, pdf8, pdf7, pdf8), (ddf9, pdf10, pdf9, pdf10) ] for (l, r, el, er) in cases: check_series_arithmetics(l.a, r.b, el.a, er.b, allow_comparison_ops=False) check_frame_arithmetics(l, r, el, er, allow_comparison_ops=False)
def test_make_meta(): df = pd.DataFrame({ "a": [1, 2, 3], "b": list("abc"), "c": [1.0, 2.0, 3.0] }, index=[10, 20, 30]) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({"a": "i8", "b": "O", "c": "f8"}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # List meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")]) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(("a", "i8")) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == "i8" assert meta.name == "a" # Iterable class CustomMetadata(Iterable): """Custom class iterator returning pandas types.""" def __init__(self, max=0): self.types = [("a", "i8"), ("c", "f8"), ("b", "O")] def __iter__(self): self.n = 0 return self def __next__(self): if self.n < len(self.types): ret = self.types[self.n] self.n += 1 return ret else: raise StopIteration meta = make_meta(CustomMetadata()) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # With index idx = pd.Index([1, 2], name="foo") meta = make_meta( { "a": "i8", "b": "i4" }, index=idx, ) assert type(meta.index) is type(idx) assert meta.index.dtype == "int64" assert len(meta.index) == 0 meta = make_meta(("a", "i8"), index=idx) assert type(meta.index) is type(idx) assert meta.index.dtype == "int64" assert len(meta.index) == 0 # Categoricals meta = make_meta({"a": "category"}, parent_meta=df) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(("a", "category"), parent_meta=df) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0), parent_meta=df) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0, parent_meta=df) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x, parent_meta=df) assert meta is x # DatetimeTZDtype x = pd.DatetimeTZDtype(tz="UTC") meta = make_meta(x) assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit) # Dtype expressions meta = make_meta("i8", parent_meta=df) assert isinstance(meta, np.int64) meta = make_meta(float, parent_meta=df) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype("bool"), parent_meta=df) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
dsk = { ("x", 0): pd.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] }, index=[0, 1, 3]), ("x", 1): pd.DataFrame({ "a": [4, 5, 6], "b": [3, 2, 1] }, index=[5, 6, 8]), ("x", 2): pd.DataFrame({ "a": [7, 8, 9], "b": [0, 0, 0] }, index=[9, 9, 9]), } meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9]) full = d.compute() CHECK_FREQ = {} if dd._compat.PANDAS_GT_110: CHECK_FREQ["check_freq"] = False def test_loc(): assert d.loc[3:8].divisions[0] == 3 assert d.loc[3:8].divisions[-1] == 8 assert d.loc[5].divisions == (5, 5) assert_eq(d.loc[5], full.loc[5:5]) assert_eq(d.loc[3:8], full.loc[3:8])
def map_overlap(func, df, before, after, *args, **kwargs): """Apply a function to each partition, sharing rows with adjacent partitions. Parameters ---------- func : function Function applied to each partition. df : dd.DataFrame, dd.Series before : int or timedelta The rows to prepend to partition ``i`` from the end of partition ``i - 1``. after : int or timedelta The rows to append to partition ``i`` from the beginning of partition ``i + 1``. args, kwargs : Arguments and keywords to pass to the function. The partition will be the first argument, and these will be passed *after*. See Also -------- dd.DataFrame.map_overlap """ if isinstance(before, datetime.timedelta) or isinstance(after, datetime.timedelta): if not is_datetime64_any_dtype(df.index._meta_nonempty.inferred_type): raise TypeError( "Must have a `DatetimeIndex` when using string offset " "for `before` and `after`" ) else: if not ( isinstance(before, Integral) and before >= 0 and isinstance(after, Integral) and after >= 0 ): raise ValueError("before and after must be positive integers") if "token" in kwargs: func_name = kwargs.pop("token") token = tokenize(df, before, after, *args, **kwargs) else: func_name = "overlap-" + funcname(func) token = tokenize(func, df, before, after, *args, **kwargs) if "meta" in kwargs: meta = kwargs.pop("meta") else: meta = _emulate(func, df, *args, **kwargs) meta = make_meta(meta, index=df._meta.index, parent_meta=df._meta) name = f"{func_name}-{token}" name_a = "overlap-prepend-" + tokenize(df, before) name_b = "overlap-append-" + tokenize(df, after) df_name = df._name dsk = {} timedelta_partition_message = ( "Partition size is less than specified window. " "Try using ``df.repartition`` to increase the partition size" ) if before and isinstance(before, Integral): prevs = [None] for i in range(df.npartitions - 1): key = (name_a, i) dsk[key] = (M.tail, (df_name, i), before) prevs.append(key) elif isinstance(before, datetime.timedelta): # Assumes monotonic (increasing?) index divs = pd.Series(df.divisions) deltas = divs.diff().iloc[1:-1] # In the first case window-size is larger than at least one partition, thus it is # necessary to calculate how many partitions must be used for each rolling task. # Otherwise, these calculations can be skipped (faster) if (before > deltas).any(): pt_z = divs[0] prevs = [None] for i in range(df.npartitions - 1): # Select all indexes of relevant partitions between the current partition and # the partition with the highest division outside the rolling window (before) pt_i = divs[i + 1] # lower-bound the search to the first division lb = max(pt_i - before, pt_z) first, j = divs[i], i while first > lb and j > 0: first = first - deltas[j] j = j - 1 key = (name_a, i) dsk[key] = ( _tail_timedelta, [(df_name, k) for k in range(j, i + 1)], (df_name, i + 1), before, ) prevs.append(key) else: prevs = [None] for i in range(df.npartitions - 1): key = (name_a, i) dsk[key] = ( _tail_timedelta, [(df_name, i)], (df_name, i + 1), before, ) prevs.append(key) else: prevs = [None] * df.npartitions if after and isinstance(after, Integral): nexts = [] for i in range(1, df.npartitions): key = (name_b, i) dsk[key] = (M.head, (df_name, i), after) nexts.append(key) nexts.append(None) elif isinstance(after, datetime.timedelta): # TODO: Do we have a use-case for this? Pandas doesn't allow negative rolling windows deltas = pd.Series(df.divisions).diff().iloc[1:-1] if (after > deltas).any(): raise ValueError(timedelta_partition_message) nexts = [] for i in range(1, df.npartitions): key = (name_b, i) dsk[key] = (_head_timedelta, (df_name, i - 0), (df_name, i), after) nexts.append(key) nexts.append(None) else: nexts = [None] * df.npartitions for i, (prev, current, next) in enumerate(zip(prevs, df.__dask_keys__(), nexts)): dsk[(name, i)] = ( overlap_chunk, func, prev, current, next, before, after, args, kwargs, ) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) return df._constructor(graph, name, meta, df.divisions)
from dask.dataframe.shuffle import (shuffle, partitioning_index, rearrange_by_column, rearrange_by_divisions, maybe_buffered_partd, remove_nans) from dask.dataframe.utils import assert_eq, make_meta dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [1, 4, 7]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [2, 5, 8]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [3, 6, 9]}, index=[9, 9, 9])} meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) d = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) full = d.compute() shuffle_func = shuffle # conflicts with keyword argument @pytest.mark.parametrize('shuffle', ['disk', 'tasks']) def test_shuffle(shuffle): s = shuffle_func(d, d.b, shuffle=shuffle) assert isinstance(s, dd.DataFrame) assert s.npartitions == d.npartitions x = dask.get(s.dask, (s._name, 0)) y = dask.get(s.dask, (s._name, 1))
def test_make_meta(): df = pd.DataFrame({'a': [1, 2, 3], 'b': list('abc'), 'c': [1., 2., 3.]}, index=[10, 20, 30]) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({'a': 'i8', 'b': 'O', 'c': 'f8'}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta([('a', 'i8'), ('c', 'f8'), ('b', 'O')]) assert (meta.columns == ['a', 'c', 'b']).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(('a', 'i8')) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == 'i8' assert meta.name == 'a' # With index meta = make_meta({'a': 'i8', 'b': 'i4'}, pd.Int64Index([1, 2], name='foo')) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta(('a', 'i8'), pd.Int64Index([1, 2], name='foo')) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Numpy scalar meta = make_meta(np.float64(1.0)) assert isinstance(meta, np.ndarray) assert meta.shape == (0,) assert meta.dtype == 'f8' # Python scalar meta = make_meta(1.0) assert isinstance(meta, np.ndarray) assert meta.shape == (0,) assert meta.dtype == 'f8' # datetime meta = make_meta(pd.NaT) assert isinstance(meta, np.ndarray) assert meta.shape == (0,) assert meta.dtype == pd.Series(pd.NaT).dtype
def test_reductions_frame(split_every): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() assert_eq(ddf1.sum(split_every=split_every), pdf1.sum()) assert_eq(ddf1.prod(split_every=split_every), pdf1.prod()) assert_eq(ddf1.min(split_every=split_every), pdf1.min()) assert_eq(ddf1.max(split_every=split_every), pdf1.max()) assert_eq(ddf1.count(split_every=split_every), pdf1.count()) assert_eq(ddf1.std(split_every=split_every), pdf1.std()) assert_eq(ddf1.var(split_every=split_every), pdf1.var()) assert_eq(ddf1.sem(split_every=split_every), pdf1.sem()) assert_eq(ddf1.std(ddof=0, split_every=split_every), pdf1.std(ddof=0)) assert_eq(ddf1.var(ddof=0, split_every=split_every), pdf1.var(ddof=0)) assert_eq(ddf1.sem(ddof=0, split_every=split_every), pdf1.sem(ddof=0)) assert_eq(ddf1.mean(split_every=split_every), pdf1.mean()) for axis in [0, 1, 'index', 'columns']: assert_eq(ddf1.sum(axis=axis, split_every=split_every), pdf1.sum(axis=axis)) assert_eq(ddf1.prod(axis=axis, split_every=split_every), pdf1.prod(axis=axis)) assert_eq(ddf1.min(axis=axis, split_every=split_every), pdf1.min(axis=axis)) assert_eq(ddf1.max(axis=axis, split_every=split_every), pdf1.max(axis=axis)) assert_eq(ddf1.count(axis=axis, split_every=split_every), pdf1.count(axis=axis)) assert_eq(ddf1.std(axis=axis, split_every=split_every), pdf1.std(axis=axis)) assert_eq(ddf1.var(axis=axis, split_every=split_every), pdf1.var(axis=axis)) assert_eq(ddf1.sem(axis=axis, split_every=split_every), pdf1.sem(axis=axis)) assert_eq(ddf1.std(axis=axis, ddof=0, split_every=split_every), pdf1.std(axis=axis, ddof=0)) assert_eq(ddf1.var(axis=axis, ddof=0, split_every=split_every), pdf1.var(axis=axis, ddof=0)) assert_eq(ddf1.sem(axis=axis, ddof=0, split_every=split_every), pdf1.sem(axis=axis, ddof=0)) assert_eq(ddf1.mean(axis=axis, split_every=split_every), pdf1.mean(axis=axis)) pytest.raises(ValueError, lambda: ddf1.sum(axis='incorrect').compute()) # axis=0 assert_dask_graph(ddf1.sum(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.prod(split_every=split_every), 'dataframe-prod') assert_dask_graph(ddf1.min(split_every=split_every), 'dataframe-min') assert_dask_graph(ddf1.max(split_every=split_every), 'dataframe-max') assert_dask_graph(ddf1.count(split_every=split_every), 'dataframe-count') # std, var, sem, and mean consist of sum and count operations assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.std(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.var(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.sem(split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.mean(split_every=split_every), 'dataframe-count') # axis=1 assert_dask_graph(ddf1.sum(axis=1, split_every=split_every), 'dataframe-sum') assert_dask_graph(ddf1.prod(axis=1, split_every=split_every), 'dataframe-prod') assert_dask_graph(ddf1.min(axis=1, split_every=split_every), 'dataframe-min') assert_dask_graph(ddf1.max(axis=1, split_every=split_every), 'dataframe-max') assert_dask_graph(ddf1.count(axis=1, split_every=split_every), 'dataframe-count') assert_dask_graph(ddf1.std(axis=1, split_every=split_every), 'dataframe-std') assert_dask_graph(ddf1.var(axis=1, split_every=split_every), 'dataframe-var') assert_dask_graph(ddf1.sem(axis=1, split_every=split_every), 'dataframe-sem') assert_dask_graph(ddf1.mean(axis=1, split_every=split_every), 'dataframe-mean')
p = l type = p.find_previous_sibling('p', text=re_tags) if type: re=re_tags.search(type.text) # print(row['title'], url, re) types[re[1]].append(text) else: # type = p.find_previous_sibling('p', text=re_download) # if type: types['Zip'].append(text) for k, v in types.items(): row[k] = ' '.join(v) # print('end='+row['title']) except Exception as e: # print(e) row['error'] = "True" # pass pbar.update(1) return row hulk = pd.read_csv('hulkpop.csv', encoding='UTF-8', sep='\n', header=None, names=['url'] ) for t in ['title','author']+tags+['error']: hulk[t]='' pbar = tqdm(total=len(hulk), ncols=80) ddata = dd.from_pandas(hulk, npartitions=WORKERS) h3 = ddata.apply(getlinks, axis=1, meta=make_meta(hulk)) files=h3.to_csv('out/hulkpop-links-*.csv', encoding='UTF-8', index=False, line_terminator='\n')
def test_reductions(split_every): dsk = { ('x', 0): pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=[0, 1, 3]), ('x', 1): pd.DataFrame({ 'a': [4, 5, 6], 'b': [3, 2, 1] }, index=[5, 6, 8]), ('x', 2): pd.DataFrame({ 'a': [7, 8, 9], 'b': [0, 0, 0] }, index=[9, 9, 9]) } meta = make_meta({'a': 'i8', 'b': 'i8'}, index=pd.Index([], 'i8')) ddf1 = dd.DataFrame(dsk, 'x', meta, [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert_eq(dds.sum(split_every=split_every), pds.sum()) assert_eq(dds.prod(split_every=split_every), pds.prod()) assert_eq(dds.min(split_every=split_every), pds.min()) assert_eq(dds.max(split_every=split_every), pds.max()) assert_eq(dds.count(split_every=split_every), pds.count()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.std(split_every=split_every), pds.std()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.var(split_every=split_every), pds.var()) with pytest.warns(None): # runtime warnings; https://github.com/dask/dask/issues/2381 assert_eq(dds.sem(split_every=split_every), pds.sem()) assert_eq(dds.std(ddof=0, split_every=split_every), pds.std(ddof=0)) assert_eq(dds.var(ddof=0, split_every=split_every), pds.var(ddof=0)) assert_eq(dds.sem(ddof=0, split_every=split_every), pds.sem(ddof=0)) assert_eq(dds.mean(split_every=split_every), pds.mean()) assert_eq(dds.nunique(split_every=split_every), pds.nunique()) assert_eq(dds.sum(skipna=False, split_every=split_every), pds.sum(skipna=False)) assert_eq(dds.prod(skipna=False, split_every=split_every), pds.prod(skipna=False)) assert_eq(dds.min(skipna=False, split_every=split_every), pds.min(skipna=False)) assert_eq(dds.max(skipna=False, split_every=split_every), pds.max(skipna=False)) assert_eq(dds.std(skipna=False, split_every=split_every), pds.std(skipna=False)) assert_eq(dds.var(skipna=False, split_every=split_every), pds.var(skipna=False)) assert_eq(dds.sem(skipna=False, split_every=split_every), pds.sem(skipna=False)) assert_eq(dds.std(skipna=False, ddof=0, split_every=split_every), pds.std(skipna=False, ddof=0)) assert_eq(dds.var(skipna=False, ddof=0, split_every=split_every), pds.var(skipna=False, ddof=0)) assert_eq(dds.sem(skipna=False, ddof=0, split_every=split_every), pds.sem(skipna=False, ddof=0)) assert_eq(dds.mean(skipna=False, split_every=split_every), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(split_every=split_every), 'series-sum') assert_dask_graph(ddf1.b.prod(split_every=split_every), 'series-prod') assert_dask_graph(ddf1.b.min(split_every=split_every), 'series-min') assert_dask_graph(ddf1.b.max(split_every=split_every), 'series-max') assert_dask_graph(ddf1.b.count(split_every=split_every), 'series-count') assert_dask_graph(ddf1.b.std(split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.std(ddof=0, split_every=split_every), 'series-std') assert_dask_graph(ddf1.b.var(ddof=0, split_every=split_every), 'series-var') assert_dask_graph(ddf1.b.sem(ddof=0, split_every=split_every), 'series-sem') assert_dask_graph(ddf1.b.mean(split_every=split_every), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(split_every=split_every), 'drop-duplicates') # testing index assert_eq(ddf1.index.min(split_every=split_every), pdf1.index.min()) assert_eq(ddf1.index.max(split_every=split_every), pdf1.index.max()) assert_eq(ddf1.index.count(split_every=split_every), pd.notnull(pdf1.index).sum())
COLUMN_NAME_NES = "NES" COLUMN_NAME_AUC = "AUC" COLUMN_NAME_CONTEXT = "Context" COLUMN_NAME_TARGET_GENES = "TargetGenes" COLUMN_NAME_RANK_AT_MAX = "RankAtMax" COLUMN_NAME_TYPE = "Type" # TODO: Should actually be a function depending on return_recovery_curves and rank_threshold DF_META_DATA = make_meta( { ('Enrichment', COLUMN_NAME_AUC): np.float64, ('Enrichment', COLUMN_NAME_NES): np.float64, ('Enrichment', COLUMN_NAME_MOTIF_SIMILARITY_QVALUE): np.float64, ('Enrichment', COLUMN_NAME_ORTHOLOGOUS_IDENTITY): np.float64, ('Enrichment', COLUMN_NAME_ANNOTATION): np.object, ('Enrichment', COLUMN_NAME_CONTEXT): np.object, ('Enrichment', COLUMN_NAME_TARGET_GENES): np.object, ('Enrichment', COLUMN_NAME_RANK_AT_MAX): np.int64, }, index=pd.MultiIndex.from_arrays([[], []], names=(COLUMN_NAME_TF, COLUMN_NAME_MOTIF_ID)), ) __all__ = ["module2features", "module2df", "modules2df", "df2regulons", "module2regulon", "modules2regulons"] LOGGER = logging.getLogger(__name__) def module2features_rcc4all_impl(
def test_make_meta(): df = pd.DataFrame({ 'a': [1, 2, 3], 'b': list('abc'), 'c': [1., 2., 3.] }, index=[10, 20, 30]) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({'a': 'i8', 'b': 'O', 'c': 'f8'}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta([('a', 'i8'), ('c', 'f8'), ('b', 'O')]) assert (meta.columns == ['a', 'c', 'b']).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(('a', 'i8')) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == 'i8' assert meta.name == 'a' # With index meta = make_meta({ 'a': 'i8', 'b': 'i4' }, index=pd.Int64Index([1, 2], name='foo')) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta(('a', 'i8'), index=pd.Int64Index([1, 2], name='foo')) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Categoricals meta = make_meta({'a': 'category'}) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(('a', 'category')) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0)) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x) assert meta is x # Dtype expressions meta = make_meta('i8') assert isinstance(meta, np.int64) meta = make_meta(float) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype('bool')) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
import numpy as np import dask from dask.utils import raises import dask.dataframe as dd from dask.dataframe.core import _coerce_loc_index from dask.dataframe.utils import eq, make_meta dsk = { ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]), ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]), ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]), } meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9]) full = d.compute() def test_loc(): assert d.loc[3:8].divisions[0] == 3 assert d.loc[3:8].divisions[-1] == 8 assert d.loc[5].divisions == (5, 5) assert eq(d.loc[5], full.loc[5:5]) assert eq(d.loc[3:8], full.loc[3:8]) assert eq(d.loc[:8], full.loc[:8]) assert eq(d.loc[3:], full.loc[3:])