def test_join_indexed_dataframe_to_indexed_dataframe(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({'y': list('abcdef')}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) c = join_indexed_dataframes(a, b, how='left') assert c.divisions[0] == a.divisions[0] assert c.divisions[-1] == a.divisions[-1] tm.assert_frame_equal(c.compute(), A.join(B)) c = join_indexed_dataframes(a, b, how='right') assert c.divisions[0] == b.divisions[0] assert c.divisions[-1] == b.divisions[-1] tm.assert_frame_equal(c.compute(), A.join(B, how='right')) c = join_indexed_dataframes(a, b, how='inner') assert c.divisions[0] == 1 assert c.divisions[-1] == 7 tm.assert_frame_equal(c.compute(), A.join(B, how='inner')) c = join_indexed_dataframes(a, b, how='outer') assert c.divisions[0] == 1 assert c.divisions[-1] == 8 tm.assert_frame_equal(c.compute(), A.join(B, how='outer'))
def test_indexed_concat(): A = pd.DataFrame({ 'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef') }, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({'x': [10, 20, 40, 50, 60, 80]}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) for how in ['inner', 'outer']: c = concat_indexed_dataframes([a, b], join=how) result = c.compute() expected = pd.concat([A, B], 0, how) assert list(result.columns) == list(expected.columns) assert sorted(zip(result.values.tolist(), result.index.values.tolist())) == \ sorted(zip(expected.values.tolist(), expected.index.values.tolist())) assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) == \ sorted(concat_indexed_dataframes([a, b], join='inner').dask) assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) != \ sorted(concat_indexed_dataframes([a, b], join='outer').dask)
def test_hash_join(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) for how in ['inner', 'left', 'right', 'outer']: c = hash_join(a, 'y', b, 'y', how) result = c.compute() expected = pd.merge(A, B, how, 'y') assert list(result.columns) == list(expected.columns) assert sorted(result.fillna(100).values.tolist()) == \ sorted(expected.fillna(100).values.tolist()) # Different columns and npartitions c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3) assert c.npartitions == 3 result = c.compute() expected = pd.merge(A, B, 'outer', None, 'x', 'z') assert list(result.columns) == list(expected.columns) assert sorted(result.fillna(100).values.tolist()) == \ sorted(expected.fillna(100).values.tolist()) assert hash_join(a, 'y', b, 'y', 'inner')._name == \ hash_join(a, 'y', b, 'y', 'inner')._name assert hash_join(a, 'y', b, 'y', 'inner')._name != \ hash_join(a, 'y', b, 'y', 'outer')._name
def test_hash_join(how): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) c = hash_join(a, 'y', b, 'y', how) result = c.compute() expected = pd.merge(A, B, how, 'y') list_eq(result, expected) # Different columns and npartitions c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3) assert c.npartitions == 3 result = c.compute() expected = pd.merge(A, B, 'outer', None, 'x', 'z') list_eq(result, expected) assert hash_join(a, 'y', b, 'y', 'inner')._name == \ hash_join(a, 'y', b, 'y', 'inner')._name assert hash_join(a, 'y', b, 'y', 'inner')._name != \ hash_join(a, 'y', b, 'y', 'outer')._name
def test_merge(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) list_eq(dd.merge(a, b, left_index=True, right_index=True), pd.merge(A, B, left_index=True, right_index=True)) list_eq(dd.merge(a, b, on='y'), pd.merge(A, B, on='y')) list_eq(dd.merge(a, b, left_on='x', right_on='z'), pd.merge(A, B, left_on='x', right_on='z')) list_eq(dd.merge(a, b), pd.merge(A, B)) list_eq(dd.merge(a, B), pd.merge(A, B)) list_eq(dd.merge(A, b), pd.merge(A, B)) list_eq(dd.merge(A, B), pd.merge(A, B)) list_eq(dd.merge(a, b, left_index=True, right_index=True), pd.merge(A, B, left_index=True, right_index=True))
def test_join_indexed_dataframe_to_indexed_dataframe(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({'y': list('abcdef')}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) c = join_indexed_dataframes(a, b, how='left') assert c.divisions[0] == a.divisions[0] assert c.divisions[-1] == max(a.divisions + b.divisions) assert eq(c, A.join(B)) c = join_indexed_dataframes(a, b, how='right') assert c.divisions[0] == b.divisions[0] assert c.divisions[-1] == b.divisions[-1] assert eq(c, A.join(B, how='right')) c = join_indexed_dataframes(a, b, how='inner') assert c.divisions[0] == 1 assert c.divisions[-1] == max(a.divisions + b.divisions) assert eq(c.compute(), A.join(B, how='inner')) c = join_indexed_dataframes(a, b, how='outer') assert c.divisions[0] == 1 assert c.divisions[-1] == 8 assert eq(c.compute(), A.join(B, how='outer')) assert sorted(join_indexed_dataframes(a, b, how='inner').dask) == \ sorted(join_indexed_dataframes(a, b, how='inner').dask) assert sorted(join_indexed_dataframes(a, b, how='inner').dask) != \ sorted(join_indexed_dataframes(a, b, how='outer').dask)
def test_hash_join(how, shuffle): A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) c = hash_join(a, "y", b, "y", how) result = c.compute() expected = pd.merge(A, B, how, "y") list_eq(result, expected) # Different columns and npartitions c = hash_join(a, "x", b, "z", "outer", npartitions=3, shuffle=shuffle) assert c.npartitions == 3 result = c.compute() expected = pd.merge(A, B, "outer", None, "x", "z") list_eq(result, expected) assert ( hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name == hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name ) assert ( hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name != hash_join(a, "y", b, "y", "outer", shuffle=shuffle)._name )
def test_join_indexed_dataframe_to_indexed_dataframe(): A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({"y": list("abcdef")}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) c = join_indexed_dataframes(a, b, how="left") assert c.divisions[0] == a.divisions[0] assert c.divisions[-1] == max(a.divisions + b.divisions) assert eq(c, A.join(B)) c = join_indexed_dataframes(a, b, how="right") assert c.divisions[0] == b.divisions[0] assert c.divisions[-1] == b.divisions[-1] assert eq(c, A.join(B, how="right")) c = join_indexed_dataframes(a, b, how="inner") assert c.divisions[0] == 1 assert c.divisions[-1] == max(a.divisions + b.divisions) assert eq(c.compute(), A.join(B, how="inner")) c = join_indexed_dataframes(a, b, how="outer") assert c.divisions[0] == 1 assert c.divisions[-1] == 8 assert eq(c.compute(), A.join(B, how="outer")) assert sorted(join_indexed_dataframes(a, b, how="inner").dask) == sorted( join_indexed_dataframes(a, b, how="inner").dask ) assert sorted(join_indexed_dataframes(a, b, how="inner").dask) != sorted( join_indexed_dataframes(a, b, how="outer").dask )
def test_merge(how): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) eq(dd.merge(a, b, left_index=True, right_index=True), pd.merge(A, B, left_index=True, right_index=True)) result = dd.merge(a, b, on='y', how=how) list_eq(result, pd.merge(A, B, on='y', how=how)) assert all(d is None for d in result.divisions) list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how), pd.merge(A, B, left_on='x', right_on='z', how=how)) list_eq( dd.merge(a, b, left_on='x', right_on='z', how=how, suffixes=('1', '2')), pd.merge(A, B, left_on='x', right_on='z', how=how, suffixes=('1', '2'))) list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how)) list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how)) list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how)) list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how)) list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how), pd.merge(A, B, left_index=True, right_index=True, how=how)) list_eq( dd.merge(a, b, left_index=True, right_index=True, how=how, suffixes=('1', '2')), pd.merge(A, B, left_index=True, right_index=True, how=how, suffixes=('1', '2'))) list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how), pd.merge(A, B, left_on='x', right_index=True, how=how)) list_eq( dd.merge(a, b, left_on='x', right_index=True, how=how, suffixes=('1', '2')), pd.merge(A, B, left_on='x', right_index=True, how=how, suffixes=('1', '2')))
def test_repartition_on_pandas_dataframe(): df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": list("abdabd")}, index=[10, 20, 30, 40, 50, 60]) ddf = dd.repartition(df, divisions=[10, 20, 50, 60]) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (10, 20, 50, 60) assert eq(ddf, df) ddf = dd.repartition(df.y, divisions=[10, 20, 50, 60]) assert isinstance(ddf, dd.Series) assert ddf.divisions == (10, 20, 50, 60) assert eq(ddf, df.y)
def test_align_partitions(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')}, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')}, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) (aa, bb), divisions, L = align_partitions(a, b) assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) assert L == [[(aa._name, 0), (bb._name, 0)], [(aa._name, 1), (bb._name, 1)], [(aa._name, 2), (bb._name, 2)], [(aa._name, 3), (bb._name, 3)], [(aa._name, 4), (bb._name, 4)]] ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}) rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}) for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]: (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf) # different index ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}, index=list('fghijkl')) for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]: (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf)
def test_repartition_on_pandas_dataframe(): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd') }, index=[10, 20, 30, 40, 50, 60]) ddf = dd.repartition(df, divisions=[10, 20, 50, 60]) assert isinstance(ddf, dd.DataFrame) assert ddf.divisions == (10, 20, 50, 60) assert eq(ddf, df) ddf = dd.repartition(df.y, divisions=[10, 20, 50, 60]) assert isinstance(ddf, dd.Series) assert ddf.divisions == (10, 20, 50, 60) assert eq(ddf, df.y)
def test_rearrange_by_column_with_narrow_divisions(): from dask.dataframe.tests.test_multi import list_eq A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) df = rearrange_by_divisions(a, 'x', (0, 2, 5)) list_eq(df, a)
def test_merge(how, shuffle): A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) eq( dd.merge(a, b, left_index=True, right_index=True, shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True), ) result = dd.merge(a, b, on="y", how=how) list_eq(result, pd.merge(A, B, on="y", how=how)) assert all(d is None for d in result.divisions) list_eq( dd.merge(a, b, left_on="x", right_on="z", how=how, shuffle=shuffle), pd.merge(A, B, left_on="x", right_on="z", how=how), ) list_eq( dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2"), shuffle=shuffle), pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")), ) list_eq(dd.merge(a, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(a, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(A, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(A, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq( dd.merge(a, b, left_index=True, right_index=True, how=how, shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True, how=how), ) list_eq( dd.merge(a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2"), shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")), ) list_eq( dd.merge(a, b, left_on="x", right_index=True, how=how, shuffle=shuffle), pd.merge(A, B, left_on="x", right_index=True, how=how), ) list_eq( dd.merge(a, b, left_on="x", right_index=True, how=how, suffixes=("1", "2"), shuffle=shuffle), pd.merge(A, B, left_on="x", right_index=True, how=how, suffixes=("1", "2")), )
def test_indexed_concat(join): A = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({'x': [10, 20, 40, 50, 60, 80]}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) result = concat_indexed_dataframes([a, b], join=join) expected = pd.concat([A, B], axis=0, join=join) assert eq(result, expected) assert sorted(concat_indexed_dataframes([a, b], join=join).dask) == \ sorted(concat_indexed_dataframes([a, b], join=join).dask) assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) != \ sorted(concat_indexed_dataframes([a, b], join='outer').dask)
def test_merge(how, shuffle): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) assert_eq(dd.merge(a, b, left_index=True, right_index=True, how=how, shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True, how=how)) result = dd.merge(a, b, on='y', how=how) list_eq(result, pd.merge(A, B, on='y', how=how)) assert all(d is None for d in result.divisions) list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how, shuffle=shuffle), pd.merge(A, B, left_on='x', right_on='z', how=how)) list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how, suffixes=('1', '2'), shuffle=shuffle), pd.merge(A, B, left_on='x', right_on='z', how=how, suffixes=('1', '2'))) list_eq(dd.merge(a, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(a, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(A, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(A, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how)) list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how, shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True, how=how)) list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how, suffixes=('1', '2'), shuffle=shuffle), pd.merge(A, B, left_index=True, right_index=True, how=how, suffixes=('1', '2'))) list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how, shuffle=shuffle), pd.merge(A, B, left_on='x', right_index=True, how=how)) list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how, suffixes=('1', '2'), shuffle=shuffle), pd.merge(A, B, left_on='x', right_index=True, how=how, suffixes=('1', '2')))
def test_align_partitions(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')}, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')}, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) (aa, bb), divisions, L = align_partitions(a, b) assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) assert L == [[(aa._name, 0), None], [(aa._name, 1), (bb._name, 0)], [(aa._name, 2), (bb._name, 1)], [None, (bb._name, 2)], [None, (bb._name, 3)]]
def test_align_partitions(): A = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd') }, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': list('abda') }, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) (aa, bb), divisions, L = align_partitions(a, b) assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) assert L == [[(aa._name, 0), None], [(aa._name, 1), (bb._name, 0)], [(aa._name, 2), (bb._name, 1)], [None, (bb._name, 2)], [None, (bb._name, 3)]]
def test_indexed_concat(): A = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[1, 2, 3, 4, 6, 7]) a = dd.repartition(A, [1, 4, 7]) B = pd.DataFrame({'x': [10, 20, 40, 50, 60, 80]}, index=[1, 2, 4, 5, 6, 8]) b = dd.repartition(B, [1, 2, 5, 8]) for how in ['inner', 'outer']: c = concat_indexed_dataframes([a, b], join=how) result = c.compute() expected = pd.concat([A, B], 0, how) assert list(result.columns) == list(expected.columns) assert sorted(zip(result.values.tolist(), result.index.values.tolist())) == \ sorted(zip(expected.values.tolist(), expected.index.values.tolist())) assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) == \ sorted(concat_indexed_dataframes([a, b], join='inner').dask) assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) != \ sorted(concat_indexed_dataframes([a, b], join='outer').dask)
def test_align_partitions(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')}, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')}, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) s = dd.core.Scalar({('s', 0): 10}, 's') (aa, bb), divisions, L = align_partitions(a, b) def _check(a, b, aa, bb): assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert isinstance(aa, dd.DataFrame) assert isinstance(bb, dd.DataFrame) assert eq(a, aa) assert eq(b, bb) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) _check(a, b, aa, bb) assert L == [[(aa._name, 0), (bb._name, 0)], [(aa._name, 1), (bb._name, 1)], [(aa._name, 2), (bb._name, 2)], [(aa._name, 3), (bb._name, 3)], [(aa._name, 4), (bb._name, 4)]] (aa, ss, bb), divisions, L = align_partitions(a, s, b) _check(a, b, aa, bb) assert L == [[(aa._name, 0), None, (bb._name, 0)], [(aa._name, 1), None, (bb._name, 1)], [(aa._name, 2), None, (bb._name, 2)], [(aa._name, 3), None, (bb._name, 3)], [(aa._name, 4), None, (bb._name, 4)]] assert eq(ss, 10) ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}) rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}) for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]: (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf) # different index ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}, index=list('fghijkl')) for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]: (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf)
def ddf_right(df_right): # Create frame with 10 partitions # Skip division on 3 so there is one mismatch with ddf_left return dd.repartition(df_right, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11])
def test_align_partitions(): A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": list("abdabd")}, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({"x": [1, 2, 3, 4], "y": list("abda")}, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) s = dd.core.Scalar({("s", 0): 10}, "s", "i8") (aa, bb), divisions, L = align_partitions(a, b) def _check(a, b, aa, bb): assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert isinstance(aa, dd.DataFrame) assert isinstance(bb, dd.DataFrame) assert eq(a, aa) assert eq(b, bb) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) _check(a, b, aa, bb) assert L == [ [(aa._name, 0), (bb._name, 0)], [(aa._name, 1), (bb._name, 1)], [(aa._name, 2), (bb._name, 2)], [(aa._name, 3), (bb._name, 3)], [(aa._name, 4), (bb._name, 4)], ] (aa, ss, bb), divisions, L = align_partitions(a, s, b) _check(a, b, aa, bb) assert L == [ [(aa._name, 0), None, (bb._name, 0)], [(aa._name, 1), None, (bb._name, 1)], [(aa._name, 2), None, (bb._name, 2)], [(aa._name, 3), None, (bb._name, 3)], [(aa._name, 4), None, (bb._name, 4)], ] assert eq(ss, 10) ldf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}) rdf = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]}) for lhs, rhs in [ (dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2)), ]: (lresult, rresult), div, parts = align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf) # different index ldf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg")) rdf = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]}, index=list("fghijkl")) for lhs, rhs in [ (dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2)), ]: (lresult, rresult), div, parts = align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf)