def test_align_partitions_unknown_divisions(): df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}) # One known, one unknown ddf = dd.from_pandas(df, npartitions=2) ddf2 = dd.from_pandas(df, npartitions=2, sort=False) assert not ddf2.known_divisions with pytest.raises(ValueError): align_partitions(ddf, ddf2) # Both unknown ddf = dd.from_pandas(df + 1, npartitions=2, sort=False) ddf2 = dd.from_pandas(df, npartitions=2, sort=False) assert not ddf.known_divisions assert not ddf2.known_divisions with pytest.raises(ValueError): align_partitions(ddf, ddf2)
def test_align_partitions(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')}, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')}, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) (aa, bb), divisions, L = align_partitions(a, b) assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) assert L == [[(aa._name, 0), (bb._name, 0)], [(aa._name, 1), (bb._name, 1)], [(aa._name, 2), (bb._name, 2)], [(aa._name, 3), (bb._name, 3)], [(aa._name, 4), (bb._name, 4)]] ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}) rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}) for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]: (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf) # different index ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefg')) rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1]}, index=list('fghijkl')) for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]: (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf)
def test_align_partitions(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')}, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')}, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) (aa, bb), divisions, L = align_partitions(a, b) assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) assert L == [[(aa._name, 0), None], [(aa._name, 1), (bb._name, 0)], [(aa._name, 2), (bb._name, 1)], [None, (bb._name, 2)], [None, (bb._name, 3)]]
def test_align_partitions(): A = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd') }, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': list('abda') }, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) s = dd.core.Scalar({('s', 0): 10}, 's', 'i8') (aa, bb), divisions, L = align_partitions(a, b) def _check(a, b, aa, bb): assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert isinstance(aa, dd.DataFrame) assert isinstance(bb, dd.DataFrame) assert eq(a, aa) assert eq(b, bb) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) _check(a, b, aa, bb) assert L == [[(aa._name, 0), (bb._name, 0)], [(aa._name, 1), (bb._name, 1)], [(aa._name, 2), (bb._name, 2)], [(aa._name, 3), (bb._name, 3)], [(aa._name, 4), (bb._name, 4)]] (aa, ss, bb), divisions, L = align_partitions(a, s, b) _check(a, b, aa, bb) assert L == [[(aa._name, 0), None, (bb._name, 0)], [(aa._name, 1), None, (bb._name, 1)], [(aa._name, 2), None, (bb._name, 2)], [(aa._name, 3), None, (bb._name, 3)], [(aa._name, 4), None, (bb._name, 4)]] assert eq(ss, 10) ldf = pd.DataFrame({ 'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1] }) rdf = pd.DataFrame({ 'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1] }) for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]: (lresult, rresult), div, parts = align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf) # different index ldf = pd.DataFrame({ 'a': [1, 2, 3, 4, 5, 6, 7], 'b': [7, 6, 5, 4, 3, 2, 1] }, index=list('abcdefg')) rdf = pd.DataFrame({ 'c': [1, 2, 3, 4, 5, 6, 7], 'd': [7, 6, 5, 4, 3, 2, 1] }, index=list('fghijkl')) for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]: (lresult, rresult), div, parts = align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf)
def test_align_partitions(): A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": list("abdabd")}, index=[10, 20, 30, 40, 50, 60]) a = dd.repartition(A, [10, 40, 60]) B = pd.DataFrame({"x": [1, 2, 3, 4], "y": list("abda")}, index=[30, 70, 80, 100]) b = dd.repartition(B, [30, 80, 100]) s = dd.core.Scalar({("s", 0): 10}, "s", "i8") (aa, bb), divisions, L = align_partitions(a, b) def _check(a, b, aa, bb): assert isinstance(a, dd.DataFrame) assert isinstance(b, dd.DataFrame) assert isinstance(aa, dd.DataFrame) assert isinstance(bb, dd.DataFrame) assert eq(a, aa) assert eq(b, bb) assert divisions == (10, 30, 40, 60, 80, 100) assert isinstance(L, list) assert len(divisions) == 1 + len(L) _check(a, b, aa, bb) assert L == [ [(aa._name, 0), (bb._name, 0)], [(aa._name, 1), (bb._name, 1)], [(aa._name, 2), (bb._name, 2)], [(aa._name, 3), (bb._name, 3)], [(aa._name, 4), (bb._name, 4)], ] (aa, ss, bb), divisions, L = align_partitions(a, s, b) _check(a, b, aa, bb) assert L == [ [(aa._name, 0), None, (bb._name, 0)], [(aa._name, 1), None, (bb._name, 1)], [(aa._name, 2), None, (bb._name, 2)], [(aa._name, 3), None, (bb._name, 3)], [(aa._name, 4), None, (bb._name, 4)], ] assert eq(ss, 10) ldf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}) rdf = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]}) for lhs, rhs in [ (dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2)), ]: (lresult, rresult), div, parts = align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf) # different index ldf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg")) rdf = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]}, index=list("fghijkl")) for lhs, rhs in [ (dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)), (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)), (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2)), ]: (lresult, rresult), div, parts = align_partitions(lhs, rhs) assert eq(lresult, ldf) assert eq(rresult, rdf)