Exemple #1
0
def test_join_indexed_dataframe_to_indexed_dataframe():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({'y': list('abcdef')}, index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    c = join_indexed_dataframes(a, b, how='left')
    assert c.divisions[0] == a.divisions[0]
    assert c.divisions[-1] == a.divisions[-1]
    tm.assert_frame_equal(c.compute(), A.join(B))

    c = join_indexed_dataframes(a, b, how='right')
    assert c.divisions[0] == b.divisions[0]
    assert c.divisions[-1] == b.divisions[-1]
    tm.assert_frame_equal(c.compute(), A.join(B, how='right'))

    c = join_indexed_dataframes(a, b, how='inner')
    assert c.divisions[0] == 1
    assert c.divisions[-1] == 7
    tm.assert_frame_equal(c.compute(), A.join(B, how='inner'))

    c = join_indexed_dataframes(a, b, how='outer')
    assert c.divisions[0] == 1
    assert c.divisions[-1] == 8
    tm.assert_frame_equal(c.compute(), A.join(B, how='outer'))
Exemple #2
0
def test_indexed_concat():
    A = pd.DataFrame({
        'x': [1, 2, 3, 4, 6, 7],
        'y': list('abcdef')
    },
                     index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({'x': [10, 20, 40, 50, 60, 80]}, index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    for how in ['inner', 'outer']:
        c = concat_indexed_dataframes([a, b], join=how)

        result = c.compute()
        expected = pd.concat([A, B], 0, how)

        assert list(result.columns) == list(expected.columns)

        assert sorted(zip(result.values.tolist(), result.index.values.tolist())) == \
               sorted(zip(expected.values.tolist(), expected.index.values.tolist()))

    assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) == \
           sorted(concat_indexed_dataframes([a, b], join='inner').dask)
    assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) != \
           sorted(concat_indexed_dataframes([a, b], join='outer').dask)
Exemple #3
0
def test_hash_join():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    for how in ['inner', 'left', 'right', 'outer']:
        c = hash_join(a, 'y', b, 'y', how)

        result = c.compute()
        expected = pd.merge(A, B, how, 'y')

        assert list(result.columns) == list(expected.columns)
        assert sorted(result.fillna(100).values.tolist()) == \
               sorted(expected.fillna(100).values.tolist())


    # Different columns and npartitions
    c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, 'outer', None, 'x', 'z')
    assert list(result.columns) == list(expected.columns)
    assert sorted(result.fillna(100).values.tolist()) == \
           sorted(expected.fillna(100).values.tolist())

    assert hash_join(a, 'y', b, 'y', 'inner')._name == \
           hash_join(a, 'y', b, 'y', 'inner')._name
    assert hash_join(a, 'y', b, 'y', 'inner')._name != \
           hash_join(a, 'y', b, 'y', 'outer')._name
Exemple #4
0
def test_hash_join(how):
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    c = hash_join(a, 'y', b, 'y', how)

    result = c.compute()
    expected = pd.merge(A, B, how, 'y')
    list_eq(result, expected)

    # Different columns and npartitions
    c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, 'outer', None, 'x', 'z')

    list_eq(result, expected)

    assert hash_join(a, 'y', b, 'y', 'inner')._name == \
           hash_join(a, 'y', b, 'y', 'inner')._name
    assert hash_join(a, 'y', b, 'y', 'inner')._name != \
           hash_join(a, 'y', b, 'y', 'outer')._name
Exemple #5
0
def test_merge():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    list_eq(dd.merge(a, b, left_index=True, right_index=True),
            pd.merge(A, B, left_index=True, right_index=True))

    list_eq(dd.merge(a, b, on='y'),
            pd.merge(A, B, on='y'))

    list_eq(dd.merge(a, b, left_on='x', right_on='z'),
            pd.merge(A, B, left_on='x', right_on='z'))

    list_eq(dd.merge(a, b),
            pd.merge(A, B))

    list_eq(dd.merge(a, B),
            pd.merge(A, B))

    list_eq(dd.merge(A, b),
            pd.merge(A, B))

    list_eq(dd.merge(A, B),
            pd.merge(A, B))

    list_eq(dd.merge(a, b, left_index=True, right_index=True),
            pd.merge(A, B, left_index=True, right_index=True))
Exemple #6
0
def test_merge():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    list_eq(dd.merge(a, b, left_index=True, right_index=True),
            pd.merge(A, B, left_index=True, right_index=True))

    list_eq(dd.merge(a, b, on='y'), pd.merge(A, B, on='y'))

    list_eq(dd.merge(a, b, left_on='x', right_on='z'),
            pd.merge(A, B, left_on='x', right_on='z'))

    list_eq(dd.merge(a, b), pd.merge(A, B))

    list_eq(dd.merge(a, B), pd.merge(A, B))

    list_eq(dd.merge(A, b), pd.merge(A, B))

    list_eq(dd.merge(A, B), pd.merge(A, B))

    list_eq(dd.merge(a, b, left_index=True, right_index=True),
            pd.merge(A, B, left_index=True, right_index=True))
Exemple #7
0
def test_hash_join():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    for how in ['inner', 'left', 'right', 'outer']:
        c = hash_join(a, 'y', b, 'y', how)

        result = c.compute()
        expected = pd.merge(A, B, how, 'y')

        assert list(result.columns) == list(expected.columns)
        assert sorted(result.fillna(100).values.tolist()) == \
               sorted(expected.fillna(100).values.tolist())

    # Different columns and npartitions
    c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, 'outer', None, 'x', 'z')
    assert list(result.columns) == list(expected.columns)
    assert sorted(result.fillna(100).values.tolist()) == \
           sorted(expected.fillna(100).values.tolist())

    assert hash_join(a, 'y', b, 'y', 'inner')._name == \
           hash_join(a, 'y', b, 'y', 'inner')._name
    assert hash_join(a, 'y', b, 'y', 'inner')._name != \
           hash_join(a, 'y', b, 'y', 'outer')._name
Exemple #8
0
def test_join_indexed_dataframe_to_indexed_dataframe():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]},
                     index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({'y': list('abcdef')},
                     index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    c = join_indexed_dataframes(a, b, how='left')
    assert c.divisions[0] == a.divisions[0]
    assert c.divisions[-1] == max(a.divisions + b.divisions)
    assert eq(c, A.join(B))

    c = join_indexed_dataframes(a, b, how='right')
    assert c.divisions[0] == b.divisions[0]
    assert c.divisions[-1] == b.divisions[-1]
    assert eq(c, A.join(B, how='right'))

    c = join_indexed_dataframes(a, b, how='inner')
    assert c.divisions[0] == 1
    assert c.divisions[-1] == max(a.divisions + b.divisions)
    assert eq(c.compute(), A.join(B, how='inner'))

    c = join_indexed_dataframes(a, b, how='outer')
    assert c.divisions[0] == 1
    assert c.divisions[-1] == 8
    assert eq(c.compute(), A.join(B, how='outer'))

    assert sorted(join_indexed_dataframes(a, b, how='inner').dask) == \
           sorted(join_indexed_dataframes(a, b, how='inner').dask)
    assert sorted(join_indexed_dataframes(a, b, how='inner').dask) != \
           sorted(join_indexed_dataframes(a, b, how='outer').dask)
Exemple #9
0
def test_hash_join(how, shuffle):
    A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    c = hash_join(a, "y", b, "y", how)

    result = c.compute()
    expected = pd.merge(A, B, how, "y")
    list_eq(result, expected)

    # Different columns and npartitions
    c = hash_join(a, "x", b, "z", "outer", npartitions=3, shuffle=shuffle)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, "outer", None, "x", "z")

    list_eq(result, expected)

    assert (
        hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name
        == hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name
    )
    assert (
        hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name
        != hash_join(a, "y", b, "y", "outer", shuffle=shuffle)._name
    )
Exemple #10
0
def test_join_indexed_dataframe_to_indexed_dataframe():
    A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({"y": list("abcdef")}, index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    c = join_indexed_dataframes(a, b, how="left")
    assert c.divisions[0] == a.divisions[0]
    assert c.divisions[-1] == max(a.divisions + b.divisions)
    assert eq(c, A.join(B))

    c = join_indexed_dataframes(a, b, how="right")
    assert c.divisions[0] == b.divisions[0]
    assert c.divisions[-1] == b.divisions[-1]
    assert eq(c, A.join(B, how="right"))

    c = join_indexed_dataframes(a, b, how="inner")
    assert c.divisions[0] == 1
    assert c.divisions[-1] == max(a.divisions + b.divisions)
    assert eq(c.compute(), A.join(B, how="inner"))

    c = join_indexed_dataframes(a, b, how="outer")
    assert c.divisions[0] == 1
    assert c.divisions[-1] == 8
    assert eq(c.compute(), A.join(B, how="outer"))

    assert sorted(join_indexed_dataframes(a, b, how="inner").dask) == sorted(
        join_indexed_dataframes(a, b, how="inner").dask
    )
    assert sorted(join_indexed_dataframes(a, b, how="inner").dask) != sorted(
        join_indexed_dataframes(a, b, how="outer").dask
    )
Exemple #11
0
def test_merge(how):
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    eq(dd.merge(a, b, left_index=True, right_index=True),
       pd.merge(A, B, left_index=True, right_index=True))

    result = dd.merge(a, b, on='y', how=how)
    list_eq(result, pd.merge(A, B, on='y', how=how))
    assert all(d is None for d in result.divisions)

    list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how),
            pd.merge(A, B, left_on='x', right_on='z', how=how))
    list_eq(
        dd.merge(a, b, left_on='x', right_on='z', how=how,
                 suffixes=('1', '2')),
        pd.merge(A, B, left_on='x', right_on='z', how=how,
                 suffixes=('1', '2')))

    list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
    list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))

    list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how),
            pd.merge(A, B, left_index=True, right_index=True, how=how))
    list_eq(
        dd.merge(a,
                 b,
                 left_index=True,
                 right_index=True,
                 how=how,
                 suffixes=('1', '2')),
        pd.merge(A,
                 B,
                 left_index=True,
                 right_index=True,
                 how=how,
                 suffixes=('1', '2')))

    list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how),
            pd.merge(A, B, left_on='x', right_index=True, how=how))
    list_eq(
        dd.merge(a,
                 b,
                 left_on='x',
                 right_index=True,
                 how=how,
                 suffixes=('1', '2')),
        pd.merge(A,
                 B,
                 left_on='x',
                 right_index=True,
                 how=how,
                 suffixes=('1', '2')))
Exemple #12
0
def test_repartition_on_pandas_dataframe():
    df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": list("abdabd")}, index=[10, 20, 30, 40, 50, 60])
    ddf = dd.repartition(df, divisions=[10, 20, 50, 60])
    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (10, 20, 50, 60)
    assert eq(ddf, df)

    ddf = dd.repartition(df.y, divisions=[10, 20, 50, 60])
    assert isinstance(ddf, dd.Series)
    assert ddf.divisions == (10, 20, 50, 60)
    assert eq(ddf, df.y)
Exemple #13
0
def test_align_partitions():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')},
                     index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')},
                     index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    (aa, bb), divisions, L = align_partitions(a, b)
    assert isinstance(a, dd.DataFrame)
    assert isinstance(b, dd.DataFrame)
    assert divisions == (10, 30, 40, 60, 80, 100)
    assert isinstance(L, list)
    assert len(divisions) == 1 + len(L)
    assert L == [[(aa._name, 0), (bb._name, 0)],
                 [(aa._name, 1), (bb._name, 1)],
                 [(aa._name, 2), (bb._name, 2)],
                 [(aa._name, 3), (bb._name, 3)],
                 [(aa._name, 4), (bb._name, 4)]]

    ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                        'b': [7, 6, 5, 4, 3, 2, 1]})
    rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                        'd': [7, 6, 5, 4, 3, 2, 1]})

    for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
                     (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]:
        (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)

    # different index
    ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                        'b': [7, 6, 5, 4, 3, 2, 1]},
                       index=list('abcdefg'))
    rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                        'd': [7, 6, 5, 4, 3, 2, 1]},
                       index=list('fghijkl'))

    for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
                     (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]:
        (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)
Exemple #14
0
def test_repartition_on_pandas_dataframe():
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6],
        'y': list('abdabd')
    },
                      index=[10, 20, 30, 40, 50, 60])
    ddf = dd.repartition(df, divisions=[10, 20, 50, 60])
    assert isinstance(ddf, dd.DataFrame)
    assert ddf.divisions == (10, 20, 50, 60)
    assert eq(ddf, df)

    ddf = dd.repartition(df.y, divisions=[10, 20, 50, 60])
    assert isinstance(ddf, dd.Series)
    assert ddf.divisions == (10, 20, 50, 60)
    assert eq(ddf, df.y)
def test_rearrange_by_column_with_narrow_divisions():
    from dask.dataframe.tests.test_multi import list_eq
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    df = rearrange_by_divisions(a, 'x', (0, 2, 5))
    list_eq(df, a)
Exemple #16
0
def test_rearrange_by_column_with_narrow_divisions():
    from dask.dataframe.tests.test_multi import list_eq
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    df = rearrange_by_divisions(a, 'x', (0, 2, 5))
    list_eq(df, a)
Exemple #17
0
def test_merge(how, shuffle):
    A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    eq(
        dd.merge(a, b, left_index=True, right_index=True, shuffle=shuffle),
        pd.merge(A, B, left_index=True, right_index=True),
    )

    result = dd.merge(a, b, on="y", how=how)
    list_eq(result, pd.merge(A, B, on="y", how=how))
    assert all(d is None for d in result.divisions)

    list_eq(
        dd.merge(a, b, left_on="x", right_on="z", how=how, shuffle=shuffle),
        pd.merge(A, B, left_on="x", right_on="z", how=how),
    )
    list_eq(
        dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2"), shuffle=shuffle),
        pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
    )

    list_eq(dd.merge(a, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(a, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how))

    list_eq(
        dd.merge(a, b, left_index=True, right_index=True, how=how, shuffle=shuffle),
        pd.merge(A, B, left_index=True, right_index=True, how=how),
    )
    list_eq(
        dd.merge(a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2"), shuffle=shuffle),
        pd.merge(A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")),
    )

    list_eq(
        dd.merge(a, b, left_on="x", right_index=True, how=how, shuffle=shuffle),
        pd.merge(A, B, left_on="x", right_index=True, how=how),
    )
    list_eq(
        dd.merge(a, b, left_on="x", right_index=True, how=how, suffixes=("1", "2"), shuffle=shuffle),
        pd.merge(A, B, left_on="x", right_index=True, how=how, suffixes=("1", "2")),
    )
Exemple #18
0
def test_indexed_concat(join):
    A = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')},
                     index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({'x': [10, 20, 40, 50, 60, 80]},
                     index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    result = concat_indexed_dataframes([a, b], join=join)
    expected = pd.concat([A, B], axis=0, join=join)
    assert eq(result, expected)

    assert sorted(concat_indexed_dataframes([a, b], join=join).dask) == \
           sorted(concat_indexed_dataframes([a, b], join=join).dask)
    assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) != \
           sorted(concat_indexed_dataframes([a, b], join='outer').dask)
Exemple #19
0
def test_merge(how, shuffle):
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    assert_eq(dd.merge(a, b, left_index=True, right_index=True,
                       how=how, shuffle=shuffle),
              pd.merge(A, B, left_index=True, right_index=True, how=how))

    result = dd.merge(a, b, on='y', how=how)
    list_eq(result, pd.merge(A, B, on='y', how=how))
    assert all(d is None for d in result.divisions)

    list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how, shuffle=shuffle),
            pd.merge(A, B, left_on='x', right_on='z', how=how))
    list_eq(dd.merge(a, b, left_on='x', right_on='z', how=how,
                     suffixes=('1', '2'), shuffle=shuffle),
            pd.merge(A, B, left_on='x', right_on='z', how=how,
                     suffixes=('1', '2')))

    list_eq(dd.merge(a, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(a, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, b, how=how, shuffle=shuffle), pd.merge(A, B, how=how))
    list_eq(dd.merge(A, B, how=how, shuffle=shuffle), pd.merge(A, B, how=how))

    list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how,
                     shuffle=shuffle),
            pd.merge(A, B, left_index=True, right_index=True, how=how))
    list_eq(dd.merge(a, b, left_index=True, right_index=True, how=how,
                     suffixes=('1', '2'), shuffle=shuffle),
            pd.merge(A, B, left_index=True, right_index=True, how=how,
                     suffixes=('1', '2')))

    list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how,
                     shuffle=shuffle),
            pd.merge(A, B, left_on='x', right_index=True, how=how))
    list_eq(dd.merge(a, b, left_on='x', right_index=True, how=how,
                     suffixes=('1', '2'), shuffle=shuffle),
            pd.merge(A, B, left_on='x', right_index=True, how=how,
                     suffixes=('1', '2')))
Exemple #20
0
def test_align_partitions():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')},
                     index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')},
                     index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    (aa, bb), divisions, L = align_partitions(a, b)
    assert isinstance(a, dd.DataFrame)
    assert isinstance(b, dd.DataFrame)
    assert divisions == (10, 30, 40, 60, 80, 100)
    assert isinstance(L, list)
    assert len(divisions) == 1 + len(L)
    assert L == [[(aa._name, 0), None],
                 [(aa._name, 1), (bb._name, 0)],
                 [(aa._name, 2), (bb._name, 1)],
                 [None, (bb._name, 2)],
                 [None, (bb._name, 3)]]
Exemple #21
0
def test_align_partitions():
    A = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6],
        'y': list('abdabd')
    },
                     index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': list('abda')
    },
                     index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    (aa, bb), divisions, L = align_partitions(a, b)
    assert isinstance(a, dd.DataFrame)
    assert isinstance(b, dd.DataFrame)
    assert divisions == (10, 30, 40, 60, 80, 100)
    assert isinstance(L, list)
    assert len(divisions) == 1 + len(L)
    assert L == [[(aa._name, 0), None], [(aa._name, 1), (bb._name, 0)],
                 [(aa._name, 2), (bb._name, 1)], [None, (bb._name, 2)],
                 [None, (bb._name, 3)]]
Exemple #22
0
def test_indexed_concat():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')},
                     index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({'x': [10, 20, 40, 50, 60, 80]},
                     index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    for how in ['inner', 'outer']:
        c = concat_indexed_dataframes([a, b], join=how)

        result = c.compute()
        expected = pd.concat([A, B], 0, how)

        assert list(result.columns) == list(expected.columns)

        assert sorted(zip(result.values.tolist(), result.index.values.tolist())) == \
               sorted(zip(expected.values.tolist(), expected.index.values.tolist()))

    assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) == \
           sorted(concat_indexed_dataframes([a, b], join='inner').dask)
    assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) != \
           sorted(concat_indexed_dataframes([a, b], join='outer').dask)
Exemple #23
0
def test_align_partitions():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')},
                     index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')},
                     index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    s = dd.core.Scalar({('s', 0): 10}, 's')

    (aa, bb), divisions, L = align_partitions(a, b)

    def _check(a, b, aa, bb):
        assert isinstance(a, dd.DataFrame)
        assert isinstance(b, dd.DataFrame)
        assert isinstance(aa, dd.DataFrame)
        assert isinstance(bb, dd.DataFrame)
        assert eq(a, aa)
        assert eq(b, bb)
        assert divisions == (10, 30, 40, 60, 80, 100)
        assert isinstance(L, list)
        assert len(divisions) == 1 + len(L)

    _check(a, b, aa, bb)
    assert L == [[(aa._name, 0), (bb._name, 0)],
                 [(aa._name, 1), (bb._name, 1)],
                 [(aa._name, 2), (bb._name, 2)],
                 [(aa._name, 3), (bb._name, 3)],
                 [(aa._name, 4), (bb._name, 4)]]

    (aa, ss, bb), divisions, L = align_partitions(a, s, b)
    _check(a, b, aa, bb)
    assert L == [[(aa._name, 0), None, (bb._name, 0)],
                 [(aa._name, 1), None, (bb._name, 1)],
                 [(aa._name, 2), None, (bb._name, 2)],
                 [(aa._name, 3), None, (bb._name, 3)],
                 [(aa._name, 4), None, (bb._name, 4)]]
    assert eq(ss, 10)

    ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                        'b': [7, 6, 5, 4, 3, 2, 1]})
    rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                        'd': [7, 6, 5, 4, 3, 2, 1]})

    for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
                     (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]:
        (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)

    # different index
    ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                        'b': [7, 6, 5, 4, 3, 2, 1]},
                       index=list('abcdefg'))
    rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                        'd': [7, 6, 5, 4, 3, 2, 1]},
                       index=list('fghijkl'))

    for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
                     (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]:
        (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)
def ddf_right(df_right):
    # Create frame with 10 partitions
    # Skip division on 3 so there is one mismatch with ddf_left
    return dd.repartition(df_right, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11])
Exemple #25
0
def ddf_right(df_right):
    # Create frame with 10 partitions
    # Skip division on 3 so there is one mismatch with ddf_left
    return dd.repartition(df_right, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11])
Exemple #26
0
def test_align_partitions():
    A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": list("abdabd")}, index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({"x": [1, 2, 3, 4], "y": list("abda")}, index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    s = dd.core.Scalar({("s", 0): 10}, "s", "i8")

    (aa, bb), divisions, L = align_partitions(a, b)

    def _check(a, b, aa, bb):
        assert isinstance(a, dd.DataFrame)
        assert isinstance(b, dd.DataFrame)
        assert isinstance(aa, dd.DataFrame)
        assert isinstance(bb, dd.DataFrame)
        assert eq(a, aa)
        assert eq(b, bb)
        assert divisions == (10, 30, 40, 60, 80, 100)
        assert isinstance(L, list)
        assert len(divisions) == 1 + len(L)

    _check(a, b, aa, bb)
    assert L == [
        [(aa._name, 0), (bb._name, 0)],
        [(aa._name, 1), (bb._name, 1)],
        [(aa._name, 2), (bb._name, 2)],
        [(aa._name, 3), (bb._name, 3)],
        [(aa._name, 4), (bb._name, 4)],
    ]

    (aa, ss, bb), divisions, L = align_partitions(a, s, b)
    _check(a, b, aa, bb)
    assert L == [
        [(aa._name, 0), None, (bb._name, 0)],
        [(aa._name, 1), None, (bb._name, 1)],
        [(aa._name, 2), None, (bb._name, 2)],
        [(aa._name, 3), None, (bb._name, 3)],
        [(aa._name, 4), None, (bb._name, 4)],
    ]
    assert eq(ss, 10)

    ldf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]})
    rdf = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]})

    for lhs, rhs in [
        (dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
        (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
        (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
        (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2)),
    ]:
        (lresult, rresult), div, parts = align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)

    # different index
    ldf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg"))
    rdf = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]}, index=list("fghijkl"))

    for lhs, rhs in [
        (dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
        (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
        (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
        (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2)),
    ]:
        (lresult, rresult), div, parts = align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)