Esempio n. 1
0
def test_hash_join():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    for how in ['inner', 'left', 'right', 'outer']:
        c = hash_join(a, 'y', b, 'y', how)

        result = c.compute()
        expected = pd.merge(A, B, how, 'y')

        assert list(result.columns) == list(expected.columns)
        assert sorted(result.fillna(100).values.tolist()) == \
               sorted(expected.fillna(100).values.tolist())

    # Different columns and npartitions
    c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, 'outer', None, 'x', 'z')
    assert list(result.columns) == list(expected.columns)
    assert sorted(result.fillna(100).values.tolist()) == \
           sorted(expected.fillna(100).values.tolist())
Esempio n. 2
0
def test_hash_join(how):
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    c = hash_join(a, 'y', b, 'y', how)

    result = c.compute()
    expected = pd.merge(A, B, how, 'y')
    list_eq(result, expected)

    # Different columns and npartitions
    c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, 'outer', None, 'x', 'z')

    list_eq(result, expected)

    assert hash_join(a, 'y', b, 'y', 'inner')._name == \
           hash_join(a, 'y', b, 'y', 'inner')._name
    assert hash_join(a, 'y', b, 'y', 'inner')._name != \
           hash_join(a, 'y', b, 'y', 'outer')._name
Esempio n. 3
0
def test_hash_join(how, shuffle):
    A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    c = hash_join(a, "y", b, "y", how)

    result = c.compute()
    expected = pd.merge(A, B, how, "y")
    list_eq(result, expected)

    # Different columns and npartitions
    c = hash_join(a, "x", b, "z", "outer", npartitions=3, shuffle=shuffle)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, "outer", None, "x", "z")

    list_eq(result, expected)

    assert (
        hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name
        == hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name
    )
    assert (
        hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name
        != hash_join(a, "y", b, "y", "outer", shuffle=shuffle)._name
    )
Esempio n. 4
0
def test_hash_join():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    for how in ['inner', 'left', 'right', 'outer']:
        c = hash_join(a, 'y', b, 'y', how)

        result = c.compute()
        expected = pd.merge(A, B, how, 'y')

        assert list(result.columns) == list(expected.columns)
        assert sorted(result.fillna(100).values.tolist()) == \
               sorted(expected.fillna(100).values.tolist())


    # Different columns and npartitions
    c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, 'outer', None, 'x', 'z')
    assert list(result.columns) == list(expected.columns)
    assert sorted(result.fillna(100).values.tolist()) == \
           sorted(expected.fillna(100).values.tolist())

    assert hash_join(a, 'y', b, 'y', 'inner')._name == \
           hash_join(a, 'y', b, 'y', 'inner')._name
    assert hash_join(a, 'y', b, 'y', 'inner')._name != \
           hash_join(a, 'y', b, 'y', 'outer')._name