def test_hash_join(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) for how in ['inner', 'left', 'right', 'outer']: c = hash_join(a, 'y', b, 'y', how) result = c.compute() expected = pd.merge(A, B, how, 'y') assert list(result.columns) == list(expected.columns) assert sorted(result.fillna(100).values.tolist()) == \ sorted(expected.fillna(100).values.tolist()) # Different columns and npartitions c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3) assert c.npartitions == 3 result = c.compute() expected = pd.merge(A, B, 'outer', None, 'x', 'z') assert list(result.columns) == list(expected.columns) assert sorted(result.fillna(100).values.tolist()) == \ sorted(expected.fillna(100).values.tolist())
def test_hash_join(how): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) c = hash_join(a, 'y', b, 'y', how) result = c.compute() expected = pd.merge(A, B, how, 'y') list_eq(result, expected) # Different columns and npartitions c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3) assert c.npartitions == 3 result = c.compute() expected = pd.merge(A, B, 'outer', None, 'x', 'z') list_eq(result, expected) assert hash_join(a, 'y', b, 'y', 'inner')._name == \ hash_join(a, 'y', b, 'y', 'inner')._name assert hash_join(a, 'y', b, 'y', 'inner')._name != \ hash_join(a, 'y', b, 'y', 'outer')._name
def test_hash_join(how, shuffle): A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) c = hash_join(a, "y", b, "y", how) result = c.compute() expected = pd.merge(A, B, how, "y") list_eq(result, expected) # Different columns and npartitions c = hash_join(a, "x", b, "z", "outer", npartitions=3, shuffle=shuffle) assert c.npartitions == 3 result = c.compute() expected = pd.merge(A, B, "outer", None, "x", "z") list_eq(result, expected) assert ( hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name == hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name ) assert ( hash_join(a, "y", b, "y", "inner", shuffle=shuffle)._name != hash_join(a, "y", b, "y", "outer", shuffle=shuffle)._name )
def test_hash_join(): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) for how in ['inner', 'left', 'right', 'outer']: c = hash_join(a, 'y', b, 'y', how) result = c.compute() expected = pd.merge(A, B, how, 'y') assert list(result.columns) == list(expected.columns) assert sorted(result.fillna(100).values.tolist()) == \ sorted(expected.fillna(100).values.tolist()) # Different columns and npartitions c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3) assert c.npartitions == 3 result = c.compute() expected = pd.merge(A, B, 'outer', None, 'x', 'z') assert list(result.columns) == list(expected.columns) assert sorted(result.fillna(100).values.tolist()) == \ sorted(expected.fillna(100).values.tolist()) assert hash_join(a, 'y', b, 'y', 'inner')._name == \ hash_join(a, 'y', b, 'y', 'inner')._name assert hash_join(a, 'y', b, 'y', 'inner')._name != \ hash_join(a, 'y', b, 'y', 'outer')._name