Esempio n. 1
0
def test_string_join_non_key(str_data, num_cols, how, how_raise):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_cols):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    pdf2 = pdf.copy()
    gdf2 = gdf.copy()

    expectation = raise_builder([how_raise], NotImplementedError)

    with expectation:
        expect = pdf.merge(pdf2, on=["a"], how=how)
        got = gdf.merge(gdf2, on=["a"], how=how)

        if len(expect) == 0 and len(got) == 0:
            expect = expect.reset_index(drop=True)
            got = got[expect.columns]

        assert_eq(expect, got)
Esempio n. 2
0
def test_string_join_non_key_nulls(str_data_nulls):
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["vals"] = pd.Series(str_data, dtype="str")
    gdf["vals"] = Series(str_data, dtype="str")
    pdf["key"] = other_data
    gdf["key"] = other_data

    pdf2 = pd.DataFrame()
    gdf2 = DataFrame()
    pdf2["vals"] = pd.Series(str_data_nulls, dtype="str")
    gdf2["vals"] = Series(str_data_nulls, dtype="str")
    pdf2["key"] = pd.Series(other_data_nulls, dtype="int64")
    gdf2["key"] = Series(other_data_nulls, dtype="int64")

    expect = pdf.merge(pdf2, on="key", how="left")
    got = gdf.merge(gdf2, on="key", how="left")

    if len(expect) == 0 and len(got) == 0:
        expect = expect.reset_index(drop=True)
        got = got[expect.columns]

    assert_eq(expect, got)
Esempio n. 3
0
def test_string_join_non_key_nulls(str_data_nulls):
    str_data = ['a', 'b', 'c', 'd', 'e']
    other_data = [1, 2, 3, 4, 5]

    other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf['vals'] = pd.Series(str_data, dtype='str')
    gdf['vals'] = Series(str_data, dtype='str')
    pdf['key'] = other_data
    gdf['key'] = other_data

    pdf2 = pd.DataFrame()
    gdf2 = DataFrame()
    pdf2['vals'] = pd.Series(str_data_nulls, dtype='str')
    gdf2['vals'] = Series(str_data_nulls, dtype='str')
    pdf2['key'] = pd.Series(other_data_nulls, dtype='int64')
    gdf2['key'] = Series(other_data_nulls, dtype='int64')

    expect = pdf.merge(pdf2, on='key', how='left')
    got = gdf.merge(gdf2, on='key', how='left')

    if len(expect) == 0 and len(got) == 0:
        expect = expect.reset_index(drop=True)
        got = got[expect.columns]

    assert_eq(expect, got)
Esempio n. 4
0
def test_dataframe_empty_merge():
    gdf1 = DataFrame([('a', []), ('b', [])])
    gdf2 = DataFrame([('a', []), ('c', [])])

    expect = DataFrame([('a', []), ('b', []), ('c', [])])
    got = gdf1.merge(gdf2, how='left', on=['a'])

    assert_eq(expect, got)
Esempio n. 5
0
def test_dataframe_empty_merge():
    gdf1 = DataFrame([("a", []), ("b", [])])
    gdf2 = DataFrame([("a", []), ("c", [])])

    expect = DataFrame([("a", []), ("b", []), ("c", [])])
    got = gdf1.merge(gdf2, how="left", on=["a"])

    assert_eq(expect, got)
Esempio n. 6
0
def test_dataframe_merge_on(on):
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left['key1'] = np.random.randint(0, 40, nelem)
    df_left['key2'] = np.random.randint(0, 50, nelem)
    df_left['left_val'] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right['key1'] = np.random.randint(0, 30, nelem)
    df_right['key2'] = np.random.randint(0, 50, nelem)
    df_right['right_val'] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()

    # Expected result (from pandas)
    pddf_joined = pddf_left.merge(pddf_right, on=on, how='left')

    # Test (from cuDF; doesn't check for ordering)
    join_result = df_left.merge(df_right, on=on, how='left')
    join_result_cudf = cudf.merge(df_left, df_right, on=on, how='left')

    join_result['right_val'] = (join_result['right_val'].astype(
        np.float64).fillna(np.nan))

    join_result_cudf['right_val'] = (join_result_cudf['right_val'].astype(
        np.float64).fillna(np.nan))

    for col in list(pddf_joined.columns):
        if (col.count('_y') > 0):
            join_result[col] = (join_result[col].astype(np.float64).fillna(
                np.nan))
            join_result_cudf[col] = (join_result_cudf[col].astype(
                np.float64).fillna(np.nan))

    # Test dataframe equality (ignore order of rows and columns)
    cdf_result = join_result.to_pandas() \
                            .sort_values(list(pddf_joined.columns)) \
                            .reset_index(drop=True)

    pdf_result = pddf_joined.sort_values(list(pddf_joined.columns)) \
                            .reset_index(drop=True)

    pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True)

    merge_func_result_cdf = join_result_cudf.to_pandas() \
                                            .sort_values(
                                                list(pddf_joined.columns)) \
                                            .reset_index(drop=True)

    pd.util.testing.assert_frame_equal(merge_func_result_cdf,
                                       cdf_result,
                                       check_like=True)
Esempio n. 7
0
def test_dataframe_merge_no_common_column():
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left['key1'] = np.random.randint(0, 40, nelem)
    df_left['key2'] = np.random.randint(0, 50, nelem)
    df_left['left_val'] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right['key3'] = np.random.randint(0, 30, nelem)
    df_right['key4'] = np.random.randint(0, 50, nelem)
    df_right['right_val'] = np.arange(nelem)

    with pytest.raises(ValueError) as raises:
        df_left.merge(df_right, how='left')
    raises.match('No common columns to perform merge on')
Esempio n. 8
0
def test_dataframe_merge_on_unknown_column():
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left['key1'] = np.random.randint(0, 40, nelem)
    df_left['key2'] = np.random.randint(0, 50, nelem)
    df_left['left_val'] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right['key1'] = np.random.randint(0, 30, nelem)
    df_right['key2'] = np.random.randint(0, 50, nelem)
    df_right['right_val'] = np.arange(nelem)

    with pytest.raises(KeyError) as raises:
        df_left.merge(df_right, on='bad_key', how='left')
    raises.match('bad_key')
Esempio n. 9
0
def test_dataframe_merge_no_common_column():
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left["key1"] = np.random.randint(0, 40, nelem)
    df_left["key2"] = np.random.randint(0, 50, nelem)
    df_left["left_val"] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right["key3"] = np.random.randint(0, 30, nelem)
    df_right["key4"] = np.random.randint(0, 50, nelem)
    df_right["right_val"] = np.arange(nelem)

    with pytest.raises(ValueError) as raises:
        df_left.merge(df_right, how="left")
    raises.match("No common columns to perform merge on")
Esempio n. 10
0
def test_dataframe_merge_on_unknown_column():
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left["key1"] = np.random.randint(0, 40, nelem)
    df_left["key2"] = np.random.randint(0, 50, nelem)
    df_left["left_val"] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right["key1"] = np.random.randint(0, 30, nelem)
    df_right["key2"] = np.random.randint(0, 50, nelem)
    df_right["right_val"] = np.arange(nelem)

    with pytest.raises(KeyError) as raises:
        df_left.merge(df_right, on="bad_key", how="left")
    raises.match("bad_key")
Esempio n. 11
0
def test_dataframe_merge_order():
    gdf1 = DataFrame()
    gdf2 = DataFrame()
    gdf1['id'] = [10, 11]
    gdf1['timestamp'] = [1, 2]
    gdf1['a'] = [3, 4]

    gdf2['id'] = [4, 5]
    gdf2['a'] = [7, 8]

    gdf = gdf1.merge(gdf2, how='left', on=['id', 'a'], method='hash')

    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    df1['id'] = [10, 11]
    df1['timestamp'] = [1, 2]
    df1['a'] = [3, 4]

    df2['id'] = [4, 5]
    df2['a'] = [7, 8]

    df = df1.merge(df2, how='left', on=['id', 'a'])
    assert_eq(gdf, df)
Esempio n. 12
0
def test_dataframe_merge_order():
    gdf1 = DataFrame()
    gdf2 = DataFrame()
    gdf1["id"] = [10, 11]
    gdf1["timestamp"] = [1, 2]
    gdf1["a"] = [3, 4]

    gdf2["id"] = [4, 5]
    gdf2["a"] = [7, 8]

    gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash")

    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    df1["id"] = [10, 11]
    df1["timestamp"] = [1, 2]
    df1["a"] = [3, 4]

    df2["id"] = [4, 5]
    df2["a"] = [7, 8]

    df = df1.merge(df2, how="left", on=["id", "a"])
    assert_eq(gdf, df)
Esempio n. 13
0
def test_dataframe_multi_column_join():
    np.random.seed(0)

    # Make GDF
    df_left = DataFrame()
    nelem = 500
    df_left['key1'] = np.random.randint(0, 30, nelem)
    df_left['key2'] = np.random.randint(0, 50, nelem)
    df_left['val1'] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right['key1'] = np.random.randint(0, 30, nelem)
    df_right['key2'] = np.random.randint(0, 50, nelem)
    df_right['val1'] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()

    # Expected result
    pddf_joined = pddf_left.merge(pddf_right,
                                  on=['key1', 'key2'],
                                  how='left',
                                  sort=True)

    # Test (doesn't check for ordering)
    join_result = df_left.merge(df_right, on=['key1', 'key2'], how='left')

    for col in list(pddf_joined.columns):
        if (col.count('_y') > 0):
            join_result[col] = (join_result[col].astype(np.float64).fillna(
                np.nan))

    pd.util.testing.assert_frame_equal(
        join_result.to_pandas().sort_values(list(
            pddf_joined.columns)).reset_index(drop=True), pddf_joined)