Example #1
0
def test_dataframe_join_cats():
    ldf = DataFrame()
    ldf['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    ldf['b'] = bb = np.arange(len(ldf))
    lhs = ldf.set_index('a')

    rdf = DataFrame()
    rdf['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rdf['c'] = cc = np.arange(len(rdf))
    rhs = rdf.set_index('a')

    got = lhs.join(rhs)
    expect = lhs.to_pandas().join(rhs.to_pandas())

    # Note: pandas make a object Index after joining
    pd.util.testing.assert_frame_equal(
        got.sort_values(by='b').to_pandas().sort_index().reset_index(
            drop=True), expect.reset_index(drop=True))

    # Just do some rough checking here.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
Example #2
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in 'abc':
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index('a')
    right = df.set_index('c')
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True)
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index('a').join(pddf.set_index('c'),
                                      lsuffix='_left',
                                      rsuffix='_right')
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k], got[k])
Example #3
0
def test_dataframe_join_cats():
    ldf = DataFrame()
    ldf['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    ldf['b'] = bb = np.arange(len(ldf))
    lhs = ldf.set_index('a')

    rdf = DataFrame()
    rdf['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rdf['c'] = cc = np.arange(len(rdf))
    rhs = rdf.set_index('a')

    got = lhs.join(rhs)
    # Just do some rough checking here.
    # Note: pandas fails to join on categorical index.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
Example #4
0
def test_dataframe_join_cats():
    ldf = DataFrame()
    ldf['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    ldf['b'] = bb = np.arange(len(ldf))
    lhs = ldf.set_index('a')

    rdf = DataFrame()
    rdf['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rdf['c'] = cc = np.arange(len(rdf))
    rhs = rdf.set_index('a')

    got = lhs.join(rhs)
    # Just do some rough checking here.
    # Note: pandas fails to join on categorical index.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
Example #5
0
def test_df_cat_sort_index():
    df = DataFrame()
    df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    df['b'] = np.arange(len(df))

    got = df.set_index('a').sort_index()
    expect = df.to_pandas().set_index('a').sort_index()

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())
Example #6
0
def test_df_set_index_from_series():
    df = DataFrame()
    df['a'] = list(range(10))
    df['b'] = list(range(0, 20, 2))

    # Check set_index(Series)
    df2 = df.set_index(df['b'])
    assert list(df2.columns) == ['a', 'b']
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
Example #7
0
def test_nonmatching_index_setitem(nrows):
    np.random.seed(0)

    gdf = DataFrame()
    gdf['a'] = np.random.randint(2147483647, size=nrows)
    gdf['b'] = np.random.randint(2147483647, size=nrows)
    gdf = gdf.set_index('b')

    test_values = np.random.randint(2147483647, size=nrows)
    gdf['c'] = test_values
    assert (len(test_values) == len(gdf['c']))
    assert (gdf['c'].to_pandas().equals(
        Series(test_values).set_index(gdf._index).to_pandas()))
Example #8
0
def test_df_set_index_from_name():
    df = DataFrame()
    df['a'] = list(range(10))
    df['b'] = list(range(0, 20, 2))

    # Check set_index(column_name)
    df2 = df.set_index('b')
    print(df2)
    # 1 less column because 'b' is used as index
    assert list(df2.columns) == ['a']
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
Example #9
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in 'abc':
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index('a')
    right = df.set_index('c')
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix='_left', rsuffix='_right')
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index('a').join(pddf.set_index('c'),
                                      lsuffix='_left', rsuffix='_right')
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k], got[k])