예제 #1
0
def test_normalize_dataframe():

    dic = {
        'team': [
            'Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', 'Yellow',
            'Green', 'Green', 'Blue'
        ],
        'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
        'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'],
        'city': [
            'boston', 'boston', 'boston', 'chicago', 'chicago', 'honolulu',
            'honolulu', 'boston', 'boston', 'austin'
        ],
        'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX']
    }
    df = pd.DataFrame(dic)
    deps = classes.Dependencies(
        {
            'team': [['player_name', 'jersey_num']],
            'jersey_num': [['player_name', 'team']],
            'player_name': [['team', 'jersey_num']],
            'city': [['team'], ['state'], ['player_name', 'jersey_num']],
            'state': [['team'], ['player_name', 'jersey_num'], ['city']]
        }, ['team', 'jersey_num'])

    depdf = normalize.DepDF(deps, df, deps.get_prim_key())
    normalize.normalize_dataframe(depdf)
    new_dfs = depdf.return_dfs()

    assert len(new_dfs) == 3

    dic_one = {
        'team': [
            'Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', 'Yellow',
            'Green', 'Green', 'Blue'
        ],
        'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2],
        'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H']
    }

    dic_two = {
        'team': ['Red', 'Orange', 'Yellow', 'Green', 'Blue', 'Blue'],
        'city':
        ['boston', 'chicago', 'honolulu', 'boston', 'austin', 'austin']
    }

    dic_three = {
        'city': ['boston', 'chicago', 'honolulu', 'austin', 'austin'],
        'state': ['MA', 'IL', 'HI', 'TX', 'TX']
    }

    assert new_dfs[0].equals(
        normalize.drop_primary_dups(pd.DataFrame(dic_one),
                                    ['team', 'jersey_num']))
    assert new_dfs[1].equals(
        normalize.drop_primary_dups(pd.DataFrame(dic_two), ['team']))
    assert new_dfs[2].equals(
        normalize.drop_primary_dups(pd.DataFrame(dic_three), ['city']))
예제 #2
0
def test_make_indexes():

    dic = {
        "id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        "month":
        ['dec', 'dec', 'jul', 'jul', 'dec', 'jul', 'jul', 'jul', 'dec', 'jul'],
        "hemisphere": ['N', 'N', 'N', 'N', 'S', 'S', 'S', 'S', 'S', 'N'],
        "is_winter":
        [True, True, False, False, False, True, True, True, False, False]
    }

    df = pd.DataFrame(dic)
    deps = classes.Dependencies(
        {
            'id': [],
            'month': [['id'], ['hemisphere', 'is_winter']],
            'hemisphere': [['month', 'is_winter'], ['id']],
            'is_winter': [['month', 'hemisphere'], ['id']]
        }, ['id'])

    depdf = normalize.DepDF(deps, df, deps.get_prim_key())
    normalize.normalize_dataframe(depdf)
    normalize.make_indexes(depdf)
    new_dfs = depdf.return_dfs()

    mask = (new_dfs[1]['month'] == 'dec') & (new_dfs[1]['hemisphere'] == 'N')
    val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0]
    assert new_dfs[0][new_dfs[1].columns[0]][0] == val
    assert new_dfs[0][new_dfs[1].columns[0]][1] == val

    mask = (new_dfs[1]['month'] == 'jul') & (new_dfs[1]['hemisphere'] == 'N')
    val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0]
    assert new_dfs[0][new_dfs[1].columns[0]][2] == val
    assert new_dfs[0][new_dfs[1].columns[0]][3] == val
    assert new_dfs[0][new_dfs[1].columns[0]][9] == val

    mask = (new_dfs[1]['month'] == 'dec') & (new_dfs[1]['hemisphere'] == 'S')
    val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0]
    assert new_dfs[0][new_dfs[1].columns[0]][4] == val
    assert new_dfs[0][new_dfs[1].columns[0]][8] == val

    mask = (new_dfs[1]['month'] == 'jul') & (new_dfs[1]['hemisphere'] == 'S')
    val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0]
    assert new_dfs[0][new_dfs[1].columns[0]][5] == val
    assert new_dfs[0][new_dfs[1].columns[0]][6] == val
    assert new_dfs[0][new_dfs[1].columns[0]][7] == val

    # Make sure new column names are sorted
    assert 'hemisphere_month' in new_dfs[0].columns
    assert 'hemisphere_month' in new_dfs[1].columns