def test_normalize_dataframe(): dic = { 'team': [ 'Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', 'Yellow', 'Green', 'Green', 'Blue' ], 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'], 'city': [ 'boston', 'boston', 'boston', 'chicago', 'chicago', 'honolulu', 'honolulu', 'boston', 'boston', 'austin' ], 'state': ['MA', 'MA', 'MA', 'IL', 'IL', 'HI', 'HI', 'MA', 'MA', 'TX'] } df = pd.DataFrame(dic) deps = classes.Dependencies( { 'team': [['player_name', 'jersey_num']], 'jersey_num': [['player_name', 'team']], 'player_name': [['team', 'jersey_num']], 'city': [['team'], ['state'], ['player_name', 'jersey_num']], 'state': [['team'], ['player_name', 'jersey_num'], ['city']] }, ['team', 'jersey_num']) depdf = normalize.DepDF(deps, df, deps.get_prim_key()) normalize.normalize_dataframe(depdf) new_dfs = depdf.return_dfs() assert len(new_dfs) == 3 dic_one = { 'team': [ 'Red', 'Red', 'Red', 'Orange', 'Orange', 'Yellow', 'Yellow', 'Green', 'Green', 'Blue' ], 'jersey_num': [1, 2, 3, 1, 2, 1, 5, 8, 2, 2], 'player_name': ['A', 'B', 'C', 'D', 'A', 'E', 'B', 'A', 'G', 'H'] } dic_two = { 'team': ['Red', 'Orange', 'Yellow', 'Green', 'Blue', 'Blue'], 'city': ['boston', 'chicago', 'honolulu', 'boston', 'austin', 'austin'] } dic_three = { 'city': ['boston', 'chicago', 'honolulu', 'austin', 'austin'], 'state': ['MA', 'IL', 'HI', 'TX', 'TX'] } assert new_dfs[0].equals( normalize.drop_primary_dups(pd.DataFrame(dic_one), ['team', 'jersey_num'])) assert new_dfs[1].equals( normalize.drop_primary_dups(pd.DataFrame(dic_two), ['team'])) assert new_dfs[2].equals( normalize.drop_primary_dups(pd.DataFrame(dic_three), ['city']))
def test_make_indexes(): dic = { "id": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "month": ['dec', 'dec', 'jul', 'jul', 'dec', 'jul', 'jul', 'jul', 'dec', 'jul'], "hemisphere": ['N', 'N', 'N', 'N', 'S', 'S', 'S', 'S', 'S', 'N'], "is_winter": [True, True, False, False, False, True, True, True, False, False] } df = pd.DataFrame(dic) deps = classes.Dependencies( { 'id': [], 'month': [['id'], ['hemisphere', 'is_winter']], 'hemisphere': [['month', 'is_winter'], ['id']], 'is_winter': [['month', 'hemisphere'], ['id']] }, ['id']) depdf = normalize.DepDF(deps, df, deps.get_prim_key()) normalize.normalize_dataframe(depdf) normalize.make_indexes(depdf) new_dfs = depdf.return_dfs() mask = (new_dfs[1]['month'] == 'dec') & (new_dfs[1]['hemisphere'] == 'N') val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0] assert new_dfs[0][new_dfs[1].columns[0]][0] == val assert new_dfs[0][new_dfs[1].columns[0]][1] == val mask = (new_dfs[1]['month'] == 'jul') & (new_dfs[1]['hemisphere'] == 'N') val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0] assert new_dfs[0][new_dfs[1].columns[0]][2] == val assert new_dfs[0][new_dfs[1].columns[0]][3] == val assert new_dfs[0][new_dfs[1].columns[0]][9] == val mask = (new_dfs[1]['month'] == 'dec') & (new_dfs[1]['hemisphere'] == 'S') val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0] assert new_dfs[0][new_dfs[1].columns[0]][4] == val assert new_dfs[0][new_dfs[1].columns[0]][8] == val mask = (new_dfs[1]['month'] == 'jul') & (new_dfs[1]['hemisphere'] == 'S') val = new_dfs[1][mask][new_dfs[1].columns[0]].iloc[0] assert new_dfs[0][new_dfs[1].columns[0]][5] == val assert new_dfs[0][new_dfs[1].columns[0]][6] == val assert new_dfs[0][new_dfs[1].columns[0]][7] == val # Make sure new column names are sorted assert 'hemisphere_month' in new_dfs[0].columns assert 'hemisphere_month' in new_dfs[1].columns