Ejemplo n.º 1
0
def import_data_from_test_csv(db_session, db_uri):
    import pandas as pd
    from sqlalchemy import create_engine
    from app.models import City, Venue, Team, Official

    # read raw matches csv files
    df = pd.read_csv('test_files/matches.csv', parse_dates=['date'])

    # save cities and venues
    city_venue_df = df[['city', 'venue']].drop_duplicates()
    zip_city_venue = zip(city_venue_df['city'], city_venue_df['venue'])
    for cv in zip_city_venue:
        city = City()
        city.name = cv[0]
        db_session.add(city)
        db_session.commit()

        venue = Venue()
        venue.city_id = city.id
        venue.name = cv[1]
        db_session.add(venue)
        db_session.commit()

    # save teams
    team_set1 = set(df['team1'].drop_duplicates())
    team_set2 = set(df['team2'].drop_duplicates())
    all_teams = team_set1.union(team_set2)
    for team in all_teams:
        team_model = Team()
        team_model.name = team
        db_session.add(team_model)

    # save umpires
    umpire_set1 = set(df['umpire1'].drop_duplicates())
    umpire_set2 = set(df['umpire2'].drop_duplicates())
    all_umpires = umpire_set1.union(umpire_set2)
    for umpire in all_umpires:
        umpire_model = Official()
        umpire_model.name = umpire
        db_session.add(umpire_model)

    db_session.commit()

    # create a dict with key and name mappings for
    # cities, venues, teams and officials
    cities = db_session.query(City).all()
    cities_dict = {city.name: city.id for city in cities}
    venues = db_session.query(Venue).all()
    venues_dict = {venue.name: venue.id for venue in venues}
    teams = db_session.query(Team).all()
    teams_dict = {team.name: team.id for team in teams}
    officials = db_session.query(Official).all()
    officials_dict = {official.name: official.id for official in officials}

    # cities = df_matches['city'].drop_duplicates()
    # cities_dict = {x: y for y,x in enumerate(cities)}
    db_session.close()

    # change city, venue, team1, team2, umpire1, umpire2
    # names to their corresponding 'ids' from database
    df['city'] = df['city'].apply(lambda x: cities_dict[x])
    df['venue'] = df['venue'].apply(lambda x: venues_dict[x])
    df['team1'] = df['team1'].apply(lambda x: teams_dict[x])
    df['team2'] = df['team2'].apply(lambda x: teams_dict[x])
    df['toss_winner'] = df['toss_winner'].apply(lambda x: teams_dict[x])
    df['winner'] = df['winner'].apply(lambda x: teams_dict[x])
    df['umpire1'] = df['umpire1'].apply(lambda x: officials_dict[x])
    df['umpire2'] = df['umpire2'].apply(lambda x: officials_dict[x])

    # create sqlalchemy engine
    engine = create_engine(db_uri)

    # Insert dataframe into 'Matches' table
    df.to_sql('Matches', con=engine, if_exists='replace', index=False)
Ejemplo n.º 2
0
def load_csv_data_into_tables(db_session):
    from lib.src.d02_intermediate.clean_data import clean_null_values_in_csv
    from lib.src.d03_processing.get_cleaned_data import get_dataframe

    # clean the raw 'matches.csv' file
    clean_null_values_in_csv()

    # get pandas dataframe from the cleaned up csv
    df = get_dataframe()

    # save cities and venues
    city_venue_df = df[['city', 'venue']].drop_duplicates()
    zip_city_venue = zip(city_venue_df['city'], city_venue_df['venue'])
    for cv in zip_city_venue:
        city = City()
        city.name = cv[0]
        db_session.add(city)
        db_session.commit()

        venue = Venue()
        venue.city_id = city.id
        venue.name = cv[1]
        db_session.add(venue)
        # db_session.commit()

    # save teams
    teams_df = df['team1'].drop_duplicates()
    teams_list = [team for team in teams_df]
    for team in teams_list:
        team_model = Team()
        team_model.name = team
        db_session.add(team_model)

    # save umpires
    umpire_set1 = set(df['umpire1'].drop_duplicates())
    umpire_set2 = set(df['umpire2'].drop_duplicates())
    all_umpires = umpire_set1.union(umpire_set2)
    for umpire in all_umpires:
        umpire_model = Official()
        umpire_model.name = umpire
        db_session.add(umpire_model)

    db_session.commit()

    # create a dict with key and name mappings for
    # cities, venues, teams and officials
    cities = db_session.query(City).all()
    cities_dict = {city.name: city.id for city in cities}
    venues = db_session.query(Venue).all()
    venues_dict = {venue.name: venue.id for venue in venues}
    teams = db_session.query(Team).all()
    teams_dict = {team.name: team.id for team in teams}
    officials = db_session.query(Official).all()
    officials_dict = {official.name: official.id for official in officials}

    # cities = df_matches['city'].drop_duplicates()
    # cities_dict = {x: y for y,x in enumerate(cities)}
    # db_session.close()

    # change city, venue, team1, team2, umpire1, umpire2
    # names to their corresponding 'ids' from database
    df['city'] = df['city'].apply(lambda x: cities_dict[x])
    df['venue'] = df['venue'].apply(lambda x: venues_dict[x])
    df['team1'] = df['team1'].apply(lambda x: teams_dict[x])
    df['team2'] = df['team2'].apply(lambda x: teams_dict[x])
    df['toss_winner'] = df['toss_winner'].apply(lambda x: teams_dict[x])
    df['winner'] = df['winner'].apply(lambda x: teams_dict[x])
    df['umpire1'] = df['umpire1'].apply(lambda x: officials_dict[x])
    df['umpire2'] = df['umpire2'].apply(lambda x: officials_dict[x])

    # create sqlalchemy engine
    engine = db_session.get_bind()

    # Insert dataframe into 'Matches' table
    df.to_sql('Matches', con=engine, if_exists='replace', index=False)