# Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Read in Overage Data
    csv_name = "feburaryaddedutc"
    trips_df = pd.read_csv(os.path.join("data", csv_name + ".csv"), index_col=[0], dtype=str)
    # Drop duplicate records by Operator and TripID
    trips_df = trips_df.drop_duplicates(["Operator", "TripID"])
    print(len(trips_df['TripID']))
    # Fill missing with 0
    trips_df.fillna(0, inplace=True)
    # Create a new trip_id that includes the first letter of the operator name
    trips_df['UniqueTripID'] = trips_df['Operator'].str[0] + trips_df['TripID']
    # Drop records without an Operator
    trips_df = trips_df[trips_df['Operator'] != 0]
    # Create clean version of operator
    trips_df['OperatorClean'] = trips_df['Operator'].map(lambda x: x.split(" ")[0].lower().replace('limebike', 'lime'))
    # Set enddate equal to startdate if missing
    trips_df['EndDate'] = np.where(trips_df['EndDate'] == 0, trips_df['StartDate'], trips_df['EndDate'])
    trips_df['endposct'] = np.where(trips_df['endposct'] == 0, trips_df['posct'], trips_df['endposct'])
    trips_df['endutc'] = np.where(trips_df['endutc'] == 0, trips_df['startutc'], trips_df['endutc'])
    # Output final dataframe
    outname = csv_name + "pipe_delimited"
    trips_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|')
    # Create Database
    create_dockless_trips_org(cur)
    # Load to Database
    uf.aws_load(outname, "dockless_trips_org", cur)
    # Commit changes to database
    conn.commit()
    uf.set_env_path()
    conn, cur = uf.aws_connect()

    # Loop through all CSVs in cabi trip data folder
    cabi_trip_dir = r'./data/Cabi_Trips_MemberType'

    # Convert trip data from CSV to dataframe
    combined_df = trips_to_df(cabi_trip_dir)

    # Sort by StartDate and Add trip_id
    combined_df.sort_values(['Startdate'], inplace=True)
    combined_df.reset_index(inplace=True)
    combined_df.drop(['index'], axis=1, inplace=True)

    combined_df['trip_id'] = combined_df.index + 1

    # Output dataframe as CSV
    outname = "Cabi_Trips_MemberType"
    combined_df.to_csv(os.path.join("data", outname + ".csv"),
                       index=False,
                       sep='|')

    # Create Table in AWS
    create_cabi_trips_membertype(cur)

    # Load to Database
    uf.aws_load(outname, "cabi_trips_membertype", cur)

    # Commit changes to database
    conn.commit()
Example #3
0
    conn, cur = uf.aws_connect()

    # Gather Outage data
    start_date = datetime.datetime(2018, 4, 1)
    end_date = datetime.datetime(2018, 4, 30)
    date_list = date_list(start_date, end_date)
    df_list = gather_out_data(date_list)

    # Combine daily dataframes
    combined_df = pd.concat(df_list, axis=0)

    # Drop any fields that do not have numeric terminal number
    combined_df = combined_df[combined_df['Terminal Number'].astype(str).str.isdigit()]

    # Add outage_id continuing from last record in AWS table
    out_id_df = pd.read_sql("""SELECT outage_id from cabi_out_hist order by outage_id desc LIMIT 1 """, con=conn)
    last_outage_id = out_id_df['outage_id'].iloc[0]
    combined_df.reset_index(inplace=True)
    combined_df['outage_id'] = combined_df.index + 1 + last_outage_id
    # Drop unneeded fields
    combined_df.drop(['index', 'Station Name'], axis=1, inplace=True)
    # Output dataframe as CSV
    outname = "CaBi_Tracker_Outage_History_From_" + start_date.strftime('%Y-%m-%d') + "_To_" + end_date.strftime('%Y-%m-%d')
    combined_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|')
    # Create Database
    create_cabi_out_hist(cur)
    # Load to Database
    uf.aws_load(outname, "cabi_out_hist", cur)
    # Commit changes to database
    conn.commit()
Example #4
0
    summary_loc = os.path.join("data", "Summary data.xlsx")
    summary_df = pd.read_excel(summary_loc, sheet_name="Summary")

    # Lowercase Operator and replace limebike with lime
    summary_df['Operator'] = summary_df['Operator'].str.lower().str.replace(
        'limebike', 'lime').str.split(' ').str[0]

    # Standardize month
    summary_df['Month'] = summary_df['Month'].apply(
        lambda x: convert_month_toDT(x, cur))

    # Keep only Trips and Bikes
    keep_cols = ['Operator', 'Month', 'TotalTrips', 'TotalBikes']
    summary_df = summary_df[keep_cols]

    # Drop Records without Trips and Bikes
    summary_df = summary_df[summary_df['TotalBikes'].notnull()]
    # Output dataframe as CSV
    outname = "dockless_summary"
    summary_df.to_csv(os.path.join("data", outname + ".csv"),
                      index=False,
                      sep='|')

    # Create Table in AWS
    create_dockless_summary(cur)
    # Load to Database
    uf.aws_load(outname, "dockless_summary", cur)

    # Commit changes to database
    conn.commit()
def create_cabi_system(cur):
    # This script creates the CaBi System AWS table
    cur.execute("""
    DROP TABLE cabi_system;
    CREATE TABLE cabi_system(
        region_id serial PRIMARY KEY,
        name varchar(100),
        code text
    )
            """)


if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Pull CaBi System Regions
    regions_df = pull_region_info()
    # Merge on region code
    regions_df = regions_df.merge(region_code(), on='name', how='left')
    regions_df = regions_df[['region_id', 'name', 'code']]
    # Output dataframe as CSV
    outname = "CaBi_System"
    regions_df.to_csv(outname + ".csv", index=False, sep='|')
    # Create Table
    create_cabi_system(cur)
    # Load to Database
    uf.aws_load(outname, "cabi_system", cur)
    # Commit changes to database
    conn.commit()
Example #6
0
        polygon geography);
            """)


if __name__ == '__main__':
    # Set environmental variable
    uf.set_env_path()
    # Connect to AWS
    conn, cur = uf.aws_connect()
    # Download DC ANCs and Neighborhood Clusters From Open Data DC Website as JSONs
    json_id_dict = {
        'ngh': "f6c703ebe2534fc3800609a07bad8f5b_17",
        'anc': "fcfbf29074e549d8aff9b9c708179291_1"
    }
    for json_name, json_id in json_id_dict.items():
        combined_df = extract_json(json_id)
        # Output dataframe as CSV
        outname = "json_name"
        combined_df.to_csv(os.path.join("data", outname + ".csv"),
                           index=False,
                           sep='|')
        # Create Database
        if json_name == 'ngh':
            create_ngh(cur)
        else:
            create_anc(cur)
        # Load to Database
        uf.aws_load(outname, json_name, cur)
        # Commit changes to database
        conn.commit()
        name varchar(200),
        region_id integer,
        rental_methods varchar(200),
        rental_url varchar(200),
        short_name varchar(20),
        station_id integer PRIMARY KEY)
            """)


if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Pull CaBi System Regions
    station_df = pull_station_info()
    # Default any missing region ids to DC (only example as of 2/21 is new station at Anacostia Park)
    station_df['region_id'] = np.where(station_df['region_id'].isnull(), 42,
                                       station_df['region_id'])
    station_df['region_id'] = station_df['region_id'].astype(int)
    # Output dataframe as CSV
    outname = "CaBi_Stations_Temp"
    station_df.to_csv(os.path.join("data", outname + ".csv"),
                      index=False,
                      sep='|')
    # Create Database
    create_cabi_stations_temp(cur)
    # Load to Database
    uf.aws_load(outname, "cabi_stations_temp", cur)
    # Commit changes to database
    conn.commit()
Example #8
0
    final_dfs = [second_df, cabi_df, dless_df]
    df_final = reduce(
        lambda left, right: pd.merge(
            left, right, how="left", left_index=True, right_index=True),
        final_dfs)
    df_final.drop_duplicates(inplace=True)

    # Define numeric columns for create table statement prior to resetting index to bring back date field
    numeric_cols = ", ".join([col + " numeric" for col in df_final.columns])
    df_final.fillna(0, inplace=True)
    df_final.reset_index(inplace=True)

    # Output final DataFrame
    outname = "final_db_pipe_delimited"
    df_final.to_csv(outname + ".csv", index=False, sep='|')

    # CREATE TABLE on AWS with and without timestamp
    TIMESTR = time.strftime("%Y%m%d_%H%M%S")
    db_name = "final_db_" + TIMESTR
    cur.execute(
        sql.SQL("CREATE TABLE {0}(date date PRIMARY KEY," + numeric_cols +
                ")").format(sql.Identifier(db_name)))
    cur.execute(
        "DROP TABLE final_db; CREATE TABLE final_db(date date PRIMARY KEY," +
        numeric_cols + ")")
    # Load to Database
    uf.aws_load(outname, db_name, cur)
    uf.aws_load(outname, 'final_db', cur)
    # Commit changes to database
    conn.commit()

def create_jump_price(cur):
    # This script loads data to the jump_price AWS table
    cur.execute("""
    DROP TABLE jump_price;
    CREATE TABLE jump_price(
        min_seconds integer PRIMARY KEY,
        max_seconds integer,
        cost numeric
        )
            """)


if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Read in Overage Data
    csv_name = "JumpTripCosts"
    overage_df = pd.read_csv(csv_name + ".csv")
    # Output final dataframe to memory
    outname = csv_name + "pipe_delimited"
    overage_df.to_csv(outname + ".csv", index=False, sep='|')
    # Create Table
    create_jump_price(cur)
    # Load to Database
    uf.aws_load(outname, "jump_price", cur)
    # Commit changes to database
    conn.commit()
    conn, cur = uf.aws_connect()

    # Load Ofo user from report provided by DDOT
    user_report_loc = os.path.join("data", "User ids.xlsx")
    ofo_df = pd.read_excel(user_report_loc, sheet_name="ofo User IDs")
    # Drop second row since it's a second header
    ofo_df = ofo_df[ofo_df['October'] != 'UserID']
    ofo_df_stack = ofo_df.stack()
    ofo_df_stack = ofo_df_stack.reset_index()
    # Define Month in Datetime
    ofo_df_stack['usage_month'] = ofo_df_stack['level_1'].apply(
        convert_month_toDT)
    ofo_df_stack.drop(['level_0', 'level_1'], axis=1, inplace=True)
    ofo_df_stack.columns = ['user_id', 'usage_month']
    # Count trips by User and month
    ofo_count_df = ofo_df_stack.groupby(['user_id', 'usage_month']).size()
    ofo_count_df = ofo_count_df.reset_index()
    ofo_count_df.columns = ['user_id', 'usage_month', 'trips']
    ofo_count_df.sort_values(['user_id', 'usage_month'], inplace=True)
    # Output dataframe as CSV
    outname = "ofo_users"
    ofo_count_df.to_csv(os.path.join("data", outname + ".csv"),
                        index=False,
                        sep='|')
    # Create Table in AWS
    create_ofo_users(cur)
    # Load to Database
    uf.aws_load(outname, "ofo_users", cur)
    # Commit changes to database
    conn.commit()
Example #11
0
    results_df = pd.concat([results_df, first_record_df], axis=0)
    results_df = results_df[:-1]
    # Fill in zeros missing data
    results_df.fillna(0, inplace=True)
    # Convert precipIntensityMaxTime to integer
    results_df['precipintensitymaxtime'] = results_df[
        'precipintensitymaxtime'].astype('int')
    # Drop weather fields new to 2018
    drop_cols = [
        'ozone', 'uvindex', 'uvindextime', 'windgust', 'windgusttime', 'time'
    ]
    for drop_col in drop_cols:
        if drop_col in results_df.columns:
            results_df = results_df.drop([drop_col], axis=1)
    # Reorder columns based on current table
    results_df['day_time'] = results_df['day_time'].astype(int)
    results_df = results_df[first_record_df.columns]
    # Output final dataframe
    TIMESTR = time.strftime("%Y%m%d_%H%M%S")
    outname = "Dark_Sky_From_" + start_date.strftime(
        '%Y-%m-%d') + "_To_" + end_date.strftime('%Y-%m-%d')
    results_df.to_csv(os.path.join("data", outname + ".csv"),
                      index=False,
                      sep='|')
    # Create Table
    create_dark_sky_raw(cur)
    # Load to Database
    uf.aws_load(outname, "dark_sky_raw", cur)
    # Commit changes to database
    conn.commit()
if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()

    # Load raw CSV as Dataframe
    raw_df = pd.read_csv(os.path.join("data", "bikes_per_day.csv"))

    # Lowercase provider and replace limebike with lime
    raw_df['provider'] = raw_df['provider'].str.lower().str.replace(
        'limebike', 'lime')

    # Replace data outliers with reasonable number for Jump
    jump_outlier_mask = (raw_df['ride_date'].isin(
        ['2018-02-22', '2018-02-23'])) & (raw_df['provider'] == 'jump')
    raw_df.loc[jump_outlier_mask, 'daily_bikes_available'] = 71
    print(raw_df.tail(20))

    # Output dataframe as CSV
    outname = "dockless_bikes_api"
    raw_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|')

    # Create Table in AWS
    create_dockless_bikes_api(cur)
    # Load to Database
    uf.aws_load(outname, "dockless_bikes_api", cur)

    # Commit changes to database
    conn.commit()
def create_cabi_price(cur):
    # This script creates the CaBi Price AWS table
    cur.execute("""
    DROP TABLE cabi_price;
    CREATE TABLE cabi_price(
        min_seconds integer PRIMARY KEY,
        max_seconds integer,
        casual_cost numeric,
        member_cost numeric
        )
    """)


if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Read in Overage Data
    csv_name = "CaBiOverageCosts"
    overage_df = pd.read_csv(csv_name + ".csv")
    # Output final dataframe
    outname = csv_name + "pipe_delimited"
    overage_df.to_csv(outname + ".csv", index=False, sep='|')
    # Create Database
    create_cabi_price(cur)
    # Load to Database
    uf.aws_load(outname, "cabi_price", cur)
    # Commit changes to database
    conn.commit()
Example #14
0
    df_dc_month = df_dc_month.append(year_month_1718)

    # Fill down growth rates for population and bike
    df_dc_month['grow_rate'] = df_dc_month['grow_rate'].fillna(method='ffill')
    df_dc_month['pct_bike'] = df_dc_month['pct_bike'].fillna(method='ffill')
    df_dc_month = df_dc_month.reset_index()

    # Apply growth rate to population figures
    for x in range(11):
        building_month()
    df_dc_month['citypop'] = df_dc_month['citypop'].fillna(value=0)
    for x in df_dc_month.index:
        apply_rate()

    # Keep final fields and drop any additional records
    df_dc_month['day'] = 1
    df_dc_month['pop_date'] = pd.to_datetime(
        df_dc_month[['year', 'month', 'day']])
    df_dc_month = df_dc_month[['pop_date', 'citypop', 'grow_rate', 'pct_bike']]
    df_dc_month = df_dc_month.dropna(how='any')

    # Output dataframe as CSV
    outname = "dc_pop"
    df_dc_month.to_csv(outname + ".csv", index=False, sep='|')
    # Create Table
    create_dc_pop(cur)
    # Load to Database
    uf.aws_load(outname, "dc_pop", cur)
    # Commit changes to database
    conn.commit()
Example #15
0
                'Local Purchase Month', 'Total Number Bikes Purchased'
            ]
            df = df[keep_cols]
            df.columns = ['Local Purchase Month', membership_type]
        # set month as index
        df.set_index(['Local Purchase Month'], inplace=True)
        # Append dataframe to list
        df_list.append(df)

    # Combine Dataframe
    combined_df = pd.concat(df_list, axis=1)
    combined_df.reset_index(inplace=True)
    combined_df['Local Purchase Month'] = combined_df[
        'Local Purchase Month'].astype('datetime64[ns]')
    # Fill missing with zeros
    combined_df = combined_df.fillna(0)
    # Output dataframe as CSV
    outname = "cabi_membership"
    combined_df.to_csv(os.path.join("data", outname + ".csv"),
                       index=False,
                       sep='|')

    # Create Table in AWS
    create_cabi_membership(cur)

    # Load to Database
    uf.aws_load(outname, "cabi_membership", cur)

    # Commit changes to database
    conn.commit()
Example #16
0
    DROP TABLE IF EXISTS """ + ml_name + """;
    CREATE TABLE """ + ml_name + """(
        date date,
        dless numeric,
        error numeric,
        neg_error numeric,
        t date,
        yhat numeric,
        ytest numeric,
        dless_impact numeric);
    """)


if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    for ml_file in os.listdir(os.path.join("data", "ml")):
        # Read in Overage Data
        csv_name = ml_file.replace(".csv", "")
        ml_df = pd.read_csv(os.path.join("data", "ml", csv_name + ".csv"))
        # Output final dataframe
        outname = csv_name + "pipe_delimited"
        ml_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|')
        # Create Database
        create_ml(cur, ml_name=csv_name)
        # Load to Database
        uf.aws_load(outname, csv_name, cur)
        # Commit changes to database
        conn.commit()
    # Loads data to the dockless_price AWS table
    cur.execute("""
    DROP TABLE dockless_price;
    CREATE TABLE dockless_price(
        min_seconds integer PRIMARY KEY,
        max_seconds integer,
        limebike numeric,
        spin numeric,
        ofo numeric,
        mobike numeric
        )
            """)


if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Read in Overage Data
    csv_name = "DocklessTripCosts"
    overage_df = pd.read_csv(csv_name + ".csv")
    # Output final dataframe
    outname = csv_name + "pipe_delimited"
    overage_df.to_csv(outname + ".csv", index=False, sep='|')
    # Create Table
    create_dockless_price(cur)
    # Load to Database
    uf.aws_load(outname, "dockless_price", cur)
    # Commit changes to database
    conn.commit()
if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Connect to Google Calendar
    CAL = gcal_authorization()
    # Pull all WABA Events
    results_df = pull_waba_events(CAL)
    # Limit calendar events to those that are significant based on subjective filtering
    sign_events_df = limit_sign_events(results_df)
    # Combine start date and start date time
    sign_events_df['final_date'] = combine_dates()
    # Set final date as index and keep only summary and id
    sign_events_df = sign_events_df[['id', 'final_date', 'summary']]
    sign_events_df.sort_values(['final_date'], inplace=True)
    # Output full WABA calendar
    TIMESTR = time.strftime("%Y%m%d_%H%M%S")
    filename = "WABA_Calendar_" + TIMESTR + ".csv"
    filepath = os.path.join(filename)
    results_df.to_csv(filepath, index=True)
    # Output significant events
    outname = "WABA_Significant_Events" + TIMESTR
    sign_events_df.to_csv(outname + ".csv", index=False, sep='|')
    # Create Database
    create_bike_events(cur)
    # Load to Database
    uf.aws_load(outname, "bike_events", cur)
    # Commit changes to database
    conn.commit()
if __name__ == "__main__":
    # Pull CaBi Trips Data from AWS S3 Store
    links = get_links()
    download_links(links)
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Loop through all CSVs in cabi trip data folder
    cabi_trip_dir = '../cabi_trip_data'
    # Convert trip data from CSV to dataframe
    combined_df = trips_to_df(cabi_trip_dir)
    # Add trip_id continuing from last record in AWS table
    trip_id_df = pd.read_sql(
        """SELECT trip_id from cabi_trips order by outage_id desc LIMIT 1 """,
        con=conn)
    last_trip_id = trip_id_df['trip_id'].iloc[0]
    combined_df.reset_index(inplace=True)
    combined_df['trip_id'] = combined_df.index + 1 + last_trip_id
    # Drop unneeded fields
    combined_df.drop(['index'], axis=1, inplace=True)
    # Output dataframe as CSV
    outname = "CaBi_Trip_Data"
    combined_df.to_csv(outname + ".csv", index=False, sep='|')
    # Create Table
    create_cabi_trips(cur)
    # Load to Database
    uf.aws_load(outname, "cabi_trips", cur)
    # Commit changes to database
    conn.commit()
        # Use beautifulsoup?
        soup = BeautifulSoup(r.text, "lxml")
        # Extract schedule table from HTML
        schedule = soup.find('div', attrs={'class': 'overthrow table_container'})
        # Create dataframe
        schedule_df = pd.read_html(str(schedule))[0]
        # Drop extra header rows embeded in dataframe
        schedule_df = schedule_df[schedule_df['Gm#'] != 'Gm#']
        # Format date field
        schedule_df['Date'] = format_date_field(schedule_df['Date'])
        # Append games from current iteration (season) to list
        schedule_df_list.append(schedule_df)
    # Concat all the dataframe into one big schedule
    nats_sched = pd.concat(schedule_df_list, axis=0)
    nats_sched.rename(columns={'Unnamed: 2': 'BoxScore', 'Unnamed: 4': 'Home_Away'}, inplace=True)
    # Keep only games played
    nats_sched = nats_sched[nats_sched['BoxScore'] == 'boxscore']
    # Fill in zeros for integer columns
    integer_columns = ['R', 'RA', 'Inn', 'Rank', 'Attendance']
    nats_sched[integer_columns] = nats_sched[integer_columns].fillna(value=0)

    # Output dataframe as CSV
    outname = "nats_attendance"
    nats_sched.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|')
    # Create Table in AWS
    create_nats_attendance(cur)
    # Load to Database
    uf.aws_load(outname, "nats_attendance", cur)
    # Commit changes to database
    conn.commit()
Example #21
0
        tranwork integer,
        carpool integer,
        riders integer,
        trantime integer,
        qtrantim integer,
        qtranwor integer)
        )
    """)


if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Load CSV FROM IPMUS Download as Dataframe
    csv_name = 'usa_00007'
    acs_df = pd.read_csv(csv_name + ".csv")
    # Set Primary Key
    acs_df.reset_index(inplace=True)
    # Fill all na with zeros
    acs_df.fillna(0, inplace=True)
    # Output dataframe as CSV
    outname = csv_name + "_pipe_delimited"
    acs_df.to_csv(outname + ".csv", index=False, sep='|')
    # Create Database
    create_acs(cur)
    # Load to Database
    uf.aws_load(outname, "acs", cur)
    # Commit changes to database
    conn.commit()

def create_metro_hours(cur):
    # This script creates the bike events AWS table
    cur.execute("""
    DROP TABLE IF EXISTS metro_hours;
    CREATE TABLE metro_hours (
        day_of_week integer,
        op_status text,
        start_time time,
        end_time time)
    """)


if __name__ == "__main__":
    # Connect to AWS
    uf.set_env_path()
    conn, cur = uf.aws_connect()
    # Load CSV FROM IPMUS Download as Dataframe
    csv_name = 'metro_hours'
    metro_hours_df = pd.read_csv(os.path.join("data", csv_name) + ".csv")
    # Output dataframe as CSV
    outname = csv_name + "_pipe_delimited"
    metro_hours_df.to_csv(os.path.join("data", outname) + ".csv", index=False, sep='|')
    # Create Database
    create_metro_hours(cur)
    # Load to Database
    uf.aws_load(outname, "metro_hours", cur)
    # Commit changes to database
    conn.commit()