Ejemplo n.º 1
0
 def merge_airport_stations(data, origin_or_dest):
     rename_dict = {
         'iata': origin_or_dest,
         'station_identifier': origin_or_dest + '_' + 'station_identifier',
         'airport_to_station': origin_or_dest + '_' + 'airport_to_station'
     }
     df = airport_stations[rename_dict.keys()].rename(columns=rename_dict)
     data = fp.merge('m:1', data, df, on=origin_or_dest, how='left')
     return data
Ejemplo n.º 2
0
    def merge_weather(data, origin_or_dest):
        data = fp.merge('m:1', data, weather,
                        left_on=['FlightDate',
                                 '{}_station_identifier'.format(origin_or_dest)],
                        right_on=['date', 'station_identifier'],
                        how='left')

        data.rename(columns={'PRCP': '{}PRCP'.format(origin_or_dest),
                             'SNOW': '{}SNOW'.format(origin_or_dest),
                             'SNWD': '{}SNWD'.format(origin_or_dest),
                             'TMAX': '{}TMAX'.format(origin_or_dest),
                             'TMIN': '{}TMIN'.format(origin_or_dest)},
                    inplace=True)
        data = data.drop(axis=1, labels=['date', 'station_identifier'])
        return data
Ejemplo n.º 3
0
        rename_dict = {
            'iata': origin_or_dest,
            'station_identifier': origin_or_dest + '_' + 'station_identifier',
            'airport_to_station': origin_or_dest + '_' + 'airport_to_station'
        }
        df = airport_stations[rename_dict.keys()].rename(columns=rename_dict)
        data = fp.merge('m:1', data, df, on=origin_or_dest, how='left')
        return data

    data = store['On_Time_On_Time_Performance']
    data = merge_airport_stations(data, 'Origin')
    data = merge_airport_stations(data, 'Dest')

    # merge in carrier names to flights dataset
    carriers = get_carriers()
    data = fp.merge('m:1', data, carriers, on='UniqueCarrier', how='left')

    # merge in plane level info
    plane_info = pd.read_csv(output_dir + 'plane_info.csv')
    plane_info = plane_info[[
        'Engine Model',	'MFR Year',	'Manufacturer Name', 'Model', 'TailNum'
    ]]
    data = fp.merge('m:1', data, plane_info, on='TailNum', how='left')

    # create flight date

    # this is slow but will run, the below commented out .apply was breaking
    FlightDate = []
    for i, row in data.iterrows():
        print '{} of {}'.format(i, len(data))
        year=int(row['Year'])
    # check duplicate links in bios data - keep those with bio_image
    idx = df.link.duplicated(keep=False)
    dups = df[idx].sort_values('link')
    dups['drop'] = dups['bio_image'].isnull()

    df = df.join(dups['drop'])
    df = df[df['drop'] != True]
    df.drop('drop', axis=1, inplace=True)

    return df


if __name__ == '__main__':

    # only need to run this once
    #insert_static_json_into_mongo()

    df_bios = mongo_to_dataframe('nobel_prize', 'mini_bios')
    df_winners = mongo_to_dataframe('nobel_prize', 'winners')

    # Clean Data
    df_winners, df_winners_born_in = clean_winners(df_winners)
    df_bios = clean_bios(df_bios)

    # create merged dataset
    df_winners_all = fp.merge("m:1", df_winners, df_bios,on='link', how='left')

    # save to mongo
    dataframe_to_mongo(df_winners, 'nobel_prize', 'winners_clean')
    dataframe_to_mongo(df_winners_born_in, 'nobel_prize', 'winners_born_in_clean')
    dataframe_to_mongo(df_winners_all, 'nobel_prize', 'winners_all_clean')