def merge_airport_stations(data, origin_or_dest): rename_dict = { 'iata': origin_or_dest, 'station_identifier': origin_or_dest + '_' + 'station_identifier', 'airport_to_station': origin_or_dest + '_' + 'airport_to_station' } df = airport_stations[rename_dict.keys()].rename(columns=rename_dict) data = fp.merge('m:1', data, df, on=origin_or_dest, how='left') return data
def merge_weather(data, origin_or_dest): data = fp.merge('m:1', data, weather, left_on=['FlightDate', '{}_station_identifier'.format(origin_or_dest)], right_on=['date', 'station_identifier'], how='left') data.rename(columns={'PRCP': '{}PRCP'.format(origin_or_dest), 'SNOW': '{}SNOW'.format(origin_or_dest), 'SNWD': '{}SNWD'.format(origin_or_dest), 'TMAX': '{}TMAX'.format(origin_or_dest), 'TMIN': '{}TMIN'.format(origin_or_dest)}, inplace=True) data = data.drop(axis=1, labels=['date', 'station_identifier']) return data
rename_dict = { 'iata': origin_or_dest, 'station_identifier': origin_or_dest + '_' + 'station_identifier', 'airport_to_station': origin_or_dest + '_' + 'airport_to_station' } df = airport_stations[rename_dict.keys()].rename(columns=rename_dict) data = fp.merge('m:1', data, df, on=origin_or_dest, how='left') return data data = store['On_Time_On_Time_Performance'] data = merge_airport_stations(data, 'Origin') data = merge_airport_stations(data, 'Dest') # merge in carrier names to flights dataset carriers = get_carriers() data = fp.merge('m:1', data, carriers, on='UniqueCarrier', how='left') # merge in plane level info plane_info = pd.read_csv(output_dir + 'plane_info.csv') plane_info = plane_info[[ 'Engine Model', 'MFR Year', 'Manufacturer Name', 'Model', 'TailNum' ]] data = fp.merge('m:1', data, plane_info, on='TailNum', how='left') # create flight date # this is slow but will run, the below commented out .apply was breaking FlightDate = [] for i, row in data.iterrows(): print '{} of {}'.format(i, len(data)) year=int(row['Year'])
# check duplicate links in bios data - keep those with bio_image idx = df.link.duplicated(keep=False) dups = df[idx].sort_values('link') dups['drop'] = dups['bio_image'].isnull() df = df.join(dups['drop']) df = df[df['drop'] != True] df.drop('drop', axis=1, inplace=True) return df if __name__ == '__main__': # only need to run this once #insert_static_json_into_mongo() df_bios = mongo_to_dataframe('nobel_prize', 'mini_bios') df_winners = mongo_to_dataframe('nobel_prize', 'winners') # Clean Data df_winners, df_winners_born_in = clean_winners(df_winners) df_bios = clean_bios(df_bios) # create merged dataset df_winners_all = fp.merge("m:1", df_winners, df_bios,on='link', how='left') # save to mongo dataframe_to_mongo(df_winners, 'nobel_prize', 'winners_clean') dataframe_to_mongo(df_winners_born_in, 'nobel_prize', 'winners_born_in_clean') dataframe_to_mongo(df_winners_all, 'nobel_prize', 'winners_all_clean')