def df_stop_frequency(direction, for_lines=['4','5','6'], fname="default", write_df_root="stopFreq", dt=120): D = load_df(fname) get_stops = _get_stops(direction=direction, set_range="all") D = filter_stops(D, get_stops, for_lines) endpoint = "N" origin = "S" if lower(direction)[0] == "s": endpoint = "S" origin = "N" elif lower(direction)[0] != "n": print "Specified direction",direction,"not recognized; forcing NORTH" #station_codes = get_stop_dict(direction) times = D['timestamp'].unique() t_max = times.max() t_min = times.min() print t_min,t_max nbins = int((t_max - t_min)/dt) + 1 set_index = [int(t_min+n*dt) for n in range(nbins)] stops = D['stop'].unique() print "Filtered DF contains stops",stops df_freq = pd.DataFrame(index=set_index, columns=stops) df_freq = df_freq.fillna(0.) for i in D.index: t_bin = int((D.loc[i,'timestamp'] - t_min)/dt)*dt + int(t_min) df_freq.loc[t_bin, D.loc[i,'stop']] = df_freq.loc[t_bin, D.loc[i,'stop']] + 1 df_freq.to_csv(write_df_root+".csv") return df_freq
def df_trips_by_column(D, direction="N", for_lines=['4','5','6'], fname="default", write_df_root="tripData"): D['tref'] = D['timestamp'].map(lambda t: get_TOD_reference(t)) # This is cumbersome and probably inefficient... # But it's what I do to generate unique trip_ids to manipulate # Probably want to end up doing this as a pre-processing step D['long_id'] = D['id'] + "::" + D['tref'].astype('string') stops = _get_stops(direction=direction, set_range="all") D = filter_stops(D, stops, for_lines) endpoint = "N" origin = "S" if lower(direction)[0] == "s": endpoint = "S" origin = "N" elif lower(direction)[0] != "n": print "Specified direction",direction,"not recognized; forcing NORTH" station_codes = get_stop_dict(direction) trips = D['long_id'].unique() stops = D['stop'].unique() tripCol = pd.DataFrame(index=['line','trip_time','tref']+list(stops), columns = trips) tripTimes = pd.DataFrame(index = trips, columns = ['line','trip_time','time_of_day']) stopCounts = pd.DataFrame(index=list(stops), columns = trips) ## Should profile these loops and speed it up for trip in trips: D_trip = D[D['long_id']==trip] l = get_line(trip) tripCol.loc['line',trip] = l tripCol.loc['tref',trip] = D_trip['tref'].values[0] tripTimes.loc[trip,'line'] = l tripTimes.loc[trip,'time_of_day'] = l for stop in D_trip['stop'].unique(): D_trip_for_stop = D_trip[D_trip['stop']==stop] tripCol.loc[stop,trip] = D_trip_for_stop['timestamp'].max() stopCounts.loc[stop,trip] = len(D_trip_for_stop) trip_time = tripCol.loc[station_codes[stop_for(l,endpoint)],trip] - \ tripCol.loc[station_codes[stop_for(l,origin)],trip] tripCol.loc['trip_time',trip] = trip_time tripTimes.loc[trip,'trip_time'] = trip_time tripTimes.loc[trip,'time_of_day'] = \ TOD_value(tripCol.loc[station_codes[stop_for(l,origin)],\ trip],tripCol.loc['tref',trip])/3600. print "created DataFrames with data compiled by trip; writing to files with root", write_df_root tripCol.to_csv(write_df_root + "_verbose.csv") tripTimes.to_csv(write_df_root + "_trip_times.csv") stopCounts.to_csv(write_df_root + "_stop_counts.csv") return tripCol, tripTimes, stopCounts