def load_schedule(tdate,dpath): trips = gtfs.load_trips(tdate,dpath) stop_times, tz_sched = gtfs.load_stop_times(tdate,dpath) tcal=gtfs.TransitCalendar(tdate,dpath) active_services = tcal.get_service_ids(tdate) active_trips = trips.service_id.isin(active_services) active_stops = stop_times.reset_index().set_index('trip_id').loc[active_trips] active_stops['sched_hour'] = active_stops.arrival_time.str[:2].astype(int) active_stops['sched_arrival_time'] = active_stops.arrival_time.apply(ttools.parseTime) sched_times = active_stops.join(trips['route_id'],how='left') sched_times = sched_times.reset_index().sort(['route_id','sched_arrival_time']) sec = ttools.datetime.timedelta(seconds=1) sched_times['sched_headway'] = sched_times.groupby(['route_id','stop_id'])['sched_arrival_time'].diff()/sec sched_times.set_index(['trip_id','stop_id'],inplace=True,verify_integrity=True) return sched_times
def load_schedule(tdate, dpath): trips = gtfs.load_trips(tdate, dpath) stop_times, tz_sched = gtfs.load_stop_times(tdate, dpath) tcal = gtfs.TransitCalendar(tdate, dpath) active_services = tcal.get_service_ids(tdate) active_trips = trips.service_id.isin(active_services) active_stops = stop_times.reset_index().set_index( 'trip_id').loc[active_trips] active_stops['sched_hour'] = active_stops.arrival_time.str[:2].astype(int) active_stops['sched_arrival_time'] = active_stops.arrival_time.apply( ttools.parseTime) sched_times = active_stops.join(trips['route_id'], how='left') sched_times = sched_times.reset_index().sort( ['route_id', 'sched_arrival_time']) sec = ttools.datetime.timedelta(seconds=1) sched_times['sched_headway'] = sched_times.groupby( ['route_id', 'stop_id'])['sched_arrival_time'].diff() / sec sched_times.set_index(['trip_id', 'stop_id'], inplace=True, verify_integrity=True) return sched_times
scheduled arrival time at one stop for one example trip. """ import os import pandas as pd # these two modules are homemade import gtfs import arrivals import time os.chdir('/gpfs2/projects/project-bus_capstone_2016/workspace/share') # get all the schedule data. (subset can be created later) trips = gtfs.load_trips('gtfs/') stops = gtfs.load_stops('gtfs/') stop_times, tz_sched = gtfs.load_stop_times('gtfs/') print 'Finished loading GTFS data.' # get the sample of parsed AVL data. Beware, takes a few minutes. bustime = pd.read_csv('newdata_parsed.csv')#,parse_dates=dt_columns) qstr = ('Trip == "MTA NYCT_MV_B6-Weekday-SDon-038500_M5_203" or ' 'Trip == "MTA NYCT_MV_B6-Weekday-SDon-036500_M5_202" or ' 'Trip == "MTA NYCT_MV_B6-Weekday-SDon-040000_M5_204"') bustime = bustime.query(qstr) bustime.drop_duplicates(['vehicleID','RecordedAtTime'],inplace=True) bustime.set_index(['Line','Trip','TripDate','vehicleID','RecordedAtTime'], inplace=True,drop=True,verify_integrity=True) # for now, use a truncated data set. just get data for one line (M5). tripDateLookup = "2016-06-13" lineLookup = "MTA NYCT_M5"
# For each trip id # For each record (different time stamps?) # Get position import os import pandas as pd import numpy as np import matplotlib.pylab as plt import gtfs import arrivals os.chdir('/green-projects/project-bus_capstone_2016/workspace/share') trips = gtfs.load_trips('gtfs/') stops = gtfs.load_stops('gtfs/') stop_times = gtfs.load_stop_times('gtfs/') print 'Finished loading GTFS data.'
import gtfs #homemade module os.chdir('/gpfs2/projects/project-bus_capstone_2016/workspace/share') schedule_samples = ['2015-01-04','2015-04-05','2015-06-27','2015-07-06','2015-09-05','2015-09-15','2015-10-12'] for i in range(len(schedule_samples)): print 'Beginning season of ' + schedule_samples[i] if i == len(schedule_samples)-1: break base = datetime.strptime(schedule_samples[i], '%Y-%m-%d') numdays = datetime.strptime(schedule_samples[i+1], '%Y-%m-%d') - base date_list = [base + ttools.datetime.timedelta(days=(x-1)) for x in range(0, numdays.days)] ss = schedule_samples[i] # get all the schedule data. (subset can be created later) trips = gtfs.load_trips(ss,'gtfs/') stop_times, tz_sched = gtfs.load_stop_times(ss,'gtfs/') stop_times['arrival_time'] = pd.to_timedelta(stop_times['arrival_time']) print 'Finished loading season schedule' # pd.DataFrame(columns=['date','count','mean','std','min','25%','50%','75%','max']).to_csv(ss+'_schedules.csv',index=False) pd.DataFrame(columns=['route_id','measure_name','measure','schedule_date']).to_csv(ss+'_schedules.csv') for dd in date_list: ds = datetime.strftime(dd,'%Y-%m-%d') try: tcal=gtfs.TransitCalendar(ds) day_services = tcal.get_service_ids(ds) day_trips = trips.service_id.isin(day_services) day_stops = stop_times.reset_index(level=1).loc[day_trips] day_stops.set_index('stop_id',append=True,inplace=True) trip_durations = day_stops.groupby(level=(0))['arrival_time'].max()- day_stops.groupby(level=(0))['arrival_time'].min() trip_durations = pd.DataFrame(trip_durations).join(trips['route_id'],how='left').set_index('route_id',append=True)
def first_ping_index(row): # for a row from stop_times, return indexes of nearby points from KDTree trip_id = row.name[0] # trip_id is contained in the row index tree = trees.xs((trip_id, "2016-06-13"), level=(1, 2)).values[0] nearby = tree.query_ball_point([row[0][0], row[0][1]], r=0.001) if len(nearby) == 0: # sometimes there are no nearby points return None else: return min(nearby) # get all the schedule data. (subset can be created later) trips = gtfs.load_trips("gtfs/") stops = gtfs.load_stops("gtfs/") stop_times, tz_sched = gtfs.load_stop_times("gtfs/") print "Finished loading GTFS data." # get the sample of parsed AVL data. Beware, large files take more time. bustime = pd.read_csv("newdata_parsed.csv") # ,parse_dates=dt_columns) bustime.drop_duplicates(["vehicleID", "RecordedAtTime"], inplace=True) bustime["Trip"] = bustime["Trip"].str.replace("MTA NYCT_", "") bustime.set_index( ["Line", "Trip", "TripDate", "vehicleID", "RecordedAtTime"], inplace=True, drop=True, verify_integrity=True ) # for demonstration, use a subset. Just get data for one line (M5) on one day. tripDateLookup = "2016-06-13" lineLookup = "MTA NYCT_M5" bustime = bustime.xs((lineLookup, tripDateLookup), level=(0, 2), drop_level=False) # note that the AVL dataframe must be sorted by timestammp, since iloc[]
how='left') masker = filtered.apply(valid_stop, axis=1) filtered.drop('stop_id', axis=1, inplace=True) return filtered[masker] if __name__ == '__main__': infile = sys.argv[1] sched_date = sys.argv[2] gtfspath = sys.argv[3] outfile = sys.argv[1][:-4] + '_cleaned.csv' # get the sample of parsed AVL data. Beware, large files take more time. bustime = pd.read_csv(infile, header=None) bustime.columns = [ 'ROUTE_ID', 'latitude', 'longitude', 'recorded_time', 'vehicle_id', 'TRIP_ID', 'trip_date', 'SHAPE_ID', 'STOP_ID', 'distance_stop', 'distance_shape', 'status', 'destination' ] bustime.drop_duplicates(['vehicle_id', 'recorded_time'], inplace=True) bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTA NYCT_', '') bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTABC_', '') bustime['STOP_ID'] = bustime['STOP_ID'].str.replace('MTA_', '') print 'Finished loading Bus Time data.' stop_times = gtfs.load_stop_times(sched_date, 'gtfs/')[0] print 'Finished loading GTFS data.' filtered = filter_invalid_stops(bustime, stop_times) filtered.to_csv(outfile, index=False)
valid_stops = st.groupby(level=0)['stop_id'].apply(list) filtered = avl_df.merge(pd.DataFrame(valid_stops),left_on='TRIP_ID', right_index=True,how='left') masker = filtered.apply(valid_stop,axis=1) filtered.drop('stop_id',axis=1,inplace=True) return filtered[masker] if __name__=='__main__': infile = sys.argv[1] sched_date = sys.argv[2] gtfspath = sys.argv[3] outfile = sys.argv[1][:-4]+'_cleaned.csv' # get the sample of parsed AVL data. Beware, large files take more time. bustime = pd.read_csv(infile,header=None) bustime.columns = ['ROUTE_ID','latitude','longitude','recorded_time', 'vehicle_id','TRIP_ID','trip_date','SHAPE_ID', 'STOP_ID','distance_stop','distance_shape','status', 'destination'] bustime.drop_duplicates(['vehicle_id','recorded_time'],inplace=True) bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTA NYCT_','') bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTABC_','') bustime['STOP_ID'] = bustime['STOP_ID'].str.replace('MTA_','') print 'Finished loading Bus Time data.' stop_times = gtfs.load_stop_times(sched_date,'gtfs/')[0] print 'Finished loading GTFS data.' filtered = filter_invalid_stops(bustime,stop_times) filtered.to_csv(outfile,index=False)