def __init__(self, sfmuni_file, gtfs_outfile, trip_outfile, ts_outfile, daily_trip_outfile, daily_ts_outfile, dow=[1, 2, 3], startDate='1900-01-01', endDate='2100-01-01', startingTripCount=1, startingTsCount=0): """ Constructor. """ # set the relevant files self.trip_outfile = trip_outfile self.ts_outfile = ts_outfile # open the data stores self.sfmuni_store = pd.HDFStore(sfmuni_file) self.gtfs_store = pd.HDFStore(gtfs_outfile) # which days of week to run for self.dow = dow # helper for creating data aggregations self.aggregator = SFMuniDataAggregator( daily_trip_outfile=daily_trip_outfile, daily_ts_outfile=daily_ts_outfile) # count the trips and trip-stops to ensure a unique index self.tripCount = startingTripCount self.tsCount = startingTsCount # get the list of all observed dates observedDates = self.sfmuni_store.select_column('sample', 'DATE').unique() self.dateList = [] for d in sorted(observedDates): date = pd.Timestamp(d) if (date >= pd.Timestamp(startDate) and date <= pd.Timestamp(endDate)): self.dateList.append(date) print('SFMuniDataExpander set up for ', len(self.dateList), ' observed dates between ', self.dateList[0], ' and ', self.dateList[len(self.dateList) - 1])
def createMonthlySystemTotals(self, outfile, inkey, outkey): """ Converts from the detailed schedule information to the daily system totals. """ print('Calculating monthly totals') outstore = pd.HDFStore(outfile) if '/' + outkey in outstore.keys(): outstore.remove(outkey) # determine the system totals, grouped by schedule dates df = outstore.get(inkey) aggregator = SFMuniDataAggregator() AGGREGATION_RULES = [ ['TRIPS', 'TRIPS', 'mean', 'system', 'int64', 0], ['STOPS', 'STOPS', 'mean', 'system', 'int64', 0], ['TRIP_STOPS', 'TRIP_STOPS', 'mean', 'system', 'int64', 0], ['FARE', 'FARE', 'mean', 'system', 'float64', 0], ['HEADWAY_S', 'HEADWAY_S', 'mean', 'system', 'float64', 0], ['SERVMILES_S', 'SERVMILES_S', 'mean', 'system', 'float64', 0], ['DWELL_S', 'DWELL_S', 'mean', 'system', 'float64', 0], ['RUNTIME_S', 'RUNTIME_S', 'mean', 'system', 'float64', 0], ['TOTTIME_S', 'TOTTIME_S', 'mean', 'system', 'float64', 0], ['RUNSPEED_S', 'RUNSPEED_S', 'mean', 'system', 'float64', 0], ['TOTSPEED_S', 'TOTSPEED_S', 'mean', 'system', 'float64', 0] ] aggdf, stringLengths = aggregator.aggregateTransitRecords( df, groupby=['MONTH', 'DOW', 'AGENCY_ID', 'ROUTE_TYPE'], columnSpecs=AGGREGATION_RULES) # write the data outstore.append(outkey, aggdf, data_columns=True, min_itemsize=stringLengths) outstore.close()
def getAggDf(self, instore, inkey): # determine the system totals, grouped by schedule dates detailed_df = instore.get(inkey) aggregator = SFMuniDataAggregator() AGGREGATION_RULES = [ ['TRIPS' ,'TRIP_ID' ,aggregator.countUnique, 'system', 'int64', 0], ['STOPS' ,'STOP_ID' ,aggregator.countUnique, 'system', 'int64', 0], ['TRIP_STOPS' ,'TRIP_STOPS' ,'sum', 'system', 'int64', 0], ['FARE' ,'FARE' ,'mean', 'system', 'float64', 0], ['HEADWAY_S' ,'HEADWAY_S' ,'mean', 'system', 'float64', 0], ['SERVMILES_S' ,'SERVMILES_S' ,'sum', 'system', 'float64', 0], ['DWELL_S' ,'DWELL_S' ,'sum', 'system', 'float64', 0], ['RUNTIME_S' ,'RUNTIME_S' ,'sum', 'system', 'float64', 0], ['TOTTIME_S' ,'TOTTIME_S' ,'sum', 'system', 'float64', 0], ['RUNSPEED_S' ,'RUNSPEED_S' ,'mean', 'system', 'float64', 0], ['TOTSPEED_S' ,'TOTSPEED_S' ,'mean', 'system', 'float64', 0] ] aggdf, stringLengths = aggregator.aggregateTransitRecords(detailed_df, groupby=['SCHED_DATES','DOW','SERVICE_ID','AGENCY_ID','ROUTE_TYPE'], columnSpecs=AGGREGATION_RULES) return aggdf, stringLengths
def createMonthlySystemTotals(self, outfile, inkey, outkey): """ Converts from the detailed schedule information to the daily system totals. """ print ('Calculating monthly totals') outstore = pd.HDFStore(outfile) if '/' + outkey in outstore.keys(): outstore.remove(outkey) # determine the system totals, grouped by schedule dates df = outstore.get(inkey) aggregator = SFMuniDataAggregator() AGGREGATION_RULES = [ ['TRIPS' ,'TRIPS' ,'mean', 'system', 'int64', 0], ['STOPS' ,'STOPS' ,'mean', 'system', 'int64', 0], ['TRIP_STOPS' ,'TRIP_STOPS' ,'mean', 'system', 'int64', 0], ['FARE' ,'FARE' ,'mean', 'system', 'float64', 0], ['HEADWAY_S' ,'HEADWAY_S' ,'mean', 'system', 'float64', 0], ['SERVMILES_S' ,'SERVMILES_S' ,'mean', 'system', 'float64', 0], ['DWELL_S' ,'DWELL_S' ,'mean', 'system', 'float64', 0], ['RUNTIME_S' ,'RUNTIME_S' ,'mean', 'system', 'float64', 0], ['TOTTIME_S' ,'TOTTIME_S' ,'mean', 'system', 'float64', 0], ['RUNSPEED_S' ,'RUNSPEED_S' ,'mean', 'system', 'float64', 0], ['TOTSPEED_S' ,'TOTSPEED_S' ,'mean', 'system', 'float64', 0] ] aggdf, stringLengths = aggregator.aggregateTransitRecords(df, groupby=['MONTH','DOW','AGENCY_ID','ROUTE_TYPE'], columnSpecs=AGGREGATION_RULES) # write the data outstore.append(outkey, aggdf, data_columns=True, min_itemsize=stringLengths) outstore.close()
def createDailySystemTotals(self, infiles, outfile, inkey, outkey): """ Converts from the detailed schedule information to the daily system totals. """ outstore = pd.HDFStore(outfile) if '/' + outkey in outstore.keys(): outstore.remove(outkey) # determine the system totals, grouped by schedule dates detailed_df = outstore.get(inkey) aggregator = SFMuniDataAggregator() AGGREGATION_RULES = [ ['TRIPS', 'TRIP_ID', aggregator.countUnique, 'system', 'int64', 0], ['STOPS', 'STOP_ID', aggregator.countUnique, 'system', 'int64', 0], ['TRIP_STOPS', 'TRIP_STOPS', 'sum', 'system', 'int64', 0], ['FARE', 'FARE', 'mean', 'system', 'float64', 0], ['HEADWAY_S', 'HEADWAY_S', 'mean', 'system', 'float64', 0], ['SERVMILES_S', 'SERVMILES_S', 'sum', 'system', 'float64', 0], ['DWELL_S', 'DWELL_S', 'sum', 'system', 'float64', 0], ['RUNTIME_S', 'RUNTIME_S', 'sum', 'system', 'float64', 0], ['TOTTIME_S', 'TOTTIME_S', 'sum', 'system', 'float64', 0], ['RUNSPEED_S', 'RUNSPEED_S', 'mean', 'system', 'float64', 0], ['TOTSPEED_S', 'TOTSPEED_S', 'mean', 'system', 'float64', 0] ] aggdf, stringLengths = aggregator.aggregateTransitRecords( detailed_df, groupby=[ 'SCHED_DATES', 'DOW', 'SERVICE_ID', 'AGENCY_ID', 'ROUTE_TYPE' ], columnSpecs=AGGREGATION_RULES) # use the GTFS files to determine the service in operation for each date for infile in infiles: print('\n\nReading ', infile) self.establishTransitFeed(infile) # loop through each date, and add the appropriate service to the database gtfsDateRange = self.schedule.GetDateRange() dateRangeString = str(gtfsDateRange[0]) + '-' + str( gtfsDateRange[1]) gtfsStartDate = pd.to_datetime(gtfsDateRange[0], format='%Y%m%d') gtfsEndDate = pd.to_datetime(gtfsDateRange[1], format='%Y%m%d') # note that the last date is not included, hence the +1 increment servicePeriodsEachDate = self.schedule.GetServicePeriodsActiveEachDate( gtfsStartDate, gtfsEndDate + pd.DateOffset(days=1)) for date, servicePeriodsForDate in servicePeriodsEachDate: print(' Processing ', date) # current month month = ((pd.to_datetime(date)).to_period('M')).to_timestamp() # figure out the day of week based on the schedule in operation dow = 1 for period in servicePeriodsForDate: servIdString = str(period.service_id).strip().upper() if servIdString == 'SAT' or servIdString == '2': dow = 2 if servIdString == 'SUN' or servIdString == '3': dow = 3 # select and append the appropriate aggregated records for this date for period in servicePeriodsForDate: servIdString = str(period.service_id).strip().upper() records = aggdf[(aggdf['SCHED_DATES'] == dateRangeString) & (aggdf['SERVICE_ID'] == servIdString)] records['DOW'] = dow records['DATE'] = date records['MONTH'] = month # write the data outstore.append(outkey, records, data_columns=True, min_itemsize=stringLengths) outstore.close()
class SFMuniDataExpander(): """ Methods for expanding SFMuniData to the GTFS data and weighting it. """ # specifies how to read in each column from raw input files # columnName, stringLength, index(0/1), source('gtfs', 'avl', 'join' or 'calculated') COLUMNS = [ ['MONTH', 0, 0, 'gtfs'], # Calendar attributes ['DATE', 0, 1, 'gtfs'], ['DOW', 0, 1, 'gtfs'], ['TOD', 10, 1, 'gtfs'], ['AGENCY_ID', 10, 0, 'join'], # for matching to AVL data ['ROUTE_SHORT_NAME', 32, 1, 'join'], ['ROUTE_LONG_NAME', 32, 1, 'gtfs'], # can have case/spelling differences on long name ['DIR', 0, 1, 'join'], ['TRIP', 0, 1, 'join'], ['SEQ', 0, 1, 'join'], ['TRIP_STOPS', 0, 0, 'gtfs'], # total number of trip-stops ['OBSERVED', 0, 0, 'gtfs'], # observed in AVL data? ['ROUTE_TYPE', 0, 0, 'gtfs'], # route/trip attributes ['TRIP_HEADSIGN', 64, 0, 'gtfs'], ['HEADWAY_S', 0, 0, 'gtfs'], ['FARE', 0, 0, 'gtfs'], ['PATTCODE', 10, 0, 'avl'], ['STOPNAME', 32, 0, 'gtfs'], # stop attributes ['STOPNAME_AVL', 32, 0, 'avl'], ['STOP_LAT', 0, 0, 'gtfs'], ['STOP_LON', 0, 0, 'gtfs'], ['SOL', 0, 0, 'gtfs'], ['EOL', 0, 0, 'gtfs'], ['TIMEPOINT', 0, 0, 'avl'], ['ARRIVAL_TIME_S', 0, 0, 'gtfs'], # times ['ARRIVAL_TIME', 0, 0, 'avl'], ['ARRIVAL_TIME_DEV', 0, 0, 'calculated'], ['DEPARTURE_TIME_S', 0, 0, 'gtfs'], ['DEPARTURE_TIME', 0, 0, 'avl'], ['DEPARTURE_TIME_DEV', 0, 0, 'calculated'], ['DWELL_S', 0, 0, 'gtfs'], ['DWELL', 0, 0, 'avl'], ['RUNTIME_S', 0, 0, 'gtfs'], ['RUNTIME', 0, 0, 'avl'], ['TOTTIME_S', 0, 0, 'gtfs'], ['TOTTIME', 0, 0, 'avl'], ['SERVMILES_S', 0, 0, 'gtfs'], ['SERVMILES', 0, 0, 'avl'], # Distances and speeds ['RUNSPEED_S', 0, 0, 'gtfs'], ['RUNSPEED', 0, 0, 'calculated'], ['TOTSPEED_S', 0, 0, 'gtfs'], ['TOTSPEED', 0, 0, 'calculated'], ['ONTIME5', 0, 0, 'calculated'], ['ON', 0, 0, 'avl'], # ridership ['OFF', 0, 0, 'avl'], ['LOAD_ARR', 0, 0, 'avl'], ['LOAD_DEP', 0, 0, 'avl'], ['PASSMILES', 0, 0, 'calculated'], ['PASSHOURS', 0, 0, 'calculated'], ['WAITHOURS', 0, 0, 'calculated'], ['FULLFARE_REV', 0, 0, 'calculated'], # revenue if all passengers paid full fare ['PASSDELAY_DEP', 0, 0, 'calculated'], ['PASSDELAY_ARR', 0, 0, 'calculated'], ['RDBRDNGS', 0, 0, 'avl'], ['CAPACITY', 0, 0, 'avl'], ['DOORCYCLES', 0, 0, 'avl'], ['WHEELCHAIR', 0, 0, 'avl'], ['BIKERACK', 0, 0, 'avl'], ['VC', 0, 0, 'calculated'], # crowding ['CROWDED', 0, 0, 'calculated'], ['CROWDHOURS', 0, 0, 'calculated'], ['ROUTE_ID', 0, 0, 'gtfs'], # additional IDs ['ROUTE_AVL', 0, 0, 'avl'], ['TRIP_ID', 0, 0, 'gtfs'], ['STOP_ID', 0, 0, 'gtfs'], ['STOP_AVL', 0, 0, 'avl'], ['BLOCK_ID', 0, 0, 'gtfs'], ['SHAPE_ID', 0, 0, 'gtfs'], ['SHAPE_DIST', 0, 0, 'gtfs'], ['VEHNO', 0, 0, 'avl'], ['SCHED_DATES', 20, 0, 'gtfs'] # range of this GTFS schedule ] def __init__(self, sfmuni_file, trip_outfile, ts_outfile, daily_trip_outfile, daily_ts_outfile, dow=[1, 2, 3], startDate='1900-01-01', endDate='2100-01-01', startingTripCount=1, startingTsCount=0): """ Constructor. """ # set the relevant files self.trip_outfile = trip_outfile self.ts_outfile = ts_outfile # open the data stores self.sfmuni_store = pd.HDFStore(sfmuni_file) # which days of week to run for self.dow = dow # helper for creating data aggregations self.aggregator = SFMuniDataAggregator( daily_trip_outfile=daily_trip_outfile, daily_ts_outfile=daily_ts_outfile) # count the trips and trip-stops to ensure a unique index self.tripCount = startingTripCount self.tsCount = startingTsCount # get the list of all observed dates observedDates = self.sfmuni_store.select_column('sample', 'DATE').unique() self.dateList = [] for d in sorted(observedDates): date = pd.Timestamp(d) if (date >= pd.Timestamp(startDate) and date <= pd.Timestamp(endDate)): self.dateList.append(date) print 'SFMuniDataExpander set up for ', len(self.dateList), ' observed dates between ', \ self.dateList[0], ' and ', self.dateList[len(self.dateList)-1] def closeStores(self): """ Closes all datastores. """ self.sfmuni_store.close() self.aggregator.close() def expandAndWeight(self, gtfs_file): """ Read GTFS, cleans it, processes it, and writes it to an HDF5 file. This will be done for every individual day, so you get a list of every bus that runs. infile - in GTFS format outfile - output file name in h5 format, same as AVL/APC format """ print datetime.datetime.now( ), 'Converting raw data in file: ', gtfs_file # establish the feed, reading only the bus routes gtfsHelper = GTFSHelper() gtfsHelper.establishTransitFeed(gtfs_file) # create dictionary with one dataframe for each service period dataframes = {} servicePeriods = gtfsHelper.schedule.GetServicePeriodList() for period in servicePeriods: if int(period.service_id) in self.dow: dataframes[period.service_id] = gtfsHelper.getGTFSDataFrame( period, route_types=[3]) # loop through each date, and add the appropriate service to the database gtfsDateRange = gtfsHelper.schedule.GetDateRange() gtfsStartDate = pd.to_datetime(gtfsDateRange[0], format='%Y%m%d') gtfsEndDate = pd.to_datetime(gtfsDateRange[1], format='%Y%m%d') # note that the last date is not included, hence the +1 increment servicePeriodsEachDate = gtfsHelper.schedule.GetServicePeriodsActiveEachDate( gtfsStartDate, gtfsEndDate + pd.DateOffset(days=1)) print 'Writing data for periods from ', gtfsStartDate, ' to ', gtfsEndDate for date, servicePeriodsForDate in servicePeriodsEachDate: if pd.Timestamp(date) in self.dateList: print datetime.datetime.now(), ' Processing ', date # use a separate file for each year # and write a separate table for each month and DOW # format of the table name is mYYYYMMDDdX, where X is the day of week month = (( pd.to_datetime(date)).to_period('month')).to_timestamp() trip_outstore = pd.HDFStore( getOutfile(self.trip_outfile, month)) ts_outstore = pd.HDFStore(getOutfile(self.ts_outfile, month)) for period in servicePeriodsForDate: if int(period.service_id) in self.dow: outkey = getOutkey(month=month, dow=period.service_id, prefix='m') # get the corresponding MUNI data for this date sfmuni = self.getSFMuniData(date) # get the corresponding GTFS dataframe df = dataframes[period.service_id] # update the dates df['ARRIVAL_TIME_S'] = date + (df['ARRIVAL_TIME_S'] - df['DATE']) df['DEPARTURE_TIME_S'] = date + ( df['DEPARTURE_TIME_S'] - df['DATE']) df['DATE'] = date df['MONTH'] = month # join the sfmuni data joined = self.joinSFMuniData(df, sfmuni) # aggregate from trip-stops to trips trips = self.aggregator.aggregateToTrips(joined) # set a unique trip index trips.index = self.tripCount + pd.Series( range(0, len(trips))) self.tripCount += len(trips) # weight the trips trips = self.weightTrips(trips) # write the trips stringLengths = self.getStringLengths(trips.columns) trip_outstore.append(outkey, trips, data_columns=True, min_itemsize=stringLengths) # add weights to trip-stop df mergeFields = [ 'DATE', 'TOD', 'AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR', 'TRIP' ] weightFields = [ 'PATTERN', 'TRIP_WEIGHT', 'TOD_WEIGHT', 'DAY_WEIGHT', 'SYSTEM_WEIGHT' ] tripWeights = trips[mergeFields + weightFields] ts = pd.merge(joined, tripWeights, how='left', on=mergeFields, sort=True) # set a unique trip-stop index ts.index = self.tsCount + pd.Series(range(0, len(ts))) self.tsCount += len(ts) # write the trip-stops stringLengths = self.getStringLengths(ts.columns) ts_outstore.append(outkey, ts, data_columns=True, min_itemsize=stringLengths) # aggregate to TOD and daily totals, and write those self.aggregator.aggregateTripsToDays(trips) self.aggregator.aggregateTripStopsToDays(ts) trip_outstore.close() ts_outstore.close() def getSFMuniData(self, date): """ Returns a dataframe with the observed SFMuni records and some processing of those """ sfmuni = self.sfmuni_store.select('sample', where='DATE==Timestamp(date)') sfmuni.index = pd.Series(range(0, len(sfmuni))) # drop duplicates, which would get double-counted sfmuni = sfmuni.drop_duplicates(subset=[ 'AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR', 'PATTCODE', 'TRIP', 'SEQ' ]) # update the TRIP id in case there are multiple trips with different # patterns leaving a different stop at the same time groupby = ['AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR', 'PATTCODE', 'TRIP'] sfmuni = sfmuni.groupby(groupby, as_index=False).apply(updateTripId) # calculate observed RUNTIME # happens here because the values in the AVL data look screwy. groupby = ['AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR', 'TRIP'] sfmuni = sfmuni.groupby(groupby, as_index=False).apply(calculateRuntime) sfmuni['TOTTIME'] = sfmuni['RUNTIME'] + sfmuni['DWELL'] # speed speedInput = pd.Series(zip(sfmuni['SERVMILES'], sfmuni['RUNTIME']), index=sfmuni.index) sfmuni['RUNSPEED'] = speedInput.apply(updateSpeeds) speedInput = pd.Series(zip(sfmuni['SERVMILES'], sfmuni['TOTTIME']), index=sfmuni.index) sfmuni['TOTSPEED'] = speedInput.apply(updateSpeeds) return sfmuni def joinSFMuniData(self, gtfs, sfmuni): """ Left join from GTFS to SFMuni sample. gtfs_file - HDF file containing processed GTFS data sfmuni_file - HDF file containing processed, just for sampled routes joined_outfile - HDF file containing merged GTFS and SFMuni data """ # convert column specs colnames = [] indexColumns = [] joinFields = [] sources = {} for col in self.COLUMNS: name = col[0] index = col[2] source = col[3] colnames.append(name) sources[name] = source if index == 1: indexColumns.append(name) if source == 'join': joinFields.append(name) sfmuni['OBSERVED'] = 1 # join try: joined = pd.merge(gtfs, sfmuni, how='left', on=joinFields, suffixes=('', '_AVL'), sort=True) except KeyError: print joinFields print gtfs.info() print gtfs.head() print sfmuni.info() print sfmuni.head() raise # calculate other derived fields # observations joined['OBSERVED'] = np.where(joined['OBSERVED_AVL'] == 1, 1, 0) # normalize to consistent measure of service miles joined['SERVMILES'] = joined['SERVMILES_S'] # schedule deviation arrTime = pd.Series(zip(joined['ARRIVAL_TIME'], joined['ARRIVAL_TIME_S']), index=joined.index) depTime = pd.Series(zip(joined['DEPARTURE_TIME'], joined['DEPARTURE_TIME_S']), index=joined.index) joined['ARRIVAL_TIME_DEV'] = arrTime.apply(getScheduleDeviation) joined['DEPARTURE_TIME_DEV'] = depTime.apply(getScheduleDeviation) # ontime defined consistent with TCRP 165 joined['ONTIME5'] = np.where((joined['DEPARTURE_TIME_DEV'] > -1.0) & (joined['ARRIVAL_TIME_DEV'] < 5.0), 1, 0) joined['ONTIME5'] = joined['ONTIME5'].mask(joined['OBSERVED'] == 0, other=np.nan) # passenger miles traveled joined['PASSMILES'] = joined['LOAD_ARR'] * joined['SERVMILES'] # passenger hours -- scheduled time joined['PASSHOURS'] = ( joined['LOAD_ARR'] * joined['RUNTIME'] + joined['LOAD_DEP'] * joined['DWELL']).values / 60.0 # passenger hours of waiting time -- scheduled time joined['WAITHOURS'] = (joined['ON'] * 0.5 * joined['HEADWAY_S']).values / 60.0 # fair paid, if each boarding pays full fare joined['FULLFARE_REV'] = (joined['ON'] * joined['FARE']) # passenger hours of delay at departure joined['PASSDELAY_DEP'] = np.where( joined['DEPARTURE_TIME_DEV'] > 0, joined['ON'] * joined['DEPARTURE_TIME_DEV'], 0) joined['PASSDELAY_DEP'] = joined['PASSDELAY_DEP'].mask( joined['OBSERVED'] == 0, other=np.nan) # passenger hours of delay at arrival joined['PASSDELAY_ARR'] = np.where( joined['ARRIVAL_TIME_DEV'] > 0, joined['ON'] * joined['ARRIVAL_TIME_DEV'], 0) joined['PASSDELAY_ARR'] = joined['PASSDELAY_ARR'].mask( joined['OBSERVED'] == 0, other=np.nan) # volume-capacity ratio joined['VC'] = (joined['LOAD_ARR']).values / ( joined['CAPACITY']).values # croweded if VC>0.85 # the capacity is the 'crush' load, so we are defining # crowding as 85% of that capacity. In TCRP 165, this # corresponds approximately to the range of 125-150% of # the seated load, which is the maximum design load for # peak of the peak conditions. joined['CROWDED'] = np.where(joined['VC'] > 0.85, 1.0, 0.0) joined['CROWDED'] = joined['CROWDED'].mask(joined['OBSERVED'] == 0, other=np.nan) joined['CROWDHOURS'] = ( joined['CROWDED'] * (joined['LOAD_ARR'] * joined['RUNTIME'] + joined['LOAD_DEP'] * joined['DWELL'])).values / 60.0 # keep only relevant columns, sorted joined.sort(indexColumns, inplace=True) joined = joined[colnames] return joined def weightTrips(self, trips): """ Adds a series of weight columns to the trip df based on the ratio of total to observed trips. """ # start with all observations weighted equally trips['TRIPS'] = 1 trips['TRIP_WEIGHT'] = trips['OBSERVED'].mask(trips['OBSERVED'] == 0, other=np.nan) # add the weight columns, specific to the level of aggregation # the weights build upon the lower-level weights, so we scale # the low-weights up uniformly within the group. # routes trips['TOD_WEIGHT'] = calcWeights( trips, groupby=['DATE', 'TOD', 'AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR'], oldWeight='TRIP_WEIGHT') trips['DAY_WEIGHT'] = calcWeights( trips, groupby=['DATE', 'AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR'], oldWeight='TOD_WEIGHT') # system trips['SYSTEM_WEIGHT'] = calcWeights( trips, groupby=['DATE', 'TOD', 'AGENCY_ID'], oldWeight='DAY_WEIGHT') return trips def getStringLengths(self, usedColumns): """ gets the max string length for the columns that are in use """ # convert column specs stringLengths = {} for col in self.COLUMNS: name = col[0] if name in usedColumns: stringLength = col[1] if (stringLength > 0): stringLengths[name] = stringLength return stringLengths
sfmuni_file=CLEANED_OUTFILE, trip_outfile=EXPANDED_TRIP_OUTFILE, ts_outfile=EXPANDED_TS_OUTFILE, daily_trip_outfile=DAILY_TRIP_OUTFILE, daily_ts_outfile=DAILY_TS_OUTFILE, dow=[1], startDate='2000-01-01') for infile in RAW_GTFS_FILES: sfmuniExpander.expandAndWeight(infile) print('Finished expanding to GTFS in ', (datetime.datetime.now() - startTime)) # aggregate to monthly totals if 'aggregate' in STEPS_TO_RUN: startTime = datetime.datetime.now() aggregator = SFMuniDataAggregator() aggregator.aggregateTripsToMonths(DAILY_TRIP_OUTFILE, MONTHLY_TRIP_OUTFILE) aggregator.aggregateTripStopsToMonths(DAILY_TS_OUTFILE, MONTHLY_TS_OUTFILE) print('Finished aggregations in ', (datetime.datetime.now() - startTime)) # process GTFS schedule data. if 'gtfs' in STEPS_TO_RUN: startTime = datetime.datetime.now() gtfsHelper = GTFSHelper() gtfsHelper.processFiles(RAW_GTFS_FILES, GTFS_OUTFILE, 'sfmuni') gtfsHelper.processFiles(BART_GTFS_FILES, GTFS_OUTFILE, 'bart') print('Finished processing GTFS data ', (datetime.datetime.now() - startTime))