Python SFMuniDataAggregatorの例

プログラミング言語: Python

名前空間/パッケージ名: SFMuniDataAggregator

hotexamples.comのコード掲載数: 7

Python SFMuniDataAggregator - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのSFMuniDataAggregator.SFMuniDataAggregatorの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

SFMuniDataAggregator(4)

aggregateTransitRecords(2)

aggregateToTrips(1)

aggregateTripStopsToDays(1)

aggregateTripStopsToMonths(1)

コード例 #1

ファイルを表示

    def __init__(self,
                 sfmuni_file,
                 gtfs_outfile,
                 trip_outfile,
                 ts_outfile,
                 daily_trip_outfile,
                 daily_ts_outfile,
                 dow=[1, 2, 3],
                 startDate='1900-01-01',
                 endDate='2100-01-01',
                 startingTripCount=1,
                 startingTsCount=0):
        """
        Constructor.                 
        """

        # set the relevant files
        self.trip_outfile = trip_outfile
        self.ts_outfile = ts_outfile

        # open the data stores
        self.sfmuni_store = pd.HDFStore(sfmuni_file)
        self.gtfs_store = pd.HDFStore(gtfs_outfile)

        # which days of week to run for
        self.dow = dow

        # helper for creating data aggregations
        self.aggregator = SFMuniDataAggregator(
            daily_trip_outfile=daily_trip_outfile,
            daily_ts_outfile=daily_ts_outfile)

        # count the trips and trip-stops to ensure a unique index
        self.tripCount = startingTripCount
        self.tsCount = startingTsCount

        # get the list of all observed dates
        observedDates = self.sfmuni_store.select_column('sample',
                                                        'DATE').unique()

        self.dateList = []
        for d in sorted(observedDates):
            date = pd.Timestamp(d)
            if (date >= pd.Timestamp(startDate)
                    and date <= pd.Timestamp(endDate)):
                self.dateList.append(date)

        print('SFMuniDataExpander set up for ', len(self.dateList),
              ' observed dates between ', self.dateList[0], ' and ',
              self.dateList[len(self.dateList) - 1])

コード例 #2

ファイルを表示

    def createMonthlySystemTotals(self, outfile, inkey, outkey):
        """
        Converts from the detailed schedule information to the 
        daily system totals.
        
        """

        print('Calculating monthly totals')

        outstore = pd.HDFStore(outfile)
        if '/' + outkey in outstore.keys():
            outstore.remove(outkey)

        # determine the system totals, grouped by schedule dates
        df = outstore.get(inkey)
        aggregator = SFMuniDataAggregator()
        AGGREGATION_RULES = [
            ['TRIPS', 'TRIPS', 'mean', 'system', 'int64', 0],
            ['STOPS', 'STOPS', 'mean', 'system', 'int64', 0],
            ['TRIP_STOPS', 'TRIP_STOPS', 'mean', 'system', 'int64', 0],
            ['FARE', 'FARE', 'mean', 'system', 'float64', 0],
            ['HEADWAY_S', 'HEADWAY_S', 'mean', 'system', 'float64', 0],
            ['SERVMILES_S', 'SERVMILES_S', 'mean', 'system', 'float64', 0],
            ['DWELL_S', 'DWELL_S', 'mean', 'system', 'float64', 0],
            ['RUNTIME_S', 'RUNTIME_S', 'mean', 'system', 'float64', 0],
            ['TOTTIME_S', 'TOTTIME_S', 'mean', 'system', 'float64', 0],
            ['RUNSPEED_S', 'RUNSPEED_S', 'mean', 'system', 'float64', 0],
            ['TOTSPEED_S', 'TOTSPEED_S', 'mean', 'system', 'float64', 0]
        ]
        aggdf, stringLengths = aggregator.aggregateTransitRecords(
            df,
            groupby=['MONTH', 'DOW', 'AGENCY_ID', 'ROUTE_TYPE'],
            columnSpecs=AGGREGATION_RULES)

        # write the data
        outstore.append(outkey,
                        aggdf,
                        data_columns=True,
                        min_itemsize=stringLengths)

        outstore.close()

コード例 #3

ファイルを表示

ファイル: GTFSHelper.py プロジェクト: UCL/sfdata_wrangler

    def getAggDf(self, instore, inkey): 
    
        # determine the system totals, grouped by schedule dates
        detailed_df = instore.get(inkey)
        aggregator = SFMuniDataAggregator()        
        AGGREGATION_RULES = [            
           	['TRIPS'        ,'TRIP_ID'     ,aggregator.countUnique, 'system', 'int64', 0],
           	['STOPS'        ,'STOP_ID'     ,aggregator.countUnique, 'system', 'int64', 0],
           	['TRIP_STOPS'   ,'TRIP_STOPS'  ,'sum',  'system', 'int64', 0],
           	['FARE'         ,'FARE'        ,'mean', 'system', 'float64', 0],
           	['HEADWAY_S'    ,'HEADWAY_S'   ,'mean', 'system', 'float64', 0],
           	['SERVMILES_S'  ,'SERVMILES_S' ,'sum',  'system', 'float64', 0],
           	['DWELL_S'      ,'DWELL_S'     ,'sum',  'system', 'float64', 0],
           	['RUNTIME_S'    ,'RUNTIME_S'   ,'sum',  'system', 'float64', 0],
           	['TOTTIME_S'    ,'TOTTIME_S'   ,'sum',  'system', 'float64', 0],
           	['RUNSPEED_S'   ,'RUNSPEED_S'  ,'mean', 'system', 'float64', 0],
           	['TOTSPEED_S'   ,'TOTSPEED_S'  ,'mean', 'system', 'float64', 0]
                ]                
        aggdf, stringLengths  = aggregator.aggregateTransitRecords(detailed_df, 
                groupby=['SCHED_DATES','DOW','SERVICE_ID','AGENCY_ID','ROUTE_TYPE'], 
                columnSpecs=AGGREGATION_RULES)

        return aggdf, stringLengths

コード例 #4

ファイルを表示

ファイル: GTFSHelper.py プロジェクト: UCL/sfdata_wrangler

    def createMonthlySystemTotals(self,  outfile, inkey, outkey):
        """
        Converts from the detailed schedule information to the 
        daily system totals.
        
        """
        
        print ('Calculating monthly totals')
        
        outstore = pd.HDFStore(outfile) 
        if '/' + outkey in outstore.keys(): 
            outstore.remove(outkey)

        # determine the system totals, grouped by schedule dates
        df = outstore.get(inkey)
        aggregator = SFMuniDataAggregator()        
        AGGREGATION_RULES = [            
           	['TRIPS'        ,'TRIPS'       ,'mean', 'system', 'int64',   0],
           	['STOPS'        ,'STOPS'       ,'mean', 'system', 'int64',   0],
           	['TRIP_STOPS'   ,'TRIP_STOPS'  ,'mean', 'system', 'int64',   0],
           	['FARE'         ,'FARE'        ,'mean', 'system', 'float64', 0],
           	['HEADWAY_S'    ,'HEADWAY_S'   ,'mean', 'system', 'float64', 0],
           	['SERVMILES_S'  ,'SERVMILES_S' ,'mean', 'system', 'float64', 0],
           	['DWELL_S'      ,'DWELL_S'     ,'mean', 'system', 'float64', 0],
           	['RUNTIME_S'    ,'RUNTIME_S'   ,'mean', 'system', 'float64', 0],
           	['TOTTIME_S'    ,'TOTTIME_S'   ,'mean', 'system', 'float64', 0],
           	['RUNSPEED_S'   ,'RUNSPEED_S'  ,'mean', 'system', 'float64', 0],
           	['TOTSPEED_S'   ,'TOTSPEED_S'  ,'mean', 'system', 'float64', 0]
                ]                
        aggdf, stringLengths  = aggregator.aggregateTransitRecords(df, 
                groupby=['MONTH','DOW','AGENCY_ID','ROUTE_TYPE'], 
                columnSpecs=AGGREGATION_RULES)
                        
        # write the data
        outstore.append(outkey, aggdf, data_columns=True, min_itemsize=stringLengths)

        outstore.close()

コード例 #5

ファイルを表示

    def createDailySystemTotals(self, infiles, outfile, inkey, outkey):
        """
        Converts from the detailed schedule information to the 
        daily system totals.
        
        """

        outstore = pd.HDFStore(outfile)
        if '/' + outkey in outstore.keys():
            outstore.remove(outkey)

        # determine the system totals, grouped by schedule dates
        detailed_df = outstore.get(inkey)
        aggregator = SFMuniDataAggregator()
        AGGREGATION_RULES = [
            ['TRIPS', 'TRIP_ID', aggregator.countUnique, 'system', 'int64', 0],
            ['STOPS', 'STOP_ID', aggregator.countUnique, 'system', 'int64', 0],
            ['TRIP_STOPS', 'TRIP_STOPS', 'sum', 'system', 'int64', 0],
            ['FARE', 'FARE', 'mean', 'system', 'float64', 0],
            ['HEADWAY_S', 'HEADWAY_S', 'mean', 'system', 'float64', 0],
            ['SERVMILES_S', 'SERVMILES_S', 'sum', 'system', 'float64', 0],
            ['DWELL_S', 'DWELL_S', 'sum', 'system', 'float64', 0],
            ['RUNTIME_S', 'RUNTIME_S', 'sum', 'system', 'float64', 0],
            ['TOTTIME_S', 'TOTTIME_S', 'sum', 'system', 'float64', 0],
            ['RUNSPEED_S', 'RUNSPEED_S', 'mean', 'system', 'float64', 0],
            ['TOTSPEED_S', 'TOTSPEED_S', 'mean', 'system', 'float64', 0]
        ]
        aggdf, stringLengths = aggregator.aggregateTransitRecords(
            detailed_df,
            groupby=[
                'SCHED_DATES', 'DOW', 'SERVICE_ID', 'AGENCY_ID', 'ROUTE_TYPE'
            ],
            columnSpecs=AGGREGATION_RULES)

        # use the GTFS files to determine the service in operation for each date
        for infile in infiles:
            print('\n\nReading ', infile)

            self.establishTransitFeed(infile)

            # loop through each date, and add the appropriate service to the database
            gtfsDateRange = self.schedule.GetDateRange()
            dateRangeString = str(gtfsDateRange[0]) + '-' + str(
                gtfsDateRange[1])
            gtfsStartDate = pd.to_datetime(gtfsDateRange[0], format='%Y%m%d')
            gtfsEndDate = pd.to_datetime(gtfsDateRange[1], format='%Y%m%d')

            # note that the last date is not included, hence the +1 increment
            servicePeriodsEachDate = self.schedule.GetServicePeriodsActiveEachDate(
                gtfsStartDate, gtfsEndDate + pd.DateOffset(days=1))

            for date, servicePeriodsForDate in servicePeriodsEachDate:
                print(' Processing ', date)

                # current month
                month = ((pd.to_datetime(date)).to_period('M')).to_timestamp()

                # figure out the day of week based on the schedule in operation
                dow = 1
                for period in servicePeriodsForDate:
                    servIdString = str(period.service_id).strip().upper()
                    if servIdString == 'SAT' or servIdString == '2':
                        dow = 2
                    if servIdString == 'SUN' or servIdString == '3':
                        dow = 3

                # select and append the appropriate aggregated records for this date
                for period in servicePeriodsForDate:

                    servIdString = str(period.service_id).strip().upper()

                    records = aggdf[(aggdf['SCHED_DATES'] == dateRangeString)
                                    & (aggdf['SERVICE_ID'] == servIdString)]

                    records['DOW'] = dow
                    records['DATE'] = date
                    records['MONTH'] = month

                    # write the data
                    outstore.append(outkey,
                                    records,
                                    data_columns=True,
                                    min_itemsize=stringLengths)

        outstore.close()

コード例 #6

ファイルを表示

ファイル: SFMuniDataExpander.py プロジェクト: alex-mucci/sfdata_wrangler

class SFMuniDataExpander():
    """ 
    Methods for expanding SFMuniData to the GTFS data and weighting it.  
    
    """

    # specifies how to read in each column from raw input files
    #  columnName,       stringLength, index(0/1), source('gtfs', 'avl', 'join' or 'calculated')
    COLUMNS = [
        ['MONTH', 0, 0, 'gtfs'],  # Calendar attributes
        ['DATE', 0, 1, 'gtfs'],
        ['DOW', 0, 1, 'gtfs'],
        ['TOD', 10, 1, 'gtfs'],
        ['AGENCY_ID', 10, 0, 'join'],  # for matching to AVL data
        ['ROUTE_SHORT_NAME', 32, 1, 'join'],
        ['ROUTE_LONG_NAME', 32, 1,
         'gtfs'],  # can have case/spelling differences on long name
        ['DIR', 0, 1, 'join'],
        ['TRIP', 0, 1, 'join'],
        ['SEQ', 0, 1, 'join'],
        ['TRIP_STOPS', 0, 0, 'gtfs'],  # total number of trip-stops
        ['OBSERVED', 0, 0, 'gtfs'],  # observed in AVL data?
        ['ROUTE_TYPE', 0, 0, 'gtfs'],  # route/trip attributes 
        ['TRIP_HEADSIGN', 64, 0, 'gtfs'],
        ['HEADWAY_S', 0, 0, 'gtfs'],
        ['FARE', 0, 0, 'gtfs'],
        ['PATTCODE', 10, 0, 'avl'],
        ['STOPNAME', 32, 0, 'gtfs'],  # stop attributes
        ['STOPNAME_AVL', 32, 0, 'avl'],
        ['STOP_LAT', 0, 0, 'gtfs'],
        ['STOP_LON', 0, 0, 'gtfs'],
        ['SOL', 0, 0, 'gtfs'],
        ['EOL', 0, 0, 'gtfs'],
        ['TIMEPOINT', 0, 0, 'avl'],
        ['ARRIVAL_TIME_S', 0, 0, 'gtfs'],  # times
        ['ARRIVAL_TIME', 0, 0, 'avl'],
        ['ARRIVAL_TIME_DEV', 0, 0, 'calculated'],
        ['DEPARTURE_TIME_S', 0, 0, 'gtfs'],
        ['DEPARTURE_TIME', 0, 0, 'avl'],
        ['DEPARTURE_TIME_DEV', 0, 0, 'calculated'],
        ['DWELL_S', 0, 0, 'gtfs'],
        ['DWELL', 0, 0, 'avl'],
        ['RUNTIME_S', 0, 0, 'gtfs'],
        ['RUNTIME', 0, 0, 'avl'],
        ['TOTTIME_S', 0, 0, 'gtfs'],
        ['TOTTIME', 0, 0, 'avl'],
        ['SERVMILES_S', 0, 0, 'gtfs'],
        ['SERVMILES', 0, 0, 'avl'],  # Distances and speeds
        ['RUNSPEED_S', 0, 0, 'gtfs'],
        ['RUNSPEED', 0, 0, 'calculated'],
        ['TOTSPEED_S', 0, 0, 'gtfs'],
        ['TOTSPEED', 0, 0, 'calculated'],
        ['ONTIME5', 0, 0, 'calculated'],
        ['ON', 0, 0, 'avl'],  # ridership
        ['OFF', 0, 0, 'avl'],
        ['LOAD_ARR', 0, 0, 'avl'],
        ['LOAD_DEP', 0, 0, 'avl'],
        ['PASSMILES', 0, 0, 'calculated'],
        ['PASSHOURS', 0, 0, 'calculated'],
        ['WAITHOURS', 0, 0, 'calculated'],
        ['FULLFARE_REV', 0, 0,
         'calculated'],  # revenue if all passengers paid full fare
        ['PASSDELAY_DEP', 0, 0, 'calculated'],
        ['PASSDELAY_ARR', 0, 0, 'calculated'],
        ['RDBRDNGS', 0, 0, 'avl'],
        ['CAPACITY', 0, 0, 'avl'],
        ['DOORCYCLES', 0, 0, 'avl'],
        ['WHEELCHAIR', 0, 0, 'avl'],
        ['BIKERACK', 0, 0, 'avl'],
        ['VC', 0, 0, 'calculated'],  # crowding
        ['CROWDED', 0, 0, 'calculated'],
        ['CROWDHOURS', 0, 0, 'calculated'],
        ['ROUTE_ID', 0, 0, 'gtfs'],  # additional IDs 
        ['ROUTE_AVL', 0, 0, 'avl'],
        ['TRIP_ID', 0, 0, 'gtfs'],
        ['STOP_ID', 0, 0, 'gtfs'],
        ['STOP_AVL', 0, 0, 'avl'],
        ['BLOCK_ID', 0, 0, 'gtfs'],
        ['SHAPE_ID', 0, 0, 'gtfs'],
        ['SHAPE_DIST', 0, 0, 'gtfs'],
        ['VEHNO', 0, 0, 'avl'],
        ['SCHED_DATES', 20, 0, 'gtfs']  # range of this GTFS schedule
    ]

    def __init__(self,
                 sfmuni_file,
                 trip_outfile,
                 ts_outfile,
                 daily_trip_outfile,
                 daily_ts_outfile,
                 dow=[1, 2, 3],
                 startDate='1900-01-01',
                 endDate='2100-01-01',
                 startingTripCount=1,
                 startingTsCount=0):
        """
        Constructor.                 
        """

        # set the relevant files
        self.trip_outfile = trip_outfile
        self.ts_outfile = ts_outfile

        # open the data stores
        self.sfmuni_store = pd.HDFStore(sfmuni_file)

        # which days of week to run for
        self.dow = dow

        # helper for creating data aggregations
        self.aggregator = SFMuniDataAggregator(
            daily_trip_outfile=daily_trip_outfile,
            daily_ts_outfile=daily_ts_outfile)

        # count the trips and trip-stops to ensure a unique index
        self.tripCount = startingTripCount
        self.tsCount = startingTsCount

        # get the list of all observed dates
        observedDates = self.sfmuni_store.select_column('sample',
                                                        'DATE').unique()

        self.dateList = []
        for d in sorted(observedDates):
            date = pd.Timestamp(d)
            if (date >= pd.Timestamp(startDate)
                    and date <= pd.Timestamp(endDate)):
                self.dateList.append(date)

        print 'SFMuniDataExpander set up for ', len(self.dateList), ' observed dates between ', \
               self.dateList[0], ' and ', self.dateList[len(self.dateList)-1]

    def closeStores(self):
        """
        Closes all datastores. 
        """
        self.sfmuni_store.close()
        self.aggregator.close()

    def expandAndWeight(self, gtfs_file):
        """
        Read GTFS, cleans it, processes it, and writes it to an HDF5 file.
        This will be done for every individual day, so you get a list of 
        every bus that runs. 
        
        infile  - in GTFS format
        outfile - output file name in h5 format, same as AVL/APC format
        """

        print datetime.datetime.now(
        ), 'Converting raw data in file: ', gtfs_file

        # establish the feed, reading only the bus routes
        gtfsHelper = GTFSHelper()
        gtfsHelper.establishTransitFeed(gtfs_file)

        # create dictionary with one dataframe for each service period
        dataframes = {}
        servicePeriods = gtfsHelper.schedule.GetServicePeriodList()
        for period in servicePeriods:
            if int(period.service_id) in self.dow:
                dataframes[period.service_id] = gtfsHelper.getGTFSDataFrame(
                    period, route_types=[3])

        # loop through each date, and add the appropriate service to the database
        gtfsDateRange = gtfsHelper.schedule.GetDateRange()
        gtfsStartDate = pd.to_datetime(gtfsDateRange[0], format='%Y%m%d')
        gtfsEndDate = pd.to_datetime(gtfsDateRange[1], format='%Y%m%d')

        # note that the last date is not included, hence the +1 increment
        servicePeriodsEachDate = gtfsHelper.schedule.GetServicePeriodsActiveEachDate(
            gtfsStartDate, gtfsEndDate + pd.DateOffset(days=1))

        print 'Writing data for periods from ', gtfsStartDate, ' to ', gtfsEndDate
        for date, servicePeriodsForDate in servicePeriodsEachDate:

            if pd.Timestamp(date) in self.dateList:
                print datetime.datetime.now(), ' Processing ', date

                # use a separate file for each year
                # and write a separate table for each month and DOW
                # format of the table name is mYYYYMMDDdX, where X is the day of week
                month = ((
                    pd.to_datetime(date)).to_period('month')).to_timestamp()
                trip_outstore = pd.HDFStore(
                    getOutfile(self.trip_outfile, month))
                ts_outstore = pd.HDFStore(getOutfile(self.ts_outfile, month))

                for period in servicePeriodsForDate:
                    if int(period.service_id) in self.dow:

                        outkey = getOutkey(month=month,
                                           dow=period.service_id,
                                           prefix='m')

                        # get the corresponding MUNI data for this date
                        sfmuni = self.getSFMuniData(date)

                        # get the corresponding GTFS dataframe
                        df = dataframes[period.service_id]

                        # update the dates
                        df['ARRIVAL_TIME_S'] = date + (df['ARRIVAL_TIME_S'] -
                                                       df['DATE'])
                        df['DEPARTURE_TIME_S'] = date + (
                            df['DEPARTURE_TIME_S'] - df['DATE'])

                        df['DATE'] = date
                        df['MONTH'] = month

                        # join the sfmuni data
                        joined = self.joinSFMuniData(df, sfmuni)

                        # aggregate from trip-stops to trips
                        trips = self.aggregator.aggregateToTrips(joined)

                        # set a unique trip index
                        trips.index = self.tripCount + pd.Series(
                            range(0, len(trips)))
                        self.tripCount += len(trips)

                        # weight the trips
                        trips = self.weightTrips(trips)

                        # write the trips
                        stringLengths = self.getStringLengths(trips.columns)
                        trip_outstore.append(outkey,
                                             trips,
                                             data_columns=True,
                                             min_itemsize=stringLengths)

                        # add weights to trip-stop df
                        mergeFields = [
                            'DATE', 'TOD', 'AGENCY_ID', 'ROUTE_SHORT_NAME',
                            'DIR', 'TRIP'
                        ]
                        weightFields = [
                            'PATTERN', 'TRIP_WEIGHT', 'TOD_WEIGHT',
                            'DAY_WEIGHT', 'SYSTEM_WEIGHT'
                        ]
                        tripWeights = trips[mergeFields + weightFields]
                        ts = pd.merge(joined,
                                      tripWeights,
                                      how='left',
                                      on=mergeFields,
                                      sort=True)

                        # set a unique trip-stop index
                        ts.index = self.tsCount + pd.Series(range(0, len(ts)))
                        self.tsCount += len(ts)

                        # write the trip-stops
                        stringLengths = self.getStringLengths(ts.columns)
                        ts_outstore.append(outkey,
                                           ts,
                                           data_columns=True,
                                           min_itemsize=stringLengths)

                        # aggregate to TOD and daily totals, and write those
                        self.aggregator.aggregateTripsToDays(trips)
                        self.aggregator.aggregateTripStopsToDays(ts)

                trip_outstore.close()
                ts_outstore.close()

    def getSFMuniData(self, date):
        """
        Returns a dataframe with the observed SFMuni records
        and some processing of those
        """

        sfmuni = self.sfmuni_store.select('sample',
                                          where='DATE==Timestamp(date)')
        sfmuni.index = pd.Series(range(0, len(sfmuni)))

        # drop duplicates, which would get double-counted
        sfmuni = sfmuni.drop_duplicates(subset=[
            'AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR', 'PATTCODE', 'TRIP', 'SEQ'
        ])

        # update the TRIP id in case there are multiple trips with different
        # patterns leaving a different stop at the same time
        groupby = ['AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR', 'PATTCODE', 'TRIP']
        sfmuni = sfmuni.groupby(groupby, as_index=False).apply(updateTripId)

        # calculate observed RUNTIME
        # happens here because the values in the AVL data look screwy.
        groupby = ['AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR', 'TRIP']
        sfmuni = sfmuni.groupby(groupby,
                                as_index=False).apply(calculateRuntime)
        sfmuni['TOTTIME'] = sfmuni['RUNTIME'] + sfmuni['DWELL']

        # speed
        speedInput = pd.Series(zip(sfmuni['SERVMILES'], sfmuni['RUNTIME']),
                               index=sfmuni.index)
        sfmuni['RUNSPEED'] = speedInput.apply(updateSpeeds)
        speedInput = pd.Series(zip(sfmuni['SERVMILES'], sfmuni['TOTTIME']),
                               index=sfmuni.index)
        sfmuni['TOTSPEED'] = speedInput.apply(updateSpeeds)

        return sfmuni

    def joinSFMuniData(self, gtfs, sfmuni):
        """
        Left join from GTFS to SFMuni sample.        
        
        gtfs_file - HDF file containing processed GTFS data      
        sfmuni_file - HDF file containing processed, just for sampled routes
        joined_outfile - HDF file containing merged GTFS and SFMuni data     
        """

        # convert column specs
        colnames = []
        indexColumns = []
        joinFields = []
        sources = {}
        for col in self.COLUMNS:
            name = col[0]
            index = col[2]
            source = col[3]

            colnames.append(name)
            sources[name] = source
            if index == 1:
                indexColumns.append(name)
            if source == 'join':
                joinFields.append(name)

        sfmuni['OBSERVED'] = 1

        # join
        try:
            joined = pd.merge(gtfs,
                              sfmuni,
                              how='left',
                              on=joinFields,
                              suffixes=('', '_AVL'),
                              sort=True)
        except KeyError:
            print joinFields
            print gtfs.info()
            print gtfs.head()
            print sfmuni.info()
            print sfmuni.head()
            raise

        # calculate other derived fields
        # observations
        joined['OBSERVED'] = np.where(joined['OBSERVED_AVL'] == 1, 1, 0)

        # normalize to consistent measure of service miles
        joined['SERVMILES'] = joined['SERVMILES_S']

        # schedule deviation
        arrTime = pd.Series(zip(joined['ARRIVAL_TIME'],
                                joined['ARRIVAL_TIME_S']),
                            index=joined.index)
        depTime = pd.Series(zip(joined['DEPARTURE_TIME'],
                                joined['DEPARTURE_TIME_S']),
                            index=joined.index)
        joined['ARRIVAL_TIME_DEV'] = arrTime.apply(getScheduleDeviation)
        joined['DEPARTURE_TIME_DEV'] = depTime.apply(getScheduleDeviation)

        # ontime defined consistent with TCRP 165
        joined['ONTIME5'] = np.where((joined['DEPARTURE_TIME_DEV'] > -1.0) &
                                     (joined['ARRIVAL_TIME_DEV'] < 5.0), 1, 0)
        joined['ONTIME5'] = joined['ONTIME5'].mask(joined['OBSERVED'] == 0,
                                                   other=np.nan)

        # passenger miles traveled
        joined['PASSMILES'] = joined['LOAD_ARR'] * joined['SERVMILES']

        # passenger hours -- scheduled time
        joined['PASSHOURS'] = (
            joined['LOAD_ARR'] * joined['RUNTIME'] +
            joined['LOAD_DEP'] * joined['DWELL']).values / 60.0

        # passenger hours of waiting time -- scheduled time
        joined['WAITHOURS'] = (joined['ON'] * 0.5 *
                               joined['HEADWAY_S']).values / 60.0

        # fair paid, if each boarding pays full fare
        joined['FULLFARE_REV'] = (joined['ON'] * joined['FARE'])

        # passenger hours of delay at departure
        joined['PASSDELAY_DEP'] = np.where(
            joined['DEPARTURE_TIME_DEV'] > 0,
            joined['ON'] * joined['DEPARTURE_TIME_DEV'], 0)
        joined['PASSDELAY_DEP'] = joined['PASSDELAY_DEP'].mask(
            joined['OBSERVED'] == 0, other=np.nan)

        # passenger hours of delay at arrival
        joined['PASSDELAY_ARR'] = np.where(
            joined['ARRIVAL_TIME_DEV'] > 0,
            joined['ON'] * joined['ARRIVAL_TIME_DEV'], 0)
        joined['PASSDELAY_ARR'] = joined['PASSDELAY_ARR'].mask(
            joined['OBSERVED'] == 0, other=np.nan)

        # volume-capacity ratio
        joined['VC'] = (joined['LOAD_ARR']).values / (
            joined['CAPACITY']).values

        # croweded if VC>0.85
        # the capacity is the 'crush' load, so we are defining
        # crowding as 85% of that capacity.  In TCRP 165, this
        # corresponds approximately to the range of 125-150% of
        # the seated load, which is the maximum design load for
        # peak of the peak conditions.
        joined['CROWDED'] = np.where(joined['VC'] > 0.85, 1.0, 0.0)
        joined['CROWDED'] = joined['CROWDED'].mask(joined['OBSERVED'] == 0,
                                                   other=np.nan)

        joined['CROWDHOURS'] = (
            joined['CROWDED'] *
            (joined['LOAD_ARR'] * joined['RUNTIME'] +
             joined['LOAD_DEP'] * joined['DWELL'])).values / 60.0

        # keep only relevant columns, sorted
        joined.sort(indexColumns, inplace=True)
        joined = joined[colnames]

        return joined

    def weightTrips(self, trips):
        """
        Adds a series of weight columns to the trip df based on the ratio
        of total to observed trips.        
        """

        # start with all observations weighted equally
        trips['TRIPS'] = 1
        trips['TRIP_WEIGHT'] = trips['OBSERVED'].mask(trips['OBSERVED'] == 0,
                                                      other=np.nan)

        # add the weight columns, specific to the level of aggregation
        # the weights build upon the lower-level weights, so we scale
        # the low-weights up uniformly within the group.

        # routes
        trips['TOD_WEIGHT'] = calcWeights(
            trips,
            groupby=['DATE', 'TOD', 'AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR'],
            oldWeight='TRIP_WEIGHT')

        trips['DAY_WEIGHT'] = calcWeights(
            trips,
            groupby=['DATE', 'AGENCY_ID', 'ROUTE_SHORT_NAME', 'DIR'],
            oldWeight='TOD_WEIGHT')

        # system
        trips['SYSTEM_WEIGHT'] = calcWeights(
            trips,
            groupby=['DATE', 'TOD', 'AGENCY_ID'],
            oldWeight='DAY_WEIGHT')

        return trips

    def getStringLengths(self, usedColumns):
        """
        gets the max string length for the columns that are in use
        """

        # convert column specs
        stringLengths = {}
        for col in self.COLUMNS:
            name = col[0]
            if name in usedColumns:
                stringLength = col[1]
                if (stringLength > 0):
                    stringLengths[name] = stringLength

        return stringLengths

コード例 #7

ファイルを表示

ファイル: sfdata_wrangler.py プロジェクト: alex-mucci/sfdata_wrangler

            sfmuni_file=CLEANED_OUTFILE,
            trip_outfile=EXPANDED_TRIP_OUTFILE,
            ts_outfile=EXPANDED_TS_OUTFILE,
            daily_trip_outfile=DAILY_TRIP_OUTFILE,
            daily_ts_outfile=DAILY_TS_OUTFILE,
            dow=[1],
            startDate='2000-01-01')
        for infile in RAW_GTFS_FILES:
            sfmuniExpander.expandAndWeight(infile)
        print('Finished expanding to GTFS in ',
              (datetime.datetime.now() - startTime))

    # aggregate to monthly totals
    if 'aggregate' in STEPS_TO_RUN:
        startTime = datetime.datetime.now()
        aggregator = SFMuniDataAggregator()
        aggregator.aggregateTripsToMonths(DAILY_TRIP_OUTFILE,
                                          MONTHLY_TRIP_OUTFILE)
        aggregator.aggregateTripStopsToMonths(DAILY_TS_OUTFILE,
                                              MONTHLY_TS_OUTFILE)
        print('Finished aggregations in ',
              (datetime.datetime.now() - startTime))

    # process GTFS schedule data.
    if 'gtfs' in STEPS_TO_RUN:
        startTime = datetime.datetime.now()
        gtfsHelper = GTFSHelper()
        gtfsHelper.processFiles(RAW_GTFS_FILES, GTFS_OUTFILE, 'sfmuni')
        gtfsHelper.processFiles(BART_GTFS_FILES, GTFS_OUTFILE, 'bart')
        print('Finished processing GTFS data ',
              (datetime.datetime.now() - startTime))