Esempio n. 1
0
def batch_stats(folder=GTFS_FEEDS_PATH, output_folder=OUTPUT_DIR):
    for file in os.listdir(folder):
        date_str = file.split('.')[0]
        date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
        feed = gu.get_partridge_feed_by_date(output_folder + file, date)
        zones = gu.get_zones_df(LOCAL_TARIFF_PATH)
        ts = compute_trip_stats_partridge(feed, zones)
        ts.to_pickle(output_folder + date_str + '_trip_stats.pkl.gz', compression='gzip')
        rs = compute_route_stats_base_partridge(ts)
        rs.to_pickle(output_folder + date_str + '_route_stats.pkl.gz', compression='gzip')
Esempio n. 2
0
def batch_stats(folder=GTFS_FEEDS_PATH, output_folder=OUTPUT_DIR):
    for file in os.listdir(folder):
        date_str = file.split('.')[0]
        date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
        feed = gu.get_partridge_feed_by_date(join(output_folder, file), date)
        zones = gu.get_zones_df(LOCAL_TARIFF_PATH)
        ts = compute_trip_stats_partridge(feed, zones)
        ts.to_pickle(join(output_folder, date_str + '_trip_stats.pkl.gz'),
                     compression='gzip')
        rs = compute_route_stats_base_partridge(ts)
        rs.to_pickle(join(output_folder, date_str + '_route_stats.pkl.gz'),
                     compression='gzip')
Esempio n. 3
0
def handle_gtfs_date(date_str, file, bucket, output_folder=OUTPUT_DIR,
                     gtfs_folder=GTFS_FEEDS_PATH, logger=None):
    """
Handle a single date for a single GTFS file. Download if necessary compute and save stats files (currently trip_stats
and route_stats).
    :param date_str: %Y-%m-%d
    :type date_str: str
    :param file: gtfs file name (currently only YYYY-mm-dd.zip)
    :type file: str
    :param bucket: s3 boto bucket object
    :type bucket: boto3.resources.factory.s3.Bucket
    :param output_folder: local path to write output files to
    :type output_folder: str
    :param gtfs_folder: local path containing GTFS feeds
    :type gtfs_folder: str
    :param logger: logger to write to
    :type logger: logging.Logger
    """
    date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()

    downloaded = False

    trip_stats_output_path = output_folder + date_str + '_trip_stats.pkl.gz'
    if os.path.exists(trip_stats_output_path):
        logger.info(f'found trip stats result DF gzipped pickle "{trip_stats_output_path}"')
        ts = pd.read_pickle(trip_stats_output_path, compression='gzip')
    else:
        downloaded = get_gtfs_file(file, gtfs_folder, bucket, logger)

        if WRITE_FILTERED_FEED:
            filtered_out_path = FILTERED_FEEDS_PATH + date_str + '.zip'
            logger.info(f'writing filtered gtfs feed for file "{gtfs_folder+file}" with date "{date}" in path '
                        f'{filtered_out_path}')
            gu.write_filtered_feed_by_date(gtfs_folder + file, date, filtered_out_path)
            logger.info(f'reading filtered feed for file from path {filtered_out_path}')
            feed = ptg_feed(filtered_out_path)

        else:
            logger.info(f'creating daily partridge feed for file "{gtfs_folder+file}" with date "{date}"')
            try:
                feed = gu.get_partridge_feed_by_date(gtfs_folder + file, date)
            except BadZipFile:
                logger.error('Bad local zip file', exc_info=True)
                downloaded = get_gtfs_file(file, gtfs_folder, bucket, logger, force=True)
                feed = gu.get_partridge_feed_by_date(gtfs_folder + file, date)

        logger.debug(f'finished creating daily partridge feed for file "{gtfs_folder+file}" with date "{date}"')

        # TODO: add changing zones from archive
        logger.info(f'creating zones DF from "{LOCAL_TARIFF_PATH}"')
        zones = gu.get_zones_df(LOCAL_TARIFF_PATH)

        logger.info(
            f'starting compute_trip_stats_partridge for file "{gtfs_folder+file}" with date "{date}" and zones '
            f'"{LOCAL_TARIFF_PATH}"')
        ts = compute_trip_stats_partridge(feed, zones)
        logger.debug(
            f'finished compute_trip_stats_partridge for file "{gtfs_folder+file}" with date "{date}" and zones '
            f'"{LOCAL_TARIFF_PATH}"')
        # TODO: log this
        ts['date'] = date_str
        ts['date'] = pd.Categorical(ts.date)

        logger.info(f'saving trip stats result DF to gzipped pickle "{trip_stats_output_path}"')
        ts.to_pickle(trip_stats_output_path, compression='gzip')

    # TODO: log more stats
    logger.debug(
        f'ts.shape={ts.shape}, dc_trip_id={ts.trip_id.nunique()}, dc_route_id={ts.route_id.nunique()}, '
        f'num_start_zones={ts.start_zone.nunique()}, num_agency={ts.agency_name.nunique()}')

    logger.info(f'starting compute_route_stats_base_partridge from trip stats result')
    rs = compute_route_stats_base_partridge(ts)
    logger.debug(f'finished compute_route_stats_base_partridge from trip stats result')
    # TODO: log this
    rs['date'] = date_str
    rs['date'] = pd.Categorical(rs.date)

    # TODO: log more stats
    logger.debug(
        f'rs.shape={rs.shape}, num_trips_sum={rs.num_trips.sum()}, dc_route_id={rs.route_id.nunique()}, '
        f'num_start_zones={rs.start_zone.nunique()}, num_agency={rs.agency_name.nunique()}')

    route_stats_output_path = output_folder + date_str + '_route_stats.pkl.gz'
    logger.info(f'saving route stats result DF to gzipped pickle "{route_stats_output_path}"')
    rs.to_pickle(route_stats_output_path, compression='gzip')

    return downloaded
Esempio n. 4
0
def handle_gtfs_date(date_str, file, bucket, output_folder=OUTPUT_DIR,
                     gtfs_folder=GTFS_FEEDS_PATH, logger=None):
    """
Handle a single date for a single GTFS file. Download if necessary compute and save stats files (currently trip_stats
and route_stats).
    :param date_str: %Y-%m-%d
    :type date_str: str
    :param file: gtfs file name (currently only YYYY-mm-dd.zip)
    :type file: str
    :param bucket: s3 boto bucket object
    :type bucket: boto3.resources.factory.s3.Bucket
    :param output_folder: local path to write output files to
    :type output_folder: str
    :param gtfs_folder: local path containing GTFS feeds
    :type gtfs_folder: str
    :param logger: logger to write to
    :type logger: logging.Logger
    """
    date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()

    downloaded = False

    trip_stats_output_path = join(output_folder,
                                  date_str + '_trip_stats.pkl.gz')
    if os.path.exists(trip_stats_output_path):
        logger.info(f'found trip stats result DF gzipped pickle "{trip_stats_output_path}"')
        ts = pd.read_pickle(trip_stats_output_path, compression='gzip')
    else:
        downloaded = get_gtfs_file(file, gtfs_folder, bucket, logger)

        if WRITE_FILTERED_FEED:
            filtered_out_path = FILTERED_FEEDS_PATH + date_str + '.zip'
            logger.info(f'writing filtered gtfs feed for file "{gtfs_folder+file}" with date "{date}" in path '
                        f'{filtered_out_path}')
            gu.write_filtered_feed_by_date(gtfs_folder + file, date, filtered_out_path)
            logger.info(f'reading filtered feed for file from path {filtered_out_path}')
            feed = ptg_feed(filtered_out_path)
        else:
            logger.info(f'creating daily partridge feed for file "{join(gtfs_folder, file)}" with date "{date}"')
            try:
                feed = gu.get_partridge_feed_by_date(join(gtfs_folder, file), date)
            except BadZipFile:
                logger.error('Bad local zip file', exc_info=True)
                downloaded = get_gtfs_file(file, gtfs_folder, bucket, logger, force=True)
                feed = gu.get_partridge_feed_by_date(join(gtfs_folder, file), date)

        logger.debug(f'finished creating daily partridge feed for file "{join(gtfs_folder, file)}" with date "{date}"')

        # TODO: use Tariff.zip from s3
        tariff_path_to_use = get_closest_archive_path(date, 'Tariff.zip')
        logger.info(f'creating zones DF from "{tariff_path_to_use}"')
        zones = gu.get_zones_df(tariff_path_to_use)

        logger.info(
            f'starting compute_trip_stats_partridge for file "{join(gtfs_folder, file)}" with date "{date}" and zones '
            f'"{LOCAL_TARIFF_PATH}"')
        ts = compute_trip_stats_partridge(feed, zones)
        logger.debug(
            f'finished compute_trip_stats_partridge for file "{join(gtfs_folder, file)}" with date "{date}" and zones '
            f'"{LOCAL_TARIFF_PATH}"')
        # TODO: log this
        ts['date'] = date_str
        ts['date'] = pd.Categorical(ts.date)

        logger.info(f'saving trip stats result DF to gzipped pickle "{trip_stats_output_path}"')
        ts.to_pickle(trip_stats_output_path, compression='gzip')

    # TODO: log more stats
    logger.debug(
        f'ts.shape={ts.shape}, dc_trip_id={ts.trip_id.nunique()}, dc_route_id={ts.route_id.nunique()}, '
        f'num_start_zones={ts.start_zone.nunique()}, num_agency={ts.agency_name.nunique()}')

    logger.info(f'starting compute_route_stats_base_partridge from trip stats result')
    rs = compute_route_stats_base_partridge(ts)
    logger.debug(f'finished compute_route_stats_base_partridge from trip stats result')
    # TODO: log this
    rs['date'] = date_str
    rs['date'] = pd.Categorical(rs.date)

    # TODO: log more stats
    logger.debug(
        f'rs.shape={rs.shape}, num_trips_sum={rs.num_trips.sum()}, dc_route_id={rs.route_id.nunique()}, '
        f'num_start_zones={rs.start_zone.nunique()}, num_agency={rs.agency_name.nunique()}')

    route_stats_output_path = join(output_folder, date_str + '_route_stats.pkl.gz')
    logger.info(f'saving route stats result DF to gzipped pickle "{route_stats_output_path}"')
    rs.to_pickle(route_stats_output_path, compression='gzip')

    return downloaded