Ejemplo n.º 1
0
def run(url, report, destination=None, download=True):
    """Download IL data, extract, load, and compute some simple stats"""
    logger.info('*** IL Data Analysis Started ***')
    destination = download_and_unzip_data(url, destination)
    csv_path = get_datafile_path(url, destination)
    stops = load_csv(csv_path)
    analyze(stops, report)
Ejemplo n.º 2
0
def run(url, report, destination=None, download=True):
    """Download IL data, extract, load, and compute some simple stats"""
    logger.info('*** IL Data Analysis Started ***')
    destination = download_and_unzip_data(url, destination)
    csv_path = get_datafile_path(url, destination)
    stops = load_csv(csv_path)
    analyze(stops, report)
Ejemplo n.º 3
0
def run(url, report, destination=None, download=True):
    """Download MD data, extract, load, and compute some simple stats"""
    logger.info('*** MD Data Analysis Started ***')
    destination = download_and_unzip_data(url, destination)
    xls_path = get_datafile_path(url, destination)
    stops = load_xls(xls_path)
    stops = process_raw_data(stops, to_drop=())
    analyze(stops, report)
Ejemplo n.º 4
0
def run(url, destination=None, download=True):
    """Download MD data, extract, convert to CSV, and scan for issues"""
    logger.info('*** MD Data Scan Started ***')
    destination = download_and_unzip_data(url, destination)
    # Convert to CSV
    xls_path = get_datafile_path(url, destination)
    csv_path = get_csv_path(url, destination)
    if not os.path.exists(csv_path):
        xls_to_csv(xls_path, csv_path)
    else:
        logger.info("{} exists, skipping XLS->CSV conversion".format(csv_path))
    csv_count = line_count(csv_path)
    logger.debug('Rows: {}'.format(csv_count))
    scan([csv_path])
Ejemplo n.º 5
0
    def test_download_and_unzip_data(self):
        """
        create a temporary directory then create a zip in it, then
        check that it is downloaded properly
        """

        orig_destination = tempfile.TemporaryDirectory()
        zip_path = os.path.join(orig_destination.name, 'foo.zip')
        self.make_test_zip(zip_path)
        url = 'http://example.com/foo.zip'  # must have same basename that we create
        destination = download_and_unzip_data(url, orig_destination.name)
        self.assertEqual(orig_destination.name, destination)
        self.assertEqual({'foo.zip', 'file1.txt', 'file2.txt', 'file3.txt'},
                         set(os.listdir(orig_destination.name)))
        orig_destination.cleanup()
Ejemplo n.º 6
0
    def test_download_and_unzip_data(self):
        """
        create a temporary directory then create a zip in it, then
        check that it is downloaded properly
        """

        orig_destination = tempfile.TemporaryDirectory()
        zip_path = os.path.join(orig_destination.name, 'foo.zip')
        self.make_test_zip(zip_path)
        url = 'http://example.com/foo.zip'  # must have same basename that we create
        destination = download_and_unzip_data(url, orig_destination.name)
        self.assertEqual(orig_destination.name, destination)
        self.assertEqual(
            {'foo.zip', 'file1.txt', 'file2.txt', 'file3.txt'},
            set(os.listdir(orig_destination.name))
        )
        orig_destination.cleanup()
Ejemplo n.º 7
0
def run(url, destination=None, download=True):
    """Download IL data, extract, and load into PostgreSQL"""
    logger.info('*** IL Data Import Started ***')
    destination = download_and_unzip_data(url, destination)
    # Convert to CSV
    raw_csv_path = get_datafile_path(url, destination)
    processed_csv_path = get_csv_path(url, destination)
    if not os.path.exists(processed_csv_path):
        raw_to_processed(raw_csv_path, processed_csv_path)
    else:
        logger.info("{} exists, skipping cleanup".format(processed_csv_path))
    csv_count = line_count(processed_csv_path)
    logger.debug('Rows: {}'.format(csv_count))
    # drop constraints/indexes
    drop_constraints_and_indexes(connections['traffic_stops_il'].cursor())
    # use COPY to load CSV file as quickly as possible
    copy_from(processed_csv_path)
    # Clear the query cache
    flush_memcached()
Ejemplo n.º 8
0
def run(url, destination=None, download=True):
    """Download IL data, extract, and load into PostgreSQL"""
    logger.info('*** IL Data Import Started ***')
    destination = download_and_unzip_data(url, destination)
    # Convert to CSV
    raw_csv_path = get_datafile_path(url, destination)
    processed_csv_path = get_csv_path(url, destination)
    if not os.path.exists(processed_csv_path):
        raw_to_processed(raw_csv_path, processed_csv_path)
    else:
        logger.info("{} exists, skipping cleanup".format(processed_csv_path))
    csv_count = line_count(processed_csv_path)
    logger.debug('Rows: {}'.format(csv_count))
    # drop constraints/indexes
    drop_constraints_and_indexes(connections['traffic_stops_il'].cursor())
    # use COPY to load CSV file as quickly as possible
    copy_from(processed_csv_path)
    # Clear the query cache
    flush_memcached()
Ejemplo n.º 9
0
def run(url, destination=None, download=True):
    """Download MD data, extract, convert to CSV, and load into PostgreSQL"""
    logger.info('*** MD Data Import Started ***')
    destination = download_and_unzip_data(url, destination)
    # Convert to CSV
    xls_path = get_datafile_path(url, destination)
    csv_path = get_csv_path(url, destination)
    if not os.path.exists(csv_path):
        xls_to_csv(xls_path, csv_path)
    else:
        logger.info("{} exists, skipping XLS->CSV conversion".format(csv_path))
    csv_count = line_count(csv_path)
    logger.debug('Rows: {}'.format(csv_count))
    # drop constraints/indexes
    drop_constraints_and_indexes(connections['traffic_stops_md'].cursor())
    # use COPY to load CSV files as quickly as possible
    copy_from(csv_path)
    # Clear the query cache
    flush_memcached()
Ejemplo n.º 10
0
def run(url,
        destination=None,
        zip_path=None,
        min_stop_id=None,
        max_stop_id=None,
        prime_cache=True):
    """
    Download NC data, extract, convert to CSV, and load into PostgreSQL

    :param url: if not None, zip will be downloaded from this URL; this can
      either be a URL supported by the requests library OR the special URL
      MAGIC_NC_FTP_URL, in which case the zip will be downloaded from the state
      of North Carolina server.
    :param destination: directory for unpacking zip and creating other
      files; pass None to create a temporary file
    :param zip_path: path to previously-downloaded zip
    :param prime_cache: whether or not to prime the query cache for "big"
      NC agencies after import
    :param max_stop_id: only process stops with ids <= this value; this is to
      save time for developers by reducing the amount of data to import
    :param min_stop_id: only process stops with ids >= this value; this is to
      save time for developers by reducing the amount of data to import
    """
    if not url and not destination:
        raise ValueError(
            "destination must be provided when no URL is provided")

    if (min_stop_id is None) != (max_stop_id is None):
        raise ValueError(
            "provide neither or both of min_stop_id and max_stop_id")

    if max_stop_id is not None and min_stop_id > max_stop_id:
        raise ValueError("min_stop_id cannot be larger than max_stop_id")

    logger.info("*** NC Data Import Started ***")

    if url:
        if url == MAGIC_NC_FTP_URL:
            destination = nc_download_and_unzip_data(destination)
        else:
            destination = download_and_unzip_data(url, destination)
    else:
        unzip_data(destination, zip_path=zip_path)

    if max_stop_id is not None:
        truncate_input_data(destination, min_stop_id, max_stop_id)
        override_start_date = None
    else:
        # When processing entire dataset, pretend we don't have data from
        # 2000-2001 since so few agencies reported then.
        override_start_date = "Jan 01, 2002"

    # convert data files to CSV for database importing
    logger.info("Converting to CSV")
    convert_to_csv(destination)

    # find any new NC agencies and add to a copy of NC_agencies.csv
    logger.info("Looking for new NC agencies in Stops.csv")
    nc_agency_csv = update_nc_agencies(
        os.path.join(os.path.dirname(__file__), "NC_agencies.csv"),
        destination)

    # use COPY to load CSV files as quickly as possible
    copy_from(destination, nc_agency_csv)
    logger.info("NC Data Import Complete")

    # Clear the query cache to get rid of NC queries made on old data
    cache.clear()

    # fix landing page data
    facts = compute_dataset_facts(Agency,
                                  Stop,
                                  settings.NC_KEY,
                                  Search=Search,
                                  override_start_date=override_start_date)
    logger.info("NC dataset facts: %r", facts)

    # prime the query cache for large NC agencies
    if prime_cache:
        prime_cache_run()
Ejemplo n.º 11
0
def run(url, destination=None, zip_path=None, min_stop_id=None,
        max_stop_id=None, prime_cache=True):
    """
    Download NC data, extract, convert to CSV, and load into PostgreSQL

    :param url: if not None, zip will be downloaded from this URL; this can
      either be a URL supported by the requests library OR the special URL
      MAGIC_NC_FTP_URL, in which case the zip will be downloaded from the state
      of North Carolina server.
    :param destination: directory for unpacking zip and creating other
      files; pass None to create a temporary file
    :param zip_path: path to previously-downloaded zip
    :param prime_cache: whether or not to prime the query cache for "big"
      NC agencies after import
    :param max_stop_id: only process stops with ids <= this value; this is to
      save time for developers by reducing the amount of data to import
    :param min_stop_id: only process stops with ids >= this value; this is to
      save time for developers by reducing the amount of data to import
    """
    if not url and not destination:
        raise ValueError('destination must be provided when no URL is provided')

    if (min_stop_id is None) != (max_stop_id is None):
        raise ValueError('provide neither or both of min_stop_id and max_stop_id')

    if max_stop_id is not None and min_stop_id > max_stop_id:
        raise ValueError('min_stop_id cannot be larger than max_stop_id')

    logger.info('*** NC Data Import Started ***')

    if url:
        if url == MAGIC_NC_FTP_URL:
            destination = nc_download_and_unzip_data(destination)
        else:
            destination = download_and_unzip_data(url, destination)
    else:
        unzip_data(destination, zip_path=zip_path)

    if max_stop_id is not None:
        truncate_input_data(destination, min_stop_id, max_stop_id)
        override_start_date = None
    else:
        # When processing entire dataset, pretend we don't have data from
        # 2000-2001 since so few agencies reported then.
        override_start_date = 'Jan 01, 2002'

    # convert data files to CSV for database importing
    convert_to_csv(destination)

    # find any new NC agencies and add to a copy of NC_agencies.csv
    nc_agency_csv = update_nc_agencies(
        os.path.join(os.path.dirname(__file__), 'NC_agencies.csv'),
        destination
    )

    # drop constraints/indexes
    drop_constraints_and_indexes(connections['traffic_stops_nc'].cursor())
    # use COPY to load CSV files as quickly as possible
    copy_from(destination, nc_agency_csv)
    logger.info("NC Data Import Complete")

    # Clear the query cache to get rid of NC queries made on old data
    flush_memcached()

    # fix landing page data
    facts = compute_dataset_facts(
        Agency, Stop, settings.NC_KEY, Search=Search,
        override_start_date=override_start_date
    )
    logger.info('NC dataset facts: %r', facts)

    # prime the query cache for large NC agencies
    if prime_cache:
        prime_cache_run()