コード例 #1
0
def nc_download_and_unzip_data(destination, prefix="state-"):
    """Download and unzip data into destination directory"""
    # make sure destination exists or create a temporary directory
    if not destination:
        destination = tempfile.mkdtemp(prefix=prefix)
        logger.debug("Created temp directory {}".format(destination))
    else:
        if not os.path.exists(destination):
            os.makedirs(destination)
            logger.info("Created {}".format(destination))
    zip_basename = date.today().strftime("NC_STOPS_Extract_%Y_%m_%d.zip")
    zip_filename = os.path.join(destination, zip_basename)
    # don't re-download data if raw data file already exists
    if os.path.exists(zip_filename):
        logger.debug("{} exists, skipping download".format(zip_filename))
    else:
        logger.debug("Downloading data to {}".format(zip_filename))
        nc_data_site = settings.NC_FTP_HOST
        nc_data_file = "STOPS_Extract.zip"
        nc_data_directory = "/TSTOPextract"
        ftps = ftps_connect(nc_data_site)
        ftps.cwd(nc_data_directory)
        logger.debug("Files available at %s:", nc_data_site)
        listing = ftps.retrlines("LIST", show_ftp_listing)
        line = listing.split("\n")[0]
        if not line.startswith("226 "):  # server's "Transfer complete" message
            raise ValueError("Expected 226 response from ftp server, got %r" % listing)
        logger.info('Downloading "%s"...', nc_data_file)
        with open(zip_filename, "wb") as f:
            ftps.retrbinary("RETR %s" % nc_data_file, f.write)
        logger.info('File written to "%s"' % zip_filename)

    unzip_data(destination, zip_path=zip_filename)
    return destination
コード例 #2
0
def nc_download_and_unzip_data(destination, prefix='state-'):
    """Download and unzip data into destination directory"""
    # make sure destination exists or create a temporary directory
    if not destination:
        destination = tempfile.mkdtemp(prefix=prefix)
        logger.debug("Created temp directory {}".format(destination))
    else:
        if not os.path.exists(destination):
            os.makedirs(destination)
            logger.info("Created {}".format(destination))
    zip_basename = date.today().strftime('NC_STOPS_Extract_%Y_%m_%d.zip')
    zip_filename = os.path.join(destination, zip_basename)
    # don't re-download data if raw data file already exists
    if os.path.exists(zip_filename):
        logger.debug("{} exists, skipping download".format(zip_filename))
    else:
        logger.debug("Downloading data to {}".format(zip_filename))
        nc_data_site = 'sbi1.jus.state.nc.us'
        nc_data_user = os.environ.get('NC_FTP_USER')
        nc_data_password = os.environ.get('NC_FTP_PASSWORD')
        nc_data_file = 'STOPS_Extract.zip'
        nc_data_directory = '/TSTOPextract'

        # Note: NC documents show FileZilla set up to use explicit FTP over TLS
        #       if available (like FTP_TLS), but the server doesn't currently
        #       support it.
        ftp = FTP(nc_data_site)
        ftp.login(nc_data_user, nc_data_password)
        ftp.cwd(nc_data_directory)
        logger.debug('Files available at %s:', nc_data_site)
        listing = ftp.retrlines('LIST', show_ftp_listing)
        line = listing.split('\n')[0]
        if not line.startswith('226 '):  # server's "Transfer complete" message
            raise ValueError('Expected 226 response from ftp server, got %r' %
                             listing)
        logger.info('Downloading "%s"...', nc_data_file)
        with open(zip_filename, 'wb') as f:
            ftp.retrbinary('RETR %s' % nc_data_file, f.write)
        logger.info('File written to "%s"' % zip_filename)

    unzip_data(destination, zip_path=zip_filename)
    return destination
コード例 #3
0
 def test_unzip_data(self):
     """
     test of download_and_unzip_data() above hits a lot of the main
     path of unzip_data(); this tests a few special scenarios
     """
     with self.assertRaises(ValueError):
         unzip_data(None)
     with self.assertRaises(ValueError):
         unzip_data("something", url=None, zip_path=None)
     with self.assertRaises(ValueError):
         unzip_data("something", url="http://example.com/foo.zip", zip_path="/tmp/foo.zip")
コード例 #4
0
 def test_unzip_data(self):
     """
     test of download_and_unzip_data() above hits a lot of the main
     path of unzip_data(); this tests a few special scenarios
     """
     with self.assertRaises(ValueError):
         unzip_data(None)
     with self.assertRaises(ValueError):
         unzip_data('something', url=None, zip_path=None)
     with self.assertRaises(ValueError):
         unzip_data('something', url='http://example.com/foo.zip', zip_path='/tmp/foo.zip')
コード例 #5
0
def run(url,
        destination=None,
        zip_path=None,
        min_stop_id=None,
        max_stop_id=None,
        prime_cache=True):
    """
    Download NC data, extract, convert to CSV, and load into PostgreSQL

    :param url: if not None, zip will be downloaded from this URL; this can
      either be a URL supported by the requests library OR the special URL
      MAGIC_NC_FTP_URL, in which case the zip will be downloaded from the state
      of North Carolina server.
    :param destination: directory for unpacking zip and creating other
      files; pass None to create a temporary file
    :param zip_path: path to previously-downloaded zip
    :param prime_cache: whether or not to prime the query cache for "big"
      NC agencies after import
    :param max_stop_id: only process stops with ids <= this value; this is to
      save time for developers by reducing the amount of data to import
    :param min_stop_id: only process stops with ids >= this value; this is to
      save time for developers by reducing the amount of data to import
    """
    if not url and not destination:
        raise ValueError(
            "destination must be provided when no URL is provided")

    if (min_stop_id is None) != (max_stop_id is None):
        raise ValueError(
            "provide neither or both of min_stop_id and max_stop_id")

    if max_stop_id is not None and min_stop_id > max_stop_id:
        raise ValueError("min_stop_id cannot be larger than max_stop_id")

    logger.info("*** NC Data Import Started ***")

    if url:
        if url == MAGIC_NC_FTP_URL:
            destination = nc_download_and_unzip_data(destination)
        else:
            destination = download_and_unzip_data(url, destination)
    else:
        unzip_data(destination, zip_path=zip_path)

    if max_stop_id is not None:
        truncate_input_data(destination, min_stop_id, max_stop_id)
        override_start_date = None
    else:
        # When processing entire dataset, pretend we don't have data from
        # 2000-2001 since so few agencies reported then.
        override_start_date = "Jan 01, 2002"

    # convert data files to CSV for database importing
    logger.info("Converting to CSV")
    convert_to_csv(destination)

    # find any new NC agencies and add to a copy of NC_agencies.csv
    logger.info("Looking for new NC agencies in Stops.csv")
    nc_agency_csv = update_nc_agencies(
        os.path.join(os.path.dirname(__file__), "NC_agencies.csv"),
        destination)

    # use COPY to load CSV files as quickly as possible
    copy_from(destination, nc_agency_csv)
    logger.info("NC Data Import Complete")

    # Clear the query cache to get rid of NC queries made on old data
    cache.clear()

    # fix landing page data
    facts = compute_dataset_facts(Agency,
                                  Stop,
                                  settings.NC_KEY,
                                  Search=Search,
                                  override_start_date=override_start_date)
    logger.info("NC dataset facts: %r", facts)

    # prime the query cache for large NC agencies
    if prime_cache:
        prime_cache_run()
コード例 #6
0
def run(url, destination=None, zip_path=None, min_stop_id=None,
        max_stop_id=None, prime_cache=True):
    """
    Download NC data, extract, convert to CSV, and load into PostgreSQL

    :param url: if not None, zip will be downloaded from this URL; this can
      either be a URL supported by the requests library OR the special URL
      MAGIC_NC_FTP_URL, in which case the zip will be downloaded from the state
      of North Carolina server.
    :param destination: directory for unpacking zip and creating other
      files; pass None to create a temporary file
    :param zip_path: path to previously-downloaded zip
    :param prime_cache: whether or not to prime the query cache for "big"
      NC agencies after import
    :param max_stop_id: only process stops with ids <= this value; this is to
      save time for developers by reducing the amount of data to import
    :param min_stop_id: only process stops with ids >= this value; this is to
      save time for developers by reducing the amount of data to import
    """
    if not url and not destination:
        raise ValueError('destination must be provided when no URL is provided')

    if (min_stop_id is None) != (max_stop_id is None):
        raise ValueError('provide neither or both of min_stop_id and max_stop_id')

    if max_stop_id is not None and min_stop_id > max_stop_id:
        raise ValueError('min_stop_id cannot be larger than max_stop_id')

    logger.info('*** NC Data Import Started ***')

    if url:
        if url == MAGIC_NC_FTP_URL:
            destination = nc_download_and_unzip_data(destination)
        else:
            destination = download_and_unzip_data(url, destination)
    else:
        unzip_data(destination, zip_path=zip_path)

    if max_stop_id is not None:
        truncate_input_data(destination, min_stop_id, max_stop_id)
        override_start_date = None
    else:
        # When processing entire dataset, pretend we don't have data from
        # 2000-2001 since so few agencies reported then.
        override_start_date = 'Jan 01, 2002'

    # convert data files to CSV for database importing
    convert_to_csv(destination)

    # find any new NC agencies and add to a copy of NC_agencies.csv
    nc_agency_csv = update_nc_agencies(
        os.path.join(os.path.dirname(__file__), 'NC_agencies.csv'),
        destination
    )

    # drop constraints/indexes
    drop_constraints_and_indexes(connections['traffic_stops_nc'].cursor())
    # use COPY to load CSV files as quickly as possible
    copy_from(destination, nc_agency_csv)
    logger.info("NC Data Import Complete")

    # Clear the query cache to get rid of NC queries made on old data
    flush_memcached()

    # fix landing page data
    facts = compute_dataset_facts(
        Agency, Stop, settings.NC_KEY, Search=Search,
        override_start_date=override_start_date
    )
    logger.info('NC dataset facts: %r', facts)

    # prime the query cache for large NC agencies
    if prime_cache:
        prime_cache_run()