def nc_download_and_unzip_data(destination, prefix="state-"): """Download and unzip data into destination directory""" # make sure destination exists or create a temporary directory if not destination: destination = tempfile.mkdtemp(prefix=prefix) logger.debug("Created temp directory {}".format(destination)) else: if not os.path.exists(destination): os.makedirs(destination) logger.info("Created {}".format(destination)) zip_basename = date.today().strftime("NC_STOPS_Extract_%Y_%m_%d.zip") zip_filename = os.path.join(destination, zip_basename) # don't re-download data if raw data file already exists if os.path.exists(zip_filename): logger.debug("{} exists, skipping download".format(zip_filename)) else: logger.debug("Downloading data to {}".format(zip_filename)) nc_data_site = settings.NC_FTP_HOST nc_data_file = "STOPS_Extract.zip" nc_data_directory = "/TSTOPextract" ftps = ftps_connect(nc_data_site) ftps.cwd(nc_data_directory) logger.debug("Files available at %s:", nc_data_site) listing = ftps.retrlines("LIST", show_ftp_listing) line = listing.split("\n")[0] if not line.startswith("226 "): # server's "Transfer complete" message raise ValueError("Expected 226 response from ftp server, got %r" % listing) logger.info('Downloading "%s"...', nc_data_file) with open(zip_filename, "wb") as f: ftps.retrbinary("RETR %s" % nc_data_file, f.write) logger.info('File written to "%s"' % zip_filename) unzip_data(destination, zip_path=zip_filename) return destination
def nc_download_and_unzip_data(destination, prefix='state-'): """Download and unzip data into destination directory""" # make sure destination exists or create a temporary directory if not destination: destination = tempfile.mkdtemp(prefix=prefix) logger.debug("Created temp directory {}".format(destination)) else: if not os.path.exists(destination): os.makedirs(destination) logger.info("Created {}".format(destination)) zip_basename = date.today().strftime('NC_STOPS_Extract_%Y_%m_%d.zip') zip_filename = os.path.join(destination, zip_basename) # don't re-download data if raw data file already exists if os.path.exists(zip_filename): logger.debug("{} exists, skipping download".format(zip_filename)) else: logger.debug("Downloading data to {}".format(zip_filename)) nc_data_site = 'sbi1.jus.state.nc.us' nc_data_user = os.environ.get('NC_FTP_USER') nc_data_password = os.environ.get('NC_FTP_PASSWORD') nc_data_file = 'STOPS_Extract.zip' nc_data_directory = '/TSTOPextract' # Note: NC documents show FileZilla set up to use explicit FTP over TLS # if available (like FTP_TLS), but the server doesn't currently # support it. ftp = FTP(nc_data_site) ftp.login(nc_data_user, nc_data_password) ftp.cwd(nc_data_directory) logger.debug('Files available at %s:', nc_data_site) listing = ftp.retrlines('LIST', show_ftp_listing) line = listing.split('\n')[0] if not line.startswith('226 '): # server's "Transfer complete" message raise ValueError('Expected 226 response from ftp server, got %r' % listing) logger.info('Downloading "%s"...', nc_data_file) with open(zip_filename, 'wb') as f: ftp.retrbinary('RETR %s' % nc_data_file, f.write) logger.info('File written to "%s"' % zip_filename) unzip_data(destination, zip_path=zip_filename) return destination
def test_unzip_data(self): """ test of download_and_unzip_data() above hits a lot of the main path of unzip_data(); this tests a few special scenarios """ with self.assertRaises(ValueError): unzip_data(None) with self.assertRaises(ValueError): unzip_data("something", url=None, zip_path=None) with self.assertRaises(ValueError): unzip_data("something", url="http://example.com/foo.zip", zip_path="/tmp/foo.zip")
def test_unzip_data(self): """ test of download_and_unzip_data() above hits a lot of the main path of unzip_data(); this tests a few special scenarios """ with self.assertRaises(ValueError): unzip_data(None) with self.assertRaises(ValueError): unzip_data('something', url=None, zip_path=None) with self.assertRaises(ValueError): unzip_data('something', url='http://example.com/foo.zip', zip_path='/tmp/foo.zip')
def run(url, destination=None, zip_path=None, min_stop_id=None, max_stop_id=None, prime_cache=True): """ Download NC data, extract, convert to CSV, and load into PostgreSQL :param url: if not None, zip will be downloaded from this URL; this can either be a URL supported by the requests library OR the special URL MAGIC_NC_FTP_URL, in which case the zip will be downloaded from the state of North Carolina server. :param destination: directory for unpacking zip and creating other files; pass None to create a temporary file :param zip_path: path to previously-downloaded zip :param prime_cache: whether or not to prime the query cache for "big" NC agencies after import :param max_stop_id: only process stops with ids <= this value; this is to save time for developers by reducing the amount of data to import :param min_stop_id: only process stops with ids >= this value; this is to save time for developers by reducing the amount of data to import """ if not url and not destination: raise ValueError( "destination must be provided when no URL is provided") if (min_stop_id is None) != (max_stop_id is None): raise ValueError( "provide neither or both of min_stop_id and max_stop_id") if max_stop_id is not None and min_stop_id > max_stop_id: raise ValueError("min_stop_id cannot be larger than max_stop_id") logger.info("*** NC Data Import Started ***") if url: if url == MAGIC_NC_FTP_URL: destination = nc_download_and_unzip_data(destination) else: destination = download_and_unzip_data(url, destination) else: unzip_data(destination, zip_path=zip_path) if max_stop_id is not None: truncate_input_data(destination, min_stop_id, max_stop_id) override_start_date = None else: # When processing entire dataset, pretend we don't have data from # 2000-2001 since so few agencies reported then. override_start_date = "Jan 01, 2002" # convert data files to CSV for database importing logger.info("Converting to CSV") convert_to_csv(destination) # find any new NC agencies and add to a copy of NC_agencies.csv logger.info("Looking for new NC agencies in Stops.csv") nc_agency_csv = update_nc_agencies( os.path.join(os.path.dirname(__file__), "NC_agencies.csv"), destination) # use COPY to load CSV files as quickly as possible copy_from(destination, nc_agency_csv) logger.info("NC Data Import Complete") # Clear the query cache to get rid of NC queries made on old data cache.clear() # fix landing page data facts = compute_dataset_facts(Agency, Stop, settings.NC_KEY, Search=Search, override_start_date=override_start_date) logger.info("NC dataset facts: %r", facts) # prime the query cache for large NC agencies if prime_cache: prime_cache_run()
def run(url, destination=None, zip_path=None, min_stop_id=None, max_stop_id=None, prime_cache=True): """ Download NC data, extract, convert to CSV, and load into PostgreSQL :param url: if not None, zip will be downloaded from this URL; this can either be a URL supported by the requests library OR the special URL MAGIC_NC_FTP_URL, in which case the zip will be downloaded from the state of North Carolina server. :param destination: directory for unpacking zip and creating other files; pass None to create a temporary file :param zip_path: path to previously-downloaded zip :param prime_cache: whether or not to prime the query cache for "big" NC agencies after import :param max_stop_id: only process stops with ids <= this value; this is to save time for developers by reducing the amount of data to import :param min_stop_id: only process stops with ids >= this value; this is to save time for developers by reducing the amount of data to import """ if not url and not destination: raise ValueError('destination must be provided when no URL is provided') if (min_stop_id is None) != (max_stop_id is None): raise ValueError('provide neither or both of min_stop_id and max_stop_id') if max_stop_id is not None and min_stop_id > max_stop_id: raise ValueError('min_stop_id cannot be larger than max_stop_id') logger.info('*** NC Data Import Started ***') if url: if url == MAGIC_NC_FTP_URL: destination = nc_download_and_unzip_data(destination) else: destination = download_and_unzip_data(url, destination) else: unzip_data(destination, zip_path=zip_path) if max_stop_id is not None: truncate_input_data(destination, min_stop_id, max_stop_id) override_start_date = None else: # When processing entire dataset, pretend we don't have data from # 2000-2001 since so few agencies reported then. override_start_date = 'Jan 01, 2002' # convert data files to CSV for database importing convert_to_csv(destination) # find any new NC agencies and add to a copy of NC_agencies.csv nc_agency_csv = update_nc_agencies( os.path.join(os.path.dirname(__file__), 'NC_agencies.csv'), destination ) # drop constraints/indexes drop_constraints_and_indexes(connections['traffic_stops_nc'].cursor()) # use COPY to load CSV files as quickly as possible copy_from(destination, nc_agency_csv) logger.info("NC Data Import Complete") # Clear the query cache to get rid of NC queries made on old data flush_memcached() # fix landing page data facts = compute_dataset_facts( Agency, Stop, settings.NC_KEY, Search=Search, override_start_date=override_start_date ) logger.info('NC dataset facts: %r', facts) # prime the query cache for large NC agencies if prime_cache: prime_cache_run()