def mirror_old_style_records_etl(): """Ensure the old-style records exist for all present new-style ones.""" start_time = datetime.datetime.now() LOG.info("Start: Mirror record documents in old-style naming.") conn = credential.UNCPathCredential(path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE) count = Counter() with conn: for doc_path in rlid_record_paths(): doc_name = os.path.basename(doc_path) doc_id, ext = os.path.splitext(doc_name) old_style_path = rlid_record_path_old(doc_id, ext) if not old_style_path: count["not in database"] += 1 elif os.path.exists(old_style_path): count["already mirrored"] += 1 elif place_record_old(doc_path): count["mirrored"] += 1 else: count["failed to mirror"] += 1 LOG.warning("%r failed to mirror to %r.", doc_name, old_style_path) document.log_state_counts(count, documents_type="records") LOG.info("End: Mirror.") elapsed(start_time, LOG)
def petition_documents_map(): """Return mapping of petition ID to list of document attribute dictionaries. The Access database has infuriatingly-named objects, so we need to munge that rather than just joining values from the database table directly. Returns: dict """ sql = """ select ltrim(rtrim([id_num])) as petition_id, replace( replace(ucase(ltrim(rtrim([tiffname]))), '.PDF', ''), '.TIF', '' ) as document_name, ltrim(rtrim([image type])) as document_type from [image path] where [id_num] is not null and [tiffname] is not null group by [id_num], [tiffname], [image type]; """ odbc_conn = pyodbc.connect( database.access_odbc_string(PATH["petition_database"])) unc_conn = credential.UNCPathCredential(path.EUGENE_DATABASE_SHARE, **credential.CEDOM100) with odbc_conn, unc_conn: documents = defaultdict(list) for petition_id, document_name, document_type in odbc_conn.execute( sql): documents[petition_id].append({ "document_name": document_name, "document_type": document_type }) return documents
def plat_maps_update(): """Run update for RLID plat map repository.""" root_path = REPO_PATH["plat"] source_root_path = os.path.join(path.EUGENE_IMAGES_SHARE, "PLAT") conn = credential.UNCPathCredential( path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE ) source_conn = credential.UNCPathCredential(source_root_path, **credential.CEDOM100) with conn, source_conn: # Currently only Eugene provides plat maps for RLID. document.update_repository( root_path, source_root_path, file_extensions=[".jpg", "jpeg", ".pdf", ".tif", ".tiff"], flatten_tree=True, create_pdf_copies=True, )
def deeds_records_update(): """Run update for deeds & records documents RLID repository.""" start_time = datetime.datetime.now() PATH["logfile"] = os.path.join( PATH["staging"], "Deeds_Records_Update_{}.log".format(start_time.year)) conn = credential.UNCPathCredential(PATH["staging"], **credential.RLID_DATA_SHARE) with conn: # Attach logfile handler for staging logfile. logfile = logging.FileHandler(PATH["logfile"]) logfile.setLevel(logging.INFO) logfile.setFormatter(LOGFILE_FORMATTER) LOG.addHandler(logfile) LOG.info("START SCRIPT: Update RLID deeds & records repository.") LOG.info( "Start: Move deeds & records drop-files to staging directory.") drop_extensions = [".exe", ".pdf", ".zip" ] + document.IMAGE_FILE_EXTENSIONS for file_name in os.listdir(PATH["drop"]): file_path = os.path.join(PATH["drop"], file_name) file_extension = os.path.splitext(file_name)[-1].lower() if all( [os.path.isfile(file_path), file_extension in drop_extensions]): move_path = os.path.join(PATH["staging"], file_name) shutil.move(file_path, move_path) LOG.info("Moved %r to %r.", file_path, move_path) LOG.info("End: Move.") LOG.info("Start: Extract record archives.") count = Counter() for file_path in path.folder_file_paths(PATH["staging"]): if os.path.splitext(file_path)[-1].lower() in [".exe", ".zip"]: count[extract_records(file_path, archive_original=True)] += 1 document.log_state_counts(count, documents_type="archives") # D&R archives include a few log & reference files; delete if present. for file_path in path.folder_file_paths(PATH["staging"]): for pattern in ["_logfile", "_xreffile"]: if pattern.lower() in file_path.lower(): os.remove(file_path) LOG.info("Start: Replace record images with PDFs.") count = Counter() for file_path in path.folder_file_paths(PATH["staging"]): if (os.path.splitext(file_path)[-1].lower() in document.IMAGE_FILE_EXTENSIONS): count[convert_image(file_path, delete_original=True)] += 1 document.log_state_counts(count, documents_type="images") LOG.info("Start: Place record PDFs in RLID repository.") count = Counter() for file_path in path.folder_file_paths(PATH["staging"]): if os.path.splitext(file_path)[-1].lower() == ".pdf": old_state = place_record_old(file_path) new_state = place_record( file_path, delete_original=(old_state == "placed")) count.update([old_state, new_state]) document.log_state_counts(count, documents_type="records") elapsed(start_time, LOG) LOG.info("END SCRIPT")
def weekly_datasets_etl(): """Run ETL for map server datasets with weekly update cycle. This script should only be used for updating geodatabase datasets & other managed data stores. Purely file-based formats like shapefiles are best updated via `file_datasets_etl`, for reasons related to locking mechanisms. """ conn = credential.UNCPathCredential(DATA_PATH, **credential.CPA_MAP_SERVER) with conn: for kwargs in DATASET_KWARGS_WEEKLY: if kwargs.get("source_path"): transform.etl_dataset(**kwargs)
def locators_etl(): """Run ETL for map server locators/geocoders. Need to shut down service before rebuilding. """ conn = credential.UNCPathCredential(DATA_PATH, **credential.CPA_MAP_SERVER) with conn: token = arcetl.services.generate_token(SERVER_URL, **credential.RLID_MAPS) for kwargs in LOCATOR_KWARGS: arcetl.services.toggle_service(token=token, stop_service=True, **kwargs) arcetl.workspace.build_locator(**kwargs) arcetl.services.toggle_service(token=token, start_service=True, **kwargs)
def rlidgeo_datasets_etl(): """Run ETL for map server datasets in the RLIDGeo replica geodatabase.""" conn = credential.UNCPathCredential(DATA_PATH, **credential.CPA_MAP_SERVER) with conn: for name in arcetl.workspace.dataset_names(database.RLIDGEO.path): if any( pattern.lower() in name.lower() for pattern in IGNORE_PATTERNS_RLIDGEO_SNAPSHOT ): LOG.warning("%s matches ignore-pattern: Skipping.", name) continue transform.etl_dataset( source_path=os.path.join(database.RLIDGEO.path, name), output_path=os.path.join(DATA_PATH, "RLIDGeo.gdb", name.split(".")[-1]), )
def file_datasets_etl(): """Run ETL for map server file-based datasets. This script should only be used for updating shapefiles & other purely file-based datasets. Managed data store formats like geodatabases are best updated via `etl_gimap_dataset`, for reasons related to locking mechanisms. Essentially, the file-based formats will not append-load on shapefiles locked by a service. So we pre-load them to a staging copy, where a server-side batch script can clear the locks & wholly replace the files. """ conn = credential.UNCPathCredential(STAGING_PATH, **credential.CPA_MAP_SERVER) with conn: for kwargs in DATASET_KWARGS_FILE: if kwargs.get("source_path"): transform.etl_dataset(**kwargs)
def service_datasets_monthly_etl(): """Run ETL for GIMap datasets with weekly update cycle. This script should only be used for updating geodatabase datasets & other managed data stores. Purely file-based formats like shapefiles are best updated in another manner, for reasons related to locking mechanisms. """ conn = credential.UNCPathCredential(path.RLID_MAPS_DATA_SHARE, **credential.CPA_MAP_SERVER) with conn: for gdb_relpath in sorted(KWARGS_MONTHLY_DATASETS): LOG.info("Update datasets in %s", gdb_relpath) gdb_path = os.path.join(DATA_PATH, gdb_relpath) for kwargs in KWARGS_MONTHLY_DATASETS[gdb_relpath]: kwargs['output_path'] = os.path.join(gdb_path, kwargs['output_name']) transform.etl_dataset(**kwargs)
def plat_documents_map(): """Return mapping of plat document ID to list of document attribute dictionaries. The Access database has infuriatingly-named objects, so we need to munge that rather than just joining values from the database table directly. Returns: dict """ sql = """ select ltrim(rtrim([plat number])) as document_number, ltrim(rtrim([subdivision name])) as plat_name, replace( replace(ucase(ltrim(rtrim([image name]))), '.PDF', ''), '.TIF', '' ) as document_name, ucase(ltrim(rtrim([Description]))) as document_type from [image path] where [plat number] is not null and [plat number] <> 0 and [image name] is not null group by [subdivision name], [plat number], [image name], [description]; """ odbc_conn = pyodbc.connect( database.access_odbc_string(PATH["plat_database"])) unc_conn = credential.UNCPathCredential(path.EUGENE_DATABASE_SHARE, **credential.CEDOM100) with odbc_conn, unc_conn: documents = defaultdict(list) for ( document_number, plat_name, document_name, document_type, ) in odbc_conn.execute(sql): documents[int(document_number)].append({ "plat_name": plat_name, "document_name": document_name, "document_type": document_type, }) return documents
def lcso_cad_datasets_etl(): """Run ETL for LSCO CAD delivery datasets.""" for dataset_name, kwargs in DATASET_KWARGS.items(): kwargs["output_path"] = os.path.join(DELIVERABLES_PATH, dataset_name + ".shp") transform.etl_dataset(**kwargs) zip_name = "LCSO_CAD_{}.zip".format(datestamp()) zip_path = os.path.join(path.RLID_MAPS_WWW_SHARE, "Download", zip_name) conn = credential.UNCPathCredential( path.RLID_MAPS_WWW_SHARE, **credential.CPA_MAP_SERVER ) with conn: path.archive_directory( directory_path=DELIVERABLES_PATH, archive_path=zip_path, directory_as_base=False, archive_exclude_patterns=[".lock", ".zip"], ) zip_url = url.RLID_MAPS + "Download/" + zip_name send_links_email(urls=[zip_url], **MESSAGE_KWARGS)
def property_cards_staging_update(): """Run update for RLID assessor property card staging repository.""" LOG.info("Start: Update assessor property card staging repository.") start_time = datetime.datetime.now() source_paths = document.repository_file_paths(path.LANE_PROPERTY_CARDS) conn = credential.UNCPathCredential( path.RLID_DATA_STAGING_SHARE, **credential.RLID_DATA_SHARE ) with conn: count = Counter() for source_path in source_paths: staging_path = os.path.join( REPO_PATH["property-card-staging"], os.path.basename(source_path) ) if document.changed(staging_path, source_path): result_key = document.update_document(source_path, staging_path) count[result_key] += 1 LOG.info("End: Update.") document.log_state_counts(count, documents_type="property cards (staging)") elapsed(start_time, LOG)
def tax_maps_staging_update(): """Run update for RLID tax map staging repository.""" LOG.info("Start: Update tax map staging repository.") start_time = datetime.datetime.now() conn = credential.UNCPathCredential( path.RLID_DATA_STAGING_SHARE, **credential.RLID_DATA_SHARE ) with conn: count = Counter() for source_path in document.repository_file_paths(path.LANE_TAX_MAP_IMAGES): staging_path = os.path.join( REPO_PATH["tax-map-staging"], # Tax maps have a one-deep bin. os.path.split(os.path.dirname(source_path))[-1], os.path.basename(source_path), ) if document.changed(staging_path, source_path): result_key = document.update_document(source_path, staging_path) count[result_key] += 1 document.log_state_counts(count, documents_type="tax maps (staging)") elapsed(start_time, LOG) LOG.info("End: Update.")
def property_cards_update(): """Run update for assessor property card RLID production repository.""" LOG.info("Start: Update RLID assessor property card repository.") start_time = datetime.datetime.now() staging_paths = document.repository_file_paths( REPO_PATH["property-card-staging"], file_extensions=[".pdf"] ) conn = credential.UNCPathCredential( path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE ) with conn: count = Counter() for staging_path in staging_paths: rlid_path = rlid_document_path( os.path.basename(staging_path), document_type="property-card" ) if document.changed(rlid_path, staging_path): result_key = document.update_document(staging_path, rlid_path) count[result_key] += 1 LOG.info("End: Update.") document.log_state_counts(count, documents_type="property cards") elapsed(start_time, LOG)
def tillamook_delivery_etl(): """Run ETL for Tillamook delivery.""" name = "Tillamook" gdb_path = os.path.join(PATH["tillamook_deliverables"], name + ".gdb") for dataset_name, kwargs in chain(TILLAMOOK_DATASET_KWARGS.items(), TILLAMOOK_GIS_DATASET_KWARGS.items()): kwargs["output_path"] = os.path.join(gdb_path, dataset_name) transform.etl_dataset(**kwargs) zip_name = "{}_{}.zip".format(name, datestamp()) zip_path = os.path.join(path.RLID_MAPS_WWW_SHARE, "Download", zip_name) conn = credential.UNCPathCredential(path.RLID_MAPS_WWW_SHARE, **credential.CPA_MAP_SERVER) with conn: path.archive_directory( directory_path=gdb_path, archive_path=zip_path, directory_as_base=True, archive_exclude_patterns=[".lock"], ) zip_url = url.RLID_MAPS + "Download/" + zip_name send_message_tillamook(zip_url, metadata_where_sql="in_tillamook = 1", **TILLAMOOK_MESSAGE_KWARGS)
def tax_maps_update(): """Run update for RLID tax map repository.""" start_time = datetime.datetime.now() conn = credential.UNCPathCredential( path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE ) with conn: # Attach logfile handler for repository update logfile. logfile = logging.FileHandler( os.path.join( REPO_PATH["tax-map"], "Tax_Map_Update_{}.log".format(start_time.year) ) ) logfile.setLevel(logging.INFO) logfile.setFormatter(LOGFILE_FORMATTER) LOG.addHandler(logfile) LOG.info("START SCRIPT: Update RLID tax map repository from staging.") file_name_release_date = tax_map_file_name_release_map( start_datetime=rlid_data_currency("Tax Maps") ) count = Counter() # Iterate through path/date map, adding, archiving & updating. for file_name, release_date in file_name_release_date.items(): rlid_path = rlid_document_path(file_name, document_type="tax-map") staging_path = rlid_document_path( file_name, document_type="tax-map-staging" ) result_key = update_tax_map( staging_path, rlid_path, release_date, archive_previous=True ) count[result_key] += 1 document.log_state_counts(count, documents_type="tax maps") # Finally, update tax map repository currency date (if we placed any). if count["updated"]: rlid_data_currency_setter("Tax Maps", max(file_name_release_date.values())) elapsed(start_time, LOG) LOG.info("END SCRIPT: Update")
def missing_in_rlid_etl(): """Run ETL for log of deeds & records documents missing in RLID.""" start_time = datetime.datetime.now() LOG.info( "Start: Compile table of deeds & records listed in Lane County records system," + " but not present in RLID repository.") conn = credential.UNCPathCredential(PATH["staging"], **credential.RLID_DATA_SHARE) csv_path = os.path.join(PATH["staging"], "Missing_in_RLID.csv") check_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") missing_count = 0 with conn: csvfile = open(csv_path, "wb") with csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(["document_id", "document_path", "check_time"]) for doc_path in rlid_record_paths(): if not os.path.exists(doc_path): doc_id = os.path.splitext(os.path.basename(doc_path))[0] csvwriter.writerow((doc_id, doc_path, check_time)) missing_count += 1 LOG.info("Found %s missing documents.", missing_count) LOG.info("End: Compile.") elapsed(start_time, LOG)
def tax_maps_not_in_source_etl(): """Run ETL for log of tax map documents in RLID but not source repository. We used to have an automatic check & retire for RLID tax maps that were no longer in the source repository. This pretty much retired the entire taxmap repository the night of 2015-05-07. This was because there appear to be times when the source repository is not reachable, and/or reports nothing in the source. For now, we will just log potential orphans. If you do need to "retire" a tax map no longer in use: 1. Make an archive copy of the document with this function call: ``` archive_tax_map( tax_map_path, archive_date=datetime.datetime.now(), is_replaced=False ) ``` 2. Move the document file to the `RetiredNoReplacement` subfolder. 3. Execute the following SQL statement: ``` if exists ( select 1 from RLID.dbo.Taxmap_Retired where image_filename = {file-name} ) begin; update RLID.dbo.Taxmap_Retired set date_retired = {same-date-as-archive-above} where image_filename = {file-name}; end; else begin; insert into RLID.dbo.Taxmap_Retired(image_filename, date_retired) values ({file-name}, {same-date-as-archive-above}); end; delete from RLID.dbo.Taxmap_Image where image_filename = {file-name};` """ start_time = datetime.datetime.now() LOG.info( "Start: Compile table of tax maps not mirrored between the Lane County & RLID" " repositories.\nAny tax maps in RLID not mirrored in the county repositoryare" " likely tax maps that no longer exist, and should be researched (and perhaps" " retired)." ) conn = credential.UNCPathCredential( path.RLID_DATA_SHARE, **credential.RLID_DATA_SHARE ) with conn: check_time = start_time.strftime("%Y-%m-%d %H:%M") file_names = { "County": { fixed_file_name(name) for _, _, filenames in os.walk(REPO_PATH["tax-map-staging"]) for name in filenames if name.lower().endswith(".pdf") }, "RLID": { fixed_file_name(name) for name in os.listdir(REPO_PATH["tax-map"]) if name.lower().endswith(".pdf") }, } for repo, other in permutations(["County", "RLID"]): LOG.info("Checking %s repository for tax maps not mirrored.", repo) unmirrored_file_names = sorted(file_names[repo] - file_names[other]) csv_path = os.path.join( REPO_PATH["tax-map"], "In_{}_Not_{}.csv".format(repo, other) ) csv_file = open(csv_path, "wb") with csv_file: csv_ = csv.writer(csv_file) csv_.writerow(("file_name", "check_time")) for file_name in unmirrored_file_names: csv_.writerow((file_name, check_time)) LOG.info( "Found %s tax maps in %s repository not mirrored in %s.", len(unmirrored_file_names), repo, other, ) LOG.info("End: Compile.") elapsed(start_time, LOG)