def __init__(self, basepath): """Constructor sets the authority and riis files and headers expected for BISON-RIIS processing. Args: basepath (str): Path to the base of the input data, used to construct full filenames from basepath and relative path constants. """ NNSL.__init__(self, basepath)
def __init__(self, base_path, test_fname=None, logger=None): """Constructor sets the authority and species files and headers expected for BISON-RIIS processing. Args: base_path (str): base file path for project execution test_fname (str): RIIS file with fewer records for testing logger (object): logger for writing messages to file and console """ NNSL.__init__(self, base_path, test_fname=test_fname, logger=logger)
def __init__(self, gbif_occ_filename, nnsl=None, logger=None): """Constructor. Args: gbif_occ_filename (str): full path of CSV occurrence file to annotate nnsl (bison.common.riis.NNSL): object containing USGS RIIS data for annotating records logger (object): logger for saving relevant processing messages """ datapath, _ = os.path.split(gbif_occ_filename) self._datapath = datapath self._csvfile = gbif_occ_filename if logger is None: logger = get_logger(datapath) self._log = logger if nnsl is not None: self.nnsl = nnsl else: riis_filename = os.path.join(datapath, RIIS_SPECIES.FNAME) self.nnsl = NNSL(riis_filename, logger=logger) self.nnsl.read_riis(read_resolved=True) # Must georeference points to add new, consistent state and county fields self._geo_county = GeoResolver(US_CENSUS_COUNTY.FILE, US_CENSUS_COUNTY.CENSUS_BISON_MAP, self._log) # Input reader self._dwcdata = DwcData(self._csvfile, logger=logger) # Output writer self._csv_writer = None self._conus_states = [] for k, v in US_STATES.items(): if k not in ("Alaska", "Hawaii"): self._conus_states.extend([k, v]) self._all_states = self._conus_states.copy() self._all_states.extend(["Alaska", "Hawaii", "AK", "HI"]) # Test DwC record contents self.good_locations = {} self.bad_locations = {} self.missing_states = 0 self.matched_states = 0 self.mismatched_states = 0
def annotate_occurrence_files(input_filenames, logger): """Annotate GBIF records with census state and county, and RIIS key and assessment. Args: input_filenames (list): list of full filenames containing GBIF data for annotation. logger (object): logger for saving relevant processing messages Returns: annotated_filenames: fill filenames for GBIF data annotated with state, county, RIIS assessment, and RIIS key. """ annotated_filenames = [] nnsl = NNSL(riis_filename, logger=logger) nnsl.read_riis(read_resolved=True) for csv_filename in input_filenames: ant = Annotator(csv_filename, nnsl=nnsl, logger=logger) annotated_dwc_fname = ant.annotate_dwca_records() annotated_filenames.append(annotated_dwc_fname) return annotated_filenames
def resolve_riis_taxa(riis_filename, logger): """Resolve and write GBIF accepted names and taxonKeys in RIIS records. Args: riis_filename (str): full filename for RIIS data records. logger (object): logger for saving relevant processing messages Returns: resolved_riis_filename: full output filename for RIIS data records with updated taxa and taxonKeys from GBIF. """ nnsl = NNSL(riis_filename, logger=logger) # Update species data nnsl.resolve_riis_to_gbif_taxa() count = nnsl.write_resolved_riis() if count != RIIS_SPECIES.DATA_COUNT: logger.debug( f"Resolved {count} RIIS records, expecting {RIIS_SPECIES.DATA_COUNT}" ) resolved_riis_filename = nnsl.gbif_resolved_riis_fname return resolved_riis_filename
def test_missing_resolved_records(self, is_test=True): """Read the original and updated RIIS records and find missing records in the updated file. Args: is_test (bool): True if testing smaller test data file. """ logit(self._log, "*** test_missing_resolved_records ***") # Re-read original data self.read_riis(read_resolved=False) # resolved data test_fname = None if is_test: test_fname = RIIS_SPECIES.TEST_FNAME resolved_nnsl = NNSL(DATA_PATH, test_fname=test_fname) resolved_nnsl.read_riis(read_resolved=True) # Count originals for occid in self.nnsl_by_id.keys(): try: resolved_nnsl.nnsl_by_id[occid] except KeyError: logit(self._log, "Missing record {}".format(occid))
def test_resolution_output(self, is_test=True): """Record changed GBIF taxonomic resolutions and write updated records. Args: is_test (bool): True if testing smaller test data file. """ logit(self._log, "*** test_resolution_output ***") # Re-read original data self.read_riis(read_resolved=False) # resolved data test_fname = None if is_test: test_fname = RIIS_SPECIES.TEST_FNAME resolved_nnsl = NNSL(DATA_PATH, test_fname=test_fname) resolved_nnsl.read_riis(read_resolved=True) orig_rec_count = 0 res_rec_count = 0 # Find in original for occid in self.nnsl_by_id.keys(): orig_rec_count += 1 # Find in resolved try: resolved_nnsl.nnsl_by_id[occid] except KeyError: logit( self._log, "Failed to find occurrenceID {} in resolved dictionary". format(occid)) else: res_rec_count += 1 if orig_rec_count != res_rec_count: logit( self._log, "Original records {}, updated records {}".format( orig_rec_count, res_rec_count))
print(f" Matched states: {self.matched_states}") print(f" Mis-matched states: {self.mismatched_states}") print(f" Missing states: {self.missing_states}") print(" Good states: ") for st, counties in self.good_locations.items(): print(f" {st}: {counties}") print(" Bad states: ") for st, counties in self.bad_locations.items(): print(f" {st}: {counties}") # ............................................................................. if __name__ == "__main__": # Test the taxonkey contents in GBIF simple CSV download file logger = get_logger(DATA_PATH, logname="test_annotate") nnsl_data = NNSL(DATA_PATH, logger=logger) big_gbif_fname = os.path.join(DATA_PATH, GBIF.TEST_DATA) chunk_fnames = chunk_files(big_gbif_fname) for fname in chunk_fnames: tst = TestAnnotator(fname, do_resolve=False, logger=logger) tst.test_annotate_records() """ from test.test_annotate import * outpath = "/tmp" logname = "test_annotate" csvfile = GBIF.TEST_DATA logger = get_logger(DATA_PATH, logname=logname) """
class Annotator(): """Class for adding USGS RIIS info to GBIF occurrences.""" def __init__(self, gbif_occ_filename, nnsl=None, logger=None): """Constructor. Args: gbif_occ_filename (str): full path of CSV occurrence file to annotate nnsl (bison.common.riis.NNSL): object containing USGS RIIS data for annotating records logger (object): logger for saving relevant processing messages """ datapath, _ = os.path.split(gbif_occ_filename) self._datapath = datapath self._csvfile = gbif_occ_filename if logger is None: logger = get_logger(datapath) self._log = logger if nnsl is not None: self.nnsl = nnsl else: riis_filename = os.path.join(datapath, RIIS_SPECIES.FNAME) self.nnsl = NNSL(riis_filename, logger=logger) self.nnsl.read_riis(read_resolved=True) # Must georeference points to add new, consistent state and county fields self._geo_county = GeoResolver(US_CENSUS_COUNTY.FILE, US_CENSUS_COUNTY.CENSUS_BISON_MAP, self._log) # Input reader self._dwcdata = DwcData(self._csvfile, logger=logger) # Output writer self._csv_writer = None self._conus_states = [] for k, v in US_STATES.items(): if k not in ("Alaska", "Hawaii"): self._conus_states.extend([k, v]) self._all_states = self._conus_states.copy() self._all_states.extend(["Alaska", "Hawaii", "AK", "HI"]) # Test DwC record contents self.good_locations = {} self.bad_locations = {} self.missing_states = 0 self.matched_states = 0 self.mismatched_states = 0 # ............................................... @classmethod def construct_annotated_name(cls, csvfile): """Construct a full filename for the annotated version of csvfile. Args: csvfile (str): full filename used to construct an annotated filename for this data. Returns: outfname: output filename derived from the input GBIF DWC filename """ pth, basefilename = os.path.split(csvfile) basename, ext = os.path.splitext(basefilename) try: rawidx = basename.index("_raw") basename = basename[:rawidx] except ValueError: pass newbasefilename = f"{basename}_annotated{ext}" outfname = os.path.join(pth, newbasefilename) return outfname # ............................................... def _open_input_output(self): """Open the DwcData for reading and the csv_writer for writing. Also reads the first record and writes the header. Returns: outfname: full filename of the output file Raises: Exception: on failure to open the DwcData csvreader. Exception: on failure to open the csv_writer. """ outfname = self.construct_annotated_name(self._csvfile) try: self._dwcdata.open() except Exception: raise header = self._dwcdata.fieldnames header.extend([ NEW_RIIS_KEY_FLD, NEW_RIIS_ASSESSMENT_FLD, NEW_RESOLVED_COUNTY, NEW_RESOLVED_STATE ]) try: self._csv_writer, self._outf = get_csv_dict_writer( outfname, header, GBIF.DWCA_DELIMITER, fmode="w", encoding=ENCODING, overwrite=True) except Exception: raise Exception( f"Failed to open file or csv_writer for {outfname}") return outfname # ............................................... def close(self): """Close input datafiles and output file.""" self._dwcdata.close() try: self._outf.close() self._csv_writer = None except AttributeError: pass # ............................................... @property def is_open(self): """Return true if any files are open. Returns: :type bool, True if CSV file is open, False if CSV file is closed """ if ((self._inf is not None and not self._inf.closed) or (self._outf is not None and not self._outf.closed)): return True return False # # ............................................... # def assess_occurrence(self, dwcrec, county, state, iis_reclist): # """Find RIIS assessment matching the acceptedTaxonKey and state in this record. # # Args: # dwcrec (dict): dictionary of original DwC specimen occurrence record # county (str): county returned from geospatial intersection of point with YS boundaries # state (str): state returned from geospatial intersection of point with YS boundaries # iis_reclist (list of dict): list of RIIS records with acceptedTaxonKey matching the # acceptedTaxonKey for this occurrence # # Returns: # riis_assessment: Determination of "introduced" or "invasive" for this # record with species in this locaation. # riis_id: locally unique RIIS occurrenceID identifying this determination # for this species in this location. # """ # riis_assessment = None # riis_key = None # for iisrec in iis_reclist: # # Double check NNSL dict key == RIIS resolved key == occurrence accepted key # if dwcrec[GBIF.ACC_TAXON_FLD] != iisrec.gbif_taxon_key: # self._log.debug("WTF is happening?!?") # # # Look for AK or HI # if ((state == "AK" and iisrec.locality == "AK") # or (state == "HI" and iisrec.locality == "HI")): # riis_assessment = iisrec.assessment.lower() # riis_key = iisrec.occurrence_id # # # Not AK or HI, is it L48? # elif state in self._conus_states and iisrec.locality == "L48": # riis_assessment = iisrec.assessment.lower() # riis_key = iisrec.occurrence_id # # return riis_assessment, riis_key # ............................................... def annotate_dwca_records(self): """Resolve and append state, county, RIIS assessment, and RIIS key to GBIF DWC occurrence records. Returns: self.annotated_dwc_fname: full filename of the GBIF DWC records with appended fields. Raises: Exception: on failure to open input or output data. Exception: on unexpected failure to read or write data. """ trouble = "1698055779" trouble_next = "1698058398" try: # Open the original DwC data file for read, and the annotated file for write. annotated_dwc_fname = self._open_input_output() except Exception: raise else: self._log.info( f"Annotating {self._csvfile} to create {annotated_dwc_fname}") try: # iterate over DwC records dwcrec = self._dwcdata.get_record() while dwcrec is not None: gbif_id = dwcrec[GBIF.ID_FLD] if (self._dwcdata.recno % LOG.INTERVAL) == 0: self._log.info( f"*** Record number {self._dwcdata.recno}, gbifID: {gbif_id} ***" ) # Debug: examine data if gbif_id == trouble: self._log.debug(f"Found troubled gbifID {trouble}") if gbif_id == trouble_next: self._log.debug("Not so troubling") if EXTRA_CSV_FIELD in dwcrec.keys(): self._log.debug( f"Extra fields detected: possible bad read for record {gbif_id}" ) # Initialize new fields county = state = riis_assessment = riis_key = None # Find county and state for these coords try: county, state = self._find_county_state( dwcrec[GBIF.LON_FLD], dwcrec[GBIF.LAT_FLD], buffer_vals=POINT_BUFFER_RANGE) except ValueError as e: self._log.error(f"Record gbifID: {gbif_id}: {e}") except GeoException as e: self._log.error(f"Record gbifID: {gbif_id}: {e}") if state in ("AK", "HI"): region = state else: region = "L48" # # Find RIIS records for this acceptedTaxonKey taxkey = dwcrec[GBIF.ACC_TAXON_FLD] # try: # iis_reclist = self.nnsl.by_gbif_taxkey[taxkey] # except Exception: # iis_reclist = [] riis_assessment, riis_key = self.nnsl.get_assessment_for_gbif_taxonkey_region( taxkey, region) # if county and state and iis_reclist: # riis_assessment, riis_key = self.assess_occurrence( # dwcrec, county, state, iis_reclist) # Add county, state and RIIS assessment to record dwcrec[NEW_RESOLVED_COUNTY] = county dwcrec[NEW_RESOLVED_STATE] = state dwcrec[NEW_RIIS_ASSESSMENT_FLD] = riis_assessment dwcrec[NEW_RIIS_KEY_FLD] = riis_key try: self._csv_writer.writerow(dwcrec) except ValueError as e: self._log.error( f"ValueError {e} on record with gbifID {gbif_id}") except Exception as e: self._log.error( f"Unknown error {e} record with gbifID {gbif_id}") dwcrec = self._dwcdata.get_record() except Exception as e: raise Exception( f"Unexpected error {e} reading {self._dwcdata.input_file} or writing {annotated_dwc_fname}" ) return annotated_dwc_fname # ............................................... def _find_county_state(self, lon, lat, buffer_vals): county = state = None if None not in (lon, lat): # Intersect coordinates with county boundaries for state and county values try: fldvals, ogr_seconds = self._geo_county.find_enclosing_polygon( lon, lat, buffer_vals=buffer_vals) except ValueError: raise except GeoException: raise if ogr_seconds > 0.75: self._log.debug( f"Rec {self._dwcdata.recno}; intersect point {lon}, {lat}; OGR time {ogr_seconds}" ) county = fldvals[NEW_RESOLVED_COUNTY] state = fldvals[NEW_RESOLVED_STATE] return county, state
def _get_riis_species(self): riis_filename = os.path.join(self._datapath, RIIS_SPECIES.FNAME) nnsl = NNSL(riis_filename, logger=self._log) nnsl.read_riis(read_resolved=True) return nnsl
import argparse from bison.common.riis import NNSL # ............................................... if __name__ == "__main__": DEFAULT_BISON_PATH = "/home/astewart/git/bison" DEFAULT_GBIF_FNAME = "/tank/bison/2022/gbif_2022_01_0-100.csv" parser = argparse.ArgumentParser( description= 'Annotate GBIF records with BISON RIIS determinations and aggregate results.' ) parser.add_argument('bison_path', type=str, default=DEFAULT_BISON_PATH, help='The base path for BISON input data and outputs.') parser.add_argument( 'gbif_fname', type=str, default=DEFAULT_GBIF_FNAME, help='The full path to GBIF input species occurrence data.') args = parser.parse_args() bison = NNSL(args.bison_path) bison.read_species() # Update species data bison.resolve_gbif_species() bison.write_species() # Step through GBIF data and annotate with RIIS Ids