Example #1
0
    def __init__(self, basepath):
        """Constructor sets the authority and riis files and headers expected for BISON-RIIS processing.

        Args:
            basepath (str): Path to the base of the input data, used to construct full
                filenames from basepath and relative path constants.
        """
        NNSL.__init__(self, basepath)
Example #2
0
    def __init__(self, base_path, test_fname=None, logger=None):
        """Constructor sets the authority and species files and headers expected for BISON-RIIS processing.

        Args:
            base_path (str): base file path for project execution
            test_fname (str): RIIS file with fewer records for testing
            logger (object): logger for writing messages to file and console
        """
        NNSL.__init__(self, base_path, test_fname=test_fname, logger=logger)
Example #3
0
    def __init__(self, gbif_occ_filename, nnsl=None, logger=None):
        """Constructor.

        Args:
            gbif_occ_filename (str): full path of CSV occurrence file to annotate
            nnsl (bison.common.riis.NNSL): object containing USGS RIIS data for annotating records
            logger (object): logger for saving relevant processing messages
        """
        datapath, _ = os.path.split(gbif_occ_filename)
        self._datapath = datapath
        self._csvfile = gbif_occ_filename

        if logger is None:
            logger = get_logger(datapath)
        self._log = logger

        if nnsl is not None:
            self.nnsl = nnsl
        else:
            riis_filename = os.path.join(datapath, RIIS_SPECIES.FNAME)
            self.nnsl = NNSL(riis_filename, logger=logger)
            self.nnsl.read_riis(read_resolved=True)

        # Must georeference points to add new, consistent state and county fields
        self._geo_county = GeoResolver(US_CENSUS_COUNTY.FILE,
                                       US_CENSUS_COUNTY.CENSUS_BISON_MAP,
                                       self._log)

        # Input reader
        self._dwcdata = DwcData(self._csvfile, logger=logger)
        # Output writer
        self._csv_writer = None

        self._conus_states = []
        for k, v in US_STATES.items():
            if k not in ("Alaska", "Hawaii"):
                self._conus_states.extend([k, v])
        self._all_states = self._conus_states.copy()
        self._all_states.extend(["Alaska", "Hawaii", "AK", "HI"])

        # Test DwC record contents
        self.good_locations = {}
        self.bad_locations = {}
        self.missing_states = 0
        self.matched_states = 0
        self.mismatched_states = 0
Example #4
0
def annotate_occurrence_files(input_filenames, logger):
    """Annotate GBIF records with census state and county, and RIIS key and assessment.

    Args:
        input_filenames (list): list of full filenames containing GBIF data for annotation.
        logger (object): logger for saving relevant processing messages

    Returns:
        annotated_filenames: fill filenames for GBIF data annotated with state, county, RIIS assessment, and RIIS key.
    """
    annotated_filenames = []
    nnsl = NNSL(riis_filename, logger=logger)
    nnsl.read_riis(read_resolved=True)
    for csv_filename in input_filenames:
        ant = Annotator(csv_filename, nnsl=nnsl, logger=logger)
        annotated_dwc_fname = ant.annotate_dwca_records()
        annotated_filenames.append(annotated_dwc_fname)
    return annotated_filenames
Example #5
0
def resolve_riis_taxa(riis_filename, logger):
    """Resolve and write GBIF accepted names and taxonKeys in RIIS records.

    Args:
        riis_filename (str): full filename for RIIS data records.
        logger (object): logger for saving relevant processing messages

    Returns:
        resolved_riis_filename: full output filename for RIIS data records with updated taxa and taxonKeys from GBIF.
    """
    nnsl = NNSL(riis_filename, logger=logger)
    # Update species data
    nnsl.resolve_riis_to_gbif_taxa()
    count = nnsl.write_resolved_riis()
    if count != RIIS_SPECIES.DATA_COUNT:
        logger.debug(
            f"Resolved {count} RIIS records, expecting {RIIS_SPECIES.DATA_COUNT}"
        )
    resolved_riis_filename = nnsl.gbif_resolved_riis_fname
    return resolved_riis_filename
Example #6
0
    def test_missing_resolved_records(self, is_test=True):
        """Read the original and updated RIIS records and find missing records in the updated file.

        Args:
            is_test (bool): True if testing smaller test data file.
        """
        logit(self._log, "*** test_missing_resolved_records ***")
        # Re-read original data
        self.read_riis(read_resolved=False)

        # resolved data
        test_fname = None
        if is_test:
            test_fname = RIIS_SPECIES.TEST_FNAME
        resolved_nnsl = NNSL(DATA_PATH, test_fname=test_fname)
        resolved_nnsl.read_riis(read_resolved=True)

        # Count originals
        for occid in self.nnsl_by_id.keys():
            try:
                resolved_nnsl.nnsl_by_id[occid]
            except KeyError:
                logit(self._log, "Missing record {}".format(occid))
Example #7
0
    def test_resolution_output(self, is_test=True):
        """Record changed GBIF taxonomic resolutions and write updated records.

        Args:
            is_test (bool): True if testing smaller test data file.
        """
        logit(self._log, "*** test_resolution_output ***")
        # Re-read original data
        self.read_riis(read_resolved=False)

        # resolved data
        test_fname = None
        if is_test:
            test_fname = RIIS_SPECIES.TEST_FNAME
        resolved_nnsl = NNSL(DATA_PATH, test_fname=test_fname)
        resolved_nnsl.read_riis(read_resolved=True)

        orig_rec_count = 0
        res_rec_count = 0
        # Find in original
        for occid in self.nnsl_by_id.keys():
            orig_rec_count += 1
            # Find in resolved
            try:
                resolved_nnsl.nnsl_by_id[occid]
            except KeyError:
                logit(
                    self._log,
                    "Failed to find occurrenceID {} in resolved dictionary".
                    format(occid))
            else:
                res_rec_count += 1

        if orig_rec_count != res_rec_count:
            logit(
                self._log, "Original records {}, updated records {}".format(
                    orig_rec_count, res_rec_count))
Example #8
0
        print(f"   Matched states: {self.matched_states}")
        print(f"   Mis-matched states: {self.mismatched_states}")
        print(f"   Missing states: {self.missing_states}")
        print("   Good states: ")
        for st, counties in self.good_locations.items():
            print(f"  {st}: {counties}")
        print("   Bad states: ")
        for st, counties in self.bad_locations.items():
            print(f"  {st}: {counties}")


# .............................................................................
if __name__ == "__main__":
    # Test the taxonkey contents in GBIF simple CSV download file
    logger = get_logger(DATA_PATH, logname="test_annotate")
    nnsl_data = NNSL(DATA_PATH, logger=logger)
    big_gbif_fname = os.path.join(DATA_PATH, GBIF.TEST_DATA)

    chunk_fnames = chunk_files(big_gbif_fname)
    for fname in chunk_fnames:
        tst = TestAnnotator(fname, do_resolve=False, logger=logger)
        tst.test_annotate_records()
"""
from test.test_annotate import *

outpath = "/tmp"
logname = "test_annotate"
csvfile = GBIF.TEST_DATA
logger = get_logger(DATA_PATH, logname=logname)

"""
Example #9
0
class Annotator():
    """Class for adding USGS RIIS info to GBIF occurrences."""
    def __init__(self, gbif_occ_filename, nnsl=None, logger=None):
        """Constructor.

        Args:
            gbif_occ_filename (str): full path of CSV occurrence file to annotate
            nnsl (bison.common.riis.NNSL): object containing USGS RIIS data for annotating records
            logger (object): logger for saving relevant processing messages
        """
        datapath, _ = os.path.split(gbif_occ_filename)
        self._datapath = datapath
        self._csvfile = gbif_occ_filename

        if logger is None:
            logger = get_logger(datapath)
        self._log = logger

        if nnsl is not None:
            self.nnsl = nnsl
        else:
            riis_filename = os.path.join(datapath, RIIS_SPECIES.FNAME)
            self.nnsl = NNSL(riis_filename, logger=logger)
            self.nnsl.read_riis(read_resolved=True)

        # Must georeference points to add new, consistent state and county fields
        self._geo_county = GeoResolver(US_CENSUS_COUNTY.FILE,
                                       US_CENSUS_COUNTY.CENSUS_BISON_MAP,
                                       self._log)

        # Input reader
        self._dwcdata = DwcData(self._csvfile, logger=logger)
        # Output writer
        self._csv_writer = None

        self._conus_states = []
        for k, v in US_STATES.items():
            if k not in ("Alaska", "Hawaii"):
                self._conus_states.extend([k, v])
        self._all_states = self._conus_states.copy()
        self._all_states.extend(["Alaska", "Hawaii", "AK", "HI"])

        # Test DwC record contents
        self.good_locations = {}
        self.bad_locations = {}
        self.missing_states = 0
        self.matched_states = 0
        self.mismatched_states = 0

    # ...............................................
    @classmethod
    def construct_annotated_name(cls, csvfile):
        """Construct a full filename for the annotated version of csvfile.

        Args:
            csvfile (str): full filename used to construct an annotated filename for this data.

        Returns:
            outfname: output filename derived from the input GBIF DWC filename
        """
        pth, basefilename = os.path.split(csvfile)
        basename, ext = os.path.splitext(basefilename)
        try:
            rawidx = basename.index("_raw")
            basename = basename[:rawidx]
        except ValueError:
            pass
        newbasefilename = f"{basename}_annotated{ext}"
        outfname = os.path.join(pth, newbasefilename)
        return outfname

    # ...............................................
    def _open_input_output(self):
        """Open the DwcData for reading and the csv_writer for writing.

        Also reads the first record and writes the header.

        Returns:
            outfname: full filename of the output file

        Raises:
            Exception: on failure to open the DwcData csvreader.
            Exception: on failure to open the csv_writer.
        """
        outfname = self.construct_annotated_name(self._csvfile)
        try:
            self._dwcdata.open()
        except Exception:
            raise

        header = self._dwcdata.fieldnames
        header.extend([
            NEW_RIIS_KEY_FLD, NEW_RIIS_ASSESSMENT_FLD, NEW_RESOLVED_COUNTY,
            NEW_RESOLVED_STATE
        ])

        try:
            self._csv_writer, self._outf = get_csv_dict_writer(
                outfname,
                header,
                GBIF.DWCA_DELIMITER,
                fmode="w",
                encoding=ENCODING,
                overwrite=True)
        except Exception:
            raise Exception(
                f"Failed to open file or csv_writer for {outfname}")

        return outfname

    # ...............................................
    def close(self):
        """Close input datafiles and output file."""
        self._dwcdata.close()
        try:
            self._outf.close()
            self._csv_writer = None
        except AttributeError:
            pass

    # ...............................................
    @property
    def is_open(self):
        """Return true if any files are open.

        Returns:
            :type bool, True if CSV file is open, False if CSV file is closed
        """
        if ((self._inf is not None and not self._inf.closed)
                or (self._outf is not None and not self._outf.closed)):
            return True
        return False

    # # ...............................................
    # def assess_occurrence(self, dwcrec, county, state, iis_reclist):
    #     """Find RIIS assessment matching the acceptedTaxonKey and state in this record.
    #
    #     Args:
    #         dwcrec (dict): dictionary of original DwC specimen occurrence record
    #         county (str): county returned from geospatial intersection of point with YS boundaries
    #         state (str): state returned from geospatial intersection of point with YS boundaries
    #         iis_reclist (list of dict): list of RIIS records with acceptedTaxonKey matching the
    #             acceptedTaxonKey for this occurrence
    #
    #     Returns:
    #         riis_assessment: Determination of "introduced" or "invasive" for this
    #             record with species in this locaation.
    #         riis_id: locally unique RIIS occurrenceID identifying this determination
    #             for this species in this location.
    #     """
    #     riis_assessment = None
    #     riis_key = None
    #     for iisrec in iis_reclist:
    #         # Double check NNSL dict key == RIIS resolved key == occurrence accepted key
    #         if dwcrec[GBIF.ACC_TAXON_FLD] != iisrec.gbif_taxon_key:
    #             self._log.debug("WTF is happening?!?")
    #
    #         # Look for AK or HI
    #         if ((state == "AK" and iisrec.locality == "AK")
    #                 or (state == "HI" and iisrec.locality == "HI")):
    #             riis_assessment = iisrec.assessment.lower()
    #             riis_key = iisrec.occurrence_id
    #
    #         # Not AK or HI, is it L48?
    #         elif state in self._conus_states and iisrec.locality == "L48":
    #             riis_assessment = iisrec.assessment.lower()
    #             riis_key = iisrec.occurrence_id
    #
    #     return riis_assessment, riis_key

    # ...............................................
    def annotate_dwca_records(self):
        """Resolve and append state, county, RIIS assessment, and RIIS key to GBIF DWC occurrence records.

        Returns:
            self.annotated_dwc_fname: full filename of the GBIF DWC records with appended fields.

        Raises:
            Exception: on failure to open input or output data.
            Exception: on unexpected failure to read or write data.
        """
        trouble = "1698055779"
        trouble_next = "1698058398"
        try:
            # Open the original DwC data file for read, and the annotated file for write.
            annotated_dwc_fname = self._open_input_output()
        except Exception:
            raise
        else:
            self._log.info(
                f"Annotating {self._csvfile} to create {annotated_dwc_fname}")
            try:
                # iterate over DwC records
                dwcrec = self._dwcdata.get_record()
                while dwcrec is not None:
                    gbif_id = dwcrec[GBIF.ID_FLD]
                    if (self._dwcdata.recno % LOG.INTERVAL) == 0:
                        self._log.info(
                            f"*** Record number {self._dwcdata.recno}, gbifID: {gbif_id} ***"
                        )

                    # Debug: examine data
                    if gbif_id == trouble:
                        self._log.debug(f"Found troubled gbifID {trouble}")
                    if gbif_id == trouble_next:
                        self._log.debug("Not so troubling")
                    if EXTRA_CSV_FIELD in dwcrec.keys():
                        self._log.debug(
                            f"Extra fields detected: possible bad read for record {gbif_id}"
                        )

                    # Initialize new fields
                    county = state = riis_assessment = riis_key = None

                    # Find county and state for these coords
                    try:
                        county, state = self._find_county_state(
                            dwcrec[GBIF.LON_FLD],
                            dwcrec[GBIF.LAT_FLD],
                            buffer_vals=POINT_BUFFER_RANGE)
                    except ValueError as e:
                        self._log.error(f"Record gbifID: {gbif_id}: {e}")
                    except GeoException as e:
                        self._log.error(f"Record gbifID: {gbif_id}: {e}")

                    if state in ("AK", "HI"):
                        region = state
                    else:
                        region = "L48"

                    # # Find RIIS records for this acceptedTaxonKey
                    taxkey = dwcrec[GBIF.ACC_TAXON_FLD]
                    # try:
                    #     iis_reclist = self.nnsl.by_gbif_taxkey[taxkey]
                    # except Exception:
                    #     iis_reclist = []
                    riis_assessment, riis_key = self.nnsl.get_assessment_for_gbif_taxonkey_region(
                        taxkey, region)

                    # if county and state and iis_reclist:
                    #     riis_assessment, riis_key = self.assess_occurrence(
                    #         dwcrec, county, state, iis_reclist)

                    # Add county, state and RIIS assessment to record
                    dwcrec[NEW_RESOLVED_COUNTY] = county
                    dwcrec[NEW_RESOLVED_STATE] = state
                    dwcrec[NEW_RIIS_ASSESSMENT_FLD] = riis_assessment
                    dwcrec[NEW_RIIS_KEY_FLD] = riis_key

                    try:
                        self._csv_writer.writerow(dwcrec)
                    except ValueError as e:
                        self._log.error(
                            f"ValueError {e} on record with gbifID {gbif_id}")
                    except Exception as e:
                        self._log.error(
                            f"Unknown error {e} record with gbifID {gbif_id}")

                    dwcrec = self._dwcdata.get_record()
            except Exception as e:
                raise Exception(
                    f"Unexpected error {e} reading {self._dwcdata.input_file} or writing {annotated_dwc_fname}"
                )

        return annotated_dwc_fname

    # ...............................................
    def _find_county_state(self, lon, lat, buffer_vals):
        county = state = None
        if None not in (lon, lat):
            # Intersect coordinates with county boundaries for state and county values
            try:
                fldvals, ogr_seconds = self._geo_county.find_enclosing_polygon(
                    lon, lat, buffer_vals=buffer_vals)
            except ValueError:
                raise
            except GeoException:
                raise
            if ogr_seconds > 0.75:
                self._log.debug(
                    f"Rec {self._dwcdata.recno}; intersect point {lon}, {lat}; OGR time {ogr_seconds}"
                )
            county = fldvals[NEW_RESOLVED_COUNTY]
            state = fldvals[NEW_RESOLVED_STATE]
        return county, state
Example #10
0
 def _get_riis_species(self):
     riis_filename = os.path.join(self._datapath, RIIS_SPECIES.FNAME)
     nnsl = NNSL(riis_filename, logger=self._log)
     nnsl.read_riis(read_resolved=True)
     return nnsl
Example #11
0
import argparse

from bison.common.riis import NNSL

# ...............................................
if __name__ == "__main__":
    DEFAULT_BISON_PATH = "/home/astewart/git/bison"
    DEFAULT_GBIF_FNAME = "/tank/bison/2022/gbif_2022_01_0-100.csv"

    parser = argparse.ArgumentParser(
        description=
        'Annotate GBIF records with BISON RIIS determinations and aggregate results.'
    )
    parser.add_argument('bison_path',
                        type=str,
                        default=DEFAULT_BISON_PATH,
                        help='The base path for BISON input data and outputs.')
    parser.add_argument(
        'gbif_fname',
        type=str,
        default=DEFAULT_GBIF_FNAME,
        help='The full path to GBIF input species occurrence data.')
    args = parser.parse_args()

    bison = NNSL(args.bison_path)
    bison.read_species()
    # Update species data
    bison.resolve_gbif_species()
    bison.write_species()
    # Step through GBIF data and annotate with RIIS Ids