def test_duplicate_name_localities(self): """Test whether any full scientific names have more than one record for a locality.""" logit(self._log, "*** test_duplicate_name_localities ***") err_msgs = [] if self.nnsl_by_species is None: self.read_riis(read_resolved=False) for sciname, reclist in self.nnsl_by_species.items(): count = len(reclist) i = 0 while i < count: j = i + 1 while j < count: rec1 = reclist[i] rec2 = reclist[j] if rec1.is_duplicate_locality(rec2): msg = ( 'Sciname {} has {} on line {} and line {}'.format( sciname, rec1.data[RIIS_SPECIES.LOCALITY_FLD], rec1.data[LINENO_FLD], rec2.data[LINENO_FLD])) err_msgs.append(msg) # assert not rec1.is_duplicate_locality(rec2) j += 1 i += 1 self._print_errors("Duplicate Name-Locality records", err_msgs)
def test_gbif_resolution_inconsistency(self): """Test whether any full scientific names have more than one GBIF taxonKey.""" logit(self._log, "*** test_gbif_resolution_inconsistency ***") err_msgs = [] if self.nnsl_by_species is None: self.read_riis(read_resolved=False) for sciname, reclist in self.nnsl_by_species.items(): count = len(reclist) i = 0 while i < count: j = i + 1 while j < count: rec1 = reclist[i] rec2 = reclist[j] if not rec1.is_gbif_match(rec2): auth1 = rec1.data[RIIS_SPECIES.TAXON_AUTHORITY_FLD] auth2 = rec2.data[RIIS_SPECIES.TAXON_AUTHORITY_FLD] msg = 'Sciname {} has record1 taxon authority {}, with GBIF key {} (line {})'.format( sciname, auth1, rec1.data[RIIS_SPECIES.GBIF_KEY], rec1.data[LINENO_FLD]) msg += ' and record2 taxon authority {}, with GBIF key {} (line {})'.format( auth2, rec2.data[RIIS_SPECIES.GBIF_KEY], rec2.data[LINENO_FLD]) err_msgs.append(msg) # assert reclist[i].is_gbif_match(reclist[j]) j += 1 i += 1 self._print_errors("GBIF taxonKey conflicts", err_msgs)
def test_taxonomy_keys(self): """Test whether any records contain non-integer GBIF taxonKeys or ITIS TSNs.""" logit(self._log, "*** test_taxonomy_keys ***") if self.bad_species is None: self.read_riis(read_resolved=False) for k, v in self.bad_species.items(): logit(self._log, "{} {}".format(k, v)) assert len(self.bad_species) == 0
def find_gbif_record(self, gbifid): """Find a GBIF occurrence record identified by provided gbifID. Args: gbifid: local GBIF identifier for finding a record in a large file. Returns: self.dwcrec: a dictionary containing GBIF record """ if self._csv_reader is None: self.open() found = False try: while (self.dwcrec is not None and found is False): # Get interpreted record self.get_record() if self.dwcrec[GBIF.ID_FLD] == gbifid: found = True # Where are we if (self.recno % LOG.INTERVAL) == 0: logit(self._log, '*** Record number {} ***'.format(self.recno)) if (self.dwcrec is None and found is False): logit(self._log, 'Failed to find {}'.format(gbifid)) self.close() except Exception as e: logit(self._log, 'Failed on line {}, exception {}'.format(self.recno, e)) return self.dwcrec
def test_resolve_gbif(self): """Record changed GBIF taxonomic resolutions and write updated records.""" logit(self._log, "*** test_resolve_gbif ***") err_msgs = [] self.read_riis(read_resolved=False) # Update species data self._print_errors("Re-resolve to accepted GBIF taxon", err_msgs) name_count, rec_count = self.resolve_riis_to_gbif_taxa() logit( self._log, "Resolved {} of expected {} records".format( rec_count, RIIS_SPECIES.DATA_COUNT)) # Find mismatches for key, reclist in self.nnsl_by_species.items(): rec1 = reclist[0] try: rec1.data[RIIS_SPECIES.NEW_GBIF_KEY_FLD] except KeyError: logit( self._log, 'Failed to add field {} to {} records'.format( RIIS_SPECIES.NEW_GBIF_KEY_FLD, rec1.name)) else: if not rec1.consistent_gbif_resolution(): msg = "Record {} old GBIF taxonKey {} / {} conflicts with new GBIF taxonKey {} / {}".format( key, rec1.data[RIIS_SPECIES.GBIF_KEY], rec1.data[RIIS_SPECIES.SCINAME_FLD], rec1.data[RIIS_SPECIES.NEW_GBIF_KEY_FLD], rec1.data[RIIS_SPECIES.NEW_GBIF_SCINAME_FLD]) err_msgs.append(msg)
def test_missing_taxon_authority_resolution(self): """Test whether any full scientific names have more than one GBIF taxonKey.""" logit(self._log, "*** test_missing_taxon_authority_resolution ***") err_msgs = [] if self.nnsl_by_species is None: self.read_riis(read_resolved=False) for sciname, reclist in self.nnsl_by_species.items(): for rec in reclist: auth = rec.data[RIIS_SPECIES.TAXON_AUTHORITY_FLD] if (auth == "Accepted GBIF" and rec.data[RIIS_SPECIES.GBIF_KEY] <= 0): err_msgs.append( 'Sciname {} has GBIF authority with key {} (line {})'. format(sciname, rec.data[RIIS_SPECIES.GBIF_KEY], rec.data[LINENO_FLD])) elif (auth == "Accepted ITIS" and rec.data[RIIS_SPECIES.ITIS_KEY] <= 0): err_msgs.append( 'Sciname {} has ITIS authority with key {} (line {})'. format(sciname, rec.data[RIIS_SPECIES.GBIF_KEY], rec.data[LINENO_FLD])) self._print_errors("Missing authority resolution", err_msgs)
def test_missing_resolved_records(self, is_test=True): """Read the original and updated RIIS records and find missing records in the updated file. Args: is_test (bool): True if testing smaller test data file. """ logit(self._log, "*** test_missing_resolved_records ***") # Re-read original data self.read_riis(read_resolved=False) # resolved data test_fname = None if is_test: test_fname = RIIS_SPECIES.TEST_FNAME resolved_nnsl = NNSL(DATA_PATH, test_fname=test_fname) resolved_nnsl.read_riis(read_resolved=True) # Count originals for occid in self.nnsl_by_id.keys(): try: resolved_nnsl.nnsl_by_id[occid] except KeyError: logit(self._log, "Missing record {}".format(occid))
def test_resolution_output(self, is_test=True): """Record changed GBIF taxonomic resolutions and write updated records. Args: is_test (bool): True if testing smaller test data file. """ logit(self._log, "*** test_resolution_output ***") # Re-read original data self.read_riis(read_resolved=False) # resolved data test_fname = None if is_test: test_fname = RIIS_SPECIES.TEST_FNAME resolved_nnsl = NNSL(DATA_PATH, test_fname=test_fname) resolved_nnsl.read_riis(read_resolved=True) orig_rec_count = 0 res_rec_count = 0 # Find in original for occid in self.nnsl_by_id.keys(): orig_rec_count += 1 # Find in resolved try: resolved_nnsl.nnsl_by_id[occid] except KeyError: logit( self._log, "Failed to find occurrenceID {} in resolved dictionary". format(occid)) else: res_rec_count += 1 if orig_rec_count != res_rec_count: logit( self._log, "Original records {}, updated records {}".format( orig_rec_count, res_rec_count))
def _print_errors(self, header, msgs): if msgs: logit(self._log, ERR_SEPARATOR) logit(self._log, "--- {} ---".format(header)) for msg in msgs: logit(self._log, msg)