return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source, taxonomy_string, match) # ...def get_preferred_taxonomic_match() #%% Initialization initialize_taxonomy_lookup() #%% Test single-query lookup if False: #%% matches = get_taxonomic_info('lion') print_taxonomy_matches(matches) #%% Read the input data df = pd.read_excel(species_by_dataset_file) #%% Run all our taxonomic lookups # i_row = 0; row = df.iloc[i_row] # query = 'lion' output_rows = [] for i_row, row in df.iterrows():
def get_preferred_taxonomic_match(query: str) -> TaxonomicMatch: """ Wrapper for species_lookup.py, but expressing a variety of heuristics and preferences that are specific to our scenario. """ # query = 'person' matches = get_taxonomic_info(query) # Do we have an iNat match? inat_matches = [m for m in matches if m['source'] == 'inat'] gbif_matches = [m for m in matches if m['source'] == 'gbif'] # print_taxonomy_matches(inat_matches, verbose=True) # print_taxonomy_matches(gbif_matches, verbose=True) scientific_name = '' common_name = '' taxonomic_level = '' match = '' source = '' taxonomy_string = '' # Prefer iNat matches; they're considerably less quirky if len(inat_matches) > 0: i_match = 0 if len(inat_matches) > 1: # print('Warning: multiple iNat matches for {}'.format(query)) # Prefer chordates... most of the names that aren't what we want # are esoteric insects, like a moth called "cheetah" # # If we can't find a chordate, just take the first match. # # i_test_match = 0 for i_test_match, match in enumerate(inat_matches): found_vertebrate = False taxonomy = match['taxonomy'] for taxonomy_level in taxonomy: taxon_rank = taxonomy_level[1] scientific_name = taxonomy_level[2] if taxon_rank == 'phylum' and scientific_name == 'chordata': i_match = i_test_match found_vertebrate = True break if found_vertebrate: break match = inat_matches[i_match]['taxonomy'] # This is (taxonID, taxonLevel, scientific, [list of common]) lowest_level = match[0] taxonomic_level = lowest_level[1] scientific_name = lowest_level[2] assert len(scientific_name) > 0 common_names = lowest_level[3] if len(common_names) > 1: # print(f'Warning: multiple iNat common names for {query}') # Default to returning the query if query in common_names: common_name = query else: common_name = common_names[0] elif len(common_names) > 0: common_name = common_names[0] # print(f'Matched iNat {query} to {scientific_name},{common_name}') source = 'inat' # ...if we had iNat matches # If we didn't match to iNat, try GBIF # # Code is deliberately redundant here; I'm expecting some subtleties in how # handle GBIF and iNat. elif len(gbif_matches) > 0: i_match = 0 if len(gbif_matches) > 1: # print('Warning: multiple GBIF matches for {}'.format(query)) # Prefer chordates... most of the names that aren't what we want # are esoteric insects, like a moth called "cheetah" # # If we can't find a chordate, just take the first match. # # i_test_match = 0 for i_test_match, match in enumerate(gbif_matches): found_vertebrate = False taxonomy = match['taxonomy'] for taxonomy_level in taxonomy: taxon_rank = taxonomy_level[1] scientific_name = taxonomy_level[2] if taxon_rank == 'phylum' and scientific_name == 'chordata': i_match = i_test_match found_vertebrate = True break if found_vertebrate: break match = gbif_matches[i_match]['taxonomy'] # This is (taxonID, taxonLevel, scientific, [list of common]) lowest_level = match[0] taxonomic_level = lowest_level[1] scientific_name = lowest_level[2] assert len(scientific_name) > 0 common_names = lowest_level[3] if len(common_names) > 1: # print(f'Warning: multiple GBIF common names for {query}') # Default to returning the query if query in common_names: common_name = query else: common_name = common_names[0] elif len(common_names) > 0: common_name = common_names[0] source = 'gbif' # ...if we needed to look in the GBIF taxonomy taxonomy_string = str(match) return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source, taxonomy_string, match)
return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source, taxonomy_string, match) # ...def get_preferred_taxonomic_match() #%% Initialization initialize_taxonomy_lookup() #%% Test single-query lookup if False: #%% matches = get_taxonomic_info('equus quagga') print_taxonomy_matches(matches) #%% q = 'equus quagga' # q = "grevy's zebra" taxonomy_preference = 'gbif' m = get_preferred_taxonomic_match(q) print(m.source) print(m.taxonomy_string) import clipboard clipboard.copy(m.taxonomy_string) #%% Read the input data df = pd.read_excel(species_by_dataset_file)