def collect_all_species(fname="../data/tfidf/data_gbif.json"): ''' collect all species ''' last_read = 1 offset = 0 ts = time.time() dct = {} nb_species = 0 data = [] while last_read > 0: tab_rep = species.name_suggest(offset=offset) offset += 1 # if time.time()-ts>60: # break last_read = len(tab_rep) for x in tab_rep: nb_species += 1 data.append(x) for k in x.keys(): if k not in dct: dct[k] = 1 else: dct[k] += 1 pp = pprint.PrettyPrinter(2) print('nb species', nb_species) print('Dict:', len(dct)) pp.pprint(dct) with open(fname, "w", encoding='utf-8') as jsonfile: json.dump(data, jsonfile, ensure_ascii=False) print(time.time() - ts)
def test_species(): data = species.name_suggest(q='Puma concolor') for x in data: print(len(x)) data2 = species.name_suggest() for x in data2: print(len(x)) nb_species = 0 for i in range(0, 10): tab_rep = species.name_suggest(q='vespa', offset=i) print(len(tab_rep)) for doc in tab_rep: if 'species' in doc: nb_species += 1 print(doc['species']) print(nb_species)
def gbif_name_suggest(**kwargs): # Wrapper for pygbif name_suggest function response = species.name_suggest(**kwargs) if len(response) == 0: return None accepted_data = None synonym_data = None other_data = None for result in response: rank = result.get('rank', '') if rank.lower() in RANK_KEYS: rank_key = rank.lower() + 'Key' else: rank_key = 'key' key_found = ('nubKey' in result or rank_key in result) if key_found and 'status' in result: if result['status'] == 'ACCEPTED': if accepted_data: if result['key'] < accepted_data['key']: accepted_data = result else: accepted_data = result if result['status'] == 'SYNONYM': if synonym_data: if result['key'] < synonym_data['key']: synonym_data = result else: synonym_data = result else: if other_data: if result['key'] < other_data['key']: other_data = result else: other_data = result if accepted_data: return accepted_data if synonym_data: return synonym_data return other_data
def taxon_info(taxon): """ use pygbif.species to get GBIF taxonKey info """ info = species.name_suggest(taxon) return info
def search_species(entry): # To be used when calling the method name_input = entry # User input for testing purposes # name_input = input("Enter a common species name: ") results = {} # Uses the pygbif method to find results suggest = species.name_suggest(q=name_input, rank='SPECIES', limit=25) # Pulls the results from the dataset in json format suggest_data = suggest['data']['results'] # print(suggest_data) # Formats the data for pythonic purposes data = suggest_data # Reads data for each result for o in data: # Finds the gbif key key = o['key'] # Uses pygbif to find number of occurrences of species key occurs = occurrences.count(taxonKey=key) # Searches occurrence data for the species key occur_search = occurrences.search(taxonKey=key) # print('occur search: ' + str(occur_search)) # for country in countries: # print(country) # print(occur_search) # Runs if species has occurred more than zero times if occurs > 0: try: # Tries to retrieve scientific name canon_name = o['canonicalName'] except: continue # Vernacular name init vern_name = '' # Variable for list of vernacular names names = o['vernacularNames'] # Summary init summary = '' try: # If match found print('Scientific name: ' + canon_name) print('Vernacular names: ') match_found = False # Reads from results in matched name for name in names: # Variable for vernacular name vern_name = name['vernacularName'] # Used if all languages want to be included # print(name['vernacularName']) language = (name['language']) # Can be changed if user wants to select a specific language if language == 'eng': # Checks if vernacular name is matched with search input if name_input in vern_name and match_found is False: match_found = True print(vern_name) # If no exact match is found it reads the first vernacular name if match_found is False: name_store = [] for name in names: vern_name = name['vernacularName'] language = (name['language']) if vern_name not in name_store: name_store.append(vern_name) if language == 'eng': print(vern_name) # Adds scientific and vernacular name to results dictionary results.setdefault(canon_name, []).append(vern_name) # print(wikipedia.search(canon_name)) try: # pulls the wiki page based on canonical name in url (usually works) # desc = wikipedia.page(canon_name, auto_suggest=True) # alternative wiki page including all sections # page = wikipedia.WikipediaPage(canon_name) # pulls the summary page for the species summary = wikipedia.summary(canon_name, sentences=2) # experimental for section pages # sections = page.sections # for section in sections: # print(section) # github push error, delete this print(summary) results.setdefault(canon_name, []).append(summary) # print(desc.content) except: print("No description found") print('GBIF Key: ' + str(key)) print('GBIF Species Page: ' + 'http://www.gbif.org/species/' + str(key)) print('Count: ' + str(occurs)) # This reads results for occurences and finds countries the species # was observed in and how many occurrences there occur_data = occur_search['results'] countries = {} for occur in occur_data: try: country = occur['country'] if country not in countries: countries[country] = 1 else: countries[country] += 1 # print(occur['country']) except: continue # for country in countries: # print(country) sorted_countries = sorted(countries.items(), key=lambda x: x[1], reverse=True) if countries != {}: print('Top 3 Countries Observed: ') for country in sorted_countries[:3]: print(str(country)) print('\n') # return canon_name, vern_name, summary except: continue print(results) return results
def test_name_suggest(): "species.name_suggest - basic test" res = species.name_suggest(q="Puma concolor") assert list == res.__class__ assert True == all( [bool(re.search("Puma concolor", z["canonicalName"])) for z in res])
def test_name_suggest_paging(): "species.name_suggest - paging" res = species.name_suggest(q="Aso", limit=3) assert list == res.__class__ assert 3 == len(res)
def search_taxa(q): res = pd.DataFrame.from_dict(species.name_suggest(q)) return res