def object_hook(self, s): if "taxa_list" not in s: taxa = Taxa.Taxa() for k in s.keys(): taxa.__setattr__(k, s[k]) return taxa else: taxa = [] for t in s["taxa"]: taxa = Taxa.Taxa() for k in t.keys(): taxa.__setattr__(k, t[k]) taxa.append(taxa) return taxa
def generate_lists(base_folder, prefix, save_lists=False): filename = os.path.join(base_folder, prefix + "_species_list_webpage.pickle") s = request_handler.get_soup(species_check_page_url, filename) genus_list = [] species_list = [] # find all the listed items print("Gathering species from chrysis.net...") for li in s.find_all("li"): # find the links inside the listed items ref = li.find_all("a") # select the right link if len(ref) == 2: name = ref[1] if len(ref) == 1: name = ref[0] else: continue # split the name name_parts = name.text.split(" ") # %s %s %s %s, %year this is the right format for an author if len(name_parts) > 3 and name.text.find(",") != -1: genus = name_parts[0] specie = name_parts[1] if len(name_parts[2]) > 0: # add the subspecie specie += " " + name_parts[2] author = "".join(part + " " for part in name_parts[3:]) else: author = "".join(part + " " for part in name_parts[2:]) author = author[:-1] author = author.replace("[E]", "") species_list.append( Taxa.Taxa(genus + " " + specie, author, None, None)) else: continue return genus_list, species_list
def generate_lists(family_name, fileinfo, load_lists=True): ''' Function that generates both the genera and species lists''' print("Generating taxa list from Encyclopedia of Life (EOL)...") print("Input name: ", family_name) # performs a query to the website params = {"q": family_name.lower()} # different parameters for the query on eol # params["filter_by_hierarchy_entry_id"] = "any hierarchy id" # params["page"] = 1 # params["exact"] = False # params["filter_by_taxon_concept_id"] = "any taxon_id" # params["filter_by_string"] = "???" # params["cache_ttl"] = "n seconds in cache" path = fileinfo.cache_filename("eol_query") req = RequestsHandler.Request(eol_api, path, params) req.load() json_data = req.response.json() # shows the results print("Possible matches: ", len(json_data["results"])) for result in json_data["results"]: print(" -", result["title"]) # select the first result in the query result_link = json_data["results"][0]["link"] # creates the reference link family_page = result_link + "/names" print("Downloading data from:", family_page) # soups the link path = fileinfo.cache_filename("eol_webpage") s = RequestsHandler.get_soup(family_page, path) # select the section corresponding to the Hierarchical tree samples = s.select( "body > div.l-basic-main > div.l-content > div > div.ui.segments > div:nth-child(1)" ) # gather the names encapsulated in the divs divs = samples[0].find_all("div") names = divs[0].find_all("a") # create the genus list based on the detection of the next family # family1 == family_name -- start # genus1 # genus2 # family2 (ending -idae) -- stop genus_list = [] collect_genus = False fam_taxa = Taxa.Taxa() fam_taxa.family = family_name fam_taxa.source = Taxa.Taxa.source_eol fam_taxa.rank = Taxa.Taxa.rank_family for name in names: if collect_genus == True: # match the family ending (-idae) if name.text[::-1][0:4] == "idae"[::-1]: collect_genus = False else: genus = Taxa.Taxa() genus.copy_taxa(fam_taxa) genus.rank = Taxa.Taxa.rank_genus genus.genus = name.text genus.links.append(eol_main + name.get("href")) genus_list.append(genus) if name.text == family_name: collect_genus = True pbar = ProgressBar.ProgressBar(len(genus_list)) species_list = [] for i, taxa in enumerate(genus_list): #open the website, look for specie list filename = fileinfo.cache_filename(taxa.genus) s = RequestsHandler.get_soup(taxa.links[0], filename) div = s.find("div", {"class": "page-children"}) links = div.find_all("a") # once found all the links in the section gather the information for link in links: # the html link element contains the species name and authors parts = link.text.split(" ") if len(parts) < 3: #print(f"Name is not rightfully formatted: {link.text}") continue n_specie = 1 # this is a sub genus if parts[n_specie].startswith("("): n_specie = 2 # first 2 elements are the binomial nomenclature specie_name = parts[n_specie] # the rest is the author name author_name = "".join(p + " " for p in parts[n_specie + 1:])[:-1] # add the comma before the year author_name = re.sub(" (\d\d\d\d)", r", \1", author_name) # filter out common names in author # '(Linne, 1758) (European mantis)' # if the name has a final parentheses that is not connected to a year if author_name[-1] == ")": if not author_name[-2].isdigit(): # look where the parentesis starts reverse_name = author_name[::-1] par_position = 0 while par_position < len( author_name) and reverse_name[par_position] != "(": par_position += 1 # slice the string 1 character before slice_position = len(author_name) - par_position - 2 author_name = author_name[:slice_position] specie_link = eol_main + link.get("href") specie = Taxa.Taxa() specie.copy_taxa(taxa) specie.rank = Taxa.Taxa.rank_specie specie.specie = specie_name specie.author = author_name specie.links.append(specie_link) species_list.append(specie) pbar.draw_bar(i) print("Genus retrived:", len(genus_list), "Species retrived:", len(species_list)) return genus_list, species_list
def generate_lists(family_name, fileinfo, load_lists=True): '''Function that arranges the genuses and species in a list, the function could be translated in a tree, but ... is difficult. The function returns a list of Taxa with name, author and reference link''' logger.main_log("Generating taxa list from NBN Atlas...") logger.log_short_report("Input name: " + family_name) api_url = "https://species-ws.nbnatlas.org/search?" param = {"q": family_name, "fq": "idxtype:TAXON"} req = RequestsHandler.Request(api_url, fileinfo.cache_filename("family_search"), param) req.load() search_json = req.get_json() search_results = search_json["searchResults"]["results"] # display the possible matches logger.log_short_report(f"Possible matches: {len(search_results)}") for result in search_results: logger.log_short_report(f" -{result['name']} ({result['guid']})") # pick the first result family_guid = search_results[0]["guid"] # parameters for the webpage corresponding to the family family_url = "https://species.nbnatlas.org/species/" + family_guid filename = fileinfo.cache_filename("family") taxa = gather_child_taxa(family_url, filename) # start getting the speceis genus_list = [] species_list = [] pwheel = ProgressBar.ProgressWheel() # search for the child taxa for dt, dd in taxa: fam_taxa = Taxa.Taxa() fam_taxa.family = family_name fam_taxa.rank = Taxa.Taxa.rank_genus fam_taxa.source = "n" pwheel.draw_symbol() element = NBNElement(dt, dd, fileinfo) # if is a subfamily if element.get_rank() == "subfamily": subfam = element.gather_child_elements() for subf in subfam: subfam_taxa = Taxa.Taxa() subfam_taxa.copy_taxa(fam_taxa) subfam_taxa.subfamily = element.get_name() if subf.get_rank() == "tribe": genuses = subf.gather_child_elements() # if is a genus with a tribe and subfamily for genus in genuses: if genus.get_rank() == "genus": taxa = Taxa.Taxa() taxa.copy_taxa(subfam_taxa) taxa.author = genus.get_author() taxa.genus = genus.get_name() taxa.links.append(genus.get_link()) taxa.tribe = subf.get_name() genus_list.append(taxa) # if is a genus with subfamily without a tribe if subf.get_rank() == "genus": taxa = Taxa.Taxa() taxa.copy_taxa(subfam_taxa) taxa.author = genus.get_author() taxa.genus = genus.get_name() taxa.links.append(genus.get_link()) genus_list.append(taxa) # if is a genus without subfamily if element.get_rank() == "genus": taxa = Taxa.Taxa() taxa.copy_taxa(fam_taxa) taxa.author = element.get_author() taxa.genus = element.get_name() taxa.links.append(element.get_link()) genus_list.append(taxa) for genus in genus_list: pwheel.draw_symbol() filename = fileinfo.cache_filename(f"{genus.genus}_webpage") html_elements = gather_child_taxa(genus.links[0], filename) for dt, dd in html_elements: specie = NBNElement(dt, dd, fileinfo) taxa = Taxa.Taxa() taxa.copy_taxa(genus) taxa.author = specie.get_author() taxa.specie = specie.get_name().replace(genus.genus, "").strip() taxa.rank = Taxa.Taxa.rank_specie taxa.links.append(specie.get_link()) species_list.append(taxa) # find subspecies somehow # is possible that the name of the specie wuld be separated by / # like tritici/obelisca and will break the file name filename = fileinfo.cache_filename(taxa.specie.replace("/", "_")) soup = RequestsHandler.get_soup(taxa.links[0], filename) children = soup.find("section", id="classification") print(taxa.links[0]) if children: dts = children.find_all("dt") dds = children.find_all("dd") html_parts = zip(dts, dds) # find the subspecie and append the eventual subspecie name and author for dth, ddh in html_parts: if dth.text == "subspecies": staxa = Taxa.Taxa() staxa.copy_taxonomy(taxa) subspecie_element = NBNElement(dth, ddh, fileinfo) staxa.author = subspecie_element.get_author() name_text = subspecie_element.get_name() # clean the subsp. mention that sometimes appears name_text.replace("subsp. ", "") # select the subspecie name staxa.subspecie = name_text.split(" ")[2] staxa.rank = Taxa.Taxa.rank_subspecie staxa.links.append(specie.get_link()) staxa.source = "n" species_list.append(staxa) pwheel.end() logger.log_short_report( f"Genus retrived: {len(genus_list)} Species retrived: {len(species_list)}" ) return genus_list, species_list
def generate_lists(family_name, fileinfo, load_lists=True): logger.main_log("Gathering data from BOLD Databases...") logger.log_short_report("Input name:" + family_name) # Use the search API to search for the name param = {"taxName": family_name} req = RequestsHandler.Request(taxon_search_api_url, fileinfo.cache_filename("test"), param) res_json = req.get_json() logger.log_short_report("Possible matches: " + str(res_json["total_matched_names"])) if not res_json["top_matched_names"]: raise ParseBoldException("No candidates found") for match in res_json["top_matched_names"]: tax_match = match["taxon"] logger.log_short_report(f" - {tax_match} (id: {match['taxid']})") # get the tax id from the search # Pick the first match family = res_json["top_matched_names"][0] # use the id to get the information about the taxon param = {"taxId": family["taxid"], "dataTypes": "basic"} req = RequestsHandler.Request(taxonid_api_url, fileinfo.cache_filename("taxid_info"), param) res_json = req.get_json() # keys: dict_keys(['taxid', 'taxon', 'tax_rank', 'tax_division', # 'parentid', 'parentname', 'taxonrep', 'stats', 'country', # 'sitemap', 'images', 'sequencinglabs', 'depositry', # 'wikipedia_summary', 'wikipedia_link']) if res_json["tax_rank"] != "family": raise Exception("BOLD_downloader: the selected result is not a family") family_taxa = Taxa.Taxa() family_taxa.rank = Taxa.Taxa.rank_family family_taxa.family = res_json["taxon"] family_id = res_json["taxid"] logger.log_report("Retriving subtaxas...") # use the retrived information to scavenge the sub taxa taxa_list = generate_children_list(family_id, fileinfo, family_taxa) # print("Gathering specimens...") # # # use the specimen database to find the authors # specimens = specimen_list(family_name, fileinfo) # # print("Composing the list...") # # # assign the authors from the specimen database # for taxa in taxa_list: # for specimen in specimens: # if taxa.specie == specimen.specie and taxa.genus == specimen.genus: # if taxa.author == None: # taxa.author = specimen.author # divide species and genus species_list = list(filter(lambda t: t.rank == t.rank_specie, taxa_list)) genus_list = list(filter(lambda t: t.rank == t.rank_genus, taxa_list)) logger.short_report_log( f"Genus retrived: {len(genus_list)} Species retrived: {len(species_list)}" ) return genus_list, species_list
def get_children(taxid, fileinfo, parent_taxa=None): ''' The function scans the webpage with the taxid and finds the subtaxa asssociated with it''' # construct the request param = {"taxid": taxid} filename = fileinfo.cache_filename(f"taxid_search_{taxid}") req = RequestsHandler.Request(taxon_search_url, filename, param) # get the soup soup = req.get_soup() # Find the sections containing the sub taxa, if there are less than 6 # sections it means that the page doesn't have sub taxa sections = soup.find_all("div", {"class": "col-md-6"}) if len(sections) <= 6: return None else: subtaxa = sections[6] # Find the taxons in the sections, it can be that the subtaxa have multiple # ranks like a family can have genera and subfamily associated wiht it taxon_ranks = subtaxa.find_all("lh") taxons = subtaxa.find_all("ol") # Analyze the taxons taxa_list = [] for rank, tax in zip(taxon_ranks, taxons): # dummy taxa to insert the information that will be equal in all the # child taxa like source and previous taxonomy retrived_taxa = Taxa.Taxa() retrived_taxa.source = Taxa.Taxa.source_bold if parent_taxa: retrived_taxa.copy_taxonomy(parent_taxa) # format the rank Subfamilies (7) -> subfamilies frank = rank.text.split(" ")[0].lower() # comvert into Taxa rank type taxa_ranks = { "subfamilies": Taxa.Taxa.rank_subfamily, "tribes": Taxa.Taxa.rank_tribe, "genera": Taxa.Taxa.rank_genus, "species": Taxa.Taxa.rank_specie, "subspecies": Taxa.Taxa.rank_subspecie } try: retrived_taxa.rank = taxa_ranks[frank] except KeyError: raise Exception("BOLD_downloader: Taxon rank not present") if retrived_taxa.rank == Taxa.Taxa.rank_specie or retrived_taxa.rank == Taxa.Taxa.rank_subspecie: names = re.sub(r"\[\d+\]", ", ", tax.text) logger.main_log("Skipped because species:" + names) continue # get the names associated with the rank taxon_names = tax.find_all("a") for name in taxon_names: taxa = Taxa.Taxa() taxa.copy_taxa(retrived_taxa) # parse the name if taxa.rank == Taxa.Taxa.rank_specie: text = name.text.split(" ")[1].strip() elif taxa.rank == Taxa.Taxa.rank_subspecie: text = name.text.split(" ")[2].strip() else: text = name.text.split(" ")[0].strip() # the "sp." is marked under species but has no designated name # has only a code like CB-12. There are some species that probably # are provisory names that contain codes and symbols if text == "sp." or text == "cf." or "-" in text or "_" in text: continue if text.find("sp.") != -1: logger.log_report("sp. in text, text: " + name.text) continue if text.find("nr.") != -1: logger.log_report("nr. in text, text:" + name.text) continue # skip names containing digits if any(c.isdigit() for c in text): continue # assign the name if taxa.rank == Taxa.Taxa.rank_subfamily: taxa.subfamily = text elif taxa.rank == Taxa.Taxa.rank_tribe: taxa.tribe = text elif taxa.rank == Taxa.Taxa.rank_genus: taxa.genus = text elif taxa.rank == Taxa.Taxa.rank_specie: taxa.specie = text elif taxa.rank == Taxa.Taxa.rank_subspecie: taxa.subspecie = text else: raise Exception("Taxon rank not present") # assign the relative link, used then to find the relative subtaxa link = main_url + name.get("href") taxa.links.append(link) taxa_list.append(taxa) return taxa_list
def specimen_list(family_name, fileinfo): ''' This function produces the list of specimen that they have in the database, the function filters out all the specimen that have the same taxonomic designation ''' # request all the specimens that have family_name in their stuff param = {"taxon": family_name, "format": "json"} # in case the parameters have a space it encodes it as "name%20name" # instead of "name+name". There shouldnt be spaces in a family_name yet # this is here for legacy purposes? param = urllib.parse.urlencode(param, quote_via=urllib.parse.quote) filename = fileinfo.cache_filename("specimen_api") # performs the request req = RequestsHandler.Request(specimen_api_url, filename, param) res_json = req.get_json() # selects the respective records from the response records = res_json["bold_records"]["records"] # This are the possible dict keys of the record, if there is a response # with more keys the program will give a error tax_keys = [ 'identification_provided_by', 'identification_method', 'phylum', 'class', 'order', 'family', 'subfamily', 'genus', 'species', 'subspecies' ] # converts the records in taxas taxa_list = list() for record in records.values(): # check if the taxon program selects all possible taxon ranks tax_list = list(record["taxonomy"].keys()) for word in tax_list: if word in tax_keys: #print(".", end="") continue else: raise Exception("BOLD_downloader: the key is not present:" + word) # create the record class which will parse the JSON rec = Record(record) # We are only intereseted in genus + specie not family, subfamily, # which anyway dont have author and information to the associated genus if rec.genus == None and rec.specie == None: continue # there are species that arent determined yet? if rec.specie == "n. sp.": continue # transform the recod to the taxa taxa = Taxa.Taxa() taxa.family = rec.family taxa.subfamily = rec.subfamily taxa.tribe = rec.tribe taxa.genus = rec.genus taxa.specie = rec.specie taxa.author = rec.author taxa.rank = rec.rank # if the taxa is already present dont add it # if is not present add it to the list for existing_taxa in taxa_list: if existing_taxa.is_equal(taxa): break else: taxa_list.append(taxa) return taxa_list
species_list = list(filter(lambda t: t.rank == t.rank_specie, taxa_list)) genus_list = list(filter(lambda t: t.rank == t.rank_genus, taxa_list)) logger.short_report_log( f"Genus retrived: {len(genus_list)} Species retrived: {len(species_list)}" ) return genus_list, species_list if __name__ == "__main__": import FileInfo base_folder = "./Tests/test_BOLD" fileinfo = FileInfo.FileInfo(base_folder, "bold", "Mycetophilidae") # get the stuff species_list, genus_list = generate_lists(fileinfo.family_name, fileinfo) taxa_list = species_list + genus_list Taxa.save_taxa_list(taxa_list, fileinfo.pickle_filename("taxa_list")) # load list taxa_list = Taxa.load_taxa_list(fileinfo.pickle_filename("taxa_list")) Taxa.construct_associations(taxa_list)