Python Taxa 예제들, Taxa Python 예제들

예제 #1

0

파일 보기

    def object_hook(self, s):
        if "taxa_list" not in s:
            taxa = Taxa.Taxa()

            for k in s.keys():
                taxa.__setattr__(k, s[k])

            return taxa
        else:
            taxa = []

            for t in s["taxa"]:
                taxa = Taxa.Taxa()

                for k in t.keys():
                    taxa.__setattr__(k, t[k])

                taxa.append(taxa)

            return taxa

예제 #2

0

파일 보기

파일: Chrysis_net.py 프로젝트: Pella86/ScrapeTax

def generate_lists(base_folder, prefix, save_lists=False):

    filename = os.path.join(base_folder,
                            prefix + "_species_list_webpage.pickle")
    s = request_handler.get_soup(species_check_page_url, filename)

    genus_list = []
    species_list = []

    # find all the listed items

    print("Gathering species from chrysis.net...")

    for li in s.find_all("li"):

        # find the links inside the listed items
        ref = li.find_all("a")

        # select the right link
        if len(ref) == 2:
            name = ref[1]
        if len(ref) == 1:
            name = ref[0]
        else:
            continue

        # split the name
        name_parts = name.text.split(" ")

        # %s %s %s %s, %year this is the right format for an author
        if len(name_parts) > 3 and name.text.find(",") != -1:
            genus = name_parts[0]
            specie = name_parts[1]

            if len(name_parts[2]) > 0:
                # add the subspecie
                specie += " " + name_parts[2]
                author = "".join(part + " " for part in name_parts[3:])

            else:
                author = "".join(part + " " for part in name_parts[2:])

            author = author[:-1]
            author = author.replace("[E]", "")

            species_list.append(
                Taxa.Taxa(genus + " " + specie, author, None, None))
        else:
            continue

    return genus_list, species_list

예제 #3

0

파일 보기

def generate_lists(family_name, fileinfo, load_lists=True):
    ''' Function that generates both the genera and species lists'''

    print("Generating taxa list from Encyclopedia of Life (EOL)...")
    print("Input name: ", family_name)

    # performs a query to the website
    params = {"q": family_name.lower()}

    # different parameters for the query on eol
    # params["filter_by_hierarchy_entry_id"] = "any hierarchy id"
    # params["page"] = 1
    # params["exact"] = False
    # params["filter_by_taxon_concept_id"] = "any taxon_id"
    # params["filter_by_string"] = "???"
    # params["cache_ttl"] = "n seconds in cache"

    path = fileinfo.cache_filename("eol_query")
    req = RequestsHandler.Request(eol_api, path, params)
    req.load()

    json_data = req.response.json()

    # shows the results
    print("Possible matches: ", len(json_data["results"]))
    for result in json_data["results"]:
        print("    -", result["title"])

    # select the first result in the query
    result_link = json_data["results"][0]["link"]

    # creates the reference link
    family_page = result_link + "/names"

    print("Downloading data from:", family_page)

    # soups the link
    path = fileinfo.cache_filename("eol_webpage")
    s = RequestsHandler.get_soup(family_page, path)

    # select the section corresponding to the Hierarchical tree
    samples = s.select(
        "body > div.l-basic-main > div.l-content > div > div.ui.segments > div:nth-child(1)"
    )

    # gather the names encapsulated in the divs
    divs = samples[0].find_all("div")

    names = divs[0].find_all("a")

    # create the genus list based on the detection of the next family
    #  family1 == family_name -- start
    #    genus1
    #    genus2
    #  family2 (ending -idae) -- stop

    genus_list = []
    collect_genus = False

    fam_taxa = Taxa.Taxa()
    fam_taxa.family = family_name
    fam_taxa.source = Taxa.Taxa.source_eol
    fam_taxa.rank = Taxa.Taxa.rank_family

    for name in names:

        if collect_genus == True:

            # match the family ending (-idae)
            if name.text[::-1][0:4] == "idae"[::-1]:
                collect_genus = False
            else:
                genus = Taxa.Taxa()
                genus.copy_taxa(fam_taxa)
                genus.rank = Taxa.Taxa.rank_genus
                genus.genus = name.text
                genus.links.append(eol_main + name.get("href"))
                genus_list.append(genus)

        if name.text == family_name:
            collect_genus = True

    pbar = ProgressBar.ProgressBar(len(genus_list))

    species_list = []
    for i, taxa in enumerate(genus_list):

        #open the website, look for specie list

        filename = fileinfo.cache_filename(taxa.genus)

        s = RequestsHandler.get_soup(taxa.links[0], filename)

        div = s.find("div", {"class": "page-children"})

        links = div.find_all("a")

        # once found all the links in the section gather the information
        for link in links:

            # the html link element contains the species name and authors
            parts = link.text.split(" ")

            if len(parts) < 3:
                #print(f"Name is not rightfully formatted: {link.text}")
                continue

            n_specie = 1

            # this is a sub genus
            if parts[n_specie].startswith("("):
                n_specie = 2

            # first 2 elements are the binomial nomenclature
            specie_name = parts[n_specie]

            # the rest is the author name
            author_name = "".join(p + " " for p in parts[n_specie + 1:])[:-1]

            # add the comma before the year
            author_name = re.sub(" (\d\d\d\d)", r", \1", author_name)

            # filter out common names in author
            # '(Linne, 1758) (European mantis)'

            # if the name has a final parentheses that is not connected to a year
            if author_name[-1] == ")":
                if not author_name[-2].isdigit():

                    # look where the parentesis starts
                    reverse_name = author_name[::-1]

                    par_position = 0

                    while par_position < len(
                            author_name) and reverse_name[par_position] != "(":
                        par_position += 1

                    # slice the string 1 character before
                    slice_position = len(author_name) - par_position - 2
                    author_name = author_name[:slice_position]

            specie_link = eol_main + link.get("href")

            specie = Taxa.Taxa()
            specie.copy_taxa(taxa)
            specie.rank = Taxa.Taxa.rank_specie
            specie.specie = specie_name
            specie.author = author_name
            specie.links.append(specie_link)

            species_list.append(specie)

        pbar.draw_bar(i)

    print("Genus retrived:", len(genus_list), "Species retrived:",
          len(species_list))
    return genus_list, species_list

예제 #4

0

파일 보기

파일: ParseNBN.py 프로젝트: Pella86/ScrapeTax

def generate_lists(family_name, fileinfo, load_lists=True):
    '''Function that arranges the genuses and species in a list, the function
    could be translated in a tree, but ... is difficult. The function returns
    a list of Taxa with name, author and reference link'''

    logger.main_log("Generating taxa list from NBN Atlas...")
    logger.log_short_report("Input name: " + family_name)

    api_url = "https://species-ws.nbnatlas.org/search?"

    param = {"q": family_name, "fq": "idxtype:TAXON"}

    req = RequestsHandler.Request(api_url,
                                  fileinfo.cache_filename("family_search"),
                                  param)
    req.load()

    search_json = req.get_json()
    search_results = search_json["searchResults"]["results"]

    # display the possible matches
    logger.log_short_report(f"Possible matches: {len(search_results)}")
    for result in search_results:
        logger.log_short_report(f"    -{result['name']} ({result['guid']})")

    # pick the first result
    family_guid = search_results[0]["guid"]

    # parameters for the webpage corresponding to the family
    family_url = "https://species.nbnatlas.org/species/" + family_guid

    filename = fileinfo.cache_filename("family")

    taxa = gather_child_taxa(family_url, filename)

    # start getting the speceis
    genus_list = []
    species_list = []

    pwheel = ProgressBar.ProgressWheel()

    # search for the child taxa
    for dt, dd in taxa:

        fam_taxa = Taxa.Taxa()
        fam_taxa.family = family_name

        fam_taxa.rank = Taxa.Taxa.rank_genus
        fam_taxa.source = "n"

        pwheel.draw_symbol()

        element = NBNElement(dt, dd, fileinfo)

        # if is a subfamily
        if element.get_rank() == "subfamily":

            subfam = element.gather_child_elements()

            for subf in subfam:

                subfam_taxa = Taxa.Taxa()
                subfam_taxa.copy_taxa(fam_taxa)
                subfam_taxa.subfamily = element.get_name()

                if subf.get_rank() == "tribe":
                    genuses = subf.gather_child_elements()

                    # if is a genus with a tribe and subfamily
                    for genus in genuses:
                        if genus.get_rank() == "genus":

                            taxa = Taxa.Taxa()
                            taxa.copy_taxa(subfam_taxa)

                            taxa.author = genus.get_author()
                            taxa.genus = genus.get_name()
                            taxa.links.append(genus.get_link())
                            taxa.tribe = subf.get_name()

                            genus_list.append(taxa)

                # if is a genus with subfamily without a tribe
                if subf.get_rank() == "genus":

                    taxa = Taxa.Taxa()
                    taxa.copy_taxa(subfam_taxa)

                    taxa.author = genus.get_author()
                    taxa.genus = genus.get_name()
                    taxa.links.append(genus.get_link())

                    genus_list.append(taxa)

        # if is a genus without subfamily
        if element.get_rank() == "genus":
            taxa = Taxa.Taxa()
            taxa.copy_taxa(fam_taxa)

            taxa.author = element.get_author()
            taxa.genus = element.get_name()
            taxa.links.append(element.get_link())

            genus_list.append(taxa)

    for genus in genus_list:
        pwheel.draw_symbol()

        filename = fileinfo.cache_filename(f"{genus.genus}_webpage")

        html_elements = gather_child_taxa(genus.links[0], filename)

        for dt, dd in html_elements:
            specie = NBNElement(dt, dd, fileinfo)

            taxa = Taxa.Taxa()
            taxa.copy_taxa(genus)

            taxa.author = specie.get_author()
            taxa.specie = specie.get_name().replace(genus.genus, "").strip()
            taxa.rank = Taxa.Taxa.rank_specie
            taxa.links.append(specie.get_link())

            species_list.append(taxa)

            # find subspecies somehow
            # is possible that the name of the specie wuld be separated by /
            # like tritici/obelisca and will break the file name
            filename = fileinfo.cache_filename(taxa.specie.replace("/", "_"))
            soup = RequestsHandler.get_soup(taxa.links[0], filename)
            children = soup.find("section", id="classification")

            print(taxa.links[0])

            if children:

                dts = children.find_all("dt")
                dds = children.find_all("dd")

                html_parts = zip(dts, dds)

                # find the subspecie and append the eventual subspecie name and author

                for dth, ddh in html_parts:
                    if dth.text == "subspecies":
                        staxa = Taxa.Taxa()
                        staxa.copy_taxonomy(taxa)

                        subspecie_element = NBNElement(dth, ddh, fileinfo)
                        staxa.author = subspecie_element.get_author()

                        name_text = subspecie_element.get_name()

                        # clean the subsp. mention that sometimes appears
                        name_text.replace("subsp. ", "")

                        # select the subspecie name
                        staxa.subspecie = name_text.split(" ")[2]

                        staxa.rank = Taxa.Taxa.rank_subspecie
                        staxa.links.append(specie.get_link())
                        staxa.source = "n"

                        species_list.append(staxa)

    pwheel.end()

    logger.log_short_report(
        f"Genus retrived: {len(genus_list)} Species retrived: {len(species_list)}"
    )

    return genus_list, species_list

예제 #5

0

파일 보기

def generate_lists(family_name, fileinfo, load_lists=True):
    logger.main_log("Gathering data from BOLD Databases...")
    logger.log_short_report("Input name:" + family_name)

    # Use the search API to search for the name

    param = {"taxName": family_name}

    req = RequestsHandler.Request(taxon_search_api_url,
                                  fileinfo.cache_filename("test"), param)

    res_json = req.get_json()

    logger.log_short_report("Possible matches: " +
                            str(res_json["total_matched_names"]))

    if not res_json["top_matched_names"]:
        raise ParseBoldException("No candidates found")

    for match in res_json["top_matched_names"]:
        tax_match = match["taxon"]
        logger.log_short_report(f"    - {tax_match} (id: {match['taxid']})")

    # get the tax id from the search

    # Pick the first match
    family = res_json["top_matched_names"][0]

    # use the id to get the information about the taxon
    param = {"taxId": family["taxid"], "dataTypes": "basic"}

    req = RequestsHandler.Request(taxonid_api_url,
                                  fileinfo.cache_filename("taxid_info"), param)
    res_json = req.get_json()

    # keys: dict_keys(['taxid', 'taxon', 'tax_rank', 'tax_division',
    #         'parentid', 'parentname', 'taxonrep', 'stats', 'country',
    #         'sitemap', 'images', 'sequencinglabs', 'depositry',
    #         'wikipedia_summary', 'wikipedia_link'])

    if res_json["tax_rank"] != "family":
        raise Exception("BOLD_downloader: the selected result is not a family")

    family_taxa = Taxa.Taxa()

    family_taxa.rank = Taxa.Taxa.rank_family
    family_taxa.family = res_json["taxon"]

    family_id = res_json["taxid"]

    logger.log_report("Retriving subtaxas...")

    # use the retrived information to scavenge the sub taxa
    taxa_list = generate_children_list(family_id, fileinfo, family_taxa)

    #    print("Gathering specimens...")
    #
    #    # use the specimen database to find the authors
    #    specimens = specimen_list(family_name, fileinfo)
    #
    #    print("Composing the list...")
    #
    #    # assign the authors from the specimen database
    #    for taxa in taxa_list:
    #        for specimen in specimens:
    #            if taxa.specie == specimen.specie and taxa.genus == specimen.genus:
    #                if taxa.author == None:
    #                    taxa.author = specimen.author

    # divide species and genus
    species_list = list(filter(lambda t: t.rank == t.rank_specie, taxa_list))
    genus_list = list(filter(lambda t: t.rank == t.rank_genus, taxa_list))

    logger.short_report_log(
        f"Genus retrived: {len(genus_list)} Species retrived: {len(species_list)}"
    )

    return genus_list, species_list

예제 #6

0

파일 보기

def get_children(taxid, fileinfo, parent_taxa=None):
    ''' The function scans the webpage with the taxid and finds the subtaxa 
        asssociated with it'''

    # construct the request
    param = {"taxid": taxid}
    filename = fileinfo.cache_filename(f"taxid_search_{taxid}")
    req = RequestsHandler.Request(taxon_search_url, filename, param)

    # get the soup
    soup = req.get_soup()

    # Find the sections containing the sub taxa, if there are less than 6
    # sections it means that the page doesn't have sub taxa
    sections = soup.find_all("div", {"class": "col-md-6"})
    if len(sections) <= 6:
        return None
    else:
        subtaxa = sections[6]

    # Find the taxons in the sections, it can be that the subtaxa have multiple
    # ranks like a family can have genera and subfamily associated wiht it
    taxon_ranks = subtaxa.find_all("lh")
    taxons = subtaxa.find_all("ol")

    # Analyze the taxons
    taxa_list = []

    for rank, tax in zip(taxon_ranks, taxons):

        # dummy taxa to insert the information that will be equal in all the
        # child taxa like source and previous taxonomy
        retrived_taxa = Taxa.Taxa()
        retrived_taxa.source = Taxa.Taxa.source_bold

        if parent_taxa:
            retrived_taxa.copy_taxonomy(parent_taxa)

        # format the rank Subfamilies (7) -> subfamilies
        frank = rank.text.split(" ")[0].lower()

        # comvert into Taxa rank type
        taxa_ranks = {
            "subfamilies": Taxa.Taxa.rank_subfamily,
            "tribes": Taxa.Taxa.rank_tribe,
            "genera": Taxa.Taxa.rank_genus,
            "species": Taxa.Taxa.rank_specie,
            "subspecies": Taxa.Taxa.rank_subspecie
        }

        try:
            retrived_taxa.rank = taxa_ranks[frank]
        except KeyError:
            raise Exception("BOLD_downloader: Taxon rank not present")

        if retrived_taxa.rank == Taxa.Taxa.rank_specie or retrived_taxa.rank == Taxa.Taxa.rank_subspecie:
            names = re.sub(r"\[\d+\]", ", ", tax.text)
            logger.main_log("Skipped because species:" + names)
            continue

        # get the names associated with the rank

        taxon_names = tax.find_all("a")

        for name in taxon_names:

            taxa = Taxa.Taxa()
            taxa.copy_taxa(retrived_taxa)

            # parse the name
            if taxa.rank == Taxa.Taxa.rank_specie:
                text = name.text.split(" ")[1].strip()

            elif taxa.rank == Taxa.Taxa.rank_subspecie:
                text = name.text.split(" ")[2].strip()

            else:
                text = name.text.split(" ")[0].strip()

            # the "sp." is marked under species but has no designated name
            # has only a code like CB-12. There are some species that probably
            # are provisory names that contain codes and symbols
            if text == "sp." or text == "cf." or "-" in text or "_" in text:
                continue

            if text.find("sp.") != -1:
                logger.log_report("sp. in text, text: " + name.text)

                continue

            if text.find("nr.") != -1:
                logger.log_report("nr. in text, text:" + name.text)
                continue

            # skip names containing digits
            if any(c.isdigit() for c in text):
                continue

            # assign the name
            if taxa.rank == Taxa.Taxa.rank_subfamily:
                taxa.subfamily = text

            elif taxa.rank == Taxa.Taxa.rank_tribe:
                taxa.tribe = text

            elif taxa.rank == Taxa.Taxa.rank_genus:
                taxa.genus = text

            elif taxa.rank == Taxa.Taxa.rank_specie:
                taxa.specie = text

            elif taxa.rank == Taxa.Taxa.rank_subspecie:
                taxa.subspecie = text

            else:
                raise Exception("Taxon rank not present")

            # assign the relative link, used then to find the relative subtaxa
            link = main_url + name.get("href")
            taxa.links.append(link)

            taxa_list.append(taxa)

    return taxa_list

예제 #7

0

파일 보기

def specimen_list(family_name, fileinfo):
    ''' This function produces the list of specimen that they have in the
        database, the function filters out all the specimen that have the 
        same taxonomic designation
    '''

    # request all the specimens that have family_name in their stuff
    param = {"taxon": family_name, "format": "json"}

    # in case the parameters have a space it encodes it as "name%20name"
    # instead of "name+name". There shouldnt be spaces in a family_name yet
    # this is here for legacy purposes?
    param = urllib.parse.urlencode(param, quote_via=urllib.parse.quote)
    filename = fileinfo.cache_filename("specimen_api")

    # performs the request
    req = RequestsHandler.Request(specimen_api_url, filename, param)
    res_json = req.get_json()

    # selects the respective records from the response
    records = res_json["bold_records"]["records"]

    # This are the possible dict keys of the record, if there is a response
    # with more keys the program will give a error
    tax_keys = [
        'identification_provided_by', 'identification_method', 'phylum',
        'class', 'order', 'family', 'subfamily', 'genus', 'species',
        'subspecies'
    ]

    # converts the records in taxas
    taxa_list = list()
    for record in records.values():

        # check if the taxon program selects all possible taxon ranks
        tax_list = list(record["taxonomy"].keys())

        for word in tax_list:
            if word in tax_keys:
                #print(".", end="")
                continue
            else:
                raise Exception("BOLD_downloader: the key is not present:" +
                                word)

        # create the record class which will parse the JSON
        rec = Record(record)

        # We are only intereseted in genus + specie not family, subfamily,
        # which anyway dont have author and information to the associated genus
        if rec.genus == None and rec.specie == None:
            continue

        # there are species that arent determined yet?
        if rec.specie == "n. sp.":
            continue

        # transform the recod to the taxa
        taxa = Taxa.Taxa()

        taxa.family = rec.family
        taxa.subfamily = rec.subfamily
        taxa.tribe = rec.tribe
        taxa.genus = rec.genus
        taxa.specie = rec.specie
        taxa.author = rec.author
        taxa.rank = rec.rank

        # if the taxa is already present dont add it
        # if is not present add it to the list
        for existing_taxa in taxa_list:
            if existing_taxa.is_equal(taxa):
                break
        else:
            taxa_list.append(taxa)

    return taxa_list

예제 #8

0

파일 보기

    species_list = list(filter(lambda t: t.rank == t.rank_specie, taxa_list))
    genus_list = list(filter(lambda t: t.rank == t.rank_genus, taxa_list))

    logger.short_report_log(
        f"Genus retrived: {len(genus_list)} Species retrived: {len(species_list)}"
    )

    return genus_list, species_list


if __name__ == "__main__":

    import FileInfo

    base_folder = "./Tests/test_BOLD"

    fileinfo = FileInfo.FileInfo(base_folder, "bold", "Mycetophilidae")

    # get the stuff

    species_list, genus_list = generate_lists(fileinfo.family_name, fileinfo)
    taxa_list = species_list + genus_list

    Taxa.save_taxa_list(taxa_list, fileinfo.pickle_filename("taxa_list"))

    # load list

    taxa_list = Taxa.load_taxa_list(fileinfo.pickle_filename("taxa_list"))

    Taxa.construct_associations(taxa_list)