Esempio n. 1
0
    def ncbi_place(self, state):
        """
        Given taxids we can find all reference species used to construct the
        backbone tree with overlapping taxonomy.
        Requires database to be loaded.

        :param taxids: List or set of taxids
        :return: dict with placements
        """
        # parse, so we allow comma and spaces
        taxa = []
        for t in self.state["taxids"]:
            if "," in t:
                taxa.extend(t.split(","))
            else:
                taxa.append(t)
        # convert taxa in a set of strings
        taxa = set([str(t) for t in taxa])
        info = load_tax_info(state["dbinfo"]["files"]["taxinfo"])
        # find all nodes that intersect with the taxids
        nodes = set()
        for node, lng in info.items():
            if len(taxa & set(lng)) > 0:
                nodes.add(node)
        # make sure these are also in our SCMG set
        scmgs = load_SCMGs(state["dbinfo"]["files"]["scmgs"])
        nodes = nodes & set(scmgs.keys())

        placements = [{"n": x} for x in nodes]
        logging.info(
            "Located {} species corresponding to the provided taxids".format(
                len(nodes)))
        return {"placements": placements, "genomes": nodes}
Esempio n. 2
0
 def test_scmg_loading_gz(self):
     expected = {
         "A": set(["1", "2"]),
         "B": set(["1", "3", "2"]),
         "C": set(["1"])
     }
     found = load_SCMGs(TESTDATA_SCMG_GZ)
     self.assertEqual(expected, found)
Esempio n. 3
0
def hard_set_computation(set_path,
                         genomes,
                         prevalence=98,
                         atmost=500,
                         set_size=20):
    """
    Function to compute set based on a list of genomes passed to it
    """
    scmg = load_SCMGs(set_path)
    found = False
    set_prevalence = 100
    biggest = 0
    while found is False and set_prevalence >= prevalence:
        logging.debug(
            "Searching for Marker set at {} prevalence across {} genomes".
            format(set_prevalence, len(genomes)))
        sets = []
        for genome in genomes:
            try:
                sets.append(scmg[genome])
            except KeyError:
                logging.warning(
                    "Database missing markes for '{}'. This should not be the case. Make sure the database is not corrupted"
                    .format(genome))
        s = percentage_sets(sets, set_prevalence, atmost)
        if len(s) > biggest:
            biggest = len(s)
        if len(s) >= set_size:
            found = True
            break
        set_prevalence = set_prevalence - 0.5

    logging.debug("Largest set we found had {} SCMGs".format(biggest))
    if found:
        logging.debug("Found set of size {} with prevalence {}".format(
            len(s), set_prevalence))
        return s
    else:
        return None
Esempio n. 4
0
    def __init__(
        self,
        tree_v,
        placement,
        setp,
        set_species=5,
        set_size=50,
        set_prevalence=98,
        set_atmost=500,
        dynamic_root=False,
        set_selection="lm",
        use_ncbi=False,
        training=False,
        taxinfo=None,
    ):
        self.t = Tree(tree_v)

        # find LCA of all placements
        # make places into convenient list
        pl = [x["n"] for x in placement["placements"]]
        places = []
        for p in pl:
            if type(p) is list:
                places.extend(p)
            elif type(p) is str:
                places.append(p)
        # root the tree to get best clade patterns
        if dynamic_root:
            logging.debug("Will use most distant entry to LCA as outgroup")
            self.lca = self.LCA(places)
            new_root = self.lca.get_farthest_node()
            self.t.set_outgroup(new_root[0])

        # load in all marker genes
        scmg = load_SCMGs(setp)
        self.known_leafes = set(load_tax_info(taxinfo).keys())

        logging.debug(
            "Starting to look for scmg set, selection based on {}".format(
                set_selection))
        if use_ncbi:
            logging.debug("Will use NCBI tree instead of eukcc tree")
            self.marker_set = self._find_best_ncbi_set(
                places,
                scmg,
                taxinfo=taxinfo,
                min_set_size=set_size,
                set_atmost=set_atmost,
                set_species=set_species,
                min_prevalence=set_prevalence,
            )
        else:
            # expose final prevalence
            self.marker_set = self._find_best_set(
                places,
                scmg,
                training=training,
                min_set_size=set_size,
                set_atmost=set_atmost,
                set_species=set_species,
                min_prevalence=set_prevalence,
                sort_using=set_selection,
            )

        if training is False and self.marker_set is not None:
            logging.debug(
                "Defined SCMG set with {} marker genes with a single copy prevalence of {} percent covering {} related genomes supported by {}/{} placements"
                .format(
                    len(self.marker_set.profiles),
                    self.marker_set.prevalence,
                    len(self.marker_set.leafes),
                    len(self.marker_set.covered),
                    len(self.marker_set.all_places),
                ))
Esempio n. 5
0
 def test_scmg_loading_csv(self):
     csv = load_SCMGs(TESTDATA_SCMG)
     gz = load_SCMGs(TESTDATA_SCMG_GZ)
     self.assertEqual(csv, gz)
Esempio n. 6
0
 def test_scmg_missing(self):
     with self.assertRaises(FileNotFoundError):
         load_SCMGs("adfhjdshjskfjf")