Esempio n. 1
0
    def __init__(self, newick, constraint_dict, constraint_exact):
        "Traverses tree to build test sets given constraint options."

        # store sets of four-taxon splits
        self.testset = set()
        self.hold = [0, 0, 0, 0]

        # tree to traverse
        self.tree = toytree.tree(newick)
        if not self.tree.is_rooted():
            raise IPyradError(
                "generate_tests_from_tree(): tree must be rooted and resolved")

        # constraints
        self.cdict = OrderedDict((i, []) for i in ["p1", "p2", "p3", "p4"])
        if constraint_dict:
            self.cdict.update(constraint_dict)

        # constraint setting
        self.xdict = constraint_exact
        if isinstance(self.xdict, bool):
            self.xdict = [self.xdict] * 4
        if isinstance(self.xdict, list):
            if len(self.xdict) != len(self.cdict):
                raise Exception(
                    "constraint_exact must be bool or list of bools length N")

        # get tests
        self.loop()
Esempio n. 2
0
    def _run(self, force=False, ipyclient=None):
        "Function to distribute jobs to ipyclient"

        # load balancer
        lbview = ipyclient.load_balanced_view()

        # check that tests are OK
        if not self.tests:
            raise IPyradError("no tests found")
        if isinstance(self.tests, dict):
            self.tests = [self.tests]

        # check that mindict is OK
        if isinstance(self.minmap, int):
            self.minmap = {i: self.minmap for i in self.imap}
        if not self.minmap:
            self.minmap = {i: 1 for i in self.imap}

        # send jobs to the client (but not all at once b/c njobs can be huge)
        rasyncs = {}
        idx = 0
        for i in range(len(ipyclient)):

            # next entries unless fewer than len ipyclient, skip
            try:
                test = next(itests)
                mindict = next(imdict)
            except StopIteration:
                continue

            rasyncs[idx] = lbview.apply(
                dstat, *[loci, test, mindict, self.params.nboots])
            idx += 1
Esempio n. 3
0
def tree2tests(newick, constraint_dict, constraint_exact):
    """
    Returns dict of all possible four-taxon splits in a tree. Assumes
    the user has entered a rooted tree. Skips polytomies.
    """
    # make tree
    tree = toytree.tree(newick)
    if not tree.is_rooted():
        raise IPyradError(
            "Input tree must be rooted to use generate_tests_from_tree()")

    # store results
    testset = set()

    # constraints fill in empty
    cdict = OrderedDict((i, []) for i in ["p1", "p2", "p3", "p4"])
    if constraint_dict:
        cdict.update(constraint_dict)

    # expand constraint_exact if list
    if isinstance(constraint_exact, bool):
        constraint_exact = [constraint_exact] * 4

    if isinstance(constraint_exact, list):
        if len(constraint_exact) != len(cdict):
            raise Exception(
                "constraint_exact must be bool or list of bools of length N")

    # traverse root to tips. Treat the left as outgroup, then the right.
    tests = []

    # topnode must have children. All traversals use default "levelorder"
    for topnode in tree.treenode.traverse():

        for oparent in topnode.children:
            for onode in oparent.traverse("levelorder"):
                if test_constraint(onode, cdict, "p4", constraint_exact[3]):
                    #print(topnode.name, onode.name)

                    ## p123 parent is sister to oparent
                    p123parent = oparent.get_sisters()[0]
                    for p123node in p123parent.traverse("levelorder"):

                        for p3parent in p123node.children:
                            for p3node in p3parent.traverse("levelorder"):
                                if test_constraint(p3node, cdict, "p3",
                                                   constraint_exact[2]):
                                    #print(topnode.name, onode.name, p3node.name)

                                    ## p12 parent is sister to p3parent
                                    p12parent = p3parent.get_sisters()[0]
                                    for p12node in p12parent.traverse(
                                            "levelorder"):

                                        for p2parent in p12node.children:
                                            for p2node in p2parent.traverse(
                                                    "levelorder"):
                                                if test_constraint(
                                                        p2node, cdict, "p2",
                                                        constraint_exact[1]):

                                                    ## p12 parent is sister to p3parent
                                                    p1parent = p2parent.get_sisters(
                                                    )[0]
                                                    for p1node in p1parent.traverse(
                                                            "levelorder"):
                                                        #for p1parent in p1node.children:
                                                        #    for p1node in p1parent.traverse("levelorder"):
                                                        if test_constraint(
                                                                p1node, cdict,
                                                                "p1",
                                                                constraint_exact[
                                                                    0]):
                                                            x = (onode.name,
                                                                 p3node.name,
                                                                 p2node.name,
                                                                 p1node.name)
                                                            test = {}
                                                            test[
                                                                'p4'] = onode.get_leaf_names(
                                                                )
                                                            test[
                                                                'p3'] = p3node.get_leaf_names(
                                                                )
                                                            test[
                                                                'p2'] = p2node.get_leaf_names(
                                                                )
                                                            test[
                                                                'p1'] = p1node.get_leaf_names(
                                                                )
                                                            if x not in testset:
                                                                tests.append(
                                                                    test)
                                                                testset.add(x)
    return tests
Esempio n. 4
0
def _loci_to_arr(loci, taxdict, mindict):
    """
    return a frequency array from a loci file for all loci with taxa from 
    taxdict and min coverage from mindict. 
    """

    ## get max length of loci
    maxlen = np.max(np.array([len(locus.split("\n")[0]) for locus in loci]))

    ## make the array (4 or 5) and a mask array to remove loci without cov
    nloci = len(loci)
    maxlen = np.max(np.array([len(locus.split("\n")[0]) for locus in loci]))
    keep = np.zeros(nloci, dtype=np.bool_)
    arr = np.zeros((nloci, 4, maxlen), dtype=np.float64)

    ## six rows b/c one for each p3, and for the fused p3 ancestor
    if len(taxdict) == 5:
        # arr = np.zeros((nloci, 6, 300), dtype=np.float64)
        arr = np.zeros((nloci, 6, maxlen), dtype=np.float64)

    ## if not mindict, make one that requires 1 in each taxon
    if isinstance(mindict, int):
        mindict = {i: mindict for i in taxdict}
    elif isinstance(mindict, dict):
        mindict = {i: mindict[i] for i in taxdict}
    else:
        mindict = {i: 1 for i in taxdict}

    ## raise error if names are not 'p[int]'
    allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']
    if any([i not in allowed_names for i in taxdict]):
        raise IPyradError(\
            "keys in taxdict must be named 'p1' through 'p4' or 'p5'")

    ## parse key names
    keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])
    outg = keys[-1]

    ## grab seqs just for the good guys
    for loc in range(nloci):

        ## parse the locus
        lines = loci[loc].split("\n")[:-1]
        names = [i.split()[0] for i in lines]
        seqs = np.array([list(i.split()[1]) for i in lines])

        ## check that names cover the taxdict (still need to check by site)
        covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \
                for tax in taxdict]

        ## keep locus
        if all(covs):
            keep[loc] = True

            ## get the refseq
            refidx = np.where([i in taxdict[outg] for i in names])[0]
            refseq = seqs[refidx].view(np.uint8)
            ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])

            ## freq of ref in outgroup
            iseq = _reffreq2(ancestral, refseq, GETCONS2)
            arr[loc, -1, :iseq.shape[1]] = iseq

            ## enter 4-taxon freqs
            if len(taxdict) == 4:
                for tidx, key in enumerate(keys[:-1]):

                    ## get idx of names in test tax
                    nidx = np.where([i in taxdict[key] for i in names])[0]
                    sidx = seqs[nidx].view(np.uint8)

                    ## get freq of sidx
                    iseq = _reffreq2(ancestral, sidx, GETCONS2)

                    ## fill it in
                    arr[loc, tidx, :iseq.shape[1]] = iseq

            else:

                ## entere p5; and fill it in
                iseq = _reffreq2(ancestral, refseq, GETCONS2)
                arr[loc, -1, :iseq.shape[1]] = iseq

                ## enter p1
                nidx = np.where([i in taxdict['p1'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 0, :iseq.shape[1]] = iseq

                ## enter p2
                nidx = np.where([i in taxdict['p2'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 1, :iseq.shape[1]] = iseq

                ## enter p3 with p4 masked, and p4 with p3 masked
                nidx = np.where([i in taxdict['p3'] for i in names])[0]
                nidy = np.where([i in taxdict['p4'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                sidy = seqs[nidy].view(np.uint8)
                xseq = _reffreq2(ancestral, sidx, GETCONS2)
                yseq = _reffreq2(ancestral, sidy, GETCONS2)
                mask3 = xseq != 0
                mask4 = yseq != 0
                xseq[mask4] = 0
                yseq[mask3] = 0
                arr[loc, 2, :xseq.shape[1]] = xseq
                arr[loc, 3, :yseq.shape[1]] = yseq

                ## enter p34
                nidx = nidx.tolist() + nidy.tolist()
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 4, :iseq.shape[1]] = iseq

    ## size-down array to the number of loci that have taxa for the test
    arr = arr[keep, :, :]

    ## size-down sites to
    arr = masknulls(arr)

    return arr, keep
Esempio n. 5
0
    def plot(self,
             show_test_labels=True,
             use_edge_lengths=False,
             collapse_outgroup=False,
             pct_tree_x=0.5,
             pct_tree_y=0.2,
             subset_tests=None,
             prune_tree_to_tests=False,
             *args,
             **kwargs):
        """ 
        Draw a multi-panel figure with tree, tests, and results 
        
        Parameters:
        -----------
        height: int
        ...

        width: int
        ...

        show_test_labels: bool
        ...

        use_edge_lengths: bool
        ...

        collapse_outgroups: bool
        ...

        pct_tree_x: float
        ...

        pct_tree_y: float
        ...

        subset_tests: list
        ...

        """

        ## check for attributes
        if not self.newick:
            raise IPyradError("baba plot requires a newick treefile")
        if not self.tests:
            raise IPyradError("baba plot must have a .tests attribute")

        ## ensure tests is a list
        if isinstance(self.tests, dict):
            self.tests = [self.tests]

        # re-decompose the tree
        ttree = toytree.tree(self.newick)

        # subset test to show fewer
        if subset_tests is not None:
            #tests = self.tests[subset_tests]
            tests = [self.tests[i] for i in subset_tests]
            boots = self.results_boots[subset_tests]
        else:
            tests = self.tests
            boots = self.results_boots

        ## if prune tree
        if prune_tree_to_tests:
            alltesttaxa = set(itertools.chain(*self.taxon_table.values[0]))
            ttree = ttree.drop_tips(
                [i for i in ttree.get_tip_labels() if i not in alltesttaxa])
            ttree.tree.ladderize()

        ## make the plot
        canvas, axes, panel = baba_panel_plot(
            ttree=ttree,
            tests=tests,
            boots=boots,
            show_test_labels=show_test_labels,
            use_edge_lengths=use_edge_lengths,
            collapse_outgroup=collapse_outgroup,
            pct_tree_x=pct_tree_x,
            pct_tree_y=pct_tree_y,
            *args,
            **kwargs)
        return canvas, axes, panel
Esempio n. 6
0
def batch(baba, ipyclient=None):
    """
    distributes jobs to the parallel client
    """
    # parse args
    handle = baba.data
    taxdicts = baba.tests
    mindicts = baba.params.mincov
    nboots = baba.params.nboots

    ## if ms generator make into reusable list
    sims = 0
    if isinstance(handle, types.GeneratorType):
        handle = list(handle)
        sims = 1
    else:
        ## expand locifile path to full path
        handle = os.path.realpath(handle)

    ## parse taxdicts into names and lists if it a dictionary
    #if isinstance(taxdicts, dict):
    #    names, taxdicts = taxdicts.keys(), taxdicts.values()
    #else:
    #    names = []
    names = []
    if isinstance(taxdicts, dict):
        taxdicts = [taxdicts]

    ## an array to hold results (len(taxdicts), nboots)
    tot = len(taxdicts)
    resarr = np.zeros((tot, 7), dtype=np.float64)
    bootsarr = np.zeros((tot, nboots), dtype=np.float64)
    paneldict = {}

    ## submit jobs to run on the cluster queue
    start = time.time()
    asyncs = {}
    idx = 0


    ## prepare data before sending to engines
    ## if it's a str (locifile) then parse it here just once.
    if isinstance(handle, str):
        with open(handle, 'r') as infile:
            loci = infile.read().strip().split("|\n")
    if isinstance(handle, list):
        pass  #sims()

    ## iterate over tests (repeats mindicts if fewer than taxdicts)
    if not taxdicts:
        print("no tests found")
        return
    else:
        itests = iter(taxdicts)
        imdict = itertools.cycle([mindicts])

    #for test, mindict in zip(taxdicts, itertools.cycle([mindicts])):
    for i in range(len(ipyclient)):

        ## next entries unless fewer than len ipyclient, skip
        try:
            test = next(itests)
            mindict = next(imdict)
        except StopIteration:
            continue

        ## if it's sim data then convert to an array
        if sims:
            loci = _msp_to_arr(handle, test)
            args = (loci, test, mindict, nboots)
            print("not yet implemented")
            #asyncs[idx] = lbview.apply_async(dstat, *args)
        else:
            args = [loci, test, mindict, nboots]
            asyncs[idx] = lbview.apply(dstat, *args)
        idx += 1

    ## block until finished, print progress if requested.
    finished = 0
    try:
        while 1:
            keys = [i for (i, j) in asyncs.items() if j.ready()]
            ## check for failures
            for job in keys:
                if not asyncs[job].successful():
                    raise IPyradError(
                        " error: {}: {}".format(job, asyncs[job].exception()))
                ## enter results for successful jobs
                else:
                    _res, _bot = asyncs[job].result()

                    ## store D4 results
                    if _res.shape[0] == 1:
                        resarr[job] = _res.T.as_matrix()[:, 0]
                        bootsarr[job] = _bot

                    ## or store D5 results                        
                    else:   
                        paneldict[job] = _res.T

                    ## remove old job
                    del asyncs[job]
                    finished += 1

                    ## submit next job if there is one.
                    try:
                        test = next(itests)
                        mindict = next(imdict)
                        if sims:
                            loci = _msp_to_arr(handle, test)
                            args = (loci, test, mindict, nboots)
                            print("not yet implemented")
                            #asyncs[idx] = lbview.apply_async(dstat, *args)
                        else:
                            args = [loci, test, mindict, nboots]
                            asyncs[idx] = lbview.apply(dstat, *args)
                        idx += 1
                    except StopIteration:
                        pass

            ## count finished and break if all are done.
            #fin = idx - len(asyncs)
            elap = datetime.timedelta(seconds=int(time.time()-start))
            printstr = " calculating D-stats  | {} | "
            progressbar(tot, finished, printstr.format(elap), spacer="")
            time.sleep(0.1)
            if not asyncs:
                print("")
                break

    except KeyboardInterrupt as inst:
        ## cancel all jobs (ipy & multiproc modes) and then raise error
        try:
            ipyclient.abort()
        except Exception:
            pass
        raise inst

    ## dress up resarr as a Pandas DataFrame if 4-part test
    if len(test) == 4:
        if not names:
            names = range(len(taxdicts))
        #print("resarr")
        #print(resarr)
        resarr = pd.DataFrame(resarr, 
            index=names,
            columns=["dstat", "bootmean", "bootstd", "Z", "ABBA", "BABA", "nloci"])

        ## sort results and bootsarr to match if test names were supplied
        resarr = resarr.sort_index()
        order = [list(resarr.index).index(i) for i in names]
        bootsarr = bootsarr[order]
        return resarr, bootsarr
    else:
        ## order results dfs
        listres = []
        for key in range(len(paneldict)):
            listres.append(paneldict[key])
            
        ## make into a multi-index dataframe
        ntests = len(paneldict)
        multi_index = [
            np.array([[i] * 3 for i in range(ntests)]).flatten(),
            np.array(['p3', 'p4', 'shared'] * ntests),
        ]
        resarr = pd.DataFrame(
            data=pd.concat(listres).as_matrix(), 
            index=multi_index,
            columns=listres[0].columns,
            )
        return resarr, None
        #return listres, None  #_res.T, _bot

    # store instead of return...
    self.results_table, self.results_boots