def __init__(self, newick, constraint_dict, constraint_exact): "Traverses tree to build test sets given constraint options." # store sets of four-taxon splits self.testset = set() self.hold = [0, 0, 0, 0] # tree to traverse self.tree = toytree.tree(newick) if not self.tree.is_rooted(): raise IPyradError( "generate_tests_from_tree(): tree must be rooted and resolved") # constraints self.cdict = OrderedDict((i, []) for i in ["p1", "p2", "p3", "p4"]) if constraint_dict: self.cdict.update(constraint_dict) # constraint setting self.xdict = constraint_exact if isinstance(self.xdict, bool): self.xdict = [self.xdict] * 4 if isinstance(self.xdict, list): if len(self.xdict) != len(self.cdict): raise Exception( "constraint_exact must be bool or list of bools length N") # get tests self.loop()
def _run(self, force=False, ipyclient=None): "Function to distribute jobs to ipyclient" # load balancer lbview = ipyclient.load_balanced_view() # check that tests are OK if not self.tests: raise IPyradError("no tests found") if isinstance(self.tests, dict): self.tests = [self.tests] # check that mindict is OK if isinstance(self.minmap, int): self.minmap = {i: self.minmap for i in self.imap} if not self.minmap: self.minmap = {i: 1 for i in self.imap} # send jobs to the client (but not all at once b/c njobs can be huge) rasyncs = {} idx = 0 for i in range(len(ipyclient)): # next entries unless fewer than len ipyclient, skip try: test = next(itests) mindict = next(imdict) except StopIteration: continue rasyncs[idx] = lbview.apply( dstat, *[loci, test, mindict, self.params.nboots]) idx += 1
def tree2tests(newick, constraint_dict, constraint_exact): """ Returns dict of all possible four-taxon splits in a tree. Assumes the user has entered a rooted tree. Skips polytomies. """ # make tree tree = toytree.tree(newick) if not tree.is_rooted(): raise IPyradError( "Input tree must be rooted to use generate_tests_from_tree()") # store results testset = set() # constraints fill in empty cdict = OrderedDict((i, []) for i in ["p1", "p2", "p3", "p4"]) if constraint_dict: cdict.update(constraint_dict) # expand constraint_exact if list if isinstance(constraint_exact, bool): constraint_exact = [constraint_exact] * 4 if isinstance(constraint_exact, list): if len(constraint_exact) != len(cdict): raise Exception( "constraint_exact must be bool or list of bools of length N") # traverse root to tips. Treat the left as outgroup, then the right. tests = [] # topnode must have children. All traversals use default "levelorder" for topnode in tree.treenode.traverse(): for oparent in topnode.children: for onode in oparent.traverse("levelorder"): if test_constraint(onode, cdict, "p4", constraint_exact[3]): #print(topnode.name, onode.name) ## p123 parent is sister to oparent p123parent = oparent.get_sisters()[0] for p123node in p123parent.traverse("levelorder"): for p3parent in p123node.children: for p3node in p3parent.traverse("levelorder"): if test_constraint(p3node, cdict, "p3", constraint_exact[2]): #print(topnode.name, onode.name, p3node.name) ## p12 parent is sister to p3parent p12parent = p3parent.get_sisters()[0] for p12node in p12parent.traverse( "levelorder"): for p2parent in p12node.children: for p2node in p2parent.traverse( "levelorder"): if test_constraint( p2node, cdict, "p2", constraint_exact[1]): ## p12 parent is sister to p3parent p1parent = p2parent.get_sisters( )[0] for p1node in p1parent.traverse( "levelorder"): #for p1parent in p1node.children: # for p1node in p1parent.traverse("levelorder"): if test_constraint( p1node, cdict, "p1", constraint_exact[ 0]): x = (onode.name, p3node.name, p2node.name, p1node.name) test = {} test[ 'p4'] = onode.get_leaf_names( ) test[ 'p3'] = p3node.get_leaf_names( ) test[ 'p2'] = p2node.get_leaf_names( ) test[ 'p1'] = p1node.get_leaf_names( ) if x not in testset: tests.append( test) testset.add(x) return tests
def _loci_to_arr(loci, taxdict, mindict): """ return a frequency array from a loci file for all loci with taxa from taxdict and min coverage from mindict. """ ## get max length of loci maxlen = np.max(np.array([len(locus.split("\n")[0]) for locus in loci])) ## make the array (4 or 5) and a mask array to remove loci without cov nloci = len(loci) maxlen = np.max(np.array([len(locus.split("\n")[0]) for locus in loci])) keep = np.zeros(nloci, dtype=np.bool_) arr = np.zeros((nloci, 4, maxlen), dtype=np.float64) ## six rows b/c one for each p3, and for the fused p3 ancestor if len(taxdict) == 5: # arr = np.zeros((nloci, 6, 300), dtype=np.float64) arr = np.zeros((nloci, 6, maxlen), dtype=np.float64) ## if not mindict, make one that requires 1 in each taxon if isinstance(mindict, int): mindict = {i: mindict for i in taxdict} elif isinstance(mindict, dict): mindict = {i: mindict[i] for i in taxdict} else: mindict = {i: 1 for i in taxdict} ## raise error if names are not 'p[int]' allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5'] if any([i not in allowed_names for i in taxdict]): raise IPyradError(\ "keys in taxdict must be named 'p1' through 'p4' or 'p5'") ## parse key names keys = sorted([i for i in taxdict.keys() if i[0] == 'p']) outg = keys[-1] ## grab seqs just for the good guys for loc in range(nloci): ## parse the locus lines = loci[loc].split("\n")[:-1] names = [i.split()[0] for i in lines] seqs = np.array([list(i.split()[1]) for i in lines]) ## check that names cover the taxdict (still need to check by site) covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \ for tax in taxdict] ## keep locus if all(covs): keep[loc] = True ## get the refseq refidx = np.where([i in taxdict[outg] for i in names])[0] refseq = seqs[refidx].view(np.uint8) ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]]) ## freq of ref in outgroup iseq = _reffreq2(ancestral, refseq, GETCONS2) arr[loc, -1, :iseq.shape[1]] = iseq ## enter 4-taxon freqs if len(taxdict) == 4: for tidx, key in enumerate(keys[:-1]): ## get idx of names in test tax nidx = np.where([i in taxdict[key] for i in names])[0] sidx = seqs[nidx].view(np.uint8) ## get freq of sidx iseq = _reffreq2(ancestral, sidx, GETCONS2) ## fill it in arr[loc, tidx, :iseq.shape[1]] = iseq else: ## entere p5; and fill it in iseq = _reffreq2(ancestral, refseq, GETCONS2) arr[loc, -1, :iseq.shape[1]] = iseq ## enter p1 nidx = np.where([i in taxdict['p1'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 0, :iseq.shape[1]] = iseq ## enter p2 nidx = np.where([i in taxdict['p2'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 1, :iseq.shape[1]] = iseq ## enter p3 with p4 masked, and p4 with p3 masked nidx = np.where([i in taxdict['p3'] for i in names])[0] nidy = np.where([i in taxdict['p4'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) sidy = seqs[nidy].view(np.uint8) xseq = _reffreq2(ancestral, sidx, GETCONS2) yseq = _reffreq2(ancestral, sidy, GETCONS2) mask3 = xseq != 0 mask4 = yseq != 0 xseq[mask4] = 0 yseq[mask3] = 0 arr[loc, 2, :xseq.shape[1]] = xseq arr[loc, 3, :yseq.shape[1]] = yseq ## enter p34 nidx = nidx.tolist() + nidy.tolist() sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 4, :iseq.shape[1]] = iseq ## size-down array to the number of loci that have taxa for the test arr = arr[keep, :, :] ## size-down sites to arr = masknulls(arr) return arr, keep
def plot(self, show_test_labels=True, use_edge_lengths=False, collapse_outgroup=False, pct_tree_x=0.5, pct_tree_y=0.2, subset_tests=None, prune_tree_to_tests=False, *args, **kwargs): """ Draw a multi-panel figure with tree, tests, and results Parameters: ----------- height: int ... width: int ... show_test_labels: bool ... use_edge_lengths: bool ... collapse_outgroups: bool ... pct_tree_x: float ... pct_tree_y: float ... subset_tests: list ... """ ## check for attributes if not self.newick: raise IPyradError("baba plot requires a newick treefile") if not self.tests: raise IPyradError("baba plot must have a .tests attribute") ## ensure tests is a list if isinstance(self.tests, dict): self.tests = [self.tests] # re-decompose the tree ttree = toytree.tree(self.newick) # subset test to show fewer if subset_tests is not None: #tests = self.tests[subset_tests] tests = [self.tests[i] for i in subset_tests] boots = self.results_boots[subset_tests] else: tests = self.tests boots = self.results_boots ## if prune tree if prune_tree_to_tests: alltesttaxa = set(itertools.chain(*self.taxon_table.values[0])) ttree = ttree.drop_tips( [i for i in ttree.get_tip_labels() if i not in alltesttaxa]) ttree.tree.ladderize() ## make the plot canvas, axes, panel = baba_panel_plot( ttree=ttree, tests=tests, boots=boots, show_test_labels=show_test_labels, use_edge_lengths=use_edge_lengths, collapse_outgroup=collapse_outgroup, pct_tree_x=pct_tree_x, pct_tree_y=pct_tree_y, *args, **kwargs) return canvas, axes, panel
def batch(baba, ipyclient=None): """ distributes jobs to the parallel client """ # parse args handle = baba.data taxdicts = baba.tests mindicts = baba.params.mincov nboots = baba.params.nboots ## if ms generator make into reusable list sims = 0 if isinstance(handle, types.GeneratorType): handle = list(handle) sims = 1 else: ## expand locifile path to full path handle = os.path.realpath(handle) ## parse taxdicts into names and lists if it a dictionary #if isinstance(taxdicts, dict): # names, taxdicts = taxdicts.keys(), taxdicts.values() #else: # names = [] names = [] if isinstance(taxdicts, dict): taxdicts = [taxdicts] ## an array to hold results (len(taxdicts), nboots) tot = len(taxdicts) resarr = np.zeros((tot, 7), dtype=np.float64) bootsarr = np.zeros((tot, nboots), dtype=np.float64) paneldict = {} ## submit jobs to run on the cluster queue start = time.time() asyncs = {} idx = 0 ## prepare data before sending to engines ## if it's a str (locifile) then parse it here just once. if isinstance(handle, str): with open(handle, 'r') as infile: loci = infile.read().strip().split("|\n") if isinstance(handle, list): pass #sims() ## iterate over tests (repeats mindicts if fewer than taxdicts) if not taxdicts: print("no tests found") return else: itests = iter(taxdicts) imdict = itertools.cycle([mindicts]) #for test, mindict in zip(taxdicts, itertools.cycle([mindicts])): for i in range(len(ipyclient)): ## next entries unless fewer than len ipyclient, skip try: test = next(itests) mindict = next(imdict) except StopIteration: continue ## if it's sim data then convert to an array if sims: loci = _msp_to_arr(handle, test) args = (loci, test, mindict, nboots) print("not yet implemented") #asyncs[idx] = lbview.apply_async(dstat, *args) else: args = [loci, test, mindict, nboots] asyncs[idx] = lbview.apply(dstat, *args) idx += 1 ## block until finished, print progress if requested. finished = 0 try: while 1: keys = [i for (i, j) in asyncs.items() if j.ready()] ## check for failures for job in keys: if not asyncs[job].successful(): raise IPyradError( " error: {}: {}".format(job, asyncs[job].exception())) ## enter results for successful jobs else: _res, _bot = asyncs[job].result() ## store D4 results if _res.shape[0] == 1: resarr[job] = _res.T.as_matrix()[:, 0] bootsarr[job] = _bot ## or store D5 results else: paneldict[job] = _res.T ## remove old job del asyncs[job] finished += 1 ## submit next job if there is one. try: test = next(itests) mindict = next(imdict) if sims: loci = _msp_to_arr(handle, test) args = (loci, test, mindict, nboots) print("not yet implemented") #asyncs[idx] = lbview.apply_async(dstat, *args) else: args = [loci, test, mindict, nboots] asyncs[idx] = lbview.apply(dstat, *args) idx += 1 except StopIteration: pass ## count finished and break if all are done. #fin = idx - len(asyncs) elap = datetime.timedelta(seconds=int(time.time()-start)) printstr = " calculating D-stats | {} | " progressbar(tot, finished, printstr.format(elap), spacer="") time.sleep(0.1) if not asyncs: print("") break except KeyboardInterrupt as inst: ## cancel all jobs (ipy & multiproc modes) and then raise error try: ipyclient.abort() except Exception: pass raise inst ## dress up resarr as a Pandas DataFrame if 4-part test if len(test) == 4: if not names: names = range(len(taxdicts)) #print("resarr") #print(resarr) resarr = pd.DataFrame(resarr, index=names, columns=["dstat", "bootmean", "bootstd", "Z", "ABBA", "BABA", "nloci"]) ## sort results and bootsarr to match if test names were supplied resarr = resarr.sort_index() order = [list(resarr.index).index(i) for i in names] bootsarr = bootsarr[order] return resarr, bootsarr else: ## order results dfs listres = [] for key in range(len(paneldict)): listres.append(paneldict[key]) ## make into a multi-index dataframe ntests = len(paneldict) multi_index = [ np.array([[i] * 3 for i in range(ntests)]).flatten(), np.array(['p3', 'p4', 'shared'] * ntests), ] resarr = pd.DataFrame( data=pd.concat(listres).as_matrix(), index=multi_index, columns=listres[0].columns, ) return resarr, None #return listres, None #_res.T, _bot # store instead of return... self.results_table, self.results_boots