def parseALERecFile(nfrec, reftreelen=None, restrictclade=None, skipEventFreq=False, skipLines=False, nsample=[], returnDict=False): line = '' lrecgt = [] restrictlabs = [] frec = open(nfrec, 'r') while not line.startswith('S:\t'): line = frec.readline() # extract node labels from reconciled species tree spetree = tree2.AnnotatedNode(nwk=line.strip('\n').split('\t')[1], namesAsNum=True) spetree.complete_node_ids() if reftreelen: if not spetree.hasSameTopology(reftreelen): raise IndexError, "reference tree from $2 has not the same topology as that extracted from reconciliation output file $1" for node in spetree: # extract branch length from topologically identical tree from $2 matchclade = reftreelen.map_to_node(node.get_leaf_labels()) node.set_lg(matchclade.lg()) if restrictclade: for restrictnodelab in restrictclade.split(','): restrictlabs += spetree[restrictnodelab].get_children_labels() subspetree = spetree.restrictToLeaves(restrictlabs, force=True) else: subspetree = spetree while not line.endswith('reconciled G-s:\n'): line = frec.readline() for i in range(2): line = frec.readline() # skips 2 lines # extract reconciled gene tree(s) recgtlines = [] k = 0 while not line.startswith('#'): if (not nsample) or (k in nsample): if not skipLines: recgtlines.append(line) rectree = tree2.AnnotatedNode(nwk=line.strip('\n'), namesAsNum=True) rectree.complete_node_ids() lrecgt.append(rectree) line = frec.readline() k += 1 dnodeevt = {} if not skipEventFreq: for i in range(3): line = frec.readline() # skips 3 lines # extract node-wise event frequency / copy number info for line in frec: if line=='\n': continue lsp = line.strip('\n').split('\t') dnodeevt[lsp[1]] = [float(s) for s in lsp[2:]] frec.close() if returnDict: return {'spetree':spetree, 'subspetree':subspetree, 'lrecgt':lrecgt, 'recgtlines':recgtlines, 'restrictlabs':restrictlabs, 'dnodeevt':dnodeevt} else: return [spetree, subspetree, lrecgt, recgtlines, restrictlabs, dnodeevt]
def loadRefPopTree(nfrefspetree, nfpop): # annotate reference species tree with ancestral population node labels lnamepops = [] with open(nfpop, 'r') as fpop: for line in fpop: if line.startswith('#'): continue lsp = line.rstrip('\n').split('\t') lnamepops.append((lsp[0], tuple(lsp[1].split()))) refspetree = tree2.AnnotatedNode(file=nfrefspetree) refspetree.complete_internal_labels(order=0, ffel=True) refspetree.complete_node_ids(force=True) annotatePopulationInSpeciesTree(refspetree, lnamepops, returnCopy=False, returnAncNodes=False) dspe2pop = getdspe2pop(lnamepops) nfrefspetreeout = nfrefspetree.rsplit('.', 1)[0]+'_internalPopulations.nwk' refspetree.write_newick(nfrefspetreeout, ignoreBS=True) return (refspetree, dspe2pop)
return [field.strip('" ') for field in line.rstrip('\n').split('\t')] if len(sys.argv)<3: print "Usage: %s /path/to/lineage_module_event_table /path/to/reference_tree /path/to/output_folder [/path/to/reconciliation_folder]" sys.exit(2) nflnflineagecommevents = sys.argv[1] nfrefspetree = sys.argv[2] dirout = sys.argv[3] # optional argument to give context of gene family occurrence to gene lineage if len(sys.argv)>4: dirrec = sys.argv[4] else: dirrec = None refspetree = tree2.AnnotatedNode(file=nfrefspetree) with open(nflnflineagecommevents, 'r') as flnflineagecommevents: lnflineagecommevents = [line.rstrip('\n') for line in flnflineagecommevents] dfamspetree = {} for nflineagecommevents in lnflineagecommevents: flineagecommevents = open(nflineagecommevents, 'r') lineagecomm = os.path.basename(nflineagecommevents).rsplit('.', 1)[0] dirlineageout = os.path.join(dirout, lineagecomm) if not os.path.isdir(dirlineageout): os.mkdir(dirlineageout) curfamily = None curlineage = None curspetree = None dnodefreq = {} ltrans = []
def main(nfgenetree, diraln, dirout, outtag, mkdircons=True, **kw): aliformatin = kw.get('aliformatin') diridentseq = kw.get('diridentseq') isparallel = kw.get('isparallel') print nfgenetree bnspl = os.path.basename(nfgenetree).split('.') if bnspl[0].startswith('RAxML_'): bngt = bnspl[1] else: bngt = bnspl[0] globaln = "%s/%s*%s*" % (diraln, bngt, aliformatin[:3]) try: nfaln = glob.glob(globaln)[0] except IndexError: globaln = "%s/%s*aln*" % (diraln, bngt) nfaln = glob.glob(globaln)[0] print nfaln bnfaln = os.path.basename(nfaln).split('.')[0] if bnspl[0].startswith('RAxML_rootedTree'): # tree is already rooted, but the branch supports are storred in the comments genetree = tree2.AnnotatedNode(file=nfgenetree, keep_comments=True) for n in genetree: if str(n.comment()).isdigit(): n.set_bs(float(n.comment())) else: # trees is unrooted genetree = tree2.read_check_newick(nfgenetree, treeclass='AnnotatedNode') # tree is interpreted here as trifurcated at the root ; root it. genetree.resolveNode(outgroups='subroot') # there will be deepcopy operation on the tree, either to save its state before pruning (pop) below or in select_clades. # a deepcopy operation on a recursive tree2.Node object induces a cycle of ~7 function calls per nested node # knowing that there are (2*n)-1 nodes in a tree (n being the nuber of tree leaves), # one should set the recursion limit >> 7*2*n ; on sequential calls, set it to 10*2*n to be on the safe side with overheads of higher level function calls. if not isparallel: adddepth(currentmaxreccursdepths, bngt, 10 * 2 * genetree.nb_leaves()) # deal with potential information on sets of identical sequences didseq = {} if diridentseq: # parse pairs of (reference, redundant) sequences that are identical globidseq = "%s/*%s*" % (diridentseq, bngt) gnfidseq = glob.glob(globidseq) if not gnfidseq: raise OSError, "cannot find file matching pattern: '%s'" % globidseq nfidseq = gnfidseq[0] with open(nfidseq, 'r') as fidseq: for line in fidseq: refidseq, redidseq = line.rstrip('\n').split('\t') didseq.setdefault(refidseq, []).append(redidseq) if didseq: # remove any redundant sequence from the gene tree before processing cleangenetree = copy.deepcopy(genetree) gtleaves = set(cleangenetree.get_leaf_labels()) for refidseq, redidseqs in didseq.iteritems(): for redidseq in redidseqs: if redidseq in gtleaves: cleangenetree.pop(redidseq) gtleaves.remove(redidseq) else: cleangenetree = genetree else: cleangenetree = genetree # detect unresolved clades constraintswithsingles = mark_unresolved_clades( cleangenetree, **kw) #, pruneSelected=True, inclusive=True if verbose: print 'constraintswithsingles =', constraintswithsingles # add to identical sequence map to the constrained clades definitions newconstraintsfromidseqs = [] for refidseq, redidseqs in didseq.iteritems(): # scan for existing clade that would contain the reference for c in constraintswithsingles: if refidseq in c: c += redidseqs break else: newconstraintsfromidseqs.append([refidseq] + redidseqs) if verbose: print 'newconstraintsfromidseqs =', newconstraintsfromidseqs # for reporting, filter out contraint clades that are just made of one leaf (NB: these are useful for proper definitition of other constraint clades, when nested, non-inclusive clades are allowed) constraints = [ c for c in constraintswithsingles + newconstraintsfromidseqs if len(c) > 1 ] if verbose: print 'constraints =', constraints # write out subalignments and the main alignment with collapsed clades loutgroups = restrict_alignment_representative_leaves(constraints, genetree, nfaln, dirout, radout=bnfaln, selectRepr=0, didseq=didseq, **kw) if not 'mbc' in supressout: mbcoutd, mbcext = doutdext['mbc'] if mkdircons is True: mbcoutd = os.path.join(mbcoutd, bnfaln) dout = os.path.join(dirout, mbcoutd) if not os.path.isdir(dout): os.mkdir(dout) for i, constraint in enumerate(constraints): cladename = "clade%d" % i # write out MrBayes clade constraint for the sub-alignment, in order to compute subalignment samples and/or ancestral sequence write_out_MrBayes_clade_constraints( [constraint], loutgroups[i], os.path.join(dirout, mbcoutd, bnfaln + '-' + cladename + '.' + mbcext), ilist=[i], verbose=verbose) if not 'cgt' in supressout: fmtcoltree = kw.get('format_color_tree') cgtoutd, cgtext = doutdext['cgt'] colour_tree_with_constrained_clades(genetree, constraints, force=True) genetree.complete_internal_labels(order=0, ffel=True) if fmtcoltree.lower() in ['xml', 'phyloxml']: genetree.write_phyloXML(os.path.join(dirout, cgtoutd, bnfaln + '-%s.xml' % cgtext), ignoreBS=True) elif fmtcoltree.lower() in ['nex', 'nexus']: genetree.write_nexus(os.path.join(dirout, cgtoutd, bnfaln + '-%s.nex' % cgtext), ignoreBS=True) else: raise ValueError, "specified format '%s' for output coloured-branch tree is not valid; please select among '[phylo]xml' or 'nex[us]'" % fmtcoltree # done risking going over reccursion limt if not isparallel: rmdepth(currentmaxreccursdepths, bngt)
def main(nfrec, nfreftree, nfgenetree, maxrecgt=1, recformat='tera', sgsep='_', phylofact=1000.0, restrictclade=None, verbose=False, **kw): try: genetree = tree2.Node(file=nfgenetree, namesAsNum=True) except ValueError: genetree = tree2.Node(file=nfgenetree, namesAsNum=True, branch_lengths=False) reftree = tree2.AnnotatedNode(file=nfreftree, namesAsNum=True) if restrictclade: st = reftree.restrictToLeaves(restrictclade) else: st = reftree # check presence of outgroup/dead lineage branch if necessary if recformat == 'tera': if not (kw.get('noDeadStories') or (deadlabnum in st.get_leaf_labels())): if (outtaxlab in st.get_leaf_labels()): # must adapt mowgli-compliant species tree st[outtaxlab].edit_label(deadlabnum) else: maxd = reftree.max_leaf_distance() outgroup = tree2.AnnotatedNode(lleaves=[deadlabnum]) outgroup.get_children()[0].set_lg(maxd * 3) outgroup.link_child(reftree, newlen=maxd * 2) reftree = outgroup reftree.complete_internal_labels(prefix='') # else: # raise ValueError, "the provided species tree should feature a branch labaelled 'OUTGROUP' or '-1' to represent the dead/unsampled lineages" elif recformat == 'mowgli': if not (outtaxlab in st.get_leaf_labels()): if (deadlabnum in st.get_leaf_labels()): # must adapt mowgli-compliant species tree st[deadlabnum].edit_label(outtaxlab) else: outgroup = tree2.AnnotatedNode(lleaves=[outtaxlab]) outgroup.get_children()[0].set_lg(maxd * 3) outgroup.link_child(reftree, newlen=maxd * 2) reftree = outgroup reftree.complete_internal_labels(prefix='') # else: # raise ValueError, "the provided species tree should feature a branch labaelled 'OUTGROUP' or '-1' to represent the dead/unsampled lineages" for i, rec in enumerate( parseTERARecFile(nfrec, genetree=genetree, recformat=recformat, sgsep=sgsep, verbose=verbose, **kw)): dnodefreq, dlevt = rec # write SVG species tree tag = '_no_dead' if kw.get('noDeadStories') else '' nfoutspe = '%s_%d_maprec2spetree%s.svg' % (nfrec, i, tag) lleaffreq = [(lab, f) for lab, f in dnodefreq.items() if st[lab].is_leaf()] st.writeSvgTree(nfoutspe, padleaves=True, supports=False, phylofact=phylofact, branchwidths=dnodefreq, textorbit=5, \ treetype='species', transfers=dlevt['T'], duplications=dlevt['D'], losses=dlevt['L'], counts=lleaffreq, \ transferwidth='freq', modstyle="stroke-width:1; ", padstyle="stroke:red; stroke-width:0.5; stroke-dasharray:1,1; ") # transfercolor='green', print os.path.basename(nfoutspe)