def parseALERecFile(nfrec, reftreelen=None, restrictclade=None, skipEventFreq=False, skipLines=False, nsample=[], returnDict=False):
	line = ''
	lrecgt = []
	restrictlabs = []
	frec = open(nfrec, 'r')
	while not line.startswith('S:\t'):
		line = frec.readline()
	# extract node labels from reconciled species tree
	spetree = tree2.AnnotatedNode(nwk=line.strip('\n').split('\t')[1], namesAsNum=True)
	spetree.complete_node_ids()
	if reftreelen:
		if not spetree.hasSameTopology(reftreelen): raise IndexError, "reference tree from $2 has not the same topology as that extracted from reconciliation output file $1"
		for node in spetree:
			# extract branch length from topologically identical tree from $2
			matchclade = reftreelen.map_to_node(node.get_leaf_labels())
			node.set_lg(matchclade.lg())
	if restrictclade:
		for restrictnodelab in restrictclade.split(','):
			restrictlabs += spetree[restrictnodelab].get_children_labels()
		subspetree = spetree.restrictToLeaves(restrictlabs, force=True)
	else:
		subspetree = spetree
	while not line.endswith('reconciled G-s:\n'):
		line = frec.readline()
	for i in range(2): line = frec.readline() # skips 2 lines
	# extract reconciled gene tree(s)
	recgtlines = []
	k = 0
	while not line.startswith('#'):
		if (not nsample) or (k in nsample):
			if not skipLines: recgtlines.append(line)
			rectree = tree2.AnnotatedNode(nwk=line.strip('\n'), namesAsNum=True)
			rectree.complete_node_ids()
			lrecgt.append(rectree)
		line = frec.readline()
		k += 1
	dnodeevt = {}
	if not skipEventFreq:
		for i in range(3): line = frec.readline() # skips 3 lines
		# extract node-wise event frequency / copy number info
		for line in frec:
			if line=='\n': continue
			lsp = line.strip('\n').split('\t')
			dnodeevt[lsp[1]] = [float(s) for s in lsp[2:]]
	frec.close()
	if returnDict:
		return {'spetree':spetree, 'subspetree':subspetree, 'lrecgt':lrecgt, 'recgtlines':recgtlines, 'restrictlabs':restrictlabs, 'dnodeevt':dnodeevt}
	else:
		return [spetree, subspetree, lrecgt, recgtlines, restrictlabs, dnodeevt]
Exemple #2
0
def loadRefPopTree(nfrefspetree, nfpop):
	# annotate reference species tree with ancestral population node labels
	lnamepops = []
	with open(nfpop, 'r') as fpop:
		for line in fpop:
			if line.startswith('#'): continue
			lsp = line.rstrip('\n').split('\t')
			lnamepops.append((lsp[0], tuple(lsp[1].split())))
	refspetree = tree2.AnnotatedNode(file=nfrefspetree)
	refspetree.complete_internal_labels(order=0, ffel=True)
	refspetree.complete_node_ids(force=True)
	annotatePopulationInSpeciesTree(refspetree, lnamepops, returnCopy=False, returnAncNodes=False)
	dspe2pop = getdspe2pop(lnamepops)
	nfrefspetreeout = nfrefspetree.rsplit('.', 1)[0]+'_internalPopulations.nwk'
	refspetree.write_newick(nfrefspetreeout, ignoreBS=True)
	return (refspetree, dspe2pop)
	return [field.strip('" ') for field in line.rstrip('\n').split('\t')]

if len(sys.argv)<3:
	print "Usage: %s /path/to/lineage_module_event_table /path/to/reference_tree /path/to/output_folder [/path/to/reconciliation_folder]"
	sys.exit(2)

nflnflineagecommevents = sys.argv[1]
nfrefspetree = sys.argv[2]
dirout = sys.argv[3]
# optional argument to give context of gene family occurrence to gene lineage
if len(sys.argv)>4:
	dirrec = sys.argv[4]
else:
	dirrec = None

refspetree = tree2.AnnotatedNode(file=nfrefspetree)
			
with open(nflnflineagecommevents, 'r') as flnflineagecommevents:
	lnflineagecommevents = [line.rstrip('\n') for line in flnflineagecommevents]

dfamspetree = {}
for nflineagecommevents in lnflineagecommevents:
	flineagecommevents = open(nflineagecommevents, 'r')
	lineagecomm = os.path.basename(nflineagecommevents).rsplit('.', 1)[0]
	dirlineageout = os.path.join(dirout, lineagecomm)
	if not os.path.isdir(dirlineageout): os.mkdir(dirlineageout)
	curfamily = None
	curlineage = None
	curspetree = None
	dnodefreq = {}
	ltrans = []
Exemple #4
0
def main(nfgenetree, diraln, dirout, outtag, mkdircons=True, **kw):
    aliformatin = kw.get('aliformatin')
    diridentseq = kw.get('diridentseq')
    isparallel = kw.get('isparallel')
    print nfgenetree
    bnspl = os.path.basename(nfgenetree).split('.')
    if bnspl[0].startswith('RAxML_'): bngt = bnspl[1]
    else: bngt = bnspl[0]
    globaln = "%s/%s*%s*" % (diraln, bngt, aliformatin[:3])
    try:
        nfaln = glob.glob(globaln)[0]
    except IndexError:
        globaln = "%s/%s*aln*" % (diraln, bngt)
        nfaln = glob.glob(globaln)[0]
    print nfaln
    bnfaln = os.path.basename(nfaln).split('.')[0]
    if bnspl[0].startswith('RAxML_rootedTree'):
        # tree is already rooted, but the branch supports are storred in the comments
        genetree = tree2.AnnotatedNode(file=nfgenetree, keep_comments=True)
        for n in genetree:
            if str(n.comment()).isdigit():
                n.set_bs(float(n.comment()))
    else:
        # trees is unrooted
        genetree = tree2.read_check_newick(nfgenetree,
                                           treeclass='AnnotatedNode')
        # tree is interpreted here as trifurcated at the root ; root it.
        genetree.resolveNode(outgroups='subroot')
    # there will be deepcopy operation on the tree, either to save its state before pruning (pop) below or in select_clades.
    # a deepcopy operation on a recursive tree2.Node object induces a cycle of ~7 function calls per nested node
    # knowing that there are (2*n)-1 nodes in a tree (n being the nuber of tree leaves),
    # one should set the recursion limit >> 7*2*n ; on sequential calls, set it to 10*2*n to be on the safe side with overheads of higher level function calls.
    if not isparallel:
        adddepth(currentmaxreccursdepths, bngt, 10 * 2 * genetree.nb_leaves())
    # deal with potential information on sets of identical sequences
    didseq = {}
    if diridentseq:
        # parse pairs of (reference, redundant) sequences that are identical
        globidseq = "%s/*%s*" % (diridentseq, bngt)
        gnfidseq = glob.glob(globidseq)
        if not gnfidseq:
            raise OSError, "cannot find file matching pattern: '%s'" % globidseq
        nfidseq = gnfidseq[0]
        with open(nfidseq, 'r') as fidseq:
            for line in fidseq:
                refidseq, redidseq = line.rstrip('\n').split('\t')
                didseq.setdefault(refidseq, []).append(redidseq)
        if didseq:
            # remove any redundant sequence from the gene tree before processing
            cleangenetree = copy.deepcopy(genetree)
            gtleaves = set(cleangenetree.get_leaf_labels())
            for refidseq, redidseqs in didseq.iteritems():
                for redidseq in redidseqs:
                    if redidseq in gtleaves:
                        cleangenetree.pop(redidseq)
                        gtleaves.remove(redidseq)
        else:
            cleangenetree = genetree
    else:
        cleangenetree = genetree
    # detect unresolved clades
    constraintswithsingles = mark_unresolved_clades(
        cleangenetree, **kw)  #, pruneSelected=True, inclusive=True
    if verbose: print 'constraintswithsingles =', constraintswithsingles
    # add to identical sequence map to the constrained clades definitions
    newconstraintsfromidseqs = []
    for refidseq, redidseqs in didseq.iteritems():
        # scan for existing clade that would contain the reference
        for c in constraintswithsingles:
            if refidseq in c:
                c += redidseqs
                break
        else:
            newconstraintsfromidseqs.append([refidseq] + redidseqs)
    if verbose: print 'newconstraintsfromidseqs =', newconstraintsfromidseqs
    # for reporting, filter out contraint clades that are just made of one leaf (NB: these are useful for proper definitition of other constraint clades, when nested, non-inclusive clades are allowed)
    constraints = [
        c for c in constraintswithsingles + newconstraintsfromidseqs
        if len(c) > 1
    ]
    if verbose: print 'constraints =', constraints
    # write out subalignments and the main alignment with collapsed clades
    loutgroups = restrict_alignment_representative_leaves(constraints,
                                                          genetree,
                                                          nfaln,
                                                          dirout,
                                                          radout=bnfaln,
                                                          selectRepr=0,
                                                          didseq=didseq,
                                                          **kw)
    if not 'mbc' in supressout:
        mbcoutd, mbcext = doutdext['mbc']
        if mkdircons is True:
            mbcoutd = os.path.join(mbcoutd, bnfaln)
            dout = os.path.join(dirout, mbcoutd)
            if not os.path.isdir(dout):
                os.mkdir(dout)
        for i, constraint in enumerate(constraints):
            cladename = "clade%d" % i
            # write out MrBayes clade constraint for the sub-alignment, in order to compute subalignment samples and/or ancestral sequence
            write_out_MrBayes_clade_constraints(
                [constraint],
                loutgroups[i],
                os.path.join(dirout, mbcoutd,
                             bnfaln + '-' + cladename + '.' + mbcext),
                ilist=[i],
                verbose=verbose)
    if not 'cgt' in supressout:
        fmtcoltree = kw.get('format_color_tree')
        cgtoutd, cgtext = doutdext['cgt']
        colour_tree_with_constrained_clades(genetree, constraints, force=True)
        genetree.complete_internal_labels(order=0, ffel=True)
        if fmtcoltree.lower() in ['xml', 'phyloxml']:
            genetree.write_phyloXML(os.path.join(dirout, cgtoutd,
                                                 bnfaln + '-%s.xml' % cgtext),
                                    ignoreBS=True)
        elif fmtcoltree.lower() in ['nex', 'nexus']:
            genetree.write_nexus(os.path.join(dirout, cgtoutd,
                                              bnfaln + '-%s.nex' % cgtext),
                                 ignoreBS=True)
        else:
            raise ValueError, "specified format '%s' for output coloured-branch tree is not valid; please select among '[phylo]xml' or 'nex[us]'" % fmtcoltree
    # done risking going over reccursion limt
    if not isparallel: rmdepth(currentmaxreccursdepths, bngt)
Exemple #5
0
def main(nfrec,
         nfreftree,
         nfgenetree,
         maxrecgt=1,
         recformat='tera',
         sgsep='_',
         phylofact=1000.0,
         restrictclade=None,
         verbose=False,
         **kw):
    try:
        genetree = tree2.Node(file=nfgenetree, namesAsNum=True)
    except ValueError:
        genetree = tree2.Node(file=nfgenetree,
                              namesAsNum=True,
                              branch_lengths=False)
    reftree = tree2.AnnotatedNode(file=nfreftree, namesAsNum=True)
    if restrictclade: st = reftree.restrictToLeaves(restrictclade)
    else: st = reftree
    # check presence of outgroup/dead lineage branch if necessary
    if recformat == 'tera':
        if not (kw.get('noDeadStories') or
                (deadlabnum in st.get_leaf_labels())):
            if (outtaxlab in st.get_leaf_labels()):
                # must adapt mowgli-compliant species tree
                st[outtaxlab].edit_label(deadlabnum)
            else:
                maxd = reftree.max_leaf_distance()
                outgroup = tree2.AnnotatedNode(lleaves=[deadlabnum])
                outgroup.get_children()[0].set_lg(maxd * 3)
                outgroup.link_child(reftree, newlen=maxd * 2)
                reftree = outgroup
                reftree.complete_internal_labels(prefix='')
#			else:
#				raise ValueError, "the provided species tree should feature a branch labaelled 'OUTGROUP' or '-1' to represent the dead/unsampled lineages"
    elif recformat == 'mowgli':
        if not (outtaxlab in st.get_leaf_labels()):
            if (deadlabnum in st.get_leaf_labels()):
                # must adapt mowgli-compliant species tree
                st[deadlabnum].edit_label(outtaxlab)
            else:
                outgroup = tree2.AnnotatedNode(lleaves=[outtaxlab])
                outgroup.get_children()[0].set_lg(maxd * 3)
                outgroup.link_child(reftree, newlen=maxd * 2)
                reftree = outgroup
                reftree.complete_internal_labels(prefix='')


#			else:
#				raise ValueError, "the provided species tree should feature a branch labaelled 'OUTGROUP' or '-1' to represent the dead/unsampled lineages"
    for i, rec in enumerate(
            parseTERARecFile(nfrec,
                             genetree=genetree,
                             recformat=recformat,
                             sgsep=sgsep,
                             verbose=verbose,
                             **kw)):
        dnodefreq, dlevt = rec
        # write SVG species tree
        tag = '_no_dead' if kw.get('noDeadStories') else ''
        nfoutspe = '%s_%d_maprec2spetree%s.svg' % (nfrec, i, tag)
        lleaffreq = [(lab, f) for lab, f in dnodefreq.items()
                     if st[lab].is_leaf()]
        st.writeSvgTree(nfoutspe, padleaves=True, supports=False, phylofact=phylofact, branchwidths=dnodefreq, textorbit=5, \
         treetype='species', transfers=dlevt['T'], duplications=dlevt['D'], losses=dlevt['L'], counts=lleaffreq, \
         transferwidth='freq', modstyle="stroke-width:1; ", padstyle="stroke:red; stroke-width:0.5; stroke-dasharray:1,1; ")
        # transfercolor='green',
        print os.path.basename(nfoutspe)