def validate_tree(tree_path, msa_path): ''' Tries to validate that the tree contains unique ids, that those ids exist in the MSA, and finally, it links each leaf to its msa row.''' # holds the link between the leaf accession and the row in that msa leaf_information = {} # first load the tree tree = TreeNode(newick=tree_path, format=0) # also load in the MSA msa = AlignIO.read(msa_path,'fasta') # next go through all of the leaves for leaf in tree.get_leaves(): found = False for alignment_row in msa: if leaf.name in alignment_row.description: if leaf.name not in leaf_information: leaf_information[leaf.name] = alignment_row found = True break else: raise Exception("%s is found in the tree/MSA twice, accessions must be unique" % (leaf.name)) if not found: raise Exception("%s is in the tree but not found in the MSA" % (leaf.name)) return (tree, msa, leaf_information)
elif arg.startswith("--paralogs="): pz = arg.split("=") paralogsStr = pz[1] elif not arg.startswith('-'): if graphfile1 == '': graphfile1 = arg elif graphfile2 == '': graphfile2 = arg # Map each species name to a leaf. This avoids using search_nodes, which is slow # speciesTree is used in every mode, so we build it right here right now speciesTree = None if speciesTreeStr != '': speciesTree = TreeNode(speciesTreeStr) speciesLeavesList = speciesTree.get_leaves() speciesLeaves = {} for leaf in speciesLeavesList: speciesLeaves[leaf.name] = leaf class ConstraintGraph: """ Given a set of genes, and and two set of tuples that represent orthology and paralogy relationships between genes, builds a graph of orthologies, and a graph of paralogies. It then becomes possible to build a DS-tree that satisifies all these constraints (or detect that it can't be done). """ def __init__(self, genes, orthology_relations, paralogy_relations, speciesTree = None, geneSpeciesMapping = None, treelessGeneSpeciesMapping = None): """Constructor. The 2 graphs are built here, as adjacency lists (self.orthologs and self.paralogs, two dicts with key = gene-string, value = neighbors as a set of gene-strings).