def testMoveRoot(self): for test in range(0, self.testNo): binaryTree = getRandomTree() binaryTree_depthFirstNumbers(binaryTree) node = getRandomNode(binaryTree) print("before", printBinaryTree(binaryTree, True), printBinaryTree(node, True)) remodTree = moveRoot(binaryTree, node.traversalID.mid) print("test", test, printBinaryTree(binaryTree, True), printBinaryTree(node, True), printBinaryTree(remodTree, True)) binaryTree_depthFirstNumbers(remodTree)
def testNewickTreeParser(self): if self.testNo > 0: d = '((human,baboon),chimp);' e = newickTreeParser(d) f = printBinaryTree(e, False) print(d, f) assert d == f
def moveRoot(root, branch): """ Removes the old root and places the new root at the mid point along the given branch """ from sonLib import bioio if root.traversalID.mid == branch: return bioio.newickTreeParser(bioio.printBinaryTree(root, True)) def fn2(tree, seq): if seq is not None: return '(' + bioio.printBinaryTree(tree, True)[:-1] + ',' + seq + ')' return bioio.printBinaryTree(tree, True)[:-1] def fn(tree, seq): if tree.traversalID.mid == branch: i = tree.distance tree.distance /= 2 seq = '(' + bioio.printBinaryTree(tree, True)[:-1] + ',(' + seq + ('):%s' % tree.distance) + ');' tree.distance = i return seq if tree.internal: if branch < tree.traversalID.mid: seq = fn2(tree.right, seq) return fn(tree.left, seq) else: assert branch > tree.traversalID.mid seq = fn2(tree.left, seq) return fn(tree.right, seq) else: return bioio.printBinaryTree(tree, True)[:-1] s = fn(root, None) return bioio.newickTreeParser(s)
def fn(tree, seq): if tree.traversalID.mid == branch: i = tree.distance tree.distance /= 2 seq = '(' + bioio.printBinaryTree(tree, True)[:-1] + ',(' + seq + ('):%s' % tree.distance) + ');' tree.distance = i return seq if tree.internal: if branch < tree.traversalID.mid: seq = fn2(tree.right, seq) return fn(tree.left, seq) else: assert branch > tree.traversalID.mid seq = fn2(tree.left, seq) return fn(tree.right, seq) else: return bioio.printBinaryTree(tree, True)[:-1]
def testNewickTreeParser_UnaryNodes(self): #tests with unary nodes for test in range(0, self.testNo): tree = getRandomTreeString() logger.debug("tree to try\t", tree) tree2 = newickTreeParser(tree, reportUnaryNodes=True) tree3 = printBinaryTree(tree2, True) logger.debug("tree found\t", tree3) assert tree == tree3
def testRemodelTreeRemovingRoot(self): for test in range(0, self.testNo): binaryTree = getRandomTree() binaryTree_depthFirstNumbers(binaryTree) node = getRandomLeafNode(binaryTree) remodTree = remodelTreeRemovingRoot(binaryTree, node.traversalID.mid) print("test", test, printBinaryTree(binaryTree, True), printBinaryTree(node, True), printBinaryTree(remodTree, True)) binaryTree_depthFirstNumbers(remodTree) distances = mapTraversalIDsBetweenTrees(binaryTree, remodTree) d = getDistancesBetweenLeaves(binaryTree) d2 = getDistancesBetweenLeaves(remodTree) print(d) print(d2) for key in d2: assert close(d2[key], d[key], 0.0001)
def testCalculateDupsAndLossesByReconcilingTrees_Examples(self): treeString = '(((((((((((((human:0.006969,chimp:0.009727):0.025291,((baboon:0.008968):0.011019):0.024581):0.023649):0.066673):0.018405,((rat:0.081244,mouse:0.072818):0.238435):0.021892):0.02326,(((cow:0.164728,(cat:0.109852,dog:0.107805):0.049576):0.004663):0.010883):0.033242):0.028346):0.016015):0.226853):0.063898):0.126639):0.119814):0.16696);' speciesTree = newickTreeParser(treeString) binaryTree_depthFirstNumbers(speciesTree) #s = printBinaryTree(speciesTree, True) #speciesTree = newickTreeParser(s) #binaryTree_depthFirstNumbers(speciesTree) geneString1 = ('((human,baboon),chimp);', 1, 3) geneString2 = ('((human,chimp),baboon);', 0, 0) geneString3 = ('((human,(human, chimp)),baboon);', 1, 1) geneString4 = ('((human,(human, chimp)),(chimp, baboon));', 2, 3) geneString5 = ('(dog,cat);', 0, 0) geneString6 = ('((dog,cat), cow);', 0, 0) geneString7 = ('(cow,(dog,cat));', 0, 0) geneString8 = ('(cow,(cat,dog));', 0, 0) geneString9 = ('((cow,dog),(dog,cow));', 1, 2) geneString10 = ('((cow,(cow,cow)),(dog,cat));', 2, 0) geneString11 = ('((cow,(cow,cow)),(dog,((cat,cat),cat)));', 4, 0) geneStrings = [ geneString1, geneString2, geneString3, geneString4, \ geneString5, geneString6, geneString7, geneString8, geneString9, geneString10, geneString11 ] print("") for geneString, dupCount, lossCount in geneStrings: geneTree = newickTreeParser(geneString) binaryTree_depthFirstNumbers(geneTree) print(printBinaryTree(geneTree, True), printBinaryTree(speciesTree, True)) dupCount2, lossCount2 = calculateDupsAndLossesByReconcilingTrees( speciesTree, geneTree, processID=lambda x: x) print(geneString, "dups", dupCount, dupCount2, "losses", lossCount, lossCount2) assert dupCount == dupCount2 assert lossCount == lossCount2
def testRandom(self): """Makes random sequences and tests that Ortheus can align them and produce a valid output. """ outputFile = getTempFile() self.tempFiles.append(outputFile) MAX_SEQS = 20 for i in xrange(MAX_SEQS): self.tempFiles.append(getTempFile()) for test in xrange(0, self.testNo): print "test no : %i " % test #seqNo binaryTree = randomTree() middleSeq = getRandomSequence(250)[1] seqs = [] getTreeSeqs(binaryTree, middleSeq, seqs) if len(seqs) <= MAX_SEQS and len(seqs) > 2: seqFiles = [] for i in xrange(0, len(seqs)): seqFiles.append(self.tempFiles[1 + i]) fileHandle = open(seqFiles[i], 'w') fastaWrite(fileHandle, "%i" % i, seqs[i]) fileHandle.close() print "Have seq files ", seqFiles treeString = printBinaryTree(binaryTree, True) print "For tree ", treeString #align seqs and check no failure command = "ortheus_core -a %s -b '%s' -d %s -e" % ( " ".join(seqFiles), treeString, outputFile) print "command to call", command system(command) #check alignment is complete alignment = [i[:] for i in fastaAlignmentRead(outputFile)] #print "alignment", alignment checkAlignment(alignment, seqs) print "test no is finished : %i " % test
def testCalculateProbableRootOfGeneTree_Examples(self): #return treeString = '(((((((((((((human:0.006969,chimp:0.009727):0.025291,((baboon:0.008968):0.011019):0.024581):0.023649):0.066673):0.018405,((rat:0.081244,mouse:0.072818):0.238435):0.021892):0.02326,(((cow:0.164728,(cat:0.109852,dog:0.107805):0.049576):0.004663):0.010883):0.033242):0.028346):0.016015):0.226853):0.063898):0.126639):0.119814):0.16696);' speciesTree = newickTreeParser(treeString) binaryTree_depthFirstNumbers(speciesTree) geneString1 = ('((human,baboon),chimp);', '((human,chimp),baboon);') geneString2 = ('((human,chimp),baboon);', '((human,chimp),baboon);') geneString3 = ( '((((human,chimp),baboon),((dog,cat),cow)),(mouse,rat));', '((((human,chimp),baboon),(mouse,rat)),((dog,cat),cow));') geneString4 = ( '((((human,chimp),baboon),(mouse,rat)),((dog,cat),cow));', '((((human,chimp),baboon),(mouse,rat)),((dog,cat),cow));') geneString5 = ( '((((human,(chimp, chimp)),baboon),((dog,cat),cow)),(mouse,rat));', '((((human,(chimp,chimp)),baboon),(mouse,rat)),((dog,cat),cow));') #geneString3 = ('((human,(human, chimp)),baboon);', 1) #geneString4 = ('((human,(human, chimp)),(chimp, baboon));', 2) #geneString5 = ('(dog,cat);', 0) #geneString6 = ('((dog,cat), cow);', 0) #geneString7 = ('(cow,(dog,cat));', 0) #geneString8 = ('(cow,(cat,dog));', 0) #geneString9 = ('((cow,dog),(dog,cow));', 1) #geneString10 = ('((cow,(cow,cow)),(dog,cat));', 2) #geneString11 = ('((cow,(cow,cow)),(dog,((cat,cat),cat)));', 4) geneStrings = [ geneString1, geneString2, geneString3, geneString4, geneString5 ] #[ geneString3, geneString4, \ #geneString5, geneString6, geneString7, geneString8, #geneString9, geneString10, geneString11 ] for geneString, rootedGeneString in geneStrings: geneTree = newickTreeParser(geneString) rootedGeneTree = newickTreeParser(rootedGeneString) binaryTree_depthFirstNumbers(geneTree) rootedGeneTree2, dupCount, lossCount = calculateProbableRootOfGeneTree( speciesTree, geneTree) print("rootedGeneTree", rootedGeneString, dupCount, lossCount, printBinaryTree(rootedGeneTree2, False))
def testRandom(self): """Makes random sequences and tests that Ortheus can align them and produce a valid output. """ outputFile = getTempFile() self.tempFiles.append(outputFile) MAX_SEQS = 20 for i in xrange(MAX_SEQS): self.tempFiles.append(getTempFile()) for test in xrange(0, self.testNo): print "test no : %i " % test # seqNo binaryTree = randomTree() middleSeq = getRandomSequence(250)[1] seqs = [] getTreeSeqs(binaryTree, middleSeq, seqs) if len(seqs) <= MAX_SEQS and len(seqs) > 2: seqFiles = [] for i in xrange(0, len(seqs)): seqFiles.append(self.tempFiles[1 + i]) fileHandle = open(seqFiles[i], "w") fastaWrite(fileHandle, "%i" % i, seqs[i]) fileHandle.close() print "Have seq files ", seqFiles treeString = printBinaryTree(binaryTree, True) print "For tree ", treeString # align seqs and check no failure command = "ortheus_core -a %s -b '%s' -d %s -e" % (" ".join(seqFiles), treeString, outputFile) print "command to call", command system(command) # check alignment is complete alignment = [i[:] for i in fastaAlignmentRead(outputFile)] # print "alignment", alignment checkAlignment(alignment, seqs) print "test no is finished : %i " % test
import sys import xml.etree.ElementTree as ET from sonLib.tree import BinaryTree from sonLib.tree import njI from sonLib.tree import upgmaI from sonLib.tree import DistancePair from sonLib.bioio import printBinaryTree l = {} def fn(eventName): if not l.has_key(eventName): l[eventName] = BinaryTree(0.0, False, None, None, eventName) return l[eventName] distancePairs = [ DistancePair(float(i.attrib["indelsPerBase"]), fn(i.attrib["eventName1"]), 1, fn(i.attrib["eventName2"]), 1) for i in ET.parse(sys.argv[1]).getroot().findall("indelDistanceForEvents") if i.attrib["eventName1"] != i.attrib["eventName2"] ] print len(distancePairs), l print "NJ", printBinaryTree(njI(distancePairs, len(l.keys())), includeDistances=True) print "UPGMA", printBinaryTree(upgmaI(distancePairs, len(l.keys())), includeDistances=True)
def getCactusInputs_random(regionNumber=0, tempDir=None, sequenceNumber=None, avgSequenceLength=None, treeLeafNumber=None): """Gets a random set of sequences, each of length given, and a species tree relating them. Each sequence is a assigned an event in this tree. """ if sequenceNumber is None: sequenceNumber = random.choice(list(range(30))) if avgSequenceLength is None: avgSequenceLength = random.choice(list(range(1, 3000))) if treeLeafNumber is None: treeLeafNumber = random.choice(list(range(2, 4))) #Make tree binaryTree = makeRandomBinaryTree(treeLeafNumber) newickTreeString = printBinaryTree(binaryTree, includeDistances=True) newickTreeLeafNames = [] def fn(tree): if tree.internal: fn(tree.left) fn(tree.right) else: newickTreeLeafNames.append(tree.iD) fn(binaryTree) logger.info("Made random binary tree: %s" % newickTreeString) sequenceDirs = [] for i in range(len(newickTreeLeafNames)): seqDir = getTempDirectory(rootDir=tempDir) sequenceDirs.append(seqDir) logger.info("Made a set of random directories: %s" % " ".join(sequenceDirs)) #Random sequences and species labelling sequenceFile = None fileHandle = None parentSequence = getRandomSequence( length=random.choice(list(range(1, 2 * avgSequenceLength))))[1] emptySequenceDirs = set(sequenceDirs) i = 0 while i < sequenceNumber or len(emptySequenceDirs) > 0: if sequenceFile == None: if random.random( ) > 0.5: #Randomly choose the files to be attached or not suffix = ".fa.complete" else: suffix = ".fa" sequenceDir = random.choice(sequenceDirs) if sequenceDir in emptySequenceDirs: emptySequenceDirs.remove(sequenceDir) sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix) fileHandle = open(sequenceFile, 'w') if random.random() > 0.8: #Get a new root sequence parentSequence = getRandomSequence( length=random.choice(list(range(1, 2 * avgSequenceLength))))[1] sequence = mutateSequence(parentSequence, distance=random.random() * 0.25) name = getRandomAlphaNumericString(15) if random.random() > 0.5: sequence = reverseComplement(sequence) fastaWrite(fileHandle, name, sequence) if random.random() > 0.5: fileHandle.close() fileHandle = None sequenceFile = None i += 1 if fileHandle != None: fileHandle.close() logger.info("Made %s sequences in %s directories" % (sequenceNumber, len(sequenceDirs))) return sequenceDirs, newickTreeString
def fn2(tree, seq): if seq is not None: return '(' + bioio.printBinaryTree(tree, True)[:-1] + ',' + seq + ')' return bioio.printBinaryTree(tree, True)[:-1]
def fn3(bT): if hash[bT] == root: s = '(' + bioio.printBinaryTree(fn2(hash[bT], bT), bT, True)[:-1] + ')' else: s = '(' + bioio.printBinaryTree(fn2(hash[bT], bT), bT, True)[:-1] + ',' + fn3(hash[bT]) + ')' return s + ":" + str(bT.distance)
def getCactusInputs_random(regionNumber=0, tempDir=None, sequenceNumber=None, avgSequenceLength=None, treeLeafNumber=None): """Gets a random set of sequences, each of length given, and a species tree relating them. Each sequence is a assigned an event in this tree. """ if sequenceNumber is None: sequenceNumber = random.choice(xrange(30)) if avgSequenceLength is None: avgSequenceLength = random.choice(xrange(1,3000)) if treeLeafNumber is None: treeLeafNumber = random.choice(xrange(2, 4)) #Make tree binaryTree = makeRandomBinaryTree(treeLeafNumber) newickTreeString = printBinaryTree(binaryTree, includeDistances=True) newickTreeLeafNames = [] def fn(tree): if tree.internal: fn(tree.left) fn(tree.right) else: newickTreeLeafNames.append(tree.iD) fn(binaryTree) logger.info("Made random binary tree: %s" % newickTreeString) sequenceDirs = [] for i in xrange(len(newickTreeLeafNames)): seqDir = getTempDirectory(rootDir=tempDir) sequenceDirs.append(seqDir) logger.info("Made a set of random directories: %s" % " ".join(sequenceDirs)) #Random sequences and species labelling sequenceFile = None fileHandle = None parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1] emptySequenceDirs = set(sequenceDirs) i = 0 while i < sequenceNumber or len(emptySequenceDirs) > 0: #for i in xrange(sequenceNumber): if sequenceFile == None: if random.random() > 0.5: #Randomly choose the files to be attached or not suffix = ".fa.complete" else: suffix = ".fa" sequenceDir = random.choice(sequenceDirs) if sequenceDir in emptySequenceDirs: emptySequenceDirs.remove(sequenceDir) sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix) fileHandle = open(sequenceFile, 'w') if random.random() > 0.8: #Get a new root sequence parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1] sequence = mutateSequence(parentSequence, distance=random.random()*0.5) name = getRandomAlphaNumericString(15) if random.random() > 0.5: sequence = reverseComplement(sequence) fastaWrite(fileHandle, name, sequence) if random.random() > 0.5: fileHandle.close() fileHandle = None sequenceFile = None i += 1 if fileHandle != None: fileHandle.close() logger.info("Made %s sequences in %s directories" % (sequenceNumber, len(sequenceDirs))) return sequenceDirs, newickTreeString
import xml.etree.ElementTree as ET from sonLib.tree import BinaryTree from sonLib.tree import njI from sonLib.tree import upgmaI from sonLib.tree import DistancePair from sonLib.bioio import printBinaryTree l = {} def fn(eventName): if not l.has_key(eventName): l[eventName] = BinaryTree(0.0, False, None, None, eventName) return l[eventName] distancePairs = [ DistancePair(float(i.attrib["substitutionRate"]), fn(i.attrib["eventName1"]), 1, fn(i.attrib["eventName2"]), 1) for i in ET.parse(sys.argv[1]).getroot().findall("distancesForSamples") ] distancePairs += [ DistancePair(i.distance, i.leaf2, 1, i.leaf1, 1) for i in distancePairs ] print len(distancePairs), l print "NJ", printBinaryTree(njI(distancePairs, len(l.keys())), includeDistances=True) print "UPGMA", printBinaryTree(upgmaI(distancePairs, len(l.keys())), includeDistances=True)