def stitchReconstruct(seqNo, inputSeqFiles, treeString, outputFile, outputScoreFile, inputAlignmentFile, alignerArgs): startTime = time.time() #epoch time in seconds logger.info("Starting Stitcher") reconstructionPrefix = alignerArgs.RECONSTRUCTION_PREFIX if alignerArgs.FAST_SETTING: reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS_FAST else: reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS cautiousArgs = alignerArgs.CAUTIOUS_ARGS alignmentChunkMaxSeqSize = alignerArgs.ALIGNMENT_CHUNK_MAX_COLUMN_SIZE viterbiAlignmentColumnGap = alignerArgs.VITERBI_ALIGNMENT_COLUMN_GAP #parse tree binaryTree = newickTreeParser(treeString) binaryTree_depthFirstNumbers(binaryTree) logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True)) labels = binaryTree_nodeNames(binaryTree) leafLabels = [ labels[i] for i in xrange(0, len(labels)) if (i%2) == 0] #load alignment iterator alignmentReader = multiFastaRead(inputAlignmentFile, lambda x : x) #number of sequences, including ancestors nodeNumber = binaryTree.traversalID.midEnd assert nodeNumber == seqNo * 2 - 1 #create output files outputFiles, outputIterators = getOpenSeqFiles(nodeNumber, getTempFile) #while has chunk previousAlignment = [] alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels) tempTreeStatesFile = getTempFile() loopOptions = " " logger.info("Starting main loop") characterFrequenciesString = " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ]) while alignmentSeqs != None: if(end): viterbiAlignmentColumnGap = 0 tempAncestorFile = getTempFile() tempScoreFile = getTempFile() command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -n %s -x %s " % (reconstructionPrefix, treeString, alignmentFile, \ " ".join(alignmentSeqs), tempTreeStatesFile, \ viterbiAlignmentColumnGap, loopOptions, reconstructionArgs, tempAncestorFile, characterFrequenciesString, tempScoreFile) logger.info("Calling Ortheus with : %s", command) exitValue = os.system(command) if exitValue != 0: logger.info("Something went wrong calling Ortheus : %i ", exitValue) #if exitValue != 73: # logger.info("Unrecognised issue, so am exiting to be cautious") # sys.exit(1) logger.info("Going to retry with caution settings") command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -x %s" % (reconstructionPrefix, treeString, alignmentFile, \ " ".join(alignmentSeqs), tempTreeStatesFile, \ viterbiAlignmentColumnGap, loopOptions, cautiousArgs, tempAncestorFile, tempScoreFile) logger.info("Calling Ortheus with : %s", command) if os.system(command): logger.info("Already tried caution, so have to go") sys.exit(1) logger.info("Completed reconstruction of chunk") appendScore(tempScoreFile, outputScoreFile) os.remove(tempScoreFile) loopOptions = " -t " + tempTreeStatesFile tempAncestorFastaOffsets = getMultiFastaOffsets(tempAncestorFile) previousAlignment = removeFromLeft(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), previousAlignment, nodeNumber, seqNo) appendToAlignment(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), outputIterators, nodeNumber) logger.info("Added reconstructed chunk to complete alignment") os.remove(tempAncestorFile) removeSeqFiles(alignmentSeqs, seqNo) os.remove(alignmentFile) logger.info("Cleaned up at end of loop") alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels) logger.info("Finished main loop") #load into single output file closeSeqIterators(outputIterators, nodeNumber) concatanateSeqFiles(outputFiles, outputFile, nodeNumber, labels) logger.info("Written out alignment to single file") #clean up os.remove(tempTreeStatesFile) removeSeqFiles(outputFiles, nodeNumber) logger.info("Cleaned up final files") logger.info("Finished, total time taken for stitcher: %s (seconds)" % (time.time()-startTime))
def nestAlign(binaryTree, leafSeqFiles, outputFile, outputScoreFile, alignerArgs): logger.info("Starting Nester") maxNodeNo = alignerArgs.MAX_NODE_NO removeInternalIDs(binaryTree) logger.info("Binary tree : %s " % printBinaryTree(binaryTree, True, False)) binaryTree_depthFirstNumbers(binaryTree) nodeNo = binaryTree.traversalID.midEnd logger.info("Labelled tree with numbers ") seqNo = len(leafSeqFiles) logger.info(" Sequence files : %s" % " ".join(leafSeqFiles)) #assert seqNo*2 - 1 == nodeNo logger.info("Output file %s " % outputFile) labels = binaryTree_nodeNames(binaryTree) costs = calculateTreeNodeCosts(binaryTree) logger.info("Calculated node costs") for node in xrange(0, nodeNo): logger.info("Node : %s , reconstruction value : %f , %f" % (labels[node], costs[node], 1.0 - costs[node])) pathCost, treePath = calculatePath(binaryTree, costs, maxNodeNo) logger.info(" Calculated nested path. Cost : %f , Path : %s" % (pathCost, " ".join([ labels[i.traversalID.mid] for i in treePath ]))) assert len(leafSeqFiles) == seqNo alignmentFiles = [None] * nodeNo seqFiles = [None] * nodeNo for i in xrange(0, seqNo): seqFiles[i*2] = leafSeqFiles[i] logger.debug("About to start main nested loop") for subTree in treePath: assert subTree != binaryTree logger.info("Chosen sub tree to align : %s " % printBinaryTree(subTree, True, False)) alignmentFile = getTempFile() startTime = time.time() makeAlignment(subTree, seqFiles, alignmentFile, outputScoreFile, alignerArgs) logger.info("Made alignment of subtree, time taken : %s (seconds)" % (time.time()-startTime)) #get the two ancestors subTreeTraversalIDs = binaryTree_depthFirstNumbers(subTree, labelTree=False, dontStopAtID=False) if subTree.left.internal: offset = subTreeTraversalIDs[subTree].midStart childXAlignmentFile = getTempFile() extractSubAlignment(alignmentFile, 0, subTreeTraversalIDs[subTree].mid-offset, childXAlignmentFile) alignmentFiles[subTree.left.traversalID.mid] = childXAlignmentFile logger.info("Extracted alignment of left child : %s " % printBinaryTree(subTree.left, True, False)) assert offset == subTreeTraversalIDs[subTree.left].midStart childXSeqFile = getTempFile() extractSubAlignment(childXAlignmentFile, subTreeTraversalIDs[subTree.left].mid - offset, subTreeTraversalIDs[subTree.left].mid - offset + 1, childXSeqFile) seqFiles[subTree.left.traversalID.mid] = childXSeqFile logger.info("Extracted sequence of left child : %s " % printBinaryTree(subTree.left, True, False)) if subTree.right.internal: offset = subTreeTraversalIDs[subTree].midStart childYAlignmentFile = getTempFile() extractSubAlignment(alignmentFile, subTreeTraversalIDs[subTree].mid + 1 - offset, subTreeTraversalIDs[subTree].midEnd - offset, childYAlignmentFile) alignmentFiles[subTree.right.traversalID.mid] = childYAlignmentFile logger.info("Extracted alignment of right child : %s " % printBinaryTree(subTree.right, True, False)) offset = subTreeTraversalIDs[subTree.right].midStart childYSeqFile = getTempFile() extractSubAlignment(childYAlignmentFile, subTreeTraversalIDs[subTree.right].mid - offset, subTreeTraversalIDs[subTree.right].mid - offset + 1, childYSeqFile) seqFiles[subTree.right.traversalID.mid] = childYSeqFile logger.info("Extracted sequence of right child : %s " % printBinaryTree(subTree.right, True, False)) subTree.left.iD = labels[subTree.left.traversalID.mid] #labels tree, so we only print relevant bits subTree.right.iD = labels[subTree.right.traversalID.mid] os.remove(alignmentFile) logger.info("Finished loop and reduced tree to : %s " % printBinaryTree(subTree, True, False)) startTime = time.time() makeAlignment(binaryTree, seqFiles, outputFile, outputScoreFile, alignerArgs) logger.info("Finished final nested alignment, time taken : %s (seconds)" % (time.time()-startTime)) alignmentFiles[binaryTree.traversalID.mid] = outputFile mergeTogetherAllAlignments(binaryTree, alignmentFiles, labels, [0]) logger.info("Merged together all alignments") for i in xrange(1, nodeNo, 2): if seqFiles[i] != None: os.remove(seqFiles[i]) removeInternalIDs(binaryTree) logger.info("Have cleaned up, and am returning")
def nestAlign(binaryTree, leafSeqFiles, outputFile, outputScoreFile, alignerArgs): logger.info("Starting Nester") maxNodeNo = alignerArgs.MAX_NODE_NO removeInternalIDs(binaryTree) logger.info("Binary tree : %s " % printBinaryTree(binaryTree, True, False)) binaryTree_depthFirstNumbers(binaryTree) nodeNo = binaryTree.traversalID.midEnd logger.info("Labelled tree with numbers ") seqNo = len(leafSeqFiles) logger.info(" Sequence files : %s" % " ".join(leafSeqFiles)) #assert seqNo*2 - 1 == nodeNo logger.info("Output file %s " % outputFile) labels = binaryTree_nodeNames(binaryTree) costs = calculateTreeNodeCosts(binaryTree) logger.info("Calculated node costs") for node in xrange(0, nodeNo): logger.info("Node : %s , reconstruction value : %f , %f" % (labels[node], costs[node], 1.0 - costs[node])) pathCost, treePath = calculatePath(binaryTree, costs, maxNodeNo) logger.info( " Calculated nested path. Cost : %f , Path : %s" % (pathCost, " ".join([labels[i.traversalID.mid] for i in treePath]))) assert len(leafSeqFiles) == seqNo alignmentFiles = [None] * nodeNo seqFiles = [None] * nodeNo for i in xrange(0, seqNo): seqFiles[i * 2] = leafSeqFiles[i] logger.debug("About to start main nested loop") for subTree in treePath: assert subTree != binaryTree logger.info("Chosen sub tree to align : %s " % printBinaryTree(subTree, True, False)) alignmentFile = getTempFile() startTime = time.time() makeAlignment(subTree, seqFiles, alignmentFile, outputScoreFile, alignerArgs) logger.info("Made alignment of subtree, time taken : %s (seconds)" % (time.time() - startTime)) #get the two ancestors subTreeTraversalIDs = binaryTree_depthFirstNumbers(subTree, labelTree=False, dontStopAtID=False) if subTree.left.internal: offset = subTreeTraversalIDs[subTree].midStart childXAlignmentFile = getTempFile() extractSubAlignment(alignmentFile, 0, subTreeTraversalIDs[subTree].mid - offset, childXAlignmentFile) alignmentFiles[subTree.left.traversalID.mid] = childXAlignmentFile logger.info("Extracted alignment of left child : %s " % printBinaryTree(subTree.left, True, False)) assert offset == subTreeTraversalIDs[subTree.left].midStart childXSeqFile = getTempFile() extractSubAlignment( childXAlignmentFile, subTreeTraversalIDs[subTree.left].mid - offset, subTreeTraversalIDs[subTree.left].mid - offset + 1, childXSeqFile) seqFiles[subTree.left.traversalID.mid] = childXSeqFile logger.info("Extracted sequence of left child : %s " % printBinaryTree(subTree.left, True, False)) if subTree.right.internal: offset = subTreeTraversalIDs[subTree].midStart childYAlignmentFile = getTempFile() extractSubAlignment(alignmentFile, subTreeTraversalIDs[subTree].mid + 1 - offset, subTreeTraversalIDs[subTree].midEnd - offset, childYAlignmentFile) alignmentFiles[subTree.right.traversalID.mid] = childYAlignmentFile logger.info("Extracted alignment of right child : %s " % printBinaryTree(subTree.right, True, False)) offset = subTreeTraversalIDs[subTree.right].midStart childYSeqFile = getTempFile() extractSubAlignment( childYAlignmentFile, subTreeTraversalIDs[subTree.right].mid - offset, subTreeTraversalIDs[subTree.right].mid - offset + 1, childYSeqFile) seqFiles[subTree.right.traversalID.mid] = childYSeqFile logger.info("Extracted sequence of right child : %s " % printBinaryTree(subTree.right, True, False)) subTree.left.iD = labels[ subTree.left.traversalID. mid] #labels tree, so we only print relevant bits subTree.right.iD = labels[subTree.right.traversalID.mid] os.remove(alignmentFile) logger.info("Finished loop and reduced tree to : %s " % printBinaryTree(subTree, True, False)) startTime = time.time() makeAlignment(binaryTree, seqFiles, outputFile, outputScoreFile, alignerArgs) logger.info("Finished final nested alignment, time taken : %s (seconds)" % (time.time() - startTime)) alignmentFiles[binaryTree.traversalID.mid] = outputFile mergeTogetherAllAlignments(binaryTree, alignmentFiles, labels, [0]) logger.info("Merged together all alignments") for i in xrange(1, nodeNo, 2): if seqFiles[i] != None: os.remove(seqFiles[i]) removeInternalIDs(binaryTree) logger.info("Have cleaned up, and am returning")