def mergeAlignments(daddyAlignmentFile, childAlignmentFile, childSeq, childSeq2, daddySeqNo, childSeqNo, newAlignmentFile, nodeLabels): childGapColumn = ['-']*childSeqNo topDaddyGapColumn = ['-']*childSeq bottomDaddyGapColumn = ['-']*(daddySeqNo-(childSeq+1)) totalSeqNo = daddySeqNo -1 + childSeqNo outputFiles, outputIters = getOpenSeqFiles(totalSeqNo, getTempFile) childIter = multiFastaRead(childAlignmentFile) def nextChild(): try: return childIter.next() except StopIteration: return None childColumn = nextChild() for daddyColumn in multiFastaRead(daddyAlignmentFile): if daddyColumn[childSeq] != '-': while childColumn[childSeq2] == '-': outputMergedColumn(outputIters, topDaddyGapColumn, childColumn, bottomDaddyGapColumn) childColumn = nextChild() outputMergedColumn(outputIters, daddyColumn[:childSeq], childColumn, daddyColumn[childSeq+1:]) childColumn = nextChild() else: outputMergedColumn(outputIters, daddyColumn[:childSeq], childGapColumn, daddyColumn[childSeq+1:]) while childColumn != None: outputMergedColumn(outputIters, topDaddyGapColumn, childColumn, bottomDaddyGapColumn) childColumn = nextChild() closeSeqIterators(outputIters, totalSeqNo) concatanateSeqFiles(outputFiles, newAlignmentFile, totalSeqNo, nodeLabels[0:totalSeqNo]) removeSeqFiles(outputFiles, totalSeqNo)
def countGaplessColumns(alignment): gapless = 0 total = 0 for column in multiFastaRead(alignment): if '-' not in column: gapless += 1 total += 1 return gapless, total
def getGaplessAlignment(alignment, seqNo): outputAlignment = getTempFile() outputFiles, outputIters = getOpenSeqFiles(seqNo, getTempFile) for column in multiFastaRead(alignment): if '-' not in column: for i in xrange(0, seqNo): outputIters[i].write(column[i]) closeSeqIterators(outputIters, seqNo) concatanateSeqFiles(outputFiles, outputAlignment, seqNo, [ str(i) for i in xrange(0, seqNo) ]) removeSeqFiles(outputFiles, seqNo) return outputAlignment
def extractSubAlignment(alignmentFile, startSeq, endSeq, newAlignmentFile): seqNo = endSeq-startSeq outputFiles, outputIters = getOpenSeqFiles(seqNo, getTempFile) for column in multiFastaRead(alignmentFile): for i in xrange(startSeq, endSeq): if column[i] != '-': for j in xrange(startSeq, endSeq): outputIters[j-startSeq].write(column[j]) break closeSeqIterators(outputIters, seqNo) concatanateSeqFiles(outputFiles, newAlignmentFile, seqNo, [ str(i) for i in xrange(startSeq, endSeq) ]) removeSeqFiles(outputFiles, seqNo)
def getGaplessAlignment(alignment, seqNo): outputAlignment = getTempFile() outputFiles, outputIters = getOpenSeqFiles(seqNo, getTempFile) for column in multiFastaRead(alignment): if '-' not in column: for i in xrange(0, seqNo): outputIters[i].write(column[i]) closeSeqIterators(outputIters, seqNo) concatanateSeqFiles(outputFiles, outputAlignment, seqNo, [str(i) for i in xrange(0, seqNo)]) removeSeqFiles(outputFiles, seqNo) return outputAlignment
def mergeAlignments(daddyAlignmentFile, childAlignmentFile, childSeq, childSeq2, daddySeqNo, childSeqNo, newAlignmentFile, nodeLabels): childGapColumn = ['-'] * childSeqNo topDaddyGapColumn = ['-'] * childSeq bottomDaddyGapColumn = ['-'] * (daddySeqNo - (childSeq + 1)) totalSeqNo = daddySeqNo - 1 + childSeqNo outputFiles, outputIters = getOpenSeqFiles(totalSeqNo, getTempFile) childIter = multiFastaRead(childAlignmentFile) def nextChild(): try: return childIter.next() except StopIteration: return None childColumn = nextChild() for daddyColumn in multiFastaRead(daddyAlignmentFile): if daddyColumn[childSeq] != '-': while childColumn[childSeq2] == '-': outputMergedColumn(outputIters, topDaddyGapColumn, childColumn, bottomDaddyGapColumn) childColumn = nextChild() outputMergedColumn(outputIters, daddyColumn[:childSeq], childColumn, daddyColumn[childSeq + 1:]) childColumn = nextChild() else: outputMergedColumn(outputIters, daddyColumn[:childSeq], childGapColumn, daddyColumn[childSeq + 1:]) while childColumn != None: outputMergedColumn(outputIters, topDaddyGapColumn, childColumn, bottomDaddyGapColumn) childColumn = nextChild() closeSeqIterators(outputIters, totalSeqNo) concatanateSeqFiles(outputFiles, newAlignmentFile, totalSeqNo, nodeLabels[0:totalSeqNo]) removeSeqFiles(outputFiles, totalSeqNo)
def extractSubAlignment(alignmentFile, startSeq, endSeq, newAlignmentFile): seqNo = endSeq - startSeq outputFiles, outputIters = getOpenSeqFiles(seqNo, getTempFile) for column in multiFastaRead(alignmentFile): for i in xrange(startSeq, endSeq): if column[i] != '-': for j in xrange(startSeq, endSeq): outputIters[j - startSeq].write(column[j]) break closeSeqIterators(outputIters, seqNo) concatanateSeqFiles(outputFiles, newAlignmentFile, seqNo, [str(i) for i in xrange(startSeq, endSeq)]) removeSeqFiles(outputFiles, seqNo)
def stitchReconstruct(seqNo, inputSeqFiles, treeString, outputFile, outputScoreFile, inputAlignmentFile, alignerArgs): startTime = time.time() #epoch time in seconds logger.info("Starting Stitcher") reconstructionPrefix = alignerArgs.RECONSTRUCTION_PREFIX if alignerArgs.FAST_SETTING: reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS_FAST else: reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS cautiousArgs = alignerArgs.CAUTIOUS_ARGS alignmentChunkMaxSeqSize = alignerArgs.ALIGNMENT_CHUNK_MAX_COLUMN_SIZE viterbiAlignmentColumnGap = alignerArgs.VITERBI_ALIGNMENT_COLUMN_GAP #parse tree binaryTree = newickTreeParser(treeString) binaryTree_depthFirstNumbers(binaryTree) logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True)) labels = binaryTree_nodeNames(binaryTree) leafLabels = [ labels[i] for i in xrange(0, len(labels)) if (i%2) == 0] #load alignment iterator alignmentReader = multiFastaRead(inputAlignmentFile, lambda x : x) #number of sequences, including ancestors nodeNumber = binaryTree.traversalID.midEnd assert nodeNumber == seqNo * 2 - 1 #create output files outputFiles, outputIterators = getOpenSeqFiles(nodeNumber, getTempFile) #while has chunk previousAlignment = [] alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels) tempTreeStatesFile = getTempFile() loopOptions = " " logger.info("Starting main loop") characterFrequenciesString = " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ]) while alignmentSeqs != None: if(end): viterbiAlignmentColumnGap = 0 tempAncestorFile = getTempFile() tempScoreFile = getTempFile() command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -n %s -x %s " % (reconstructionPrefix, treeString, alignmentFile, \ " ".join(alignmentSeqs), tempTreeStatesFile, \ viterbiAlignmentColumnGap, loopOptions, reconstructionArgs, tempAncestorFile, characterFrequenciesString, tempScoreFile) logger.info("Calling Ortheus with : %s", command) exitValue = os.system(command) if exitValue != 0: logger.info("Something went wrong calling Ortheus : %i ", exitValue) #if exitValue != 73: # logger.info("Unrecognised issue, so am exiting to be cautious") # sys.exit(1) logger.info("Going to retry with caution settings") command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -x %s" % (reconstructionPrefix, treeString, alignmentFile, \ " ".join(alignmentSeqs), tempTreeStatesFile, \ viterbiAlignmentColumnGap, loopOptions, cautiousArgs, tempAncestorFile, tempScoreFile) logger.info("Calling Ortheus with : %s", command) if os.system(command): logger.info("Already tried caution, so have to go") sys.exit(1) logger.info("Completed reconstruction of chunk") appendScore(tempScoreFile, outputScoreFile) os.remove(tempScoreFile) loopOptions = " -t " + tempTreeStatesFile tempAncestorFastaOffsets = getMultiFastaOffsets(tempAncestorFile) previousAlignment = removeFromLeft(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), previousAlignment, nodeNumber, seqNo) appendToAlignment(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), outputIterators, nodeNumber) logger.info("Added reconstructed chunk to complete alignment") os.remove(tempAncestorFile) removeSeqFiles(alignmentSeqs, seqNo) os.remove(alignmentFile) logger.info("Cleaned up at end of loop") alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels) logger.info("Finished main loop") #load into single output file closeSeqIterators(outputIterators, nodeNumber) concatanateSeqFiles(outputFiles, outputFile, nodeNumber, labels) logger.info("Written out alignment to single file") #clean up os.remove(tempTreeStatesFile) removeSeqFiles(outputFiles, nodeNumber) logger.info("Cleaned up final files") logger.info("Finished, total time taken for stitcher: %s (seconds)" % (time.time()-startTime))