Example #1
0
def mergeAlignments(daddyAlignmentFile, childAlignmentFile, childSeq, childSeq2, daddySeqNo, childSeqNo, newAlignmentFile, nodeLabels):
    childGapColumn = ['-']*childSeqNo
    topDaddyGapColumn = ['-']*childSeq
    bottomDaddyGapColumn = ['-']*(daddySeqNo-(childSeq+1))
    
    totalSeqNo = daddySeqNo -1 + childSeqNo
    outputFiles, outputIters = getOpenSeqFiles(totalSeqNo, getTempFile)
    
    childIter = multiFastaRead(childAlignmentFile)
    def nextChild():
        try:
            return childIter.next()
        except StopIteration:
            return None
    childColumn = nextChild()
    for daddyColumn in multiFastaRead(daddyAlignmentFile):
        if daddyColumn[childSeq] != '-':
            while childColumn[childSeq2] == '-':
                outputMergedColumn(outputIters, topDaddyGapColumn, childColumn, bottomDaddyGapColumn)
                childColumn = nextChild()
            outputMergedColumn(outputIters, daddyColumn[:childSeq], childColumn, daddyColumn[childSeq+1:])
            childColumn = nextChild()
        else:
            outputMergedColumn(outputIters, daddyColumn[:childSeq], childGapColumn, daddyColumn[childSeq+1:])
    while childColumn != None:
        outputMergedColumn(outputIters, topDaddyGapColumn, childColumn, bottomDaddyGapColumn)
        childColumn = nextChild()
    closeSeqIterators(outputIters, totalSeqNo)
    concatanateSeqFiles(outputFiles, newAlignmentFile, totalSeqNo, nodeLabels[0:totalSeqNo])
    removeSeqFiles(outputFiles, totalSeqNo)
Example #2
0
def countGaplessColumns(alignment):
    gapless = 0
    total = 0
    for column in multiFastaRead(alignment):
        if '-' not in column:
            gapless += 1
        total += 1
    return gapless, total
Example #3
0
def countGaplessColumns(alignment):
    gapless = 0
    total = 0
    for column in multiFastaRead(alignment):
        if '-' not in column:
            gapless += 1
        total += 1
    return gapless, total
Example #4
0
def getGaplessAlignment(alignment, seqNo):
    outputAlignment = getTempFile()
    outputFiles, outputIters = getOpenSeqFiles(seqNo, getTempFile)
    for column in multiFastaRead(alignment):
        if '-' not in column:
            for i in xrange(0, seqNo):
                outputIters[i].write(column[i])
    closeSeqIterators(outputIters, seqNo)
    concatanateSeqFiles(outputFiles, outputAlignment, seqNo, [ str(i) for i in xrange(0, seqNo) ])
    removeSeqFiles(outputFiles, seqNo)
    return outputAlignment
Example #5
0
def extractSubAlignment(alignmentFile, startSeq, endSeq, newAlignmentFile):
    seqNo = endSeq-startSeq
    outputFiles, outputIters = getOpenSeqFiles(seqNo, getTempFile)
    for column in multiFastaRead(alignmentFile):
        for i in xrange(startSeq, endSeq):
            if column[i] != '-':
                for j in xrange(startSeq, endSeq):
                    outputIters[j-startSeq].write(column[j])
                break
    closeSeqIterators(outputIters, seqNo)
    concatanateSeqFiles(outputFiles, newAlignmentFile, seqNo, [ str(i) for i in xrange(startSeq, endSeq) ])
    removeSeqFiles(outputFiles, seqNo)
Example #6
0
def getGaplessAlignment(alignment, seqNo):
    outputAlignment = getTempFile()
    outputFiles, outputIters = getOpenSeqFiles(seqNo, getTempFile)
    for column in multiFastaRead(alignment):
        if '-' not in column:
            for i in xrange(0, seqNo):
                outputIters[i].write(column[i])
    closeSeqIterators(outputIters, seqNo)
    concatanateSeqFiles(outputFiles, outputAlignment, seqNo,
                        [str(i) for i in xrange(0, seqNo)])
    removeSeqFiles(outputFiles, seqNo)
    return outputAlignment
Example #7
0
def mergeAlignments(daddyAlignmentFile, childAlignmentFile, childSeq,
                    childSeq2, daddySeqNo, childSeqNo, newAlignmentFile,
                    nodeLabels):
    childGapColumn = ['-'] * childSeqNo
    topDaddyGapColumn = ['-'] * childSeq
    bottomDaddyGapColumn = ['-'] * (daddySeqNo - (childSeq + 1))

    totalSeqNo = daddySeqNo - 1 + childSeqNo
    outputFiles, outputIters = getOpenSeqFiles(totalSeqNo, getTempFile)

    childIter = multiFastaRead(childAlignmentFile)

    def nextChild():
        try:
            return childIter.next()
        except StopIteration:
            return None

    childColumn = nextChild()
    for daddyColumn in multiFastaRead(daddyAlignmentFile):
        if daddyColumn[childSeq] != '-':
            while childColumn[childSeq2] == '-':
                outputMergedColumn(outputIters, topDaddyGapColumn, childColumn,
                                   bottomDaddyGapColumn)
                childColumn = nextChild()
            outputMergedColumn(outputIters, daddyColumn[:childSeq],
                               childColumn, daddyColumn[childSeq + 1:])
            childColumn = nextChild()
        else:
            outputMergedColumn(outputIters, daddyColumn[:childSeq],
                               childGapColumn, daddyColumn[childSeq + 1:])
    while childColumn != None:
        outputMergedColumn(outputIters, topDaddyGapColumn, childColumn,
                           bottomDaddyGapColumn)
        childColumn = nextChild()
    closeSeqIterators(outputIters, totalSeqNo)
    concatanateSeqFiles(outputFiles, newAlignmentFile, totalSeqNo,
                        nodeLabels[0:totalSeqNo])
    removeSeqFiles(outputFiles, totalSeqNo)
Example #8
0
def extractSubAlignment(alignmentFile, startSeq, endSeq, newAlignmentFile):
    seqNo = endSeq - startSeq
    outputFiles, outputIters = getOpenSeqFiles(seqNo, getTempFile)
    for column in multiFastaRead(alignmentFile):
        for i in xrange(startSeq, endSeq):
            if column[i] != '-':
                for j in xrange(startSeq, endSeq):
                    outputIters[j - startSeq].write(column[j])
                break
    closeSeqIterators(outputIters, seqNo)
    concatanateSeqFiles(outputFiles, newAlignmentFile, seqNo,
                        [str(i) for i in xrange(startSeq, endSeq)])
    removeSeqFiles(outputFiles, seqNo)
Example #9
0
def stitchReconstruct(seqNo, inputSeqFiles, treeString, outputFile, outputScoreFile, inputAlignmentFile, alignerArgs):
    startTime = time.time() #epoch time in seconds
    
    logger.info("Starting Stitcher")
    reconstructionPrefix = alignerArgs.RECONSTRUCTION_PREFIX
    if alignerArgs.FAST_SETTING:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS_FAST
    else:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS
    cautiousArgs = alignerArgs.CAUTIOUS_ARGS
    alignmentChunkMaxSeqSize = alignerArgs.ALIGNMENT_CHUNK_MAX_COLUMN_SIZE
    viterbiAlignmentColumnGap = alignerArgs.VITERBI_ALIGNMENT_COLUMN_GAP
    #parse tree 
    binaryTree = newickTreeParser(treeString)
    binaryTree_depthFirstNumbers(binaryTree)
    logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True))
    labels = binaryTree_nodeNames(binaryTree)
    leafLabels = [ labels[i] for i in xrange(0, len(labels)) if (i%2) == 0]
    #load alignment iterator
    alignmentReader = multiFastaRead(inputAlignmentFile, lambda x : x)
    #number of sequences, including ancestors
    nodeNumber = binaryTree.traversalID.midEnd
    assert nodeNumber == seqNo * 2 - 1
    #create output files
    outputFiles, outputIterators = getOpenSeqFiles(nodeNumber, getTempFile)
    #while has chunk
    previousAlignment = []
    alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    tempTreeStatesFile = getTempFile()
    loopOptions = " "  
    logger.info("Starting main loop")
    characterFrequenciesString = " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ])
    while alignmentSeqs != None:
        if(end):
            viterbiAlignmentColumnGap = 0
        tempAncestorFile = getTempFile()
        tempScoreFile = getTempFile()
        command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -n %s -x %s " % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, reconstructionArgs, tempAncestorFile, characterFrequenciesString, tempScoreFile)
        logger.info("Calling Ortheus with : %s", command)
        exitValue = os.system(command)
        if exitValue != 0:
            logger.info("Something went wrong calling Ortheus : %i ", exitValue)
            #if exitValue != 73:
            #    logger.info("Unrecognised issue, so am exiting to be cautious")
            #    sys.exit(1)
            logger.info("Going to retry with caution settings")
            command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -x %s" % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, cautiousArgs, tempAncestorFile, tempScoreFile)
            logger.info("Calling Ortheus with : %s", command)
            if os.system(command):
                logger.info("Already tried caution, so have to go")
                sys.exit(1)
        logger.info("Completed reconstruction of chunk")
        appendScore(tempScoreFile, outputScoreFile)
        os.remove(tempScoreFile)
        loopOptions = " -t " + tempTreeStatesFile
        tempAncestorFastaOffsets = getMultiFastaOffsets(tempAncestorFile)
        previousAlignment = removeFromLeft(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), previousAlignment, nodeNumber, seqNo)
        appendToAlignment(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), outputIterators, nodeNumber)
        logger.info("Added reconstructed chunk to complete alignment")
        os.remove(tempAncestorFile)
        removeSeqFiles(alignmentSeqs, seqNo)
        os.remove(alignmentFile)
        logger.info("Cleaned up at end of loop")
        alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    logger.info("Finished main loop")
    #load into single output file
    closeSeqIterators(outputIterators, nodeNumber)
    concatanateSeqFiles(outputFiles, outputFile, nodeNumber, labels)
    logger.info("Written out alignment to single file")
    #clean up
    os.remove(tempTreeStatesFile)
    removeSeqFiles(outputFiles, nodeNumber)
    logger.info("Cleaned up final files")
    logger.info("Finished, total time taken for stitcher: %s (seconds)" % (time.time()-startTime))