Beispiel #1
0
def stitchReconstruct(seqNo, inputSeqFiles, treeString, outputFile, outputScoreFile, inputAlignmentFile, alignerArgs):
    startTime = time.time() #epoch time in seconds
    
    logger.info("Starting Stitcher")
    reconstructionPrefix = alignerArgs.RECONSTRUCTION_PREFIX
    if alignerArgs.FAST_SETTING:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS_FAST
    else:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS
    cautiousArgs = alignerArgs.CAUTIOUS_ARGS
    alignmentChunkMaxSeqSize = alignerArgs.ALIGNMENT_CHUNK_MAX_COLUMN_SIZE
    viterbiAlignmentColumnGap = alignerArgs.VITERBI_ALIGNMENT_COLUMN_GAP
    #parse tree 
    binaryTree = newickTreeParser(treeString)
    binaryTree_depthFirstNumbers(binaryTree)
    logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True))
    labels = binaryTree_nodeNames(binaryTree)
    leafLabels = [ labels[i] for i in xrange(0, len(labels)) if (i%2) == 0]
    #load alignment iterator
    alignmentReader = multiFastaRead(inputAlignmentFile, lambda x : x)
    #number of sequences, including ancestors
    nodeNumber = binaryTree.traversalID.midEnd
    assert nodeNumber == seqNo * 2 - 1
    #create output files
    outputFiles, outputIterators = getOpenSeqFiles(nodeNumber, getTempFile)
    #while has chunk
    previousAlignment = []
    alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    tempTreeStatesFile = getTempFile()
    loopOptions = " "  
    logger.info("Starting main loop")
    characterFrequenciesString = " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ])
    while alignmentSeqs != None:
        if(end):
            viterbiAlignmentColumnGap = 0
        tempAncestorFile = getTempFile()
        tempScoreFile = getTempFile()
        command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -n %s -x %s " % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, reconstructionArgs, tempAncestorFile, characterFrequenciesString, tempScoreFile)
        logger.info("Calling Ortheus with : %s", command)
        exitValue = os.system(command)
        if exitValue != 0:
            logger.info("Something went wrong calling Ortheus : %i ", exitValue)
            #if exitValue != 73:
            #    logger.info("Unrecognised issue, so am exiting to be cautious")
            #    sys.exit(1)
            logger.info("Going to retry with caution settings")
            command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -x %s" % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, cautiousArgs, tempAncestorFile, tempScoreFile)
            logger.info("Calling Ortheus with : %s", command)
            if os.system(command):
                logger.info("Already tried caution, so have to go")
                sys.exit(1)
        logger.info("Completed reconstruction of chunk")
        appendScore(tempScoreFile, outputScoreFile)
        os.remove(tempScoreFile)
        loopOptions = " -t " + tempTreeStatesFile
        tempAncestorFastaOffsets = getMultiFastaOffsets(tempAncestorFile)
        previousAlignment = removeFromLeft(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), previousAlignment, nodeNumber, seqNo)
        appendToAlignment(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), outputIterators, nodeNumber)
        logger.info("Added reconstructed chunk to complete alignment")
        os.remove(tempAncestorFile)
        removeSeqFiles(alignmentSeqs, seqNo)
        os.remove(alignmentFile)
        logger.info("Cleaned up at end of loop")
        alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    logger.info("Finished main loop")
    #load into single output file
    closeSeqIterators(outputIterators, nodeNumber)
    concatanateSeqFiles(outputFiles, outputFile, nodeNumber, labels)
    logger.info("Written out alignment to single file")
    #clean up
    os.remove(tempTreeStatesFile)
    removeSeqFiles(outputFiles, nodeNumber)
    logger.info("Cleaned up final files")
    logger.info("Finished, total time taken for stitcher: %s (seconds)" % (time.time()-startTime))
Beispiel #2
0
def nestAlign(binaryTree, leafSeqFiles, outputFile, outputScoreFile, alignerArgs):
    logger.info("Starting Nester")
    maxNodeNo = alignerArgs.MAX_NODE_NO
    
    removeInternalIDs(binaryTree)
    
    logger.info("Binary tree : %s " % printBinaryTree(binaryTree, True, False))
    binaryTree_depthFirstNumbers(binaryTree)
    nodeNo = binaryTree.traversalID.midEnd
    logger.info("Labelled tree with numbers ")
    
    seqNo = len(leafSeqFiles)
    logger.info(" Sequence files : %s" % " ".join(leafSeqFiles))
    #assert seqNo*2 - 1 == nodeNo
    
    logger.info("Output file %s " % outputFile)
    
    labels = binaryTree_nodeNames(binaryTree)
    costs = calculateTreeNodeCosts(binaryTree)
    logger.info("Calculated node costs")
    for node in xrange(0, nodeNo):
        logger.info("Node : %s , reconstruction value : %f , %f" % (labels[node], costs[node], 1.0 - costs[node]))
    pathCost, treePath = calculatePath(binaryTree, costs, maxNodeNo)
    logger.info(" Calculated nested path. Cost : %f , Path : %s" % (pathCost, " ".join([ labels[i.traversalID.mid] for i in treePath ])))
    assert len(leafSeqFiles) == seqNo
    alignmentFiles = [None] * nodeNo
    seqFiles = [None] * nodeNo
    for i in xrange(0, seqNo):
        seqFiles[i*2] = leafSeqFiles[i]
    logger.debug("About to start main nested loop")
    for subTree in treePath:
        assert subTree != binaryTree
        logger.info("Chosen sub tree to align : %s " % printBinaryTree(subTree, True, False))
        alignmentFile = getTempFile()
        startTime = time.time()
        makeAlignment(subTree, seqFiles, alignmentFile, outputScoreFile, alignerArgs)
        logger.info("Made alignment of subtree, time taken : %s (seconds)" % (time.time()-startTime))
        #get the two ancestors
        subTreeTraversalIDs = binaryTree_depthFirstNumbers(subTree, labelTree=False, dontStopAtID=False)
        
        if subTree.left.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childXAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, 0, subTreeTraversalIDs[subTree].mid-offset, childXAlignmentFile)
            alignmentFiles[subTree.left.traversalID.mid] = childXAlignmentFile
            logger.info("Extracted alignment of left child : %s " % printBinaryTree(subTree.left, True, False))
            
            assert offset == subTreeTraversalIDs[subTree.left].midStart
            childXSeqFile = getTempFile()
            extractSubAlignment(childXAlignmentFile, subTreeTraversalIDs[subTree.left].mid - offset, subTreeTraversalIDs[subTree.left].mid - offset + 1, childXSeqFile)
            seqFiles[subTree.left.traversalID.mid] = childXSeqFile
            logger.info("Extracted sequence of left child : %s " % printBinaryTree(subTree.left, True, False))
        
        if subTree.right.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childYAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, subTreeTraversalIDs[subTree].mid + 1 - offset, subTreeTraversalIDs[subTree].midEnd - offset, childYAlignmentFile)
            alignmentFiles[subTree.right.traversalID.mid] = childYAlignmentFile  
            logger.info("Extracted alignment of right child : %s " % printBinaryTree(subTree.right, True, False))
            
            offset = subTreeTraversalIDs[subTree.right].midStart
            childYSeqFile = getTempFile()
            extractSubAlignment(childYAlignmentFile, subTreeTraversalIDs[subTree.right].mid - offset, subTreeTraversalIDs[subTree.right].mid - offset + 1, childYSeqFile)
            seqFiles[subTree.right.traversalID.mid] = childYSeqFile  
            logger.info("Extracted sequence of right child : %s " % printBinaryTree(subTree.right, True, False))
        
        subTree.left.iD = labels[subTree.left.traversalID.mid] #labels tree, so we only print relevant bits
        subTree.right.iD = labels[subTree.right.traversalID.mid]
        os.remove(alignmentFile)
        logger.info("Finished loop and reduced tree to : %s " % printBinaryTree(subTree, True, False))
    startTime = time.time()
    makeAlignment(binaryTree, seqFiles, outputFile, outputScoreFile, alignerArgs)
    logger.info("Finished final nested alignment, time taken : %s (seconds)" % (time.time()-startTime))
    alignmentFiles[binaryTree.traversalID.mid] = outputFile
    mergeTogetherAllAlignments(binaryTree, alignmentFiles, labels, [0])
    logger.info("Merged together all alignments")
    for i in xrange(1, nodeNo, 2):
        if seqFiles[i] != None:
            os.remove(seqFiles[i])
    removeInternalIDs(binaryTree)
    logger.info("Have cleaned up, and am returning")
Beispiel #3
0
def nestAlign(binaryTree, leafSeqFiles, outputFile, outputScoreFile,
              alignerArgs):
    logger.info("Starting Nester")
    maxNodeNo = alignerArgs.MAX_NODE_NO

    removeInternalIDs(binaryTree)

    logger.info("Binary tree : %s " % printBinaryTree(binaryTree, True, False))
    binaryTree_depthFirstNumbers(binaryTree)
    nodeNo = binaryTree.traversalID.midEnd
    logger.info("Labelled tree with numbers ")

    seqNo = len(leafSeqFiles)
    logger.info(" Sequence files : %s" % " ".join(leafSeqFiles))
    #assert seqNo*2 - 1 == nodeNo

    logger.info("Output file %s " % outputFile)

    labels = binaryTree_nodeNames(binaryTree)
    costs = calculateTreeNodeCosts(binaryTree)
    logger.info("Calculated node costs")
    for node in xrange(0, nodeNo):
        logger.info("Node : %s , reconstruction value : %f , %f" %
                    (labels[node], costs[node], 1.0 - costs[node]))
    pathCost, treePath = calculatePath(binaryTree, costs, maxNodeNo)
    logger.info(
        " Calculated nested path. Cost : %f , Path : %s" %
        (pathCost, " ".join([labels[i.traversalID.mid] for i in treePath])))
    assert len(leafSeqFiles) == seqNo
    alignmentFiles = [None] * nodeNo
    seqFiles = [None] * nodeNo
    for i in xrange(0, seqNo):
        seqFiles[i * 2] = leafSeqFiles[i]
    logger.debug("About to start main nested loop")
    for subTree in treePath:
        assert subTree != binaryTree
        logger.info("Chosen sub tree to align : %s " %
                    printBinaryTree(subTree, True, False))
        alignmentFile = getTempFile()
        startTime = time.time()
        makeAlignment(subTree, seqFiles, alignmentFile, outputScoreFile,
                      alignerArgs)
        logger.info("Made alignment of subtree, time taken : %s (seconds)" %
                    (time.time() - startTime))
        #get the two ancestors
        subTreeTraversalIDs = binaryTree_depthFirstNumbers(subTree,
                                                           labelTree=False,
                                                           dontStopAtID=False)

        if subTree.left.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childXAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, 0,
                                subTreeTraversalIDs[subTree].mid - offset,
                                childXAlignmentFile)
            alignmentFiles[subTree.left.traversalID.mid] = childXAlignmentFile
            logger.info("Extracted alignment of left child : %s " %
                        printBinaryTree(subTree.left, True, False))

            assert offset == subTreeTraversalIDs[subTree.left].midStart
            childXSeqFile = getTempFile()
            extractSubAlignment(
                childXAlignmentFile,
                subTreeTraversalIDs[subTree.left].mid - offset,
                subTreeTraversalIDs[subTree.left].mid - offset + 1,
                childXSeqFile)
            seqFiles[subTree.left.traversalID.mid] = childXSeqFile
            logger.info("Extracted sequence of left child : %s " %
                        printBinaryTree(subTree.left, True, False))

        if subTree.right.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childYAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile,
                                subTreeTraversalIDs[subTree].mid + 1 - offset,
                                subTreeTraversalIDs[subTree].midEnd - offset,
                                childYAlignmentFile)
            alignmentFiles[subTree.right.traversalID.mid] = childYAlignmentFile
            logger.info("Extracted alignment of right child : %s " %
                        printBinaryTree(subTree.right, True, False))

            offset = subTreeTraversalIDs[subTree.right].midStart
            childYSeqFile = getTempFile()
            extractSubAlignment(
                childYAlignmentFile,
                subTreeTraversalIDs[subTree.right].mid - offset,
                subTreeTraversalIDs[subTree.right].mid - offset + 1,
                childYSeqFile)
            seqFiles[subTree.right.traversalID.mid] = childYSeqFile
            logger.info("Extracted sequence of right child : %s " %
                        printBinaryTree(subTree.right, True, False))

        subTree.left.iD = labels[
            subTree.left.traversalID.
            mid]  #labels tree, so we only print relevant bits
        subTree.right.iD = labels[subTree.right.traversalID.mid]
        os.remove(alignmentFile)
        logger.info("Finished loop and reduced tree to : %s " %
                    printBinaryTree(subTree, True, False))
    startTime = time.time()
    makeAlignment(binaryTree, seqFiles, outputFile, outputScoreFile,
                  alignerArgs)
    logger.info("Finished final nested alignment, time taken : %s (seconds)" %
                (time.time() - startTime))
    alignmentFiles[binaryTree.traversalID.mid] = outputFile
    mergeTogetherAllAlignments(binaryTree, alignmentFiles, labels, [0])
    logger.info("Merged together all alignments")
    for i in xrange(1, nodeNo, 2):
        if seqFiles[i] != None:
            os.remove(seqFiles[i])
    removeInternalIDs(binaryTree)
    logger.info("Have cleaned up, and am returning")