Ejemplo n.º 1
0
def calculateSemphyTreeEstimate(alignmentFile, treeArgs, seqNo):
    if seqNo == 2:
        semphyArgs = treeArgs.SEMPHY_ARGS_PAIRS
    else:    
        semphyArgs = treeArgs.SEMPHY_ARGS_TREE
    semphyAlignmentFile = formatForSemphy(alignmentFile)
    outputTreeFile = getTempFile()
    characterFrequencies = " --ACGprob=%f,%f,%f" % tuple(treeArgs.EXPECTED_CHARACTER_FREQUENCIES[:-1])
    command = "%s --treeoutputfile=%s %s %s --sequence=%s " % (treeArgs.SEMPHY_PATH, outputTreeFile, semphyArgs, characterFrequencies, semphyAlignmentFile)
    #if existingTreeFile != None: #just optimise branch lengths
    #    command += " --bbl --tree=%s " % existingTreeFile
    logger.info("Calling Semphy with %s ", command)
    pipe = os.popen(command)
    if pipe.close():
        logger.info("tree building failed, so must exit")
        sys.exit(1)
    fileHandle = open(outputTreeFile, 'r')
    treeString = fileHandle.readlines()[0]
    fileHandle.close()
    binaryTree = newickTreeParser(treeString, False)
    binaryTree_depthFirstNumbers(binaryTree)
    #clean up
    os.remove(semphyAlignmentFile)
    os.remove(outputTreeFile)
    correctTreeDistances(binaryTree)
    return binaryTree
Ejemplo n.º 2
0
def calculateSemphyTreeEstimate(alignmentFile, treeArgs, seqNo):
    if seqNo == 2:
        semphyArgs = treeArgs.SEMPHY_ARGS_PAIRS
    else:
        semphyArgs = treeArgs.SEMPHY_ARGS_TREE
    semphyAlignmentFile = formatForSemphy(alignmentFile)
    outputTreeFile = getTempFile()
    characterFrequencies = " --ACGprob=%f,%f,%f" % tuple(
        treeArgs.EXPECTED_CHARACTER_FREQUENCIES[:-1])
    command = "%s --treeoutputfile=%s %s %s --sequence=%s " % (
        treeArgs.SEMPHY_PATH, outputTreeFile, semphyArgs, characterFrequencies,
        semphyAlignmentFile)
    #if existingTreeFile != None: #just optimise branch lengths
    #    command += " --bbl --tree=%s " % existingTreeFile
    logger.info("Calling Semphy with %s ", command)
    pipe = os.popen(command)
    if pipe.close():
        logger.info("tree building failed, so must exit")
        sys.exit(1)
    fileHandle = open(outputTreeFile, 'r')
    treeString = fileHandle.readlines()[0]
    fileHandle.close()
    binaryTree = newickTreeParser(treeString, False)
    binaryTree_depthFirstNumbers(binaryTree)
    #clean up
    os.remove(semphyAlignmentFile)
    os.remove(outputTreeFile)
    correctTreeDistances(binaryTree)
    return binaryTree
Ejemplo n.º 3
0
def estimateTreeAlign(seqFiles, outputTreeFile, treeArgs):
    origSeqFileOrder = seqFiles[:]
    tree = makeStarTree(len(seqFiles), 0, treeArgs.DEFAULT_DISTANCE)
    binaryTree_depthFirstNumbers(tree)
    labelTree(tree, strCounter([-1]))
    tree, seqFiles, outputAlignment = estimateTree(seqFiles, tree, treeArgs.ITERATION_NUMBER, \
                                                    treeArgs.DO_SUBTREE_BRANCH_LENGTH_ESTIMATION, treeArgs)
    seqFiles = list(seqFiles)
    if treeArgs.SPECIES_TREE_STRING != None:
        logger.info("Predicting root of tree using species tree")
        speciesTree = newickTreeParser(treeArgs.SPECIES_TREE_STRING)
        binaryTree_depthFirstNumbers(speciesTree)
        logger.info("Parsed species tree: %s" %
                    printBinaryTree(speciesTree, True))
        i = [-1]

        def fn():
            i[0] += 1
            j = origSeqFileOrder.index(seqFiles[i[0]])
            return "%s_%s" % (treeArgs.LEAF_SPECIES[j], str(i[0]))

        labelTree(tree, fn)
        tree, dupCount, lossCount = calculateProbableRootOfGeneTree(
            speciesTree, tree, processID=lambda x: x.split("_")[0])

        def fn2(tree):
            if tree.internal:
                fn2(tree.left)
                fn2(tree.right)
            else:
                tree.iD = tree.iD.split('_')[1]

        fn2(tree)
        seqFiles = getSubtreeSeqs(seqFiles, tree)
        logger.info("Reconciled tree with root : %s %s " %
                    (printBinaryTree(tree, True), " ".join(seqFiles)))
        logger.info("Number of dups needed for reconcilliations : %s " %
                    dupCount)
        logger.info("Number of losses needed for reconcilliations : %s " %
                    lossCount)
    seqFiles = list(seqFiles)
    out = open(outputTreeFile, 'w')
    out.write("%s\n" % printBinaryTree(tree, True))
    out.write("%s\n" % " ".join(seqFiles))
    out.close()
    logger.info("Finished estimate tree")
    return tree, seqFiles, outputAlignment
Ejemplo n.º 4
0
def estimateTreeAlign(seqFiles, outputTreeFile, treeArgs):
    origSeqFileOrder = seqFiles[:]
    tree = makeStarTree(len(seqFiles), 0, treeArgs.DEFAULT_DISTANCE)
    binaryTree_depthFirstNumbers(tree)
    labelTree(tree, strCounter([-1]))
    tree, seqFiles, outputAlignment = estimateTree(seqFiles, tree, treeArgs.ITERATION_NUMBER, \
                                                    treeArgs.DO_SUBTREE_BRANCH_LENGTH_ESTIMATION, treeArgs)
    seqFiles = list(seqFiles)
    if treeArgs.SPECIES_TREE_STRING != None:
        logger.info("Predicting root of tree using species tree")
        speciesTree = newickTreeParser(treeArgs.SPECIES_TREE_STRING)
        binaryTree_depthFirstNumbers(speciesTree)
        logger.info("Parsed species tree: %s" % printBinaryTree(speciesTree, True))
        i = [-1]
        def fn():
            i[0] += 1
            j = origSeqFileOrder.index(seqFiles[i[0]])
            return "%s_%s" % (treeArgs.LEAF_SPECIES[j], str(i[0]))
        labelTree(tree, fn)
        tree, dupCount, lossCount = calculateProbableRootOfGeneTree(speciesTree, tree, processID=lambda x : x.split("_")[0])
        def fn2(tree):
            if tree.internal:
                fn2(tree.left)
                fn2(tree.right)
            else:
                tree.iD = tree.iD.split('_')[1]
        fn2(tree)
        seqFiles = getSubtreeSeqs(seqFiles, tree)
        logger.info("Reconciled tree with root : %s %s " % (printBinaryTree(tree, True), " ".join(seqFiles)))
        logger.info("Number of dups needed for reconcilliations : %s " % dupCount)
        logger.info("Number of losses needed for reconcilliations : %s " % lossCount)
    seqFiles = list(seqFiles)
    out = open(outputTreeFile, 'w')
    out.write("%s\n" % printBinaryTree(tree, True))
    out.write("%s\n" % " ".join(seqFiles))
    out.close()
    logger.info("Finished estimate tree")
    return tree, seqFiles, outputAlignment
Ejemplo n.º 5
0
def nestAlign(binaryTree, leafSeqFiles, outputFile, outputScoreFile, alignerArgs):
    logger.info("Starting Nester")
    maxNodeNo = alignerArgs.MAX_NODE_NO
    
    removeInternalIDs(binaryTree)
    
    logger.info("Binary tree : %s " % printBinaryTree(binaryTree, True, False))
    binaryTree_depthFirstNumbers(binaryTree)
    nodeNo = binaryTree.traversalID.midEnd
    logger.info("Labelled tree with numbers ")
    
    seqNo = len(leafSeqFiles)
    logger.info(" Sequence files : %s" % " ".join(leafSeqFiles))
    #assert seqNo*2 - 1 == nodeNo
    
    logger.info("Output file %s " % outputFile)
    
    labels = binaryTree_nodeNames(binaryTree)
    costs = calculateTreeNodeCosts(binaryTree)
    logger.info("Calculated node costs")
    for node in xrange(0, nodeNo):
        logger.info("Node : %s , reconstruction value : %f , %f" % (labels[node], costs[node], 1.0 - costs[node]))
    pathCost, treePath = calculatePath(binaryTree, costs, maxNodeNo)
    logger.info(" Calculated nested path. Cost : %f , Path : %s" % (pathCost, " ".join([ labels[i.traversalID.mid] for i in treePath ])))
    assert len(leafSeqFiles) == seqNo
    alignmentFiles = [None] * nodeNo
    seqFiles = [None] * nodeNo
    for i in xrange(0, seqNo):
        seqFiles[i*2] = leafSeqFiles[i]
    logger.debug("About to start main nested loop")
    for subTree in treePath:
        assert subTree != binaryTree
        logger.info("Chosen sub tree to align : %s " % printBinaryTree(subTree, True, False))
        alignmentFile = getTempFile()
        startTime = time.time()
        makeAlignment(subTree, seqFiles, alignmentFile, outputScoreFile, alignerArgs)
        logger.info("Made alignment of subtree, time taken : %s (seconds)" % (time.time()-startTime))
        #get the two ancestors
        subTreeTraversalIDs = binaryTree_depthFirstNumbers(subTree, labelTree=False, dontStopAtID=False)
        
        if subTree.left.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childXAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, 0, subTreeTraversalIDs[subTree].mid-offset, childXAlignmentFile)
            alignmentFiles[subTree.left.traversalID.mid] = childXAlignmentFile
            logger.info("Extracted alignment of left child : %s " % printBinaryTree(subTree.left, True, False))
            
            assert offset == subTreeTraversalIDs[subTree.left].midStart
            childXSeqFile = getTempFile()
            extractSubAlignment(childXAlignmentFile, subTreeTraversalIDs[subTree.left].mid - offset, subTreeTraversalIDs[subTree.left].mid - offset + 1, childXSeqFile)
            seqFiles[subTree.left.traversalID.mid] = childXSeqFile
            logger.info("Extracted sequence of left child : %s " % printBinaryTree(subTree.left, True, False))
        
        if subTree.right.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childYAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, subTreeTraversalIDs[subTree].mid + 1 - offset, subTreeTraversalIDs[subTree].midEnd - offset, childYAlignmentFile)
            alignmentFiles[subTree.right.traversalID.mid] = childYAlignmentFile  
            logger.info("Extracted alignment of right child : %s " % printBinaryTree(subTree.right, True, False))
            
            offset = subTreeTraversalIDs[subTree.right].midStart
            childYSeqFile = getTempFile()
            extractSubAlignment(childYAlignmentFile, subTreeTraversalIDs[subTree.right].mid - offset, subTreeTraversalIDs[subTree.right].mid - offset + 1, childYSeqFile)
            seqFiles[subTree.right.traversalID.mid] = childYSeqFile  
            logger.info("Extracted sequence of right child : %s " % printBinaryTree(subTree.right, True, False))
        
        subTree.left.iD = labels[subTree.left.traversalID.mid] #labels tree, so we only print relevant bits
        subTree.right.iD = labels[subTree.right.traversalID.mid]
        os.remove(alignmentFile)
        logger.info("Finished loop and reduced tree to : %s " % printBinaryTree(subTree, True, False))
    startTime = time.time()
    makeAlignment(binaryTree, seqFiles, outputFile, outputScoreFile, alignerArgs)
    logger.info("Finished final nested alignment, time taken : %s (seconds)" % (time.time()-startTime))
    alignmentFiles[binaryTree.traversalID.mid] = outputFile
    mergeTogetherAllAlignments(binaryTree, alignmentFiles, labels, [0])
    logger.info("Merged together all alignments")
    for i in xrange(1, nodeNo, 2):
        if seqFiles[i] != None:
            os.remove(seqFiles[i])
    removeInternalIDs(binaryTree)
    logger.info("Have cleaned up, and am returning")
Ejemplo n.º 6
0
def stitchReconstruct(seqNo, inputSeqFiles, treeString, outputFile, outputScoreFile, inputAlignmentFile, alignerArgs):
    startTime = time.time() #epoch time in seconds
    
    logger.info("Starting Stitcher")
    reconstructionPrefix = alignerArgs.RECONSTRUCTION_PREFIX
    if alignerArgs.FAST_SETTING:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS_FAST
    else:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS
    cautiousArgs = alignerArgs.CAUTIOUS_ARGS
    alignmentChunkMaxSeqSize = alignerArgs.ALIGNMENT_CHUNK_MAX_COLUMN_SIZE
    viterbiAlignmentColumnGap = alignerArgs.VITERBI_ALIGNMENT_COLUMN_GAP
    #parse tree 
    binaryTree = newickTreeParser(treeString)
    binaryTree_depthFirstNumbers(binaryTree)
    logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True))
    labels = binaryTree_nodeNames(binaryTree)
    leafLabels = [ labels[i] for i in xrange(0, len(labels)) if (i%2) == 0]
    #load alignment iterator
    alignmentReader = multiFastaRead(inputAlignmentFile, lambda x : x)
    #number of sequences, including ancestors
    nodeNumber = binaryTree.traversalID.midEnd
    assert nodeNumber == seqNo * 2 - 1
    #create output files
    outputFiles, outputIterators = getOpenSeqFiles(nodeNumber, getTempFile)
    #while has chunk
    previousAlignment = []
    alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    tempTreeStatesFile = getTempFile()
    loopOptions = " "  
    logger.info("Starting main loop")
    characterFrequenciesString = " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ])
    while alignmentSeqs != None:
        if(end):
            viterbiAlignmentColumnGap = 0
        tempAncestorFile = getTempFile()
        tempScoreFile = getTempFile()
        command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -n %s -x %s " % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, reconstructionArgs, tempAncestorFile, characterFrequenciesString, tempScoreFile)
        logger.info("Calling Ortheus with : %s", command)
        exitValue = os.system(command)
        if exitValue != 0:
            logger.info("Something went wrong calling Ortheus : %i ", exitValue)
            #if exitValue != 73:
            #    logger.info("Unrecognised issue, so am exiting to be cautious")
            #    sys.exit(1)
            logger.info("Going to retry with caution settings")
            command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -x %s" % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, cautiousArgs, tempAncestorFile, tempScoreFile)
            logger.info("Calling Ortheus with : %s", command)
            if os.system(command):
                logger.info("Already tried caution, so have to go")
                sys.exit(1)
        logger.info("Completed reconstruction of chunk")
        appendScore(tempScoreFile, outputScoreFile)
        os.remove(tempScoreFile)
        loopOptions = " -t " + tempTreeStatesFile
        tempAncestorFastaOffsets = getMultiFastaOffsets(tempAncestorFile)
        previousAlignment = removeFromLeft(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), previousAlignment, nodeNumber, seqNo)
        appendToAlignment(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), outputIterators, nodeNumber)
        logger.info("Added reconstructed chunk to complete alignment")
        os.remove(tempAncestorFile)
        removeSeqFiles(alignmentSeqs, seqNo)
        os.remove(alignmentFile)
        logger.info("Cleaned up at end of loop")
        alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    logger.info("Finished main loop")
    #load into single output file
    closeSeqIterators(outputIterators, nodeNumber)
    concatanateSeqFiles(outputFiles, outputFile, nodeNumber, labels)
    logger.info("Written out alignment to single file")
    #clean up
    os.remove(tempTreeStatesFile)
    removeSeqFiles(outputFiles, nodeNumber)
    logger.info("Cleaned up final files")
    logger.info("Finished, total time taken for stitcher: %s (seconds)" % (time.time()-startTime))
Ejemplo n.º 7
0
def nestAlign(binaryTree, leafSeqFiles, outputFile, outputScoreFile,
              alignerArgs):
    logger.info("Starting Nester")
    maxNodeNo = alignerArgs.MAX_NODE_NO

    removeInternalIDs(binaryTree)

    logger.info("Binary tree : %s " % printBinaryTree(binaryTree, True, False))
    binaryTree_depthFirstNumbers(binaryTree)
    nodeNo = binaryTree.traversalID.midEnd
    logger.info("Labelled tree with numbers ")

    seqNo = len(leafSeqFiles)
    logger.info(" Sequence files : %s" % " ".join(leafSeqFiles))
    #assert seqNo*2 - 1 == nodeNo

    logger.info("Output file %s " % outputFile)

    labels = binaryTree_nodeNames(binaryTree)
    costs = calculateTreeNodeCosts(binaryTree)
    logger.info("Calculated node costs")
    for node in xrange(0, nodeNo):
        logger.info("Node : %s , reconstruction value : %f , %f" %
                    (labels[node], costs[node], 1.0 - costs[node]))
    pathCost, treePath = calculatePath(binaryTree, costs, maxNodeNo)
    logger.info(
        " Calculated nested path. Cost : %f , Path : %s" %
        (pathCost, " ".join([labels[i.traversalID.mid] for i in treePath])))
    assert len(leafSeqFiles) == seqNo
    alignmentFiles = [None] * nodeNo
    seqFiles = [None] * nodeNo
    for i in xrange(0, seqNo):
        seqFiles[i * 2] = leafSeqFiles[i]
    logger.debug("About to start main nested loop")
    for subTree in treePath:
        assert subTree != binaryTree
        logger.info("Chosen sub tree to align : %s " %
                    printBinaryTree(subTree, True, False))
        alignmentFile = getTempFile()
        startTime = time.time()
        makeAlignment(subTree, seqFiles, alignmentFile, outputScoreFile,
                      alignerArgs)
        logger.info("Made alignment of subtree, time taken : %s (seconds)" %
                    (time.time() - startTime))
        #get the two ancestors
        subTreeTraversalIDs = binaryTree_depthFirstNumbers(subTree,
                                                           labelTree=False,
                                                           dontStopAtID=False)

        if subTree.left.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childXAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile, 0,
                                subTreeTraversalIDs[subTree].mid - offset,
                                childXAlignmentFile)
            alignmentFiles[subTree.left.traversalID.mid] = childXAlignmentFile
            logger.info("Extracted alignment of left child : %s " %
                        printBinaryTree(subTree.left, True, False))

            assert offset == subTreeTraversalIDs[subTree.left].midStart
            childXSeqFile = getTempFile()
            extractSubAlignment(
                childXAlignmentFile,
                subTreeTraversalIDs[subTree.left].mid - offset,
                subTreeTraversalIDs[subTree.left].mid - offset + 1,
                childXSeqFile)
            seqFiles[subTree.left.traversalID.mid] = childXSeqFile
            logger.info("Extracted sequence of left child : %s " %
                        printBinaryTree(subTree.left, True, False))

        if subTree.right.internal:
            offset = subTreeTraversalIDs[subTree].midStart
            childYAlignmentFile = getTempFile()
            extractSubAlignment(alignmentFile,
                                subTreeTraversalIDs[subTree].mid + 1 - offset,
                                subTreeTraversalIDs[subTree].midEnd - offset,
                                childYAlignmentFile)
            alignmentFiles[subTree.right.traversalID.mid] = childYAlignmentFile
            logger.info("Extracted alignment of right child : %s " %
                        printBinaryTree(subTree.right, True, False))

            offset = subTreeTraversalIDs[subTree.right].midStart
            childYSeqFile = getTempFile()
            extractSubAlignment(
                childYAlignmentFile,
                subTreeTraversalIDs[subTree.right].mid - offset,
                subTreeTraversalIDs[subTree.right].mid - offset + 1,
                childYSeqFile)
            seqFiles[subTree.right.traversalID.mid] = childYSeqFile
            logger.info("Extracted sequence of right child : %s " %
                        printBinaryTree(subTree.right, True, False))

        subTree.left.iD = labels[
            subTree.left.traversalID.
            mid]  #labels tree, so we only print relevant bits
        subTree.right.iD = labels[subTree.right.traversalID.mid]
        os.remove(alignmentFile)
        logger.info("Finished loop and reduced tree to : %s " %
                    printBinaryTree(subTree, True, False))
    startTime = time.time()
    makeAlignment(binaryTree, seqFiles, outputFile, outputScoreFile,
                  alignerArgs)
    logger.info("Finished final nested alignment, time taken : %s (seconds)" %
                (time.time() - startTime))
    alignmentFiles[binaryTree.traversalID.mid] = outputFile
    mergeTogetherAllAlignments(binaryTree, alignmentFiles, labels, [0])
    logger.info("Merged together all alignments")
    for i in xrange(1, nodeNo, 2):
        if seqFiles[i] != None:
            os.remove(seqFiles[i])
    removeInternalIDs(binaryTree)
    logger.info("Have cleaned up, and am returning")