def calculateSemphyTreeEstimate(alignmentFile, treeArgs, seqNo):
    if seqNo == 2:
        semphyArgs = treeArgs.SEMPHY_ARGS_PAIRS
    else:    
        semphyArgs = treeArgs.SEMPHY_ARGS_TREE
    semphyAlignmentFile = formatForSemphy(alignmentFile)
    outputTreeFile = getTempFile()
    characterFrequencies = " --ACGprob=%f,%f,%f" % tuple(treeArgs.EXPECTED_CHARACTER_FREQUENCIES[:-1])
    command = "%s --treeoutputfile=%s %s %s --sequence=%s " % (treeArgs.SEMPHY_PATH, outputTreeFile, semphyArgs, characterFrequencies, semphyAlignmentFile)
    #if existingTreeFile != None: #just optimise branch lengths
    #    command += " --bbl --tree=%s " % existingTreeFile
    logger.info("Calling Semphy with %s ", command)
    pipe = os.popen(command)
    if pipe.close():
        logger.info("tree building failed, so must exit")
        sys.exit(1)
    fileHandle = open(outputTreeFile, 'r')
    treeString = fileHandle.readlines()[0]
    fileHandle.close()
    binaryTree = newickTreeParser(treeString, False)
    binaryTree_depthFirstNumbers(binaryTree)
    #clean up
    os.remove(semphyAlignmentFile)
    os.remove(outputTreeFile)
    correctTreeDistances(binaryTree)
    return binaryTree
Exemple #2
0
def calculateSemphyTreeEstimate(alignmentFile, treeArgs, seqNo):
    if seqNo == 2:
        semphyArgs = treeArgs.SEMPHY_ARGS_PAIRS
    else:
        semphyArgs = treeArgs.SEMPHY_ARGS_TREE
    semphyAlignmentFile = formatForSemphy(alignmentFile)
    outputTreeFile = getTempFile()
    characterFrequencies = " --ACGprob=%f,%f,%f" % tuple(
        treeArgs.EXPECTED_CHARACTER_FREQUENCIES[:-1])
    command = "%s --treeoutputfile=%s %s %s --sequence=%s " % (
        treeArgs.SEMPHY_PATH, outputTreeFile, semphyArgs, characterFrequencies,
        semphyAlignmentFile)
    #if existingTreeFile != None: #just optimise branch lengths
    #    command += " --bbl --tree=%s " % existingTreeFile
    logger.info("Calling Semphy with %s ", command)
    pipe = os.popen(command)
    if pipe.close():
        logger.info("tree building failed, so must exit")
        sys.exit(1)
    fileHandle = open(outputTreeFile, 'r')
    treeString = fileHandle.readlines()[0]
    fileHandle.close()
    binaryTree = newickTreeParser(treeString, False)
    binaryTree_depthFirstNumbers(binaryTree)
    #clean up
    os.remove(semphyAlignmentFile)
    os.remove(outputTreeFile)
    correctTreeDistances(binaryTree)
    return binaryTree
Exemple #3
0
def main():
    sys.stderr.write("Arguments received : %s \n" % "_".join(sys.argv))
    startTime = time.time()
    alignerArgs = getDefaultArgs()
    addDefaultArgs(alignerArgs)
    addDefaultStitcherArgs(alignerArgs)
    addDefaultNesterArgs(alignerArgs)
    addDefaultEstimateTreeArgs(alignerArgs)
    i = loggerIndices
    removeReservedIndices(i, alignerArgs)
    if len(sys.argv) < 3:
        print "Ortheus.py [MODIFIER_ARGUMENTS]"
        print "Version: ", VERSION_NO
        print "A top level script for running Ortheus and Pecan to produce substitution and indel aware reconstructed chunks of genome"
        print "If you would like to contribute to this program's development please contact me at bjp (AT) ebi (DOT) ac (DOT) uk "
        print "Arguments:"
        i = printFirstMods(alignerArgs, i)
        i = printMods(alignerArgs, i)
        i = printModsStitcher(alignerArgs, i)
        i = printModsNester(alignerArgs, i)
        i = printEstimateTreeMods(alignerArgs, i)
        print "-------------Ortheus help string as follows (Changing these arguments may break the script)-------------"
        os.system("ortheus_core")
        print "-------------End Ortheus help string-------------"
        print "-------------Pecan help string as follows (Changing these arguments may break the script)-------------"
        os.system("%s bp.pecan.Pecan -help" % (alignerArgs.JAVA_PREFIX,))
        print "-------------End Pecan help string-------------"
        sys.exit(0)
        
    mods = sys.argv[1:]
    l = []
    i = parseFirstMods(mods, alignerArgs, i, l)
    i = parseMods(l, alignerArgs, i, mods)
    i = parseModsStitcher(mods, alignerArgs, i, l)
    i = parseModsNester(l, alignerArgs, i, mods)
    i = parseEstimateTreeMods(mods, alignerArgs, i, l)
    if len(l) != 0:
        logger.info("Ooops, remaining arguments %s ", " ".join(l))
        assert False  
    logger.info("Arguments received : %s " % " ".join(sys.argv))
    logger.info("Sequence files : %s " % " ".join(alignerArgs.SEQUENCE_FILES))
    if alignerArgs.EMPIRICALLY_ESTIMATE_CHARACTER_FREQUENCIES:
        alignerArgs.EXPECTED_CHARACTER_FREQUENCIES = empiricallyEstimateNucleotideFrequencies(alignerArgs.SEQUENCE_FILES)
        logger.info("Empirically estimated character frequencies : %s " % " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ]))
    try:
        os.remove(alignerArgs.OUTPUT_SCORE_FILE)
    except OSError:
        pass
    if alignerArgs.NEWICK_TREE_STRING != None:
        binaryTree = newickTreeParser(alignerArgs.NEWICK_TREE_STRING)  
        logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True))
    else:
        binaryTree, seqFiles, outputAlignment = estimateTreeAlign(alignerArgs.SEQUENCE_FILES, alignerArgs.OUTPUT_TREE_FILE, alignerArgs)
        os.remove(outputAlignment) #for now, this should be
        alignerArgs.SEQUENCE_FILES = seqFiles
    if alignerArgs.MAKE_FINAL_ALIGNMENT:
        nestAlign(binaryTree, alignerArgs.SEQUENCE_FILES, alignerArgs.OUTPUT_FILE, alignerArgs.OUTPUT_SCORE_FILE, alignerArgs)        
    #logger.info("Finished, total time taken : %s (seconds)" % (time.time()-startTime))
    print "total_time %s " % (time.time()-startTime)
Exemple #4
0
def estimateTreeAlign(seqFiles, outputTreeFile, treeArgs):
    origSeqFileOrder = seqFiles[:]
    tree = makeStarTree(len(seqFiles), 0, treeArgs.DEFAULT_DISTANCE)
    binaryTree_depthFirstNumbers(tree)
    labelTree(tree, strCounter([-1]))
    tree, seqFiles, outputAlignment = estimateTree(seqFiles, tree, treeArgs.ITERATION_NUMBER, \
                                                    treeArgs.DO_SUBTREE_BRANCH_LENGTH_ESTIMATION, treeArgs)
    seqFiles = list(seqFiles)
    if treeArgs.SPECIES_TREE_STRING != None:
        logger.info("Predicting root of tree using species tree")
        speciesTree = newickTreeParser(treeArgs.SPECIES_TREE_STRING)
        binaryTree_depthFirstNumbers(speciesTree)
        logger.info("Parsed species tree: %s" %
                    printBinaryTree(speciesTree, True))
        i = [-1]

        def fn():
            i[0] += 1
            j = origSeqFileOrder.index(seqFiles[i[0]])
            return "%s_%s" % (treeArgs.LEAF_SPECIES[j], str(i[0]))

        labelTree(tree, fn)
        tree, dupCount, lossCount = calculateProbableRootOfGeneTree(
            speciesTree, tree, processID=lambda x: x.split("_")[0])

        def fn2(tree):
            if tree.internal:
                fn2(tree.left)
                fn2(tree.right)
            else:
                tree.iD = tree.iD.split('_')[1]

        fn2(tree)
        seqFiles = getSubtreeSeqs(seqFiles, tree)
        logger.info("Reconciled tree with root : %s %s " %
                    (printBinaryTree(tree, True), " ".join(seqFiles)))
        logger.info("Number of dups needed for reconcilliations : %s " %
                    dupCount)
        logger.info("Number of losses needed for reconcilliations : %s " %
                    lossCount)
    seqFiles = list(seqFiles)
    out = open(outputTreeFile, 'w')
    out.write("%s\n" % printBinaryTree(tree, True))
    out.write("%s\n" % " ".join(seqFiles))
    out.close()
    logger.info("Finished estimate tree")
    return tree, seqFiles, outputAlignment
def estimateTreeAlign(seqFiles, outputTreeFile, treeArgs):
    origSeqFileOrder = seqFiles[:]
    tree = makeStarTree(len(seqFiles), 0, treeArgs.DEFAULT_DISTANCE)
    binaryTree_depthFirstNumbers(tree)
    labelTree(tree, strCounter([-1]))
    tree, seqFiles, outputAlignment = estimateTree(seqFiles, tree, treeArgs.ITERATION_NUMBER, \
                                                    treeArgs.DO_SUBTREE_BRANCH_LENGTH_ESTIMATION, treeArgs)
    seqFiles = list(seqFiles)
    if treeArgs.SPECIES_TREE_STRING != None:
        logger.info("Predicting root of tree using species tree")
        speciesTree = newickTreeParser(treeArgs.SPECIES_TREE_STRING)
        binaryTree_depthFirstNumbers(speciesTree)
        logger.info("Parsed species tree: %s" % printBinaryTree(speciesTree, True))
        i = [-1]
        def fn():
            i[0] += 1
            j = origSeqFileOrder.index(seqFiles[i[0]])
            return "%s_%s" % (treeArgs.LEAF_SPECIES[j], str(i[0]))
        labelTree(tree, fn)
        tree, dupCount, lossCount = calculateProbableRootOfGeneTree(speciesTree, tree, processID=lambda x : x.split("_")[0])
        def fn2(tree):
            if tree.internal:
                fn2(tree.left)
                fn2(tree.right)
            else:
                tree.iD = tree.iD.split('_')[1]
        fn2(tree)
        seqFiles = getSubtreeSeqs(seqFiles, tree)
        logger.info("Reconciled tree with root : %s %s " % (printBinaryTree(tree, True), " ".join(seqFiles)))
        logger.info("Number of dups needed for reconcilliations : %s " % dupCount)
        logger.info("Number of losses needed for reconcilliations : %s " % lossCount)
    seqFiles = list(seqFiles)
    out = open(outputTreeFile, 'w')
    out.write("%s\n" % printBinaryTree(tree, True))
    out.write("%s\n" % " ".join(seqFiles))
    out.close()
    logger.info("Finished estimate tree")
    return tree, seqFiles, outputAlignment
Exemple #6
0
def stitchReconstruct(seqNo, inputSeqFiles, treeString, outputFile, outputScoreFile, inputAlignmentFile, alignerArgs):
    startTime = time.time() #epoch time in seconds
    
    logger.info("Starting Stitcher")
    reconstructionPrefix = alignerArgs.RECONSTRUCTION_PREFIX
    if alignerArgs.FAST_SETTING:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS_FAST
    else:
        reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS
    cautiousArgs = alignerArgs.CAUTIOUS_ARGS
    alignmentChunkMaxSeqSize = alignerArgs.ALIGNMENT_CHUNK_MAX_COLUMN_SIZE
    viterbiAlignmentColumnGap = alignerArgs.VITERBI_ALIGNMENT_COLUMN_GAP
    #parse tree 
    binaryTree = newickTreeParser(treeString)
    binaryTree_depthFirstNumbers(binaryTree)
    logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True))
    labels = binaryTree_nodeNames(binaryTree)
    leafLabels = [ labels[i] for i in xrange(0, len(labels)) if (i%2) == 0]
    #load alignment iterator
    alignmentReader = multiFastaRead(inputAlignmentFile, lambda x : x)
    #number of sequences, including ancestors
    nodeNumber = binaryTree.traversalID.midEnd
    assert nodeNumber == seqNo * 2 - 1
    #create output files
    outputFiles, outputIterators = getOpenSeqFiles(nodeNumber, getTempFile)
    #while has chunk
    previousAlignment = []
    alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    tempTreeStatesFile = getTempFile()
    loopOptions = " "  
    logger.info("Starting main loop")
    characterFrequenciesString = " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ])
    while alignmentSeqs != None:
        if(end):
            viterbiAlignmentColumnGap = 0
        tempAncestorFile = getTempFile()
        tempScoreFile = getTempFile()
        command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -n %s -x %s " % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, reconstructionArgs, tempAncestorFile, characterFrequenciesString, tempScoreFile)
        logger.info("Calling Ortheus with : %s", command)
        exitValue = os.system(command)
        if exitValue != 0:
            logger.info("Something went wrong calling Ortheus : %i ", exitValue)
            #if exitValue != 73:
            #    logger.info("Unrecognised issue, so am exiting to be cautious")
            #    sys.exit(1)
            logger.info("Going to retry with caution settings")
            command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -x %s" % (reconstructionPrefix, treeString, alignmentFile, \
                                                                       " ".join(alignmentSeqs), tempTreeStatesFile, \
                                                                       viterbiAlignmentColumnGap, loopOptions, cautiousArgs, tempAncestorFile, tempScoreFile)
            logger.info("Calling Ortheus with : %s", command)
            if os.system(command):
                logger.info("Already tried caution, so have to go")
                sys.exit(1)
        logger.info("Completed reconstruction of chunk")
        appendScore(tempScoreFile, outputScoreFile)
        os.remove(tempScoreFile)
        loopOptions = " -t " + tempTreeStatesFile
        tempAncestorFastaOffsets = getMultiFastaOffsets(tempAncestorFile)
        previousAlignment = removeFromLeft(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), previousAlignment, nodeNumber, seqNo)
        appendToAlignment(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), outputIterators, nodeNumber)
        logger.info("Added reconstructed chunk to complete alignment")
        os.remove(tempAncestorFile)
        removeSeqFiles(alignmentSeqs, seqNo)
        os.remove(alignmentFile)
        logger.info("Cleaned up at end of loop")
        alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels)
    logger.info("Finished main loop")
    #load into single output file
    closeSeqIterators(outputIterators, nodeNumber)
    concatanateSeqFiles(outputFiles, outputFile, nodeNumber, labels)
    logger.info("Written out alignment to single file")
    #clean up
    os.remove(tempTreeStatesFile)
    removeSeqFiles(outputFiles, nodeNumber)
    logger.info("Cleaned up final files")
    logger.info("Finished, total time taken for stitcher: %s (seconds)" % (time.time()-startTime))