def calculateSemphyTreeEstimate(alignmentFile, treeArgs, seqNo): if seqNo == 2: semphyArgs = treeArgs.SEMPHY_ARGS_PAIRS else: semphyArgs = treeArgs.SEMPHY_ARGS_TREE semphyAlignmentFile = formatForSemphy(alignmentFile) outputTreeFile = getTempFile() characterFrequencies = " --ACGprob=%f,%f,%f" % tuple(treeArgs.EXPECTED_CHARACTER_FREQUENCIES[:-1]) command = "%s --treeoutputfile=%s %s %s --sequence=%s " % (treeArgs.SEMPHY_PATH, outputTreeFile, semphyArgs, characterFrequencies, semphyAlignmentFile) #if existingTreeFile != None: #just optimise branch lengths # command += " --bbl --tree=%s " % existingTreeFile logger.info("Calling Semphy with %s ", command) pipe = os.popen(command) if pipe.close(): logger.info("tree building failed, so must exit") sys.exit(1) fileHandle = open(outputTreeFile, 'r') treeString = fileHandle.readlines()[0] fileHandle.close() binaryTree = newickTreeParser(treeString, False) binaryTree_depthFirstNumbers(binaryTree) #clean up os.remove(semphyAlignmentFile) os.remove(outputTreeFile) correctTreeDistances(binaryTree) return binaryTree
def calculateSemphyTreeEstimate(alignmentFile, treeArgs, seqNo): if seqNo == 2: semphyArgs = treeArgs.SEMPHY_ARGS_PAIRS else: semphyArgs = treeArgs.SEMPHY_ARGS_TREE semphyAlignmentFile = formatForSemphy(alignmentFile) outputTreeFile = getTempFile() characterFrequencies = " --ACGprob=%f,%f,%f" % tuple( treeArgs.EXPECTED_CHARACTER_FREQUENCIES[:-1]) command = "%s --treeoutputfile=%s %s %s --sequence=%s " % ( treeArgs.SEMPHY_PATH, outputTreeFile, semphyArgs, characterFrequencies, semphyAlignmentFile) #if existingTreeFile != None: #just optimise branch lengths # command += " --bbl --tree=%s " % existingTreeFile logger.info("Calling Semphy with %s ", command) pipe = os.popen(command) if pipe.close(): logger.info("tree building failed, so must exit") sys.exit(1) fileHandle = open(outputTreeFile, 'r') treeString = fileHandle.readlines()[0] fileHandle.close() binaryTree = newickTreeParser(treeString, False) binaryTree_depthFirstNumbers(binaryTree) #clean up os.remove(semphyAlignmentFile) os.remove(outputTreeFile) correctTreeDistances(binaryTree) return binaryTree
def main(): sys.stderr.write("Arguments received : %s \n" % "_".join(sys.argv)) startTime = time.time() alignerArgs = getDefaultArgs() addDefaultArgs(alignerArgs) addDefaultStitcherArgs(alignerArgs) addDefaultNesterArgs(alignerArgs) addDefaultEstimateTreeArgs(alignerArgs) i = loggerIndices removeReservedIndices(i, alignerArgs) if len(sys.argv) < 3: print "Ortheus.py [MODIFIER_ARGUMENTS]" print "Version: ", VERSION_NO print "A top level script for running Ortheus and Pecan to produce substitution and indel aware reconstructed chunks of genome" print "If you would like to contribute to this program's development please contact me at bjp (AT) ebi (DOT) ac (DOT) uk " print "Arguments:" i = printFirstMods(alignerArgs, i) i = printMods(alignerArgs, i) i = printModsStitcher(alignerArgs, i) i = printModsNester(alignerArgs, i) i = printEstimateTreeMods(alignerArgs, i) print "-------------Ortheus help string as follows (Changing these arguments may break the script)-------------" os.system("ortheus_core") print "-------------End Ortheus help string-------------" print "-------------Pecan help string as follows (Changing these arguments may break the script)-------------" os.system("%s bp.pecan.Pecan -help" % (alignerArgs.JAVA_PREFIX,)) print "-------------End Pecan help string-------------" sys.exit(0) mods = sys.argv[1:] l = [] i = parseFirstMods(mods, alignerArgs, i, l) i = parseMods(l, alignerArgs, i, mods) i = parseModsStitcher(mods, alignerArgs, i, l) i = parseModsNester(l, alignerArgs, i, mods) i = parseEstimateTreeMods(mods, alignerArgs, i, l) if len(l) != 0: logger.info("Ooops, remaining arguments %s ", " ".join(l)) assert False logger.info("Arguments received : %s " % " ".join(sys.argv)) logger.info("Sequence files : %s " % " ".join(alignerArgs.SEQUENCE_FILES)) if alignerArgs.EMPIRICALLY_ESTIMATE_CHARACTER_FREQUENCIES: alignerArgs.EXPECTED_CHARACTER_FREQUENCIES = empiricallyEstimateNucleotideFrequencies(alignerArgs.SEQUENCE_FILES) logger.info("Empirically estimated character frequencies : %s " % " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ])) try: os.remove(alignerArgs.OUTPUT_SCORE_FILE) except OSError: pass if alignerArgs.NEWICK_TREE_STRING != None: binaryTree = newickTreeParser(alignerArgs.NEWICK_TREE_STRING) logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True)) else: binaryTree, seqFiles, outputAlignment = estimateTreeAlign(alignerArgs.SEQUENCE_FILES, alignerArgs.OUTPUT_TREE_FILE, alignerArgs) os.remove(outputAlignment) #for now, this should be alignerArgs.SEQUENCE_FILES = seqFiles if alignerArgs.MAKE_FINAL_ALIGNMENT: nestAlign(binaryTree, alignerArgs.SEQUENCE_FILES, alignerArgs.OUTPUT_FILE, alignerArgs.OUTPUT_SCORE_FILE, alignerArgs) #logger.info("Finished, total time taken : %s (seconds)" % (time.time()-startTime)) print "total_time %s " % (time.time()-startTime)
def estimateTreeAlign(seqFiles, outputTreeFile, treeArgs): origSeqFileOrder = seqFiles[:] tree = makeStarTree(len(seqFiles), 0, treeArgs.DEFAULT_DISTANCE) binaryTree_depthFirstNumbers(tree) labelTree(tree, strCounter([-1])) tree, seqFiles, outputAlignment = estimateTree(seqFiles, tree, treeArgs.ITERATION_NUMBER, \ treeArgs.DO_SUBTREE_BRANCH_LENGTH_ESTIMATION, treeArgs) seqFiles = list(seqFiles) if treeArgs.SPECIES_TREE_STRING != None: logger.info("Predicting root of tree using species tree") speciesTree = newickTreeParser(treeArgs.SPECIES_TREE_STRING) binaryTree_depthFirstNumbers(speciesTree) logger.info("Parsed species tree: %s" % printBinaryTree(speciesTree, True)) i = [-1] def fn(): i[0] += 1 j = origSeqFileOrder.index(seqFiles[i[0]]) return "%s_%s" % (treeArgs.LEAF_SPECIES[j], str(i[0])) labelTree(tree, fn) tree, dupCount, lossCount = calculateProbableRootOfGeneTree( speciesTree, tree, processID=lambda x: x.split("_")[0]) def fn2(tree): if tree.internal: fn2(tree.left) fn2(tree.right) else: tree.iD = tree.iD.split('_')[1] fn2(tree) seqFiles = getSubtreeSeqs(seqFiles, tree) logger.info("Reconciled tree with root : %s %s " % (printBinaryTree(tree, True), " ".join(seqFiles))) logger.info("Number of dups needed for reconcilliations : %s " % dupCount) logger.info("Number of losses needed for reconcilliations : %s " % lossCount) seqFiles = list(seqFiles) out = open(outputTreeFile, 'w') out.write("%s\n" % printBinaryTree(tree, True)) out.write("%s\n" % " ".join(seqFiles)) out.close() logger.info("Finished estimate tree") return tree, seqFiles, outputAlignment
def estimateTreeAlign(seqFiles, outputTreeFile, treeArgs): origSeqFileOrder = seqFiles[:] tree = makeStarTree(len(seqFiles), 0, treeArgs.DEFAULT_DISTANCE) binaryTree_depthFirstNumbers(tree) labelTree(tree, strCounter([-1])) tree, seqFiles, outputAlignment = estimateTree(seqFiles, tree, treeArgs.ITERATION_NUMBER, \ treeArgs.DO_SUBTREE_BRANCH_LENGTH_ESTIMATION, treeArgs) seqFiles = list(seqFiles) if treeArgs.SPECIES_TREE_STRING != None: logger.info("Predicting root of tree using species tree") speciesTree = newickTreeParser(treeArgs.SPECIES_TREE_STRING) binaryTree_depthFirstNumbers(speciesTree) logger.info("Parsed species tree: %s" % printBinaryTree(speciesTree, True)) i = [-1] def fn(): i[0] += 1 j = origSeqFileOrder.index(seqFiles[i[0]]) return "%s_%s" % (treeArgs.LEAF_SPECIES[j], str(i[0])) labelTree(tree, fn) tree, dupCount, lossCount = calculateProbableRootOfGeneTree(speciesTree, tree, processID=lambda x : x.split("_")[0]) def fn2(tree): if tree.internal: fn2(tree.left) fn2(tree.right) else: tree.iD = tree.iD.split('_')[1] fn2(tree) seqFiles = getSubtreeSeqs(seqFiles, tree) logger.info("Reconciled tree with root : %s %s " % (printBinaryTree(tree, True), " ".join(seqFiles))) logger.info("Number of dups needed for reconcilliations : %s " % dupCount) logger.info("Number of losses needed for reconcilliations : %s " % lossCount) seqFiles = list(seqFiles) out = open(outputTreeFile, 'w') out.write("%s\n" % printBinaryTree(tree, True)) out.write("%s\n" % " ".join(seqFiles)) out.close() logger.info("Finished estimate tree") return tree, seqFiles, outputAlignment
def stitchReconstruct(seqNo, inputSeqFiles, treeString, outputFile, outputScoreFile, inputAlignmentFile, alignerArgs): startTime = time.time() #epoch time in seconds logger.info("Starting Stitcher") reconstructionPrefix = alignerArgs.RECONSTRUCTION_PREFIX if alignerArgs.FAST_SETTING: reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS_FAST else: reconstructionArgs = alignerArgs.RECONSTRUCTION_ARGS cautiousArgs = alignerArgs.CAUTIOUS_ARGS alignmentChunkMaxSeqSize = alignerArgs.ALIGNMENT_CHUNK_MAX_COLUMN_SIZE viterbiAlignmentColumnGap = alignerArgs.VITERBI_ALIGNMENT_COLUMN_GAP #parse tree binaryTree = newickTreeParser(treeString) binaryTree_depthFirstNumbers(binaryTree) logger.info("Newick tree read : %s " % printBinaryTree(binaryTree, True)) labels = binaryTree_nodeNames(binaryTree) leafLabels = [ labels[i] for i in xrange(0, len(labels)) if (i%2) == 0] #load alignment iterator alignmentReader = multiFastaRead(inputAlignmentFile, lambda x : x) #number of sequences, including ancestors nodeNumber = binaryTree.traversalID.midEnd assert nodeNumber == seqNo * 2 - 1 #create output files outputFiles, outputIterators = getOpenSeqFiles(nodeNumber, getTempFile) #while has chunk previousAlignment = [] alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels) tempTreeStatesFile = getTempFile() loopOptions = " " logger.info("Starting main loop") characterFrequenciesString = " ".join([ str(i) for i in alignerArgs.EXPECTED_CHARACTER_FREQUENCIES ]) while alignmentSeqs != None: if(end): viterbiAlignmentColumnGap = 0 tempAncestorFile = getTempFile() tempScoreFile = getTempFile() command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -n %s -x %s " % (reconstructionPrefix, treeString, alignmentFile, \ " ".join(alignmentSeqs), tempTreeStatesFile, \ viterbiAlignmentColumnGap, loopOptions, reconstructionArgs, tempAncestorFile, characterFrequenciesString, tempScoreFile) logger.info("Calling Ortheus with : %s", command) exitValue = os.system(command) if exitValue != 0: logger.info("Something went wrong calling Ortheus : %i ", exitValue) #if exitValue != 73: # logger.info("Unrecognised issue, so am exiting to be cautious") # sys.exit(1) logger.info("Going to retry with caution settings") command = "%s -b '%s' -c %s -a %s -u %s -s %s %s %s -d %s -x %s" % (reconstructionPrefix, treeString, alignmentFile, \ " ".join(alignmentSeqs), tempTreeStatesFile, \ viterbiAlignmentColumnGap, loopOptions, cautiousArgs, tempAncestorFile, tempScoreFile) logger.info("Calling Ortheus with : %s", command) if os.system(command): logger.info("Already tried caution, so have to go") sys.exit(1) logger.info("Completed reconstruction of chunk") appendScore(tempScoreFile, outputScoreFile) os.remove(tempScoreFile) loopOptions = " -t " + tempTreeStatesFile tempAncestorFastaOffsets = getMultiFastaOffsets(tempAncestorFile) previousAlignment = removeFromLeft(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), previousAlignment, nodeNumber, seqNo) appendToAlignment(multiFastaRead(tempAncestorFile, lambda x : x, tempAncestorFastaOffsets), outputIterators, nodeNumber) logger.info("Added reconstructed chunk to complete alignment") os.remove(tempAncestorFile) removeSeqFiles(alignmentSeqs, seqNo) os.remove(alignmentFile) logger.info("Cleaned up at end of loop") alignmentSeqs, alignmentFile, end = getNextAlignmentChunk(previousAlignment, alignmentReader, alignmentChunkMaxSeqSize, seqNo, leafLabels) logger.info("Finished main loop") #load into single output file closeSeqIterators(outputIterators, nodeNumber) concatanateSeqFiles(outputFiles, outputFile, nodeNumber, labels) logger.info("Written out alignment to single file") #clean up os.remove(tempTreeStatesFile) removeSeqFiles(outputFiles, nodeNumber) logger.info("Cleaned up final files") logger.info("Finished, total time taken for stitcher: %s (seconds)" % (time.time()-startTime))