def buildHmmScores(hmmPaths, queriesPath, scoreFileHmmFileMap): #tasks = [getHmmScores(hmmPath, queriesPath) for hmmPath in hmmPaths] queries = sequenceutils.readFromFasta(queriesPath, removeDashes=True) baseName = os.path.basename(queriesPath).split('.')[0] dirName = os.path.join(os.path.dirname(queriesPath), "chunks_{}".format(baseName)) if not os.path.exists(dirName): os.makedirs(dirName) chunkSize = 1000 taxa = list(queries.keys()) inputOutputs = [] for i in range(math.ceil(len(taxa) / chunkSize)): chunk = taxa[i * chunkSize:min(len(taxa), (i + 1) * chunkSize)] inputName = os.path.join(dirName, "{}_chunk_{}.txt".format(baseName, i + 1)) sequenceutils.writeFasta(queries, inputName, chunk) for hmmPath in hmmPaths: outputName = os.path.join( os.path.dirname(hmmPath), "{}_chunk_{}_score.txt".format(baseName, i + 1)) inputOutputs.append((hmmPath, inputName, outputName)) scoreFileHmmFileMap[outputName] = hmmPath tasks = [ getHmmScores(hmmPath, inputPath, outputPath) for hmmPath, inputPath, outputPath in inputOutputs ] return tasks
def buildInducedSubalignment(**kwargs): alignmentColumnsPath = kwargs["alignmentColumnsPath"] subalignmentPath = kwargs["subalignmentPath"] inducedAlignPath = kwargs["outputFile"] tempInducedAlignPath = os.path.join( os.path.dirname(inducedAlignPath), "temp_{}".format(os.path.basename(inducedAlignPath))) alignColumns = [] with open(alignmentColumnsPath) as f: insertIdxs = set( [int(token) for token in f.readline().strip().split()]) for line in f: tokens = set([int(token) for token in line.strip().split()]) alignColumns.append(tokens) subsetAlign = sequenceutils.readFromFasta(subalignmentPath, removeDashes=False) inducedAlign = {taxon: ['-'] * len(alignColumns) for taxon in subsetAlign} for idx, column in enumerate(alignColumns): for taxon in subsetAlign: for c in column: letter = subsetAlign[taxon].seq[c] if letter != '-': letter = letter.lower() if c in insertIdxs else letter assert inducedAlign[taxon][idx] == '-' inducedAlign[taxon][idx] = letter for s in inducedAlign: inducedAlign[s] = sequenceutils.Sequence(s, "".join(inducedAlign[s])) sequenceutils.writeFasta(inducedAlign, tempInducedAlignPath) shutil.move(tempInducedAlignPath, inducedAlignPath)
def decomposeGuideTree(subsetsDir, sequencesPath, guideTreePath, maxSubsetSize, maxNumSubsets): sequences = sequenceutils.readFromFasta(sequencesPath, removeDashes=False) guideTree = dendropy.Tree.get(path=guideTreePath, schema="newick", preserve_underscores=True) guideTree.collapse_basal_bifurcation() for edge in guideTree.postorder_edge_iter(): if len(edge.head_node.child_edges()) > 0: edge.childs = sum([e.childs for e in edge.head_node.child_edges()]) else: edge.childs = 1 guideTree.childs = guideTree.seed_node.edge.childs trees = decomposeTree(guideTree, maxSubsetSize, maxNumSubsets) taxonSubsets = [] for tree in trees: keep = [n.taxon.label for n in tree.leaf_nodes()] taxonSubsets.append(keep) subsetPaths = [] for n, subset in enumerate(taxonSubsets): subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(n + 1)) subsetPaths.append(subsetPath) sequenceutils.writeFasta(sequences, subsetPath, subset) return subsetPaths
def hmmAlignQueries(hmmPath, queriesPath): queries = sequenceutils.readFromFasta(queriesPath, removeDashes=True) baseName = os.path.basename(queriesPath).split('.')[0] dirName = os.path.join(os.path.dirname(queriesPath), "chunks_{}".format(baseName)) if not os.path.exists(dirName): os.makedirs(dirName) chunkSize = 1000 taxa = list(queries.keys()) alignFiles = {} for i in range(math.ceil(len(taxa) / chunkSize)): chunk = taxa[i * chunkSize:min(len(taxa), (i + 1) * chunkSize)] inputName = os.path.join(dirName, "{}_chunk_{}.txt".format(baseName, i + 1)) outputName = os.path.join( dirName, "{}_chunk_{}_aligned.txt".format(baseName, i + 1)) sequenceutils.writeFasta(queries, inputName, chunk) alignFiles[inputName] = outputName tasks = [] for inputPath, outputPath in alignFiles.items(): task = buildHmmAlignment(hmmPath, inputPath, outputPath) tasks.append(task) return tasks
def assignBackboneTaxa(context, numTaxa, unalignedFile): backbone = {} for subset in context.subsets: random.shuffle(subset) for taxon in subset[:numTaxa]: backbone[taxon] = context.unalignedSequences[taxon] sequenceutils.writeFasta(backbone, unalignedFile) return backbone
def randomDecomposition(subsetsDir, sequences, numSubsets): allTaxa = list(sequences.keys()) random.shuffle(allTaxa) taxonSubsets = [allTaxa[i::numSubsets] for i in range(numSubsets)] subsetPaths = [] for n, subset in enumerate(taxonSubsets): subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(n + 1)) subsetPaths.append(subsetPath) sequenceutils.writeFasta(sequences, subsetPath, subset) return subsetPaths
def reassignTaxons(subsetsDir, subsetSeedPaths, sequences, unusedTaxa): unusedPath = os.path.join(subsetsDir, "unassigned_sequences.txt") sequenceutils.writeFasta(sequences, unusedPath, unusedTaxa) hmmMap = {} for subsetPath in subsetSeedPaths: hmmDir = os.path.join( os.path.dirname(subsetPath), "hmm_{}".format(os.path.basename(subsetPath)).replace(".", "_")) if not os.path.exists(hmmDir): os.makedirs(hmmDir) hmmMap[subsetPath] = os.path.join(hmmDir, "hmm_model.txt") hmmTasks = hmmutils.buildHmms(hmmMap) task.submitTasks(hmmTasks) task.awaitTasks(hmmTasks) hmmPaths = [t.outputFile for t in hmmTasks] scoreFileHmmFileMap = {} scoreTasks = hmmutils.buildHmmScores(hmmPaths, unusedPath, scoreFileHmmFileMap) task.submitTasks(scoreTasks) bestScores = {} taxonHmmMap = {} for scoreTask in task.asCompleted(scoreTasks): subsetScores = hmmutils.readSearchFile(scoreTask.outputFile) for taxon, scores in subsetScores.items(): if scores[1] > bestScores.get(taxon, -float("inf")): bestScores[taxon] = scores[1] taxonHmmMap[taxon] = scoreFileHmmFileMap[scoreTask.outputFile] subsetTaxons = {file: [] for file in hmmPaths} for taxon, hmmPath in taxonHmmMap.items(): subsetTaxons[hmmPath].append(taxon) for subsetPath, hmmPath in hmmMap.items(): subset = sequenceutils.readFromFasta(subsetPath) for taxon in subset: subsetTaxons[hmmPath].append(taxon) subsetPaths = [] i = 1 for hmmPath, subset in subsetTaxons.items(): subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(i)) subsetPaths.append(subsetPath) sequenceutils.writeFasta(sequences, subsetPath, subset) i = i + 1 return subsetPaths
def buildInitialTreeAlign(tempDir, sequencesPath): outputTreePath = os.path.join(tempDir, "initial_tree.tre") outputAlignPath = os.path.join(tempDir, "initial_align.txt") if os.path.exists(outputTreePath) and os.path.exists(outputAlignPath): return outputTreePath, outputAlignPath if os.path.exists(tempDir): shutil.rmtree(tempDir) os.makedirs(tempDir) initialAlign, unusedTaxa = decomposer.initial_tree.buildInitialAlignment( sequencesPath, tempDir, Configs.decompositionSkeletonSize, 1000) sequenceutils.writeFasta(initialAlign, outputAlignPath) #external_tools.runRaxmlNg(outputAlignPath, tempDir, outputTreePath, 8).run() external_tools.runFastTree(outputAlignPath, tempDir, outputTreePath).run() return outputTreePath, outputAlignPath, unusedTaxa
def requestHmmExtensionTasks(context, backbone, alignedFile): baseName = os.path.basename(alignedFile) hmmDir = os.path.join(context.graph.workingDir, "hmm_{}".format(baseName)) extensionUnalignedFile = os.path.join(hmmDir, "queries.txt") hmmPath = os.path.join(hmmDir, "hmm_model.txt") if not os.path.exists(hmmDir): os.makedirs(hmmDir) backboneExtension = {} for taxon in context.unalignedSequences: if not taxon in backbone: backboneExtension[taxon] = context.unalignedSequences[taxon] sequenceutils.writeFasta(backboneExtension, extensionUnalignedFile) buildTask = hmmutils.buildHmmOverAlignment(alignedFile, hmmPath) buildTask.run() alignTasks = hmmutils.hmmAlignQueries(hmmPath, extensionUnalignedFile) return alignTasks
def writeUnconstrainedAlignment(context): graph = context.graph alignment = {} for taxon in context.unalignedSequences: alignment[taxon] = sequenceutils.Sequence(taxon, ['-'] * len(graph.clusters)) curIdxes = {taxon: 0 for taxon in context.unalignedSequences} for idx, cluster in enumerate(graph.clusters): for b in cluster: bsub, bpos = graph.matSubPosMap[b] taxon = context.subalignments[bsub][0] alignment[taxon].seq[idx] = context.unalignedSequences[taxon].seq[ curIdxes[taxon]] curIdxes[taxon] = curIdxes[taxon] + 1 for taxon in alignment: alignment[taxon].seq = "".join(alignment[taxon].seq) sequenceutils.writeFasta(alignment, context.outputFile) Configs.log("Wrote final alignment to {}".format(context.outputFile))
def buildInitialAlignment(sequences, tempDir, skeletonSize, initialAlignSize, outputAlignPath): skeletonPath = os.path.join(tempDir, "skeleton_sequences.txt") queriesPath = os.path.join(tempDir, "queries.txt") hmmDir = os.path.join(tempDir, "skeleton_hmm") hmmPath = os.path.join(hmmDir, "hmm_model.txt") initialInsertPath = os.path.join(tempDir, "initial_insert_align.txt") if not os.path.exists(hmmDir): os.makedirs(hmmDir) if initialAlignSize is None or initialAlignSize > len(sequences): initialAlignSize = len(sequences) skeletonTaxa, remainingTaxa = decomposer.chooseSkeletonTaxa( sequences, skeletonSize) additional = initialAlignSize - skeletonSize random.shuffle(remainingTaxa) remainingTaxa, unusedTaxa = remainingTaxa[:additional], remainingTaxa[ additional:] sequenceutils.writeFasta(sequences, skeletonPath, skeletonTaxa) external_tools.runMafft(skeletonPath, None, tempDir, outputAlignPath, Configs.numCores).run() if len(remainingTaxa) > 0: sequenceutils.writeFasta(sequences, queriesPath, remainingTaxa) hmmutils.buildHmmOverAlignment(outputAlignPath, hmmPath).run() hmmTasks = hmmutils.hmmAlignQueries(hmmPath, queriesPath) task.submitTasks(hmmTasks) for hmmTask in task.asCompleted(hmmTasks): hmmutils.mergeHmmAlignments([hmmTask.outputFile], outputAlignPath, includeInsertions=False) if Configs.graphBuildMethod == "initial": hmmutils.mergeHmmAlignments([hmmTask.outputFile], initialInsertPath, includeInsertions=True)
def writeUnpackedAlignment(context): graph = context.graph filePath = context.outputFile tempFile = os.path.join(os.path.dirname(filePath), "temp_{}".format(os.path.basename(filePath))) if os.path.exists(tempFile): os.remove(tempFile) clusterMap = { path: [[] for c in graph.clusters] for path in context.subalignmentPaths } for idx, cluster in enumerate(graph.clusters): for b in cluster: bsub, bpos = graph.matSubPosMap[b] clusterMap[context.subalignmentPaths[bsub]][idx].append(bpos) inserts = {path: [] for path in context.subalignmentPaths} for b in graph.insertions: bsub, bpos = graph.matSubPosMap[b] inserts[context.subalignmentPaths[bsub]].append(bpos) Configs.log("Assembling final alignment in {}".format(filePath)) inducedSubalignTasks = [] for bsub, subalignPath in enumerate(context.subalignmentPaths): alignmentColumnsPath = os.path.join( context.graph.workingDir, "alignment_columns_{}".format(os.path.basename(subalignPath))) with open(alignmentColumnsPath, 'w') as textFile: textFile.write("{}\n".format(" ".join( [str(c) for c in inserts[subalignPath]]))) for cluster in clusterMap[subalignPath]: textFile.write("{}\n".format(" ".join( [str(c) for c in cluster]))) inducedAlignPath = os.path.join( graph.workingDir, "induced_{}".format(os.path.basename(subalignPath))) args = { "alignmentColumnsPath": alignmentColumnsPath, "subalignmentPath": subalignPath, "outputFile": inducedAlignPath } inducedTask = task.Task(taskType="buildInducedSubalignment", outputFile=args["outputFile"], taskArgs=args) inducedSubalignTasks.append(inducedTask) #inducedTask.submitTask() task.submitTasks(inducedSubalignTasks) for inducedTask in task.asCompleted(inducedSubalignTasks): inducedAlign = sequenceutils.readFromFasta(inducedTask.outputFile, removeDashes=False) Configs.log( "Appending induced alignment, {} sequences of length {}..".format( len(inducedAlign), len(next(iter(inducedAlign.values())).seq))) sequenceutils.writeFasta(inducedAlign, tempFile, append=True) os.remove(inducedTask.taskArgs["alignmentColumnsPath"]) os.remove(inducedTask.outputFile) shutil.move(tempFile, filePath) Configs.log("Wrote final alignment to {}".format(filePath))
def combineHmmAlignments(alignFiles, outputAlignmentPath, includeInsertions): alignment = {} for file in alignFiles: alignment.update( sequenceutils.readFromStockholm(file, includeInsertions)) sequenceutils.writeFasta(alignment, outputAlignmentPath, None)
def mergeHmmAlignments(alignFiles, outputAlignmentPath, includeInsertions): for file in alignFiles: alignment = sequenceutils.readFromStockholm(file, includeInsertions) sequenceutils.writeFasta(alignment, outputAlignmentPath, None, True)