コード例 #1
0
def buildInducedSubalignment(**kwargs):
    alignmentColumnsPath = kwargs["alignmentColumnsPath"]
    subalignmentPath = kwargs["subalignmentPath"]
    inducedAlignPath = kwargs["outputFile"]
    tempInducedAlignPath = os.path.join(
        os.path.dirname(inducedAlignPath),
        "temp_{}".format(os.path.basename(inducedAlignPath)))

    alignColumns = []
    with open(alignmentColumnsPath) as f:
        insertIdxs = set(
            [int(token) for token in f.readline().strip().split()])
        for line in f:
            tokens = set([int(token) for token in line.strip().split()])
            alignColumns.append(tokens)

    subsetAlign = sequenceutils.readFromFasta(subalignmentPath,
                                              removeDashes=False)
    inducedAlign = {taxon: ['-'] * len(alignColumns) for taxon in subsetAlign}

    for idx, column in enumerate(alignColumns):
        for taxon in subsetAlign:
            for c in column:
                letter = subsetAlign[taxon].seq[c]
                if letter != '-':
                    letter = letter.lower() if c in insertIdxs else letter
                    assert inducedAlign[taxon][idx] == '-'
                    inducedAlign[taxon][idx] = letter

    for s in inducedAlign:
        inducedAlign[s] = sequenceutils.Sequence(s, "".join(inducedAlign[s]))
    sequenceutils.writeFasta(inducedAlign, tempInducedAlignPath)
    shutil.move(tempInducedAlignPath, inducedAlignPath)
コード例 #2
0
def buildDecomposition(context, subsetsDir):
    if not os.path.exists(subsetsDir):
        os.makedirs(subsetsDir)
    if context.unalignedSequences is None:
        context.unalignedSequences = sequenceutils.readFromFasta(
            context.sequencesPath, removeDashes=True)

    if (Configs.decompositionStrategy == "random" or context.guideTree
            == "random") and Configs.outputPath == context.outputFile:
        context.subsetPaths = randomDecomposition(
            subsetsDir, context.unalignedSequences,
            Configs.decompositionMaxNumSubsets)

    elif Configs.decompositionStrategy == "kmh":
        Configs.log("Decomposing {} with KMH..".format(context.sequencesPath))
        Configs.log("Targetting {} subsets..".format(
            Configs.decompositionMaxNumSubsets))
        context.subsetPaths = kmh.buildSubsetsKMH(context, subsetsDir)

    else:
        guideTreePath = initial_tree.buildInitialTree(context, subsetsDir,
                                                      context.guideTree)
        Configs.log(
            "Using target subset size of {}, and maximum number of subsets {}.."
            .format(Configs.decompositionMaxSubsetSize,
                    Configs.decompositionMaxNumSubsets))
        context.subsetPaths = treeutils.decomposeGuideTree(
            subsetsDir, context.sequencesPath, guideTreePath,
            Configs.decompositionMaxSubsetSize,
            Configs.decompositionMaxNumSubsets)
コード例 #3
0
def compressSubalignment(**kwargs):
    subalignmentPath = kwargs["subalignmentPath"]
    outputPath = kwargs["outputFile"]
    tempOutputPath = os.path.join(
        os.path.dirname(outputPath),
        "temp_{}".format(os.path.basename(outputPath)))

    subsetAlign = sequenceutils.readFromFasta(subalignmentPath,
                                              removeDashes=False)
    subsetLen = len(next(iter(subsetAlign.values())).seq)

    numLetters = []
    compressions = []
    lastIdx = {s: -1 for s in subsetAlign}
    for i in range(subsetLen):
        notGaps = []
        for s, v in subsetAlign.items():
            if v.seq[i] != '-':
                notGaps.append(s)

        numLetters.append(len(notGaps))
        dest = set()
        for s in notGaps:
            if lastIdx[s] != -1:
                dest.add(lastIdx[s])
            lastIdx[s] = i
        compressions.append(dest)

    with open(tempOutputPath, 'w') as textFile:
        textFile.write("{}\n".format(" ".join([str(c) for c in numLetters])))
        for dest in compressions:
            textFile.write("{}\n".format(" ".join([str(c) for c in dest])))
    shutil.move(tempOutputPath, outputPath)
コード例 #4
0
ファイル: graph_builder.py プロジェクト: PaulSV123/CMB-cambio
def requestBackboneTasks(context):
    if len(context.backbonePaths) > 0:
        Configs.log("Using {} user-defined backbone files..".format(
            len(context.backbonePaths)))
        context.backbonePaths = context.backbonePaths
        for path in context.backbonePaths:
            context.backboneTaxa.update(sequenceutils.readFromFasta(path))

    elif Configs.graphBuildMethod == "mafft":
        Configs.log("Using {} MAFFT backbones..".format(Configs.mafftRuns))
        requestMafftBackbones(context)

    elif Configs.graphBuildMethod == "subsethmm":
        Configs.log(
            "Using {} HMM-extended subalignments as backbone files..".format(
                len(context.subalignmentPaths)))
        context.backbonePaths = context.subalignmentPaths
        context.backboneExtend.update(context.backbonePaths)

    elif Configs.graphBuildMethod == "initial":
        Configs.log(
            "Using the initial decomposition alignment as the single backbone.."
        )
        initialAlignPath = os.path.join(context.workingDir, "decomposition",
                                        "initial_tree",
                                        "initial_insert_align.txt")
        context.backbonePaths = [initialAlignPath]

    if not Configs.constrain and Configs.graphBuildMethod != "subsethmm":
        context.backbonePaths.extend(context.subalignmentPaths)
コード例 #5
0
ファイル: graph_builder.py プロジェクト: PaulSV123/CMB-cambio
def addAlignmentFileToGraph(context, alignedFile):
    Configs.log("Feeding backbone {} to the graph..".format(alignedFile))
    backboneAlign = sequenceutils.readFromFasta(alignedFile)
    alignmentLength = len(next(iter(backboneAlign.values())).seq)

    if alignedFile in context.backboneExtend:
        extensionTasks = requestHmmExtensionTasks(context, backboneAlign,
                                                  alignedFile)
        task.submitTasks(extensionTasks)
        for extensionTask in task.asCompleted(extensionTasks):
            backboneAlign.update(
                sequenceutils.readFromStockholm(extensionTask.outputFile,
                                                includeInsertions=True))

    alignmap = backboneToAlignMap(context, backboneAlign, alignmentLength)
    Configs.log(
        "Constructed backbone alignment map from {}".format(alignedFile))

    graph = context.graph
    with graph.matrixLock:
        for l in range(alignmentLength):
            for a, avalue in alignmap[l].items():
                for b, bvalue in alignmap[l].items():

                    if Configs.graphBuildRestrict:
                        asub, apos = graph.matSubPosMap[a]
                        bsub, bpos = graph.matSubPosMap[b]
                        if asub == bsub and apos != bpos:
                            continue

                    graph.matrix[a][b] = graph.matrix[a].get(
                        b, 0) + avalue * bvalue
    Configs.log("Fed backbone {} to the graph.".format(alignedFile))
コード例 #6
0
def decomposeGuideTree(subsetsDir, sequencesPath, guideTreePath, maxSubsetSize,
                       maxNumSubsets):
    sequences = sequenceutils.readFromFasta(sequencesPath, removeDashes=False)
    guideTree = dendropy.Tree.get(path=guideTreePath,
                                  schema="newick",
                                  preserve_underscores=True)
    guideTree.collapse_basal_bifurcation()

    for edge in guideTree.postorder_edge_iter():
        if len(edge.head_node.child_edges()) > 0:
            edge.childs = sum([e.childs for e in edge.head_node.child_edges()])
        else:
            edge.childs = 1
    guideTree.childs = guideTree.seed_node.edge.childs
    trees = decomposeTree(guideTree, maxSubsetSize, maxNumSubsets)

    taxonSubsets = []
    for tree in trees:
        keep = [n.taxon.label for n in tree.leaf_nodes()]
        taxonSubsets.append(keep)

    subsetPaths = []
    for n, subset in enumerate(taxonSubsets):
        subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(n + 1))
        subsetPaths.append(subsetPath)
        sequenceutils.writeFasta(sequences, subsetPath, subset)
    return subsetPaths
コード例 #7
0
ファイル: hmmutils.py プロジェクト: PaulSV123/CMB-cambio
def buildHmmScores(hmmPaths, queriesPath, scoreFileHmmFileMap):
    #tasks = [getHmmScores(hmmPath, queriesPath) for hmmPath in hmmPaths]
    queries = sequenceutils.readFromFasta(queriesPath, removeDashes=True)
    baseName = os.path.basename(queriesPath).split('.')[0]
    dirName = os.path.join(os.path.dirname(queriesPath),
                           "chunks_{}".format(baseName))
    if not os.path.exists(dirName):
        os.makedirs(dirName)

    chunkSize = 1000

    taxa = list(queries.keys())
    inputOutputs = []
    for i in range(math.ceil(len(taxa) / chunkSize)):
        chunk = taxa[i * chunkSize:min(len(taxa), (i + 1) * chunkSize)]
        inputName = os.path.join(dirName,
                                 "{}_chunk_{}.txt".format(baseName, i + 1))
        sequenceutils.writeFasta(queries, inputName, chunk)
        for hmmPath in hmmPaths:
            outputName = os.path.join(
                os.path.dirname(hmmPath),
                "{}_chunk_{}_score.txt".format(baseName, i + 1))
            inputOutputs.append((hmmPath, inputName, outputName))
            scoreFileHmmFileMap[outputName] = hmmPath

    tasks = [
        getHmmScores(hmmPath, inputPath, outputPath)
        for hmmPath, inputPath, outputPath in inputOutputs
    ]
    return tasks
コード例 #8
0
ファイル: hmmutils.py プロジェクト: PaulSV123/CMB-cambio
def hmmAlignQueries(hmmPath, queriesPath):
    queries = sequenceutils.readFromFasta(queriesPath, removeDashes=True)
    baseName = os.path.basename(queriesPath).split('.')[0]
    dirName = os.path.join(os.path.dirname(queriesPath),
                           "chunks_{}".format(baseName))
    if not os.path.exists(dirName):
        os.makedirs(dirName)
    chunkSize = 1000

    taxa = list(queries.keys())
    alignFiles = {}
    for i in range(math.ceil(len(taxa) / chunkSize)):
        chunk = taxa[i * chunkSize:min(len(taxa), (i + 1) * chunkSize)]
        inputName = os.path.join(dirName,
                                 "{}_chunk_{}.txt".format(baseName, i + 1))
        outputName = os.path.join(
            dirName, "{}_chunk_{}_aligned.txt".format(baseName, i + 1))
        sequenceutils.writeFasta(queries, inputName, chunk)
        alignFiles[inputName] = outputName

    tasks = []
    for inputPath, outputPath in alignFiles.items():
        task = buildHmmAlignment(hmmPath, inputPath, outputPath)
        tasks.append(task)
    return tasks
コード例 #9
0
def reassignTaxons(subsetsDir, subsetSeedPaths, sequences, unusedTaxa):
    unusedPath = os.path.join(subsetsDir, "unassigned_sequences.txt")
    sequenceutils.writeFasta(sequences, unusedPath, unusedTaxa)

    hmmMap = {}
    for subsetPath in subsetSeedPaths:
        hmmDir = os.path.join(
            os.path.dirname(subsetPath),
            "hmm_{}".format(os.path.basename(subsetPath)).replace(".", "_"))
        if not os.path.exists(hmmDir):
            os.makedirs(hmmDir)
        hmmMap[subsetPath] = os.path.join(hmmDir, "hmm_model.txt")
    hmmTasks = hmmutils.buildHmms(hmmMap)
    task.submitTasks(hmmTasks)
    task.awaitTasks(hmmTasks)
    hmmPaths = [t.outputFile for t in hmmTasks]

    scoreFileHmmFileMap = {}
    scoreTasks = hmmutils.buildHmmScores(hmmPaths, unusedPath,
                                         scoreFileHmmFileMap)
    task.submitTasks(scoreTasks)

    bestScores = {}
    taxonHmmMap = {}
    for scoreTask in task.asCompleted(scoreTasks):
        subsetScores = hmmutils.readSearchFile(scoreTask.outputFile)
        for taxon, scores in subsetScores.items():
            if scores[1] > bestScores.get(taxon, -float("inf")):
                bestScores[taxon] = scores[1]
                taxonHmmMap[taxon] = scoreFileHmmFileMap[scoreTask.outputFile]

    subsetTaxons = {file: [] for file in hmmPaths}
    for taxon, hmmPath in taxonHmmMap.items():
        subsetTaxons[hmmPath].append(taxon)
    for subsetPath, hmmPath in hmmMap.items():
        subset = sequenceutils.readFromFasta(subsetPath)
        for taxon in subset:
            subsetTaxons[hmmPath].append(taxon)

    subsetPaths = []
    i = 1
    for hmmPath, subset in subsetTaxons.items():
        subsetPath = os.path.join(subsetsDir, "subset_{}.txt".format(i))
        subsetPaths.append(subsetPath)
        sequenceutils.writeFasta(sequences, subsetPath, subset)
        i = i + 1

    return subsetPaths
コード例 #10
0
    def initializeBackboneSequenceMapping(self):
        if len(self.backboneTaxa) == 0:
            backboneSubsetTaxonMap = {
                i: subset
                for i, subset in enumerate(self.subsets)
            }
        else:
            backboneSubsetTaxonMap = {}
            for taxon in self.backboneTaxa:
                i = self.taxonSubsetMap[taxon]
                backboneSubsetTaxonMap[i] = backboneSubsetTaxonMap.get(i, [])
                backboneSubsetTaxonMap[i].append(taxon)

        if Configs.constrain:
            for i, subalignPath in enumerate(self.subalignmentPaths):
                subalignment = sequenceutils.readFromFasta(subalignPath,
                                                           removeDashes=False)
                for taxon in backboneSubsetTaxonMap.get(i, []):
                    self.backboneSubalignment[taxon] = subalignment[taxon]
        else:
            self.backboneSubalignment = self.unalignedSequences
コード例 #11
0
ファイル: graph_builder.py プロジェクト: PaulSV123/CMB-cambio
def requestMafftBackbones(context):
    numTaxa = max(1, int(Configs.mafftSize / len(context.subsetPaths)))

    for n in range(Configs.mafftRuns):
        unalignedFile = os.path.join(context.graph.workingDir,
                                     "backbone_{}_unalign.txt".format(n + 1))
        alignedFile = os.path.join(context.graph.workingDir,
                                   "backbone_{}_mafft.txt".format(n + 1))
        if os.path.exists(alignedFile):
            Configs.log("Existing backbone file found: {}".format(alignedFile))
            backbone = sequenceutils.readFromFasta(alignedFile)
            context.backbonePaths.append(alignedFile)
        else:
            backbone = assignBackboneTaxa(context, numTaxa, unalignedFile)
            backboneTask = external_tools.buildMafftAlignment(
                unalignedFile, alignedFile)
            context.backboneTasks.append(backboneTask)

        if Configs.graphBuildHmmExtend:
            context.backboneExtend.add(alignedFile)
        else:
            context.backboneTaxa.update(backbone)
    task.submitTasks(context.backboneTasks)
コード例 #12
0
def writeUnpackedAlignment(context):
    graph = context.graph
    filePath = context.outputFile

    tempFile = os.path.join(os.path.dirname(filePath),
                            "temp_{}".format(os.path.basename(filePath)))
    if os.path.exists(tempFile):
        os.remove(tempFile)

    clusterMap = {
        path: [[] for c in graph.clusters]
        for path in context.subalignmentPaths
    }
    for idx, cluster in enumerate(graph.clusters):
        for b in cluster:
            bsub, bpos = graph.matSubPosMap[b]
            clusterMap[context.subalignmentPaths[bsub]][idx].append(bpos)

    inserts = {path: [] for path in context.subalignmentPaths}
    for b in graph.insertions:
        bsub, bpos = graph.matSubPosMap[b]
        inserts[context.subalignmentPaths[bsub]].append(bpos)

    Configs.log("Assembling final alignment in {}".format(filePath))
    inducedSubalignTasks = []
    for bsub, subalignPath in enumerate(context.subalignmentPaths):
        alignmentColumnsPath = os.path.join(
            context.graph.workingDir,
            "alignment_columns_{}".format(os.path.basename(subalignPath)))
        with open(alignmentColumnsPath, 'w') as textFile:
            textFile.write("{}\n".format(" ".join(
                [str(c) for c in inserts[subalignPath]])))
            for cluster in clusterMap[subalignPath]:
                textFile.write("{}\n".format(" ".join(
                    [str(c) for c in cluster])))

        inducedAlignPath = os.path.join(
            graph.workingDir,
            "induced_{}".format(os.path.basename(subalignPath)))
        args = {
            "alignmentColumnsPath": alignmentColumnsPath,
            "subalignmentPath": subalignPath,
            "outputFile": inducedAlignPath
        }
        inducedTask = task.Task(taskType="buildInducedSubalignment",
                                outputFile=args["outputFile"],
                                taskArgs=args)
        inducedSubalignTasks.append(inducedTask)
        #inducedTask.submitTask()

    task.submitTasks(inducedSubalignTasks)
    for inducedTask in task.asCompleted(inducedSubalignTasks):
        inducedAlign = sequenceutils.readFromFasta(inducedTask.outputFile,
                                                   removeDashes=False)
        Configs.log(
            "Appending induced alignment, {} sequences of length {}..".format(
                len(inducedAlign), len(next(iter(inducedAlign.values())).seq)))
        sequenceutils.writeFasta(inducedAlign, tempFile, append=True)

        os.remove(inducedTask.taskArgs["alignmentColumnsPath"])
        os.remove(inducedTask.outputFile)
    shutil.move(tempFile, filePath)
    Configs.log("Wrote final alignment to {}".format(filePath))