def rgCluster(graph, lowerBound, upperBound, enforceTrace = True): clusters = [] clusterPointers = {} clusterPos = {} nodeClusters = {} weightMap = [] absorbed = set() cantConnects = set() for s in range(len(lowerBound)): for a in range(lowerBound[s], upperBound[s]): clusters.append([a]) idx = len(clusters)-1 nodeClusters[a] = idx weightMap.append({}) clusterPos[idx] = {s : a} clusterPointers[idx] = {s : (idx-1 if idx > lowerBound[s] else None, idx+1 if idx < upperBound[s]-1 else None)} heap = buildHeap(graph, nodeClusters, weightMap, lowerBound, upperBound) Configs.log("Built a heap of size {}..".format(len(heap))) crunchHeap(graph, heap, clusters, nodeClusters, clusterPos, clusterPointers, weightMap, cantConnects, absorbed, enforceTrace) #c2 = [sorted(c) for c in clusters if len(c) > 0] #c2.sort(key= lambda l : graph.matSubPosMap[l[0]]) #for c in c2: # print(c) if enforceTrace: clusters = orderClusters(graph, clusters, nodeClusters, lowerBound, upperBound) #for c in clusters: # print(sorted(c)) return clusters
def rgSearch(graph): Configs.log("Finding graph trace with region-growing search..") k = len(graph.context.subalignments) lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] upperBound = [graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k)] graph.clusters = rgCluster(graph, lowerBound, upperBound, True)
def buildSubsetsKMH(context, subsetsDir): tempDir = os.path.join(subsetsDir, "initial_tree") Configs.log( "Building KMH decomposition on {} with skeleton size {}/{}..".format( context.sequencesPath, Configs.decompositionSkeletonSize, 1000)) time1 = time.time() initialTreePath, initialAlignPath, unusedTaxa = buildInitialTreeAlign( tempDir, context.sequencesPath) if len(unusedTaxa) == 0: subsetPaths = treeutils.decomposeGuideTree( tempDir, initialAlignPath, initialTreePath, Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets) else: subsetSeedDir = os.path.join(subsetsDir, "seed_subsets") if not os.path.exists(subsetSeedDir): os.makedirs(subsetSeedDir) subsetSeedPaths = treeutils.decomposeGuideTree( subsetSeedDir, initialAlignPath, initialTreePath, None, Configs.decompositionMaxNumSubsets) subsetPaths = reassignTaxons(subsetsDir, subsetSeedPaths, context.unalignedSequences, unusedTaxa) time2 = time.time() Configs.log("Built KMH decomposition on {} in {} sec..".format( context.sequencesPath, time2 - time1)) return subsetPaths
def writeGraphToFile(self, filePath): with open(filePath, 'w') as textFile: for i in range(len(self.matrix)): for k in self.matrix[i]: textFile.write("{} {} {}\n".format(i, k, self.matrix[i][k])) Configs.log("Wrote matrix to {}".format(filePath))
def readGraphFromFile(self, filePath): self.matrix = [{} for i in range(self.matrixSize)] with open(filePath) as f: for line in f: tokens = [int(token) for token in line.strip().split()] self.matrix[tokens[0]][tokens[1]] = tokens[2] Configs.log("Read matrix from {}".format(filePath))
def atomizedClustering(graph): Configs.log("Building a fully atomized clustering..") k = len(graph.context.subalignments) lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] upperBound = [ graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k) ] graph.clusters = atomizedCluster(lowerBound, upperBound)
def naiveClustering(graph): Configs.log("Building a naive left-justified clustering..") k = len(graph.context.subalignments) lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] upperBound = [ graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k) ] graph.clusters = naiveCluster(lowerBound, upperBound)
def rgFastClustering(graph): Configs.log("Building a fast region-growing graph clustering..") k = len(graph.context.subalignments) lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] upperBound = [ graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k) ] graph.clusters = rgFastCluster(graph, lowerBound, upperBound, False) graph.writeClustersToFile(graph.clusterPath)
def initialize(self, graph): Configs.log("Initializing search context data structures..") for i in range(len(self.clusters)): self.clusterOrders[i] = [i] self.clusterLL[i] = (i - 1 if i > 0 else None, i + 1 if i < len(self.clusters) - 1 else None) for a in self.clusters[i]: asub, apos = graph.matSubPosMap[a] self.elementClusters[a] = i self.clusterSubs[i, asub] = a nbrs = self.getNeighborList(graph, a) self.updateNeighborWeights(None, i, nbrs)
def purgeDuplicateClusters(graph): uniqueClusters = set() newclusters = [] for cluster in graph.clusters: cluster.sort() clusterTuple = tuple(cluster) if clusterTuple not in uniqueClusters: uniqueClusters.add(clusterTuple) newclusters.append(cluster) graph.clusters = newclusters Configs.log("Purged duplicate clusters. Found {} unique clusters..".format( len(graph.clusters)))
def runMlrMclClustering(graph): Configs.log("Running MLR-MCL alignment graph clustering..") graphPath = os.path.join(graph.workingDir, "graph_mlr_mcl.txt") clusterPath = os.path.join(graph.workingDir, "clusters_mlr_mcl.txt") if not os.path.exists(clusterPath): if not os.path.exists(graphPath): writeGraphToFile(graph, graphPath) external_tools.runMlrMcl(graphPath, 30000, 0.5, 4, graph.workingDir, clusterPath).run() graph.clusters = readClustersFromFile(clusterPath) graph.writeClustersToFile(graph.clusterPath)
def mergeSubalignments(context): Configs.log("Merging {} subaligments..".format( len(context.subalignmentPaths))) time1 = time.time() buildGraph(context) clusterGraph(context.graph) findTrace(context.graph) optimizeTrace(context.graph) writeAlignment(context) time2 = time.time() Configs.log("Merged {} subalignments into {} in {} sec..".format( len(context.subalignmentPaths), context.outputFile, time2 - time1))
def optimizeClusters(graph, clusters): bestClusters, bestCost = clusters, graph.computeClusteringCost(clusters) Configs.log( "Starting optimization from initial cost of {}..".format(bestCost)) context = SearchContext(clusters) context.initialize(graph) passNum = 1 while True: Configs.log("Starting optimization pass {}..".format(passNum)) newClusters, gain = optimizationPass(graph, bestClusters, context) if gain > 0: bestClusters = newClusters bestCost = bestCost - gain Configs.log( "New clustering with a cost of {} over {} clusters..".format( bestCost, len(bestClusters))) #Configs.log("Verifying cost of {}..".format(graph.computeClusteringCost(bestClusters))) else: break passNum = passNum + 1 #Configs.log("Final optimized cost of {} over {} clusters..".format(graph.computeClusteringCost(bestClusters), len(bestClusters))) Configs.log("Final optimized cost of {} over {} clusters..".format( bestCost, len(bestClusters))) return bestClusters
def decomposeSequences(context): time1 = time.time() if len(context.subsetPaths) > 0: Configs.log("Subset paths already provided, skipping decomposition..") elif len(context.subalignmentPaths) > 0: context.subsetPaths = context.subalignmentPaths Configs.log( "Subalignment paths already provided, skipping decomposition..") else: subsetsDir = os.path.join(context.workingDir, "decomposition") context.subsetPaths = [] n = 1 while True: filePath = os.path.join(subsetsDir, "subset_{}.txt".format(n)) if not os.path.exists(filePath): break Configs.log("Detected existing subset file {}".format(filePath)) context.subsetPaths.append(filePath) n = n + 1 if len(context.subsetPaths) == 0: buildDecomposition(context, subsetsDir) time2 = time.time() Configs.log("Decomposed {} into {} subsets in {} sec..".format( context.sequencesPath, len(context.subsetPaths), time2 - time1))
def requestBackboneTasks(context): if len(context.backbonePaths) > 0: Configs.log("Using {} user-defined backbone files..".format( len(context.backbonePaths))) context.backbonePaths = context.backbonePaths for path in context.backbonePaths: context.backboneTaxa.update(sequenceutils.readFromFasta(path)) elif Configs.graphBuildMethod == "mafft": Configs.log("Using {} MAFFT backbones..".format(Configs.mafftRuns)) requestMafftBackbones(context) elif Configs.graphBuildMethod == "subsethmm": Configs.log( "Using {} HMM-extended subalignments as backbone files..".format( len(context.subalignmentPaths))) context.backbonePaths = context.subalignmentPaths context.backboneExtend.update(context.backbonePaths) elif Configs.graphBuildMethod == "initial": Configs.log( "Using the initial decomposition alignment as the single backbone.." ) initialAlignPath = os.path.join(context.workingDir, "decomposition", "initial_tree", "initial_insert_align.txt") context.backbonePaths = [initialAlignPath] if not Configs.constrain and Configs.graphBuildMethod != "subsethmm": context.backbonePaths.extend(context.subalignmentPaths)
def readClustersFromFile(filePath): assignments = {} with open(filePath) as f: num = 0 for line in f: cluster = int(line.strip()) if cluster not in assignments: assignments[cluster] = [num] else: assignments[cluster].append(num) num = num + 1 clusters = [assignments[c] for c in range(len(assignments))] Configs.log("Found {} clusters..".format(len(clusters))) return clusters
def buildNodeEdgeDataStructure(self): Configs.log("Preparing node edge data structure..") k = len(self.subalignmentLengths) self.nodeEdges = {} for a in range(self.matrixSize): asub, apos = self.matSubPosMap[a] self.nodeEdges[a] = [[] for i in range(k)] for b, value in self.matrix[a].items(): bsub, bpos = self.matSubPosMap[b] if asub == bsub: continue self.nodeEdges[a][bsub].append((b, value)) for i in range(k): self.nodeEdges[a][i].sort(key=lambda pair: pair[0]) Configs.log("Prepared node edge data structure..")
def writeGraphToFile(graph, filePath): Configs.log("Writing MLR-MCL graph file to {}".format(filePath)) vertices, edges = 0, 0 lines = [] for i in range(len(graph.matrix)): pairs = graph.matrix[i].items() vertices = vertices + 1 edges = edges + len(pairs) lines.append(" ".join(["{} {}".format(a + 1, b) for a, b in pairs])) with open(filePath, 'w') as textFile: textFile.write("{} {} 1\n".format(vertices, int(edges / 2))) for line in lines: textFile.write(line + "\n") Configs.log("Wrote graph with {} vertices and {} edges to {}".format( vertices, int(edges / 2), filePath))
def fmAlgorithm(graph): Configs.log("Finding graph trace with FM Algorithm..") k = len(graph.context.subalignments) lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] upperBound = [ graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k) ] if graph.clusters is None or len(graph.clusters) == 0: graph.buildNodeEdgeDataStructure() else: graph.buildNodeEdgeDataStructureFromClusters() clusters, totalCost, cuts = fmPartition(graph, lowerBound, upperBound) graph.clusters = clusters
def decomposeTree(tree, maxSubsetSize, numSubsets): trees = [tree] while len(trees) < numSubsets: largestTree = max(trees, key=lambda t: t.childs) if maxSubsetSize is not None and largestTree.childs <= maxSubsetSize: return trees else: numChilds = largestTree.childs e = getCentroidEdge(largestTree) t1, t2 = bipartitionByEdge(largestTree, e) Configs.log( "Decomposing a tree with {} leaves into {} and {}..".format( numChilds, t1.childs, t2.childs)) trees.remove(largestTree) trees = trees + [t1, t2] return trees
def mwtSearch(graph): Configs.log("Finding graph trace with MWT heuristic search..") k = len(graph.context.subalignments) lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] upperBound = [ graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k) ] if graph.clusters is None or len(graph.clusters) == 0: graph.buildNodeEdgeDataStructure() else: graph.buildNodeEdgeDataStructureFromClusters() clusters, totalCost = mwtHeuristicSearch(graph, lowerBound, upperBound) graph.clusters = clusters
def mwtGreedySearch(graph): Configs.log("Finding graph trace with MWT greedy search..") k = len(graph.context.subalignments) lowerBound = [graph.subsetMatrixIdx[i] for i in range(k)] upperBound = [ graph.subsetMatrixIdx[i] + graph.subalignmentLengths[i] for i in range(k) ] if graph.clusters is None or len(graph.clusters) == 0: graph.buildNodeEdgeDataStructure() else: graph.buildNodeEdgeDataStructureFromClusters() context = MwtSearchContext(lowerBound, upperBound) state = MwtSearchState() state.frontier = list(lowerBound) clusters, totalCost, cycles = greedySearch(graph, state, context) graph.clusters = clusters
def runCommand(**kwargs): command = kwargs["command"] Configs.log("Running an external tool, command: {}".format(command)) runner = subprocess.run(command, shell = True, cwd = kwargs["workingDir"], universal_newlines = True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) try: runner.check_returncode() except: Configs.error("Command encountered error: {}".format(command)) Configs.error("Exit code: {}".format(runner.returncode)) Configs.error("Output: {}".format(runner.stdout)) raise for srcPath, destPath in kwargs.get("fileCopyMap", {}).items(): shutil.move(srcPath, destPath)
def run(self): try: if not os.path.exists(self.outputFile): Configs.log("Running a task, output file: {}".format( self.outputFile)) mod = importlib.import_module( Task.functionModuleMap[self.taskType]) func = getattr(mod, self.taskType) func(**self.taskArgs) Configs.log("Completed a task, output file: {}".format( self.outputFile)) else: Configs.log("File already exists: {}".format(self.outputFile)) except Exception as exc: Configs.error("Task for {} threw an exception:\n{}".format( self.outputFile, exc)) Configs.error(traceback.format_exc()) raise finally: self.isFinished = True
def writeUnconstrainedAlignment(context): graph = context.graph alignment = {} for taxon in context.unalignedSequences: alignment[taxon] = sequenceutils.Sequence(taxon, ['-'] * len(graph.clusters)) curIdxes = {taxon: 0 for taxon in context.unalignedSequences} for idx, cluster in enumerate(graph.clusters): for b in cluster: bsub, bpos = graph.matSubPosMap[b] taxon = context.subalignments[bsub][0] alignment[taxon].seq[idx] = context.unalignedSequences[taxon].seq[ curIdxes[taxon]] curIdxes[taxon] = curIdxes[taxon] + 1 for taxon in alignment: alignment[taxon].seq = "".join(alignment[taxon].seq) sequenceutils.writeFasta(alignment, context.outputFile) Configs.log("Wrote final alignment to {}".format(context.outputFile))
def buildDecomposition(context, subsetsDir): if not os.path.exists(subsetsDir): os.makedirs(subsetsDir) if context.unalignedSequences is None: context.unalignedSequences = sequenceutils.readFromFasta( context.sequencesPath, removeDashes=True) if (Configs.decompositionStrategy == "random" or context.guideTree == "random") and Configs.outputPath == context.outputFile: context.subsetPaths = randomDecomposition( subsetsDir, context.unalignedSequences, Configs.decompositionMaxNumSubsets) elif Configs.decompositionStrategy == "kmh": Configs.log("Decomposing {} with KMH..".format(context.sequencesPath)) Configs.log("Targetting {} subsets..".format( Configs.decompositionMaxNumSubsets)) context.subsetPaths = kmh.buildSubsetsKMH(context, subsetsDir) else: guideTreePath = initial_tree.buildInitialTree(context, subsetsDir, context.guideTree) Configs.log( "Using target subset size of {}, and maximum number of subsets {}.." .format(Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets)) context.subsetPaths = treeutils.decomposeGuideTree( subsetsDir, context.sequencesPath, guideTreePath, Configs.decompositionMaxSubsetSize, Configs.decompositionMaxNumSubsets)
def findTrace(graph): time1 = time.time() if os.path.exists(graph.tracePath): Configs.log("Found existing trace file {}".format(graph.tracePath)) graph.readClustersFromFile(graph.tracePath) else: purgeDuplicateClusters(graph) purgeClusterViolations(graph) if Configs.graphTraceMethod == "minclusters": minClustersSearch(graph) elif Configs.graphTraceMethod == "fm": fmAlgorithm(graph) elif Configs.graphTraceMethod == "mwtgreedy": mwtGreedySearch(graph) elif Configs.graphTraceMethod == "mwtsearch": mwtSearch(graph) elif Configs.graphTraceMethod == "rg": rgSearch(graph) elif Configs.graphTraceMethod == "rgfast": rgFastSearch(graph) elif Configs.graphTraceMethod == "naive": naiveClustering(graph) graph.writeClustersToFile(graph.tracePath) time2 = time.time() Configs.log("Found alignment graph trace in {} sec..".format(time2 - time1)) Configs.log("Found a trace with {} clusters and a total cost of {}".format( len(graph.clusters), graph.computeClusteringCost(graph.clusters)))
def addAlignmentFileToGraph(context, alignedFile): Configs.log("Feeding backbone {} to the graph..".format(alignedFile)) backboneAlign = sequenceutils.readFromFasta(alignedFile) alignmentLength = len(next(iter(backboneAlign.values())).seq) if alignedFile in context.backboneExtend: extensionTasks = requestHmmExtensionTasks(context, backboneAlign, alignedFile) task.submitTasks(extensionTasks) for extensionTask in task.asCompleted(extensionTasks): backboneAlign.update( sequenceutils.readFromStockholm(extensionTask.outputFile, includeInsertions=True)) alignmap = backboneToAlignMap(context, backboneAlign, alignmentLength) Configs.log( "Constructed backbone alignment map from {}".format(alignedFile)) graph = context.graph with graph.matrixLock: for l in range(alignmentLength): for a, avalue in alignmap[l].items(): for b, bvalue in alignmap[l].items(): if Configs.graphBuildRestrict: asub, apos = graph.matSubPosMap[a] bsub, bpos = graph.matSubPosMap[b] if asub == bsub and apos != bpos: continue graph.matrix[a][b] = graph.matrix[a].get( b, 0) + avalue * bvalue Configs.log("Fed backbone {} to the graph.".format(alignedFile))
def initializeHeap(self, graph): Configs.log("Reinitializing heap and all that stuff..") self.gainStructure = [] self.elementMoves = {} self.heap = [] self.locked = set() print("Working with {} clusters..".format( len(self.clusters) - len(self.deletedClusters))) k = len(graph.context.subalignments) self.gainStructure = [[0 for j in range(graph.subalignmentLengths[i])] for i in range(k)] for i in range(k): for j in range(graph.subalignmentLengths[i]): node = graph.subsetMatrixIdx[i] + j weight = self.weights.get((node, self.elementClusters[node]), 0) self.gainStructure[i][ j] = weight if j == 0 else weight + self.gainStructure[i][ j - 1] #if self.mode == "positive_moves": # candidates = self.getPositiveMoves(graph) #elif self.mode == "adjacent_moves": # candidates = self.getAdjacentMoves(graph) #Configs.log("Considering {} candidate moves..".format(len(candidates))) #Configs.log("Choosing {} out of {} candidates..".format(limit, len(candidates))) #candidates = heapq.nlargest(limit, candidates) #for gain, i, nbr in candidates: # self.pullNeighborMoves(graph, i, [(nbr, gain)]) #gain = self.getGainSimple(nbr, i) #self.elementMoves[nbr, i] = gain #heapq.heappush(self.heap, (-1*gain, nbr, i)) self.getPositiveMoves(graph) Configs.log("Starting with {} candidate moves..".format(len( self.heap)))
def optimizeTrace(graph): time1 = time.time() if Configs.graphTraceOptimize: Configs.log("Optimization pass..") graph.addSingletonClusters() graph.clusters = optimizeClusters(graph, graph.clusters) Configs.log( "Optimized the trace to {} clusters with a total cost of {}". format(len(graph.clusters), graph.computeClusteringCost(graph.clusters))) else: Configs.log("Skipping optimization pass..") time2 = time.time() Configs.log("Finished optimization in {} sec..".format(time2 - time1))