def readSimulationData(dataFolder):

    trees = []
    alleles = []
    simulationDataIds = []

    for subdir, dirs, files in os.walk(simulationFolder):

        if subdir == simulationFolder:  #we are not interested in the root folder
            continue
        simulationDataIds.append(subdir)
        for file in files:

            #Also collect the real tree and inferred tree to compute the anscestry swap errors
            if re.match('RealTrees',
                        file):  #read the file and obtain the error
                stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(
                    file, subdir)[0]
                tree = eval(stringDict)
                realTree = Graph(tree['vertices'], set(tree['edges']),
                                 tree['edges'])
                trees.append(realTree)
            if re.match('RealA', file):
                currentAlleles = np.loadtxt(subdir + '/' + file, dtype='str')
                alleles.append(currentAlleles)

    return trees, alleles, simulationDataIds
コード例 #2
0
def computeMST(fullGraph, vertexNames):
    newGraph = Graph(vertexNames, set(), [])
    newEdges = SpanningArborescence().computeMinimumSpanningArborescence(
        fullGraph, None)
    newVertices = vertexNames
    newGraph = Graph(newVertices, None, None)
    newGraph.setEdges(newEdges)

    #Update the node names in the edges
    updatedEdges = []
    for edge in newGraph.edgeList:
        newEdge = (edge[0], vertexNames[edge[1]], vertexNames[edge[2]])
        updatedEdges.append(newEdge)
    newGraph.edgeList = updatedEdges
    newGraph.edges = set(updatedEdges)
    return newGraph
コード例 #3
0
    def generateInitialTree(self, vertexNames, dists, aDists, samples,
                            snvAnnotations, alleleAnnotations):
        #Generate a full tree with all the edges
        fullGraph = Graph(vertexNames, None, None)
        fullGraph.setEdgesFromDistances(dists)
        #Combine the edge annotations:
        allMessages = deepcopy(snvAnnotations)
        for k in alleleAnnotations.keys():
            if k in allMessages:
                allMessages[k] = allMessages[k] + alleleAnnotations[k]
            else:
                allMessages[k] = alleleAnnotations[k]

        fullGraph.edgeAnnotations = allMessages
        print fullGraph
        print fullGraph.edgeList
        #Remove the edges for which we are based on the allele distances confident that these are not possible
        for sample1Ind in range(0, len(samples)):
            for sample2Ind in range(0, len(samples)):
                if aDists[sample1Ind, sample2Ind] == float("inf"):
                    fullGraph.removeEdge((0, sample1Ind, sample2Ind))

        return fullGraph
コード例 #4
0
def generateInitialTree(dists, vertexNames):
    fullGraph = Graph(vertexNames, None, None)
    fullGraph.setEdgesFromDistances(dists)

    return fullGraph
コード例 #5
0
def readDataIncludingPermutations(dataFolder, noiseLevels):

    groupedCErrors = dict()
    groupedAErrors = dict()
    groupedMuErrors = dict()
    groupedTreeErrors = dict()
    groupedAmbiguityErrors = dict()

    groupedEuclideanErrors = dict()
    groupedAverageAncestrySwapErrors = dict()

    for noiseLevel in noiseLevels:
        simulationFolder = dataFolder + '/snvs_' + str(noiseLevel)

        #Read all the errors into one list for this noise level
        cErrors = []
        aErrors = []
        muErrors = []
        treeErrors = []
        ambiguityErrors = []
        ambiguityCorrectedErrors = []
        averagedAncestrySwapError = []

        treeSizes = []

        for subdir, dirs, files in os.walk(simulationFolder):

            if subdir == simulationFolder:  #we are not interested in the root folder
                continue

            for file in files:
                if re.match('cError',
                            file):  #read the file and obtain the error
                    cErrors += collectErrorsFromFile(file, subdir)
                if re.match('aError',
                            file):  #read the file and obtain the error
                    aErrors += collectErrorsFromFile(file, subdir)
                if re.match('muError',
                            file):  #read the file and obtain the error
                    muErrors += collectErrorsFromFile(file, subdir)
                if re.match('treeError',
                            file):  #read the file and obtain the error
                    treeErrors += collectErrorsFromFile(file, subdir)
                if re.match('RealTrees',
                            file):  #read the file and obtain the error
                    stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(
                        file, subdir)[0]
                    tree = eval(stringDict)
                    realTree = Graph(tree['vertices'], set(tree['edges']),
                                     tree['edges'])
                    treeSizes.append(len(realTree.edgeList))

                if re.match('EstimatedTrees',
                            file):  #read the file and obtain the error
                    stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(
                        file, subdir)[0]
                    tree = eval(stringDict)
                    inferredTree = Graph(tree['vertices'], set(tree['edges']),
                                         tree['edges'])

            [
                ancestrySwapErrorAbsentInInferred,
                ancestrySwapErrorPresentInInferred, noOfSamplePairs
            ] = computeTreeErrorOtherMetrics.computeAncestrySwapError(
                realTree, inferredTree)

            #Instead of reporting the actual errors, what if we report percentages of how bad we could have done?
            summedError = (ancestrySwapErrorAbsentInInferred +
                           ancestrySwapErrorPresentInInferred)
            averagedAncestrySwapError.append(summedError /
                                             float(noOfSamplePairs))

        #Gather the data per noise level
        groupedCErrors[noiseLevel] = cErrors
        groupedAErrors[noiseLevel] = aErrors
        groupedMuErrors[noiseLevel] = muErrors
        groupedTreeErrors[noiseLevel] = treeErrors
        groupedAmbiguityErrors[noiseLevel] = ambiguityCorrectedErrors
        groupedAverageAncestrySwapErrors[
            noiseLevel] = averagedAncestrySwapError

        #Move this to a function to make it better
        #Also compute the Euclidean distance trees for each noise levels, add this as an additional error
    #Return the grouped data
    return [
        groupedCErrors, groupedAErrors, groupedMuErrors, groupedTreeErrors,
        groupedAverageAncestrySwapErrors
    ]
コード例 #6
0
	muErrorFile = muErrorFiles[0]
	
	cError = collectErrorsFromFile(cErrorFile)[0]
	aError = collectErrorsFromFile(aErrorFile)[0]
	muError = collectErrorsFromFile(muErrorFile)[0]
	
	cErrors.append(cError)
	aErrors.append(aError)
	muErrors.append(muError)
	
	realTreeFile = glob(subdir + "/RealTrees_1.txt")[0]
	inferredTreeFile = glob(subdir + "/EstimatedTrees_1.txt")[0]
	
	stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile('RealTrees_1.txt', subdir)[0]
	tree = eval(stringDict)
	realTree = Graph(tree['vertices'], set(tree['edges']), tree['edges'])
	
	stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile("EstimatedTrees_1.txt", subdir)[0]
	tree = eval(stringDict)
	inferredTree = Graph(tree['vertices'], set(tree['edges']), tree['edges'])

	#Compute the ancestry swap error
	[ancestrySwapErrorAbsentInInferred, ancestrySwapErrorPresentInInferred, noOfSamplePairs] = computeTreeErrorOtherMetrics.computeAncestrySwapError(realTree, inferredTree)

	summedError = (ancestrySwapErrorAbsentInInferred + ancestrySwapErrorPresentInInferred)
	ancestrySwapError = summedError / float(noOfSamplePairs)
	
	treeErrors.append(ancestrySwapError)


#Compute the pairwise error for each simulation
def readDataIncludingPermutations(dataFolder, noiseLevels):
	
	groupedCErrors = dict()
	groupedAErrors = dict()
	groupedMuErrors = dict()
	groupedTreeErrors = dict()
	groupedAncestrySwapErrors = dict()
	groupedAmbiguityErrors = dict()
	groupedPCErrors = dict()
	groupedPAErrors = dict()
	groupedPMuErrors = dict()
	groupedPTreeErrors = dict()
	groupedPAmbiguityErrors = dict()
	groupedPAncestrySwapErrors = dict()
	
	groupedEuclideanErrors = dict()
	
	for noiseLevel in noiseLevels:
		simulationFolder = dataFolder + '/snps_' + str(noiseLevel)
		
		#Read all the errors into one list for this noise level
		cErrors = []
		aErrors = []
		muErrors = []
		treeErrors = []
		ancestrySwapErrors = []
		ambiguityErrors = []
		ambiguityCorrectedErrors = []
		pCErrors = []
		pAErrors = []
		pMuErrors = []
		pTreeErrors = []
		pAmbiguityErrors = []
		pAmbiguityCorrectedErrors = []
		pAncestrySwapErrors = []
		
		realTree = None
		inferredTree = None
		treeSizes = []
		
		for subdir, dirs, files in os.walk(simulationFolder):
			
			
			if subdir == simulationFolder: #we are not interested in the root folder
				continue
			
			if re.search('horizontalShuffle', subdir) is not None:
				for file in files:
					if re.match('cError', file): #read the file and obtain the error
						pCErrors += collectErrorsFromFile(file, subdir)
					if re.match('aError', file): #read the file and obtain the error
						pAErrors += collectErrorsFromFile(file, subdir)
					if re.match('muError', file): #read the file and obtain the error
						pMuErrors += collectErrorsFromFile(file, subdir)
					if re.match('treeError', file): #read the file and obtain the error
						pTreeErrors += collectErrorsFromFile(file, subdir)
					if re.match('RealTrees', file): #read the file and obtain the error
						stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(file, subdir)[0]
						tree = eval(stringDict)
						realTree = Graph(tree['vertices'], set(tree['edges']), tree['edges'])
						treeSizes.append(len(realTree.edgeList))
					
					if re.match('EstimatedTrees', file): #read the file and obtain the error
						stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(file, subdir)[0]
						tree = eval(stringDict)
						inferredTree = Graph(tree['vertices'], set(tree['edges']), tree['edges'])
						
				
				#Compute the ancestry swap error
				[ancestrySwapErrorAbsentInInferred, ancestrySwapErrorPresentInInferred, noOfSamplePairs] = computeTreeErrorOtherMetrics.computeAncestrySwapError(realTree, inferredTree)

				summedError = (ancestrySwapErrorAbsentInInferred + ancestrySwapErrorPresentInInferred)
				pAncestrySwapErrors.append(summedError / float(noOfSamplePairs))	
			
			else:
				for file in files:
					if re.match('cError', file): #read the file and obtain the error
						cErrors += collectErrorsFromFile(file, subdir)
					if re.match('aError', file): #read the file and obtain the error
						aErrors += collectErrorsFromFile(file, subdir)
					if re.match('muError', file): #read the file and obtain the error
						muErrors += collectErrorsFromFile(file, subdir)
					if re.match('treeError', file): #read the file and obtain the error
						treeErrors += collectErrorsFromFile(file, subdir)
					if re.match('RealTrees', file): #read the file and obtain the error
						stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(file, subdir)[0]
						tree = eval(stringDict)
						realTree = Graph(tree['vertices'], set(tree['edges']), tree['edges'])
						treeSizes.append(len(realTree.edgeList))
					
					if re.match('EstimatedTrees', file): #read the file and obtain the error
						stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(file, subdir)[0]
						tree = eval(stringDict)
						inferredTree = Graph(tree['vertices'], set(tree['edges']), tree['edges'])
						
				
				#Compute the ancestry swap error
				[ancestrySwapErrorAbsentInInferred, ancestrySwapErrorPresentInInferred, noOfSamplePairs] = computeTreeErrorOtherMetrics.computeAncestrySwapError(realTree, inferredTree)

				summedError = (ancestrySwapErrorAbsentInInferred + ancestrySwapErrorPresentInInferred)
				ancestrySwapErrors.append(summedError / float(noOfSamplePairs))	
			
		#Gather the data per noise level
		groupedCErrors[noiseLevel] = cErrors
		groupedAErrors[noiseLevel] = aErrors
		groupedMuErrors[noiseLevel] = muErrors
		groupedTreeErrors[noiseLevel] = treeErrors
		groupedAncestrySwapErrors[noiseLevel] = ancestrySwapErrors
		groupedAmbiguityErrors[noiseLevel] = ambiguityCorrectedErrors
		groupedPCErrors[noiseLevel] = pCErrors
		groupedPAErrors[noiseLevel] = pAErrors
		groupedPMuErrors[noiseLevel] = pMuErrors
		groupedPTreeErrors[noiseLevel] = pTreeErrors
		groupedPAncestrySwapErrors[noiseLevel] = pAncestrySwapErrors
		groupedPAmbiguityErrors[noiseLevel] = pAmbiguityCorrectedErrors
		
		#Move this to a function to make it better
		#Also compute the Euclidean distance trees for each noise levels, add this as an additional error
	#Return the grouped data
	
	print "tree errors: ", groupedTreeErrors
	print "shuffled tree errors: ", groupedPTreeErrors
	
	return [groupedCErrors, groupedAErrors, groupedMuErrors, groupedTreeErrors, groupedAncestrySwapErrors, groupedPCErrors, groupedPAErrors, groupedPMuErrors, groupedPTreeErrors, groupedPAncestrySwapErrors]
コード例 #8
0
def readData(dataFolder, noiseLevels, addition):

    groupedCErrors = dict()
    groupedAErrors = dict()
    groupedMuErrors = dict()
    groupedTreeErrors = dict()
    groupedAverageAncestrySwapErrors = dict()

    for noiseLevel in noiseLevels:
        simulationFolder = dataFolder + "_" + str(noiseLevel)

        #Read all the errors into one list for this noise level
        cErrors = []
        aErrors = []
        muErrors = []
        treeErrors = []
        averagedAncestrySwapError = []
        treeSizes = []

        for subdir, dirs, files in os.walk(simulationFolder):
            if subdir == simulationFolder:  #we are not interested in the root folder
                continue
            for file in files:
                if re.match('cError',
                            file):  #read the file and obtain the error
                    cErrors += collectErrorsFromFile(file, subdir)
                if re.match('aError',
                            file):  #read the file and obtain the error
                    aErrors += collectErrorsFromFile(file, subdir)
                if re.match('muError',
                            file):  #read the file and obtain the error
                    muErrors += collectErrorsFromFile(file, subdir)
                if re.match('treeError',
                            file):  #read the file and obtain the error
                    treeErrors += collectErrorsFromFile(file, subdir)
                if re.match('RealTrees',
                            file):  #read the file and obtain the error
                    stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(
                        file, subdir)[0]
                    tree = eval(stringDict)
                    realTree = Graph(tree['vertices'], set(tree['edges']),
                                     tree['edges'])
                    treeSizes.append(len(realTree.edgeList))

                if re.match('EstimatedTrees',
                            file):  #read the file and obtain the error
                    stringDict = computeTreeErrorOtherMetrics.collectErrorsFromFile(
                        file, subdir)[0]
                    tree = eval(stringDict)
                    inferredTree = Graph(tree['vertices'], set(tree['edges']),
                                         tree['edges'])

            [
                ancestrySwapErrorAbsentInInferred,
                ancestrySwapErrorPresentInInferred, noOfSamplePairs
            ] = computeTreeErrorOtherMetrics.computeAncestrySwapError(
                realTree, inferredTree)

            summedError = (ancestrySwapErrorAbsentInInferred +
                           ancestrySwapErrorPresentInInferred)
            averagedAncestrySwapError.append(summedError /
                                             float(noOfSamplePairs))

        #Gather the data per noise level
        groupedCErrors[noiseLevel] = cErrors
        groupedAErrors[noiseLevel] = aErrors
        groupedMuErrors[noiseLevel] = muErrors
        groupedTreeErrors[noiseLevel] = treeErrors
        groupedAverageAncestrySwapErrors[
            noiseLevel] = averagedAncestrySwapError

    #Return the grouped data
    return [
        groupedCErrors, groupedAErrors, groupedMuErrors, groupedTreeErrors,
        groupedAverageAncestrySwapErrors
    ]
コード例 #9
0
def build_order_2(v,e):
    g=Graph(v)
コード例 #10
0
    def updateTree(self, fullGraph, allSomaticVariants, samples):
        #Copy the full graph, we iteratively update the full graph, but if there is no solution we can get back the original full graph and somatic variants (in case of precursors)
        originalGraph = deepcopy(fullGraph)
        originalSomaticVariants = deepcopy(allSomaticVariants)
        newSomaticVariants = deepcopy(allSomaticVariants)
        vertexNames = fullGraph.vertices

        #We go through the edges involved in causing ISA violation.
        #For either of these edges it may be efficient to place it elsewhere. This is the edge with the largest distance.
        #Remove this edge from the tree
        #Re-run Edmond's algorithm but then without this edge until the ISA is resolved.

        unresolved = False
        resolved = False
        removedEdge = None
        seenPrecursorSamples = []
        edgeGone = False
        iter = 0
        savedGraph = None
        #print "resolving the ISA"
        print "reconstructing tree: "
        #Rather than only storing the full score of the trees, we should keep a score indicating how many edges violate the ISA.
        #If we cannot solve the ISA for the tree, we report the tree which had the fewest number of violations.
        #For every list of trees associated with the # of violations, we can sort the trees by their weights. The tree with the smallest weight will be on top.
        treesAndIsaViolations = dict(
        )  #store the trees by the number of violations. We store the tree objects, these have weights associated.
        newGraph = Graph(vertexNames, set(), [])

        #In the case that we ignore SNVs, we do not need to resolve the ISA by checking bad edges. The first found tree is the solution.

        while resolved is False:
            #print fullGraph.getGraph()
            newEdges = SpanningArborescence(
            ).computeMinimumSpanningArborescence(fullGraph, newSomaticVariants)

            #maybe we should check if all edges are present. If not, then we also introduce a precursor
            if newEdges is not False:
                precursorNeeded = False
                childrenFound = []
                for edge in newEdges:
                    child = edge[2]
                    for sampleInd in range(0, len(samples)):
                        if child == sampleInd:
                            childrenFound.append(child)
                if len(childrenFound) != (
                        len(samples) - 1
                ):  #we will always miss sample 0, this is a parent and not a child.
                    print newEdges
                    print "Warning: the tree misses nodes"
                    precursorNeeded = True

            if newEdges is False and settings.trees['precursor'] is False:
                treesAndIsaViolations[float("inf")] = []
                treesAndIsaViolations[float("inf")].append(deepcopy(newGraph))
                unresolved = True
                break

            #We only attempt to introduce a precursor if this is specified in the settings.
            if newEdges is False or precursorNeeded is True and settings.trees[
                    'precursor'] is True:  #we end up here when there is no more possible tree. In this case, we need to reset the tree and add precursor nodes.
                [newEdges, newSomaticVariants
                 ] = fullGraph.addPrecursorNode(originalGraph,
                                                originalSomaticVariants,
                                                samples)

            newVertices = deepcopy(vertexNames)
            newVertices.append(len(samples))  #add a new precursor state
            newGraph = Graph(newVertices, None, None)
            newGraph.setEdges(newEdges)
            newGraph.edgeAnnotations = fullGraph.edgeAnnotations

            if iter == 0:
                savedGraph = deepcopy(newGraph)
            iter += 1

            badEdgeData = newGraph.checkIfTreeIsValid(newSomaticVariants)
            badEdges = badEdgeData[0]
            violatingEdges = badEdgeData[1]

            #In this case, we do not want to resolve the ISA and the first reported tree is the minimum distance tree.
            #Thus, the bad edges are None by default.
            badEdges = None
            if badEdges is None:

                resolved = True
                break

            #THe number of violating edges is a score for the trees.

            if len(violatingEdges) not in treesAndIsaViolations.keys():
                treesAndIsaViolations[len(violatingEdges)] = []

            treesAndIsaViolations[len(violatingEdges)].append(
                deepcopy(newGraph))
            print "bad edges: ", badEdges
            #Remove the edge with the largest distance
            #we choose the edge that together (somvar * distance) has the worst score.
            currentLargestDist = -float("inf")
            currentWorstEdge = 0
            if len(badEdges) > 0:
                edgeCounter = 0
                for edge in badEdges:
                    child = edge[2]
                    parent = edge[1]
                    print "current edge: ", edge
                    #totalDistance = (math.exp(violatingWeights[edgeCounter])) * (edge[0])
                    totalDistance = edge[0]
                    print "total distance: ", totalDistance
                    if totalDistance > currentLargestDist:
                        currentWorstEdge = edge
                        currentLargestDist = totalDistance

                    edgeCounter += 1
            #remove the problematic edge

            print "removing edge: ", currentWorstEdge
            fullGraph.removeEdge(currentWorstEdge)

            #print "removing edge: ", currentWorstEdge

        #if newGraph is None: #sometimes we cannot resolve the ISA

    #	newGraph = deepcopy(fullGraph)

    #check if the new graph contains all nodes, throw this warning only at the end
        childrenFound = []
        for edge in newGraph.edges:
            child = edge[2]
            for sampleInd in range(0, len(samples)):
                if child == sampleInd:
                    childrenFound.append(child)
        if len(childrenFound) != (
                len(samples) - 1
        ):  #we will always miss sample 0, this is a parent and not a child.
            print "Warning: missing too many samples to resolve the ISA, reporting truncated trees"

        #we also need to check if all edges are there. A tree that is truncated at only one or two positions is not bad, we can still report this to the user and not place one or two nodes.
        #if many more nodes are missing, the tree does not make much sense anymore. Here we can check if most of the tree is missing. If we miss more than 80% of nodes, report the
        #minimum spanning tree instead of an empty/half-empty tree.
        #This step does not work if the alleles are also involved in the lack of nodes! Then we need to work with a completely different set of weights.
        minimumTreeContent = 0.8
        if len(childrenFound) / (len(samples) - 1) < 0.8:
            print "Less than 80% of nodes are placed in the evolutionary tree. Reporting the minimum spanning tree instead, ISA is not resolved"
        message = ""
        if unresolved is True or len(childrenFound) / (
                len(samples) - 1
        ) < 0.8:  #if we did not succeed with introducing a precursor we should also report the best tree
            print "Did not resolve the ISA, selecting the tree with the fewest violations"
            message = "Did not resolve the ISA, reporting the tree with the fewest ISA violations"
            #in this case we select the best tree.
            bestKey = float("inf")

            for violationKey in treesAndIsaViolations.keys():
                if violationKey < bestKey:
                    bestKey = violationKey

            #If the bestKey is infinite, we were unable to reconstruct a tree.
            if bestKey == float("inf"):
                bestTree = newGraph
                print "the best tree: ", bestTree.edgeList
                print "Did not find a correct tree"
                return bestTree

            #Obtain the set of trees with this number of violations
            bestTree = None
            bestTreeWeight = float("inf")
            for tree in treesAndIsaViolations[bestKey]:
                if tree.getTotalWeight() < bestTreeWeight:
                    bestTree = tree
                    bestTreeWeight = tree.getTotalWeight()

            #print "number of violations: ", bestKey

            newGraph = deepcopy(
                bestTree
            )  #use the first made graph, this is without edit operations and precursors that failed.
        return [newGraph, message]
コード例 #11
0
    def run(self, samples):

        eventDistances = self.eventDistances
        cCombinations = self.cCombinations

        #get all somatic variants in one binary numpy array
        allSomaticVariants = self.mergeVariantsIntoMatrix(samples)

        #set the segmentation
        samples = self.updateSampleSegmentation(samples)

        measurementLength = len(samples[0].measurements.measurements)

        #Define the original graph, here the parent is always the healthy cell for every subclone.
        vertexNames = []  #use this to convert back to the actual names later
        for sample in samples:
            vertexNames.append(sample.name)
        vertices = range(0, len(samples))

        #Define the first graph, where node 0 is the parent of all subclones
        edgeList = []
        for i in range(1, len(vertices)):
            edgeList.append((0, 0, i))
        currentGraph = Graph(vertices, set(edgeList), edgeList)

        maxIterNum = settings.general['maximumIterations'] - 1
        converged = False
        iteration = 0
        graphList = [
        ]  #store all the graphs that we have made up until that point
        #Keep a table that we print to a file afterwards in which we store the edges that have been present and their occurrence at each iteration

        iterationGraphs = dict()
        edges = dict()
        iterationMu = dict()  #store the mu of the samples per iteration
        iterationMessages = dict()
        while converged is not True:

            start_time = time.time()
            #This is where we get back after one iteration. The samples need to have their parents updated based on the current graph

            print "iteration: ", iteration

            #1. Infer the best C and mu combination per sample given our current tree
            samples = self.inferBestCMu(samples, currentGraph)

            #The best C and mu are stored for each sample, we wish to obtain this information for each sample individually and store it in a matrix that we can use to compute distances easily to infer a tree.
            [cMatrix, aMatrix] = self.getCAndAMatrices(samples)

            #Compute the distance matrix that we use to infer the tree
            [dists, aDists, sDists, snvAnnotations,
             alleleAnnotations] = self.computeDistanceMatrix(aMatrix, samples)

            #Generate the initial tree
            fullGraph = self.generateInitialTree(vertexNames, dists, aDists,
                                                 samples, snvAnnotations,
                                                 alleleAnnotations)

            #Update the tree to resolve the ISA
            [newGraph, message] = self.updateTree(fullGraph,
                                                  allSomaticVariants, samples)

            #make a check for convergence, see if the current tree has the same edges as the newly inferred edges. Are the weights the same? Otherwise continue until a max.
            #we should keep the current graphs in a list and report them based on their scores.
            if iteration == maxIterNum:
                converged = True
            else:
                for previousGraph in graphList:
                    if newGraph.compareIfGraphIsTheSame(previousGraph) is True:

                        converged = True

            graphList.append(currentGraph)
            currentGraph = newGraph

            # newEdgeList = []
            # for edge in newGraph.edgeList: #append the edges to the unique list, we will print this later
            # 	newEdge = (0, edge[1], edge[2])
            # 	newEdgeList.append(newEdge)
            # 	if newEdge not in uniqueEdges:
            # 		uniqueEdges.append(newEdge)
            #
            # edges[iteration] = newEdgeList

            print "best graph: "
            print currentGraph.getGraph()

            #print the total distance in the graph as well
            print currentGraph.getTotalWeight()

            #Store the graph of each iteration by its weight.
            iterationGraphs[iteration] = currentGraph

            #Loop through the current mu (tumor fraction) of the samples, store these in a dictionary
            allMu = []
            for sample in samples:
                allMu.append(sample.bestCMu[0].mu.mu[1])

            iterationMu[iteration] = allMu
            iterationMessages[iteration] = message
            iteration += 1

        iterationGraphs = self.updateTreeNodeNames(iterationGraphs,
                                                   vertexNames)

        return [
            cMatrix, aMatrix, samples, iterationGraphs, iterationMu,
            iterationMessages
        ]  #are the samples references between classes? Otherwise we may not need to return it for the mu values.
コード例 #12
0
eAMatrix = np.loadtxt(sys.argv[2] + '/aMatrix.txt', dtype=str)
eMu = np.loadtxt(sys.argv[2] + '/EstimatedMu.txt', dtype=float)

#Obtain the tree
text_file = open(sys.argv[2] + '/EstimatedTree.txt', "r")
lines = text_file.read()
stringDict = []

for line in lines.split("\n"):
    if line != "":
        stringDict.append(line)

text_file.close()

tree = eval(stringDict[0])
eTree = Graph(tree['vertices'], set(tree['edges']), tree['edges'])

#Also read the real matrices with the ground truth

cMatrix = np.loadtxt(sys.argv[3] + '/RealC.txt', dtype=int)
aMatrix = np.loadtxt(sys.argv[3] + '/RealA.txt', dtype=str)
realMu = np.loadtxt(sys.argv[3] + '/RealMu.txt', dtype=float)

#Obtain the tree
text_file = open(sys.argv[3] + '/RealTree.txt', "r")
lines = text_file.read()
stringDict = []

for line in lines.split("\n"):
    if line != "":
        stringDict.append(line)