def setAlleleCombinations(self): k = self.c[1] for copyN in range(0, k + 1): ACount = copyN BCount = k - copyN tumorAlleles = Alleles(ACount, BCount) normalAlleles = Alleles(1, 1) self.alleleCombinations.append( AlleleCombination([normalAlleles, tumorAlleles]))
def getAllelesByLaf(self, parentAlleles, laf): #is this the right place for this step? if self.mu.mu[ 0] == 1: #if there is no tumor component the alleles can only be AB by assumption return Alleles(1, 1) #Old code to determine the valleys, this is now pre-computed. #we need to know the parent alleles to get the right mixture model to extract the right alleles given a laf # mixtureModel = self.getMixtureModelByParent(parentAlleles.getAllelesAsString()) # # #if we know where the local minimum starts and ends, we know which regions are associated with which alleles # #In the event of no minima, the array of valleys will be empty. # # #Getting the valleys is a very slow function, slowdown with ~ 2 seconds # valleys = mixtureModel.getValleys() #these are the tresholds, they are assumed to be in the valleys. Actual valley detection is tricky with a lot of sequencing noise (individual distributions overlap) # # #Get the number of the valley that the laf is located in, use this to get the allele combination back # startLaf = [0] # endLaf = [0.5] # # moddedValleys = startLaf + valleys + endLaf #here we can now just lookup the valleys without having to compute them moddedValleys = self.mixtureModelValleys[ parentAlleles.getAllelesAsString()] #between which possible LAF is the measurement value located? THe alleles remain ordered, so we can select the right allele combination. alleleInd = 0 for border in range(0, len(moddedValleys) - 1): if laf >= moddedValleys[border] and laf <= moddedValleys[border + 1]: alleleInd = border return self.c.alleleCombinations[alleleInd].alleles[1]
def generateAlleleList(self, kmin, kmax): self.alleleList = [] for k in range(kmin, kmax + 1): #make Allele object for copyN in range(0, k + 1): ACount = copyN BCount = k - copyN self.alleleList.append(Alleles(ACount, BCount).alleleString) #make a second, dictionary-based allele list for lookup speed self.alleleListDict = dict() addedVals = 0 for k in range(kmin, kmax + 1): #make Allele object for copyN in range(0, k + 1): ACount = copyN BCount = k - copyN self.alleleListDict[Alleles( ACount, BCount).alleleString] = addedVals #remmber index addedVals += 1
def computeAmbiguousPositions(): #We can generate the allele list with the event distances function kmin = 1 #the kmin and kmax used in the simulations kmax = 6 eventDistance = EventDistances(kmin, kmax) #get the allele list alleleList = eventDistance.alleleList #make sure that alleles are not duplicated LAFAndCombinations = dict() normalAlleles = Alleles(1, 1) for allele in alleleList: AOccurrences = [m.start() for m in re.finditer('A', allele)] ACount = len(AOccurrences) BOccurrences = [m.start() for m in re.finditer('B', allele)] BCount = len(BOccurrences) alleleObj = Alleles(ACount, BCount) if BCount > ACount or BCount == ACount: alleleCombination = AlleleCombination([normalAlleles, alleleObj]) for muIndex in range(0, 101): LAF = alleleCombination.computeLAF( Mu(muIndex) ) #compute the LAF that each combination would generate if LAF not in LAFAndCombinations.keys(): #LAFAndCombinations[LAF] = [] LAFAndCombinations[LAF] = 0 #LAFAndCombinations[LAF].append((alleleObj.getAllelesAsString(), muIndex)) LAFAndCombinations[LAF] += 1 #print LAFAndCombinations #For every mu, we should check which LAF the combination with normal would generate #With this dictionary, we can check if a LAF has more than one solution. If true, then we can check and see if the position is correct or not. With that, we compute a score showing the #number of ambiguous positions that we were able to infer correctly. This score is a higher wow factor than return LAFAndCombinations
def computeATreeError(aMatrix, lafMatrix, afMatrix, realTree): sampleNum = aMatrix.shape[1] aObjMatrix = np.empty(aMatrix.shape, dtype=object) #Convert the a matrix to an actual allele matrix for row in range(0, aMatrix.shape[0]): for col in range(0, aMatrix.shape[1]): allele = aMatrix[row][col] AOccurrences = [m.start() for m in re.finditer('A', allele)] ACount = len(AOccurrences) BOccurrences = [m.start() for m in re.finditer('B', allele)] BCount = len(BOccurrences) alleleObj = Alleles(ACount, BCount) aObjMatrix[row][col] = alleleObj #Compute the distance pairwise between samples distanceMatrix = np.empty([sampleNum, sampleNum], dtype=float) [chromosomes, positions, segmentation, chromosomeArms] = parseReferenceFile() for sample1 in range(0, sampleNum): for sample2 in range(0, sampleNum): #make a dummy sample object for the FST function sample1Obj = Sample(None, None) sample1Obj.measurements = LAF(lafMatrix[:, sample1], chromosomes, positions, positions) sample1Obj.measurements.segmentation = segmentation sample1Obj.afMeasurements = afMatrix[:, sample1] sample2Obj = Sample(None, None) sample2Obj.measurements = LAF(lafMatrix[:, sample2], chromosomes, positions, positions) sample2Obj.measurements.segmentation = segmentation sample2Obj.afMeasurements = afMatrix[:, sample2] #The distance can be computed for the entire column at once using the FST [messages, dist] = FST().computeAlleleDistance(aObjMatrix[:, sample1], aObjMatrix[:, sample2], sample1Obj, sample2Obj) distanceMatrix[sample1, sample2] = dist #print distanceMatrix #exit() #Compute the MST fullGraph = generateInitialTree(distanceMatrix, realTree.vertices) mst = computeMST(fullGraph, realTree.vertices) simulationErrorHandler = SimulationErrorHandler() treeScore = simulationErrorHandler.computeTreeError([mst], realTree) return treeScore
def duplicateGenome(self, simulator): newC = [] for c in self.C: oldCTumor = c.c[1] newC.append(C([2, oldCTumor*2])) newAlleles = [] #Make sure that the allele identifiers are the same across a region with the same chromosome arm. prevArm = simulator.chromosomeArms[0] currentAlleleA = 'A' + str(uuid.uuid4()) currentAlleleB = 'B' + str(uuid.uuid4()) for a in range(0, len(self.A)): newACount = self.A[a].ACount*2 newBCount = self.A[a].BCount*2 #make the new names #Check for each arm if it is the same as the previous. If true, append the same name #if the chromosome arm is different, append a new one if simulator.chromosomeArms[a] != prevArm: currentAlleleA = 'A' + str(uuid.uuid4()) currentAlleleB = 'B' + str(uuid.uuid4()) #the identifiers for the new alleles depends on the ACount. newAlleleObjects = Alleles(newACount, newBCount) newAlleleObjects.alleleIdentifiers += self.A[a].alleleIdentifiers for newAIdentifier in range(0, (self.A[a].ACount*2 - self.A[a].ACount)): newAlleleObjects.alleleIdentifiers.append(currentAlleleA) for newBIdentifier in range(0, (self.A[a].BCount*2 - self.A[a].BCount)): newAlleleObjects.alleleIdentifiers.append(currentAlleleB) newAlleles.append(newAlleleObjects) prevArm = simulator.chromosomeArms[a] #Check if our alleles always match across the chromosomes #we are using this for duplicating a precursor that does not have somatic variants. If the precursor did have somatic variants, we should duplicate these too here! precursor = Subclone() precursor.C = newC precursor.A = newAlleles precursor.somaticVariants = deepcopy(self.somaticVariants) precursor.parent = self precursor.name = str(uuid.uuid4()) self.children.append(precursor) return precursor
def parseTCOutputDistances(self, simulationClasses): #For this we need to read the actual output aMatrix files, from this we can again compute the distances based on all to all distanceMatrices = dict() for subdir in simulationClasses: simulationClass = simulationClasses[subdir] estimatedAStrMatrix = np.loadtxt(subdir + '/EstimatedA_0.txt', dtype='object') estimatedAMatrix = np.empty(estimatedAStrMatrix.shape, dtype='object') #convert the stirngs to allele objects for row in range(0, estimatedAStrMatrix.shape[0]): for col in range(0, estimatedAStrMatrix.shape[1]): #count the number of A's and B's in the string allele = estimatedAStrMatrix[row][col] AOccurrences = [ m.start() for m in re.finditer('A', allele) ] ACount = len(AOccurrences) BOccurrences = [ m.start() for m in re.finditer('B', allele) ] BCount = len(BOccurrences) alleleObj = Alleles(ACount, BCount) estimatedAMatrix[row][col] = alleleObj distanceMatrix = PairwiseDistance().fstAlleleDistance( estimatedAMatrix, simulationClass.samples) sampleOrder = [] for sample in simulationClass.samples: sampleOrder.append(sample.name) distanceMatrices[subdir] = [distanceMatrix, sampleOrder] return distanceMatrices
def computeCorrectAmbiguityScore(LAFAndCombinations, simulationFolder): ambiguityScores = [] ambiguities = [] correctAmbiguityPositions = 0 totalAmbiguousPositions = 0 totalSize = 0 #We need to read the actual A matrix values and also the mu normalAlleles = Alleles(1, 1) #1. read the simulated A matrix allPerColAmbiguities = dict() for subdir, dirs, files in os.walk(simulationFolder): if subdir == simulationFolder: #we are not interested in the root folder continue for file in files: if re.match('RealA', file): #read the file and obtain the a matrix realAMatrix = np.loadtxt(subdir + '/' + file, dtype=str) if re.match('RealMu', file): #also read the real mu realMu = collectErrorsFromFile(file, subdir) #Then load the inferred A and mu if re.match('EstimatedA', file): #read the file and obtain the a matrix estimatedAMatrix = np.loadtxt(subdir + '/' + file, dtype=str) if re.match('EstimatedMu', file): #also read the real mu estimatedMu = collectErrorsFromFile(file, subdir) #Compute the LAF that each measurement in the real data would generate perColAmbiguityCount = dict() for row in range(0, realAMatrix.shape[0]): for col in range(0, realAMatrix.shape[1]): if col not in perColAmbiguityCount: perColAmbiguityCount[col] = realAMatrix.shape[0] totalSize += 1 #generate allele object allele = realAMatrix[row][col] AOccurrences = [m.start() for m in re.finditer('A', allele)] ACount = len(AOccurrences) BOccurrences = [m.start() for m in re.finditer('B', allele)] BCount = len(BOccurrences) alleleObj = Alleles(ACount, BCount) alleleCombination = AlleleCombination( [normalAlleles, alleleObj]) #Compute the LAF this combination would generate muNormal = 1 - (realMu[col]) realMuObj = Mu( int(muNormal * 100)) #this function only takes integer indices! realLAF = alleleCombination.computeLAF(realMuObj) #Check if this LAF is ambiguous y/n. ambiguousCount = LAFAndCombinations[realLAF] #If the ambiguous count > 1 and we are correct, we make a note of that. if ambiguousCount > 1: totalAmbiguousPositions += 1 if realAMatrix[row][col] == estimatedAMatrix[row][col]: correctAmbiguityPositions += 1 perColAmbiguityCount[ col] -= 1 #Determine how many positions are wrong. #Divide the ambiguity score by the total number of positions. #print correctAmbiguityPositions #print correctAmbiguityPositions / float(totalAmbiguousPositions) #print totalAmbiguousPositions / float(totalSize) #ambiguityScores.append(correctAmbiguityPositions / float(totalAmbiguousPositions)) #Reporting as % of ambiuguities #Reporting the ambiguity scores as the fraction of the total ambiguityScores.append(correctAmbiguityPositions / float(totalSize)) ambiguities.append(totalAmbiguousPositions / float(totalSize)) allPerColAmbiguities[subdir] = perColAmbiguityCount #Compute an average for every noise level. #convert to z-scores averageAmbiguityScore = sum(ambiguityScores) / float(len(ambiguityScores)) averageAmbiguities = sum(ambiguities) / float(len(ambiguities)) return [ averageAmbiguities, averageAmbiguityScore, ambiguityScores, allPerColAmbiguities ]
precursorPloidy = int(settings.general['precursorPloidy']) precursorAlleleACount = int(settings.general['precursorAlleleACount']) precursorAlleleBCount = int(settings.general['precursorAlleleBCount']) #Check if the ploidy is correct totalPloidy = precursorAlleleACount + precursorAlleleBCount precursorTumorFrequency = 100 #Check if the ploidy is different from 2 (or allele balance). If true, then the precursor is not a healthy cell and we have 100% tumor if precursorAlleleACount != 1 or precursorAlleleBCount != 1 or precursorPloidy != 2: precursorTumorFrequency = 0 #In this case we have 100% tumor in the precursor. Only if the precursor it is normal it is 0% tumor. #Initialize the 'healthy' sample, this can now also be a precursor healthySample = Sample(None, None) healthySample.C = [C([2, precursorPloidy])] * measurementLength healthySample.A = [Alleles(precursorAlleleACount, precursorAlleleBCount) ] * measurementLength healthySample.Mu = [Mu(precursorTumorFrequency)] #obtain the chromosome, start and end information from the other samples healthySample.measurements = LAF([0.5] * measurementLength, tmpSamples[0].measurements.chromosomes, tmpSamples[0].measurements.starts, tmpSamples[0].measurements.ends) healthySample.somaticVariants = [0] * somVarNum healthySample.somaticVariantsInd = tmpSamples[0].somaticVariantsInd healthySample.setParent(None) healthySample.name = 'Precursor' #do not call it healthy, it may also be a 4N precursor. #Make a dummy bestCMu for the healthy sample eventDistances = targetClone.eventDistances bestCMuHealthy = CMuCombination(C([2, precursorPloidy]),
print "C error: ", cScore / cMatrixFloat.size #The A matrices need to be converted to A object matrices. aObjMatrix = np.empty(aMatrix.shape, dtype=object) eAObjMatrix = np.empty(aMatrix.shape, dtype=object) for row in range(0, aMatrix.shape[0]): for col in range(0, aMatrix.shape[1]): #generate allele object allele = aMatrix[row][col] AOccurrences = [m.start() for m in re.finditer('A', allele)] ACount = len(AOccurrences) BOccurrences = [m.start() for m in re.finditer('B', allele)] BCount = len(BOccurrences) alleleObj = Alleles(ACount, BCount) aObjMatrix[row][col] = alleleObj allele = eAMatrix[row][col] AOccurrences = [m.start() for m in re.finditer('A', allele)] ACount = len(AOccurrences) BOccurrences = [m.start() for m in re.finditer('B', allele)] BCount = len(BOccurrences) alleleObj = Alleles(ACount, BCount) eAObjMatrix[row][col] = alleleObj aData = simulationErrorHandler.computeAError(aObjMatrix, eAObjMatrix) print "A error: ", aData[0] / float(aMatrix.size) muError = simulationErrorHandler.computeMuErrorFromVectors(realMu, eMu)
def computeATreeError(aMatrix, lafMatrix, afMatrix, realTree, chromosomes, positions): segmentationFile = simulationSettings.files['segmentationFile'] segmentation = Segmentation() segmentation.setSegmentationFromFile(segmentationFile) sampleNum = aMatrix.shape[1] aObjMatrix = np.empty(aMatrix.shape, dtype=object) #Convert the a matrix to an actual allele matrix for row in range(0, aMatrix.shape[0]): for col in range(0, aMatrix.shape[1]): allele = aMatrix[row][col] AOccurrences = [m.start() for m in re.finditer('A', allele)] ACount = len(AOccurrences) BOccurrences = [m.start() for m in re.finditer('B', allele)] BCount = len(BOccurrences) alleleObj = Alleles(ACount, BCount) aObjMatrix[row][col] = alleleObj #Compute the distance pairwise between samples distanceMatrix = np.empty([sampleNum, sampleNum], dtype=float) for sample1 in range(0, sampleNum): for sample2 in range(0, sampleNum): #make a dummy sample object for the FST function sample1Obj = Sample(None, None) sample1Obj.measurements = LAF(lafMatrix[:, sample1], chromosomes, positions, positions) sample1Obj.measurements.segmentation = segmentation sample1Obj.afMeasurements = afMatrix[:, sample1] sample2Obj = Sample(None, None) sample2Obj.measurements = LAF(lafMatrix[:, sample2], chromosomes, positions, positions) sample2Obj.measurements.segmentation = segmentation sample2Obj.afMeasurements = afMatrix[:, sample2] #The distance can be computed for the entire column at once using the FST [messages, dist] = FST().computeAlleleDistance(aObjMatrix[:, sample1], aObjMatrix[:, sample2], sample1Obj, sample2Obj) distanceMatrix[sample1, sample2] = dist #print distanceMatrix #exit() #Compute the MST fullGraph = generateInitialTree(distanceMatrix, realTree.vertices) inferredTree = computeMST(fullGraph, realTree.vertices) [ ancestrySwapErrorAbsentInInferred, ancestrySwapErrorPresentInInferred, noOfSamplePairs ] = computeAncestrySwapError(realTree, inferredTree) summedError = (ancestrySwapErrorAbsentInInferred + ancestrySwapErrorPresentInInferred) averagedAncestrySwapError = summedError / float(noOfSamplePairs) #simulationErrorHandler = SimulationErrorHandler() #treeScore = simulationErrorHandler.computeTreeError([mst], realTree) return averagedAncestrySwapError