Beispiel #1
0
 def setAlleleCombinations(self):
     k = self.c[1]
     for copyN in range(0, k + 1):
         ACount = copyN
         BCount = k - copyN
         tumorAlleles = Alleles(ACount, BCount)
         normalAlleles = Alleles(1, 1)
         self.alleleCombinations.append(
             AlleleCombination([normalAlleles, tumorAlleles]))
Beispiel #2
0
    def getAllelesByLaf(self, parentAlleles,
                        laf):  #is this the right place for this step?

        if self.mu.mu[
                0] == 1:  #if there is no tumor component the alleles can only be AB by assumption
            return Alleles(1, 1)

        #Old code to determine the valleys, this is now pre-computed.
        #we need to know the parent alleles to get the right mixture model to extract the right alleles given a laf
        # mixtureModel = self.getMixtureModelByParent(parentAlleles.getAllelesAsString())
        #
        # #if we know where the local minimum starts and ends, we know which regions are associated with which alleles
        # #In the event of no minima, the array of valleys will be empty.
        #
        # #Getting the valleys is a very slow function, slowdown with ~ 2 seconds
        # valleys = mixtureModel.getValleys() #these are the tresholds, they are assumed to be in the valleys. Actual valley detection is tricky with a lot of sequencing noise (individual distributions overlap)
        #
        # #Get the number of the valley that the laf is located in, use this to get the allele combination back
        # startLaf = [0]
        # endLaf = [0.5]
        #
        # moddedValleys = startLaf + valleys + endLaf

        #here we can now just lookup the valleys without having to compute them
        moddedValleys = self.mixtureModelValleys[
            parentAlleles.getAllelesAsString()]

        #between which possible LAF is the measurement value located? THe alleles remain ordered, so we can select the right allele combination.
        alleleInd = 0
        for border in range(0, len(moddedValleys) - 1):
            if laf >= moddedValleys[border] and laf <= moddedValleys[border +
                                                                     1]:
                alleleInd = border

        return self.c.alleleCombinations[alleleInd].alleles[1]
Beispiel #3
0
    def generateAlleleList(self, kmin, kmax):
        self.alleleList = []
        for k in range(kmin, kmax + 1):
            #make Allele object
            for copyN in range(0, k + 1):
                ACount = copyN
                BCount = k - copyN
                self.alleleList.append(Alleles(ACount, BCount).alleleString)

        #make a second, dictionary-based allele list for lookup speed
        self.alleleListDict = dict()
        addedVals = 0
        for k in range(kmin, kmax + 1):
            #make Allele object
            for copyN in range(0, k + 1):
                ACount = copyN
                BCount = k - copyN
                self.alleleListDict[Alleles(
                    ACount, BCount).alleleString] = addedVals  #remmber index
                addedVals += 1
def computeAmbiguousPositions():

    #We can generate the allele list with the event distances function
    kmin = 1  #the kmin and kmax used in the simulations
    kmax = 6
    eventDistance = EventDistances(kmin, kmax)

    #get the allele list
    alleleList = eventDistance.alleleList

    #make sure that alleles are not duplicated
    LAFAndCombinations = dict()
    normalAlleles = Alleles(1, 1)
    for allele in alleleList:
        AOccurrences = [m.start() for m in re.finditer('A', allele)]
        ACount = len(AOccurrences)
        BOccurrences = [m.start() for m in re.finditer('B', allele)]
        BCount = len(BOccurrences)

        alleleObj = Alleles(ACount, BCount)
        if BCount > ACount or BCount == ACount:
            alleleCombination = AlleleCombination([normalAlleles, alleleObj])

            for muIndex in range(0, 101):

                LAF = alleleCombination.computeLAF(
                    Mu(muIndex)
                )  #compute the LAF that each combination would generate
                if LAF not in LAFAndCombinations.keys():
                    #LAFAndCombinations[LAF] = []
                    LAFAndCombinations[LAF] = 0

                #LAFAndCombinations[LAF].append((alleleObj.getAllelesAsString(), muIndex))
                LAFAndCombinations[LAF] += 1

    #print LAFAndCombinations
    #For every mu, we should check which LAF the combination with normal would generate

    #With this dictionary, we can check if a LAF has more than one solution. If true, then we can check and see if the position is correct or not. With that, we compute a score showing the
    #number of ambiguous positions that we were able to infer correctly. This score is a higher wow factor than
    return LAFAndCombinations
def computeATreeError(aMatrix, lafMatrix, afMatrix, realTree):
    sampleNum = aMatrix.shape[1]

    aObjMatrix = np.empty(aMatrix.shape, dtype=object)
    #Convert the a matrix to an actual allele matrix
    for row in range(0, aMatrix.shape[0]):
        for col in range(0, aMatrix.shape[1]):
            allele = aMatrix[row][col]
            AOccurrences = [m.start() for m in re.finditer('A', allele)]
            ACount = len(AOccurrences)
            BOccurrences = [m.start() for m in re.finditer('B', allele)]
            BCount = len(BOccurrences)

            alleleObj = Alleles(ACount, BCount)
            aObjMatrix[row][col] = alleleObj

    #Compute the distance pairwise between samples
    distanceMatrix = np.empty([sampleNum, sampleNum], dtype=float)
    [chromosomes, positions, segmentation,
     chromosomeArms] = parseReferenceFile()
    for sample1 in range(0, sampleNum):
        for sample2 in range(0, sampleNum):
            #make a dummy sample object for the FST function
            sample1Obj = Sample(None, None)
            sample1Obj.measurements = LAF(lafMatrix[:, sample1], chromosomes,
                                          positions, positions)
            sample1Obj.measurements.segmentation = segmentation
            sample1Obj.afMeasurements = afMatrix[:, sample1]
            sample2Obj = Sample(None, None)
            sample2Obj.measurements = LAF(lafMatrix[:, sample2], chromosomes,
                                          positions, positions)
            sample2Obj.measurements.segmentation = segmentation
            sample2Obj.afMeasurements = afMatrix[:, sample2]

            #The distance can be computed for the entire column at once using the FST
            [messages,
             dist] = FST().computeAlleleDistance(aObjMatrix[:, sample1],
                                                 aObjMatrix[:, sample2],
                                                 sample1Obj, sample2Obj)
            distanceMatrix[sample1, sample2] = dist
    #print distanceMatrix
    #exit()
    #Compute the MST
    fullGraph = generateInitialTree(distanceMatrix, realTree.vertices)
    mst = computeMST(fullGraph, realTree.vertices)
    simulationErrorHandler = SimulationErrorHandler()
    treeScore = simulationErrorHandler.computeTreeError([mst], realTree)
    return treeScore
Beispiel #6
0
	def duplicateGenome(self, simulator):
		newC = []
		for c in self.C:
			oldCTumor = c.c[1]
			newC.append(C([2, oldCTumor*2]))
		newAlleles = []
		#Make sure that the allele identifiers are the same across a region with the same chromosome arm.
		
		prevArm = simulator.chromosomeArms[0]
		currentAlleleA = 'A' + str(uuid.uuid4())
		currentAlleleB = 'B' + str(uuid.uuid4())
		for a in range(0, len(self.A)):
			newACount = self.A[a].ACount*2
			newBCount = self.A[a].BCount*2
			#make the new names
			
			#Check for each arm if it is the same as the previous. If true, append the same name
			#if the chromosome arm is different, append a new one
			if simulator.chromosomeArms[a] != prevArm:
				currentAlleleA = 'A' + str(uuid.uuid4())
				currentAlleleB = 'B' + str(uuid.uuid4())
			
			#the identifiers for the new alleles depends on the ACount.
			newAlleleObjects = Alleles(newACount, newBCount)
			newAlleleObjects.alleleIdentifiers += self.A[a].alleleIdentifiers
			for newAIdentifier in range(0, (self.A[a].ACount*2 - self.A[a].ACount)):
				newAlleleObjects.alleleIdentifiers.append(currentAlleleA)
			for newBIdentifier in range(0, (self.A[a].BCount*2 - self.A[a].BCount)):
				newAlleleObjects.alleleIdentifiers.append(currentAlleleB)	
			newAlleles.append(newAlleleObjects)
			prevArm = simulator.chromosomeArms[a]
		
		#Check if our alleles always match across the chromosomes
		
		#we are using this for duplicating a precursor that does not have somatic variants. If the precursor did have somatic variants, we should duplicate these too here!
		
		precursor = Subclone()
		precursor.C = newC
		precursor.A = newAlleles
		precursor.somaticVariants = deepcopy(self.somaticVariants)
		precursor.parent = self
		precursor.name = str(uuid.uuid4())
		self.children.append(precursor)
		
		return precursor
Beispiel #7
0
    def parseTCOutputDistances(self, simulationClasses):
        #For this we need to read the actual output aMatrix files, from this we can again compute the distances based on all to all
        distanceMatrices = dict()
        for subdir in simulationClasses:

            simulationClass = simulationClasses[subdir]
            estimatedAStrMatrix = np.loadtxt(subdir + '/EstimatedA_0.txt',
                                             dtype='object')

            estimatedAMatrix = np.empty(estimatedAStrMatrix.shape,
                                        dtype='object')
            #convert the stirngs to allele objects
            for row in range(0, estimatedAStrMatrix.shape[0]):
                for col in range(0, estimatedAStrMatrix.shape[1]):
                    #count the number of A's and B's in the string
                    allele = estimatedAStrMatrix[row][col]
                    AOccurrences = [
                        m.start() for m in re.finditer('A', allele)
                    ]
                    ACount = len(AOccurrences)
                    BOccurrences = [
                        m.start() for m in re.finditer('B', allele)
                    ]
                    BCount = len(BOccurrences)

                    alleleObj = Alleles(ACount, BCount)
                    estimatedAMatrix[row][col] = alleleObj

            distanceMatrix = PairwiseDistance().fstAlleleDistance(
                estimatedAMatrix, simulationClass.samples)
            sampleOrder = []
            for sample in simulationClass.samples:
                sampleOrder.append(sample.name)

            distanceMatrices[subdir] = [distanceMatrix, sampleOrder]

        return distanceMatrices
def computeCorrectAmbiguityScore(LAFAndCombinations, simulationFolder):
    ambiguityScores = []
    ambiguities = []
    correctAmbiguityPositions = 0
    totalAmbiguousPositions = 0
    totalSize = 0
    #We need to read the actual A matrix values and also the mu
    normalAlleles = Alleles(1, 1)
    #1. read the simulated A matrix

    allPerColAmbiguities = dict()

    for subdir, dirs, files in os.walk(simulationFolder):
        if subdir == simulationFolder:  #we are not interested in the root folder
            continue
        for file in files:
            if re.match('RealA', file):  #read the file and obtain the a matrix
                realAMatrix = np.loadtxt(subdir + '/' + file, dtype=str)
            if re.match('RealMu', file):  #also read the real mu
                realMu = collectErrorsFromFile(file, subdir)

            #Then load the inferred A and mu
            if re.match('EstimatedA',
                        file):  #read the file and obtain the a matrix
                estimatedAMatrix = np.loadtxt(subdir + '/' + file, dtype=str)
            if re.match('EstimatedMu', file):  #also read the real mu
                estimatedMu = collectErrorsFromFile(file, subdir)

        #Compute the LAF that each measurement in the real data would generate
        perColAmbiguityCount = dict()
        for row in range(0, realAMatrix.shape[0]):
            for col in range(0, realAMatrix.shape[1]):

                if col not in perColAmbiguityCount:
                    perColAmbiguityCount[col] = realAMatrix.shape[0]

                totalSize += 1
                #generate allele object
                allele = realAMatrix[row][col]
                AOccurrences = [m.start() for m in re.finditer('A', allele)]
                ACount = len(AOccurrences)
                BOccurrences = [m.start() for m in re.finditer('B', allele)]
                BCount = len(BOccurrences)

                alleleObj = Alleles(ACount, BCount)
                alleleCombination = AlleleCombination(
                    [normalAlleles, alleleObj])

                #Compute the LAF this combination would generate
                muNormal = 1 - (realMu[col])
                realMuObj = Mu(
                    int(muNormal *
                        100))  #this function only takes integer indices!
                realLAF = alleleCombination.computeLAF(realMuObj)

                #Check if this LAF is ambiguous y/n.
                ambiguousCount = LAFAndCombinations[realLAF]

                #If the ambiguous count > 1 and we are correct, we make a note of that.
                if ambiguousCount > 1:
                    totalAmbiguousPositions += 1

                    if realAMatrix[row][col] == estimatedAMatrix[row][col]:
                        correctAmbiguityPositions += 1
                        perColAmbiguityCount[
                            col] -= 1  #Determine how many positions are wrong.

        #Divide the ambiguity score by the total number of positions.
        #print correctAmbiguityPositions
        #print correctAmbiguityPositions / float(totalAmbiguousPositions)
        #print totalAmbiguousPositions / float(totalSize)
        #ambiguityScores.append(correctAmbiguityPositions / float(totalAmbiguousPositions)) #Reporting as % of ambiuguities
        #Reporting the ambiguity scores as the fraction of the total
        ambiguityScores.append(correctAmbiguityPositions / float(totalSize))

        ambiguities.append(totalAmbiguousPositions / float(totalSize))
        allPerColAmbiguities[subdir] = perColAmbiguityCount
    #Compute an average for every noise level.

    #convert to z-scores

    averageAmbiguityScore = sum(ambiguityScores) / float(len(ambiguityScores))
    averageAmbiguities = sum(ambiguities) / float(len(ambiguities))

    return [
        averageAmbiguities, averageAmbiguityScore, ambiguityScores,
        allPerColAmbiguities
    ]
Beispiel #9
0
precursorPloidy = int(settings.general['precursorPloidy'])

precursorAlleleACount = int(settings.general['precursorAlleleACount'])
precursorAlleleBCount = int(settings.general['precursorAlleleBCount'])

#Check if the ploidy is correct
totalPloidy = precursorAlleleACount + precursorAlleleBCount

precursorTumorFrequency = 100  #Check if the ploidy is different from 2 (or allele balance). If true, then the precursor is not a healthy cell and we have 100% tumor
if precursorAlleleACount != 1 or precursorAlleleBCount != 1 or precursorPloidy != 2:
    precursorTumorFrequency = 0  #In this case we have 100% tumor in the precursor. Only if the precursor it is normal it is 0% tumor.

#Initialize the 'healthy' sample, this can now also be a precursor
healthySample = Sample(None, None)
healthySample.C = [C([2, precursorPloidy])] * measurementLength
healthySample.A = [Alleles(precursorAlleleACount, precursorAlleleBCount)
                   ] * measurementLength
healthySample.Mu = [Mu(precursorTumorFrequency)]
#obtain the chromosome, start and end information from the other samples
healthySample.measurements = LAF([0.5] * measurementLength,
                                 tmpSamples[0].measurements.chromosomes,
                                 tmpSamples[0].measurements.starts,
                                 tmpSamples[0].measurements.ends)
healthySample.somaticVariants = [0] * somVarNum
healthySample.somaticVariantsInd = tmpSamples[0].somaticVariantsInd
healthySample.setParent(None)
healthySample.name = 'Precursor'  #do not call it healthy, it may also be a 4N precursor.

#Make a dummy bestCMu for the healthy sample
eventDistances = targetClone.eventDistances
bestCMuHealthy = CMuCombination(C([2, precursorPloidy]),
print "C error: ", cScore / cMatrixFloat.size

#The A matrices need to be converted to A object matrices.
aObjMatrix = np.empty(aMatrix.shape, dtype=object)
eAObjMatrix = np.empty(aMatrix.shape, dtype=object)
for row in range(0, aMatrix.shape[0]):
    for col in range(0, aMatrix.shape[1]):
        #generate allele object
        allele = aMatrix[row][col]
        AOccurrences = [m.start() for m in re.finditer('A', allele)]
        ACount = len(AOccurrences)
        BOccurrences = [m.start() for m in re.finditer('B', allele)]
        BCount = len(BOccurrences)

        alleleObj = Alleles(ACount, BCount)
        aObjMatrix[row][col] = alleleObj

        allele = eAMatrix[row][col]
        AOccurrences = [m.start() for m in re.finditer('A', allele)]
        ACount = len(AOccurrences)
        BOccurrences = [m.start() for m in re.finditer('B', allele)]
        BCount = len(BOccurrences)

        alleleObj = Alleles(ACount, BCount)
        eAObjMatrix[row][col] = alleleObj

aData = simulationErrorHandler.computeAError(aObjMatrix, eAObjMatrix)
print "A error: ", aData[0] / float(aMatrix.size)

muError = simulationErrorHandler.computeMuErrorFromVectors(realMu, eMu)
def computeATreeError(aMatrix, lafMatrix, afMatrix, realTree, chromosomes,
                      positions):
    segmentationFile = simulationSettings.files['segmentationFile']

    segmentation = Segmentation()
    segmentation.setSegmentationFromFile(segmentationFile)

    sampleNum = aMatrix.shape[1]

    aObjMatrix = np.empty(aMatrix.shape, dtype=object)
    #Convert the a matrix to an actual allele matrix
    for row in range(0, aMatrix.shape[0]):
        for col in range(0, aMatrix.shape[1]):
            allele = aMatrix[row][col]
            AOccurrences = [m.start() for m in re.finditer('A', allele)]
            ACount = len(AOccurrences)
            BOccurrences = [m.start() for m in re.finditer('B', allele)]
            BCount = len(BOccurrences)

            alleleObj = Alleles(ACount, BCount)
            aObjMatrix[row][col] = alleleObj

    #Compute the distance pairwise between samples
    distanceMatrix = np.empty([sampleNum, sampleNum], dtype=float)

    for sample1 in range(0, sampleNum):
        for sample2 in range(0, sampleNum):
            #make a dummy sample object for the FST function
            sample1Obj = Sample(None, None)
            sample1Obj.measurements = LAF(lafMatrix[:, sample1], chromosomes,
                                          positions, positions)
            sample1Obj.measurements.segmentation = segmentation
            sample1Obj.afMeasurements = afMatrix[:, sample1]
            sample2Obj = Sample(None, None)
            sample2Obj.measurements = LAF(lafMatrix[:, sample2], chromosomes,
                                          positions, positions)
            sample2Obj.measurements.segmentation = segmentation
            sample2Obj.afMeasurements = afMatrix[:, sample2]

            #The distance can be computed for the entire column at once using the FST
            [messages,
             dist] = FST().computeAlleleDistance(aObjMatrix[:, sample1],
                                                 aObjMatrix[:, sample2],
                                                 sample1Obj, sample2Obj)
            distanceMatrix[sample1, sample2] = dist
    #print distanceMatrix
    #exit()
    #Compute the MST
    fullGraph = generateInitialTree(distanceMatrix, realTree.vertices)
    inferredTree = computeMST(fullGraph, realTree.vertices)

    [
        ancestrySwapErrorAbsentInInferred, ancestrySwapErrorPresentInInferred,
        noOfSamplePairs
    ] = computeAncestrySwapError(realTree, inferredTree)

    summedError = (ancestrySwapErrorAbsentInInferred +
                   ancestrySwapErrorPresentInInferred)
    averagedAncestrySwapError = summedError / float(noOfSamplePairs)

    #simulationErrorHandler = SimulationErrorHandler()
    #treeScore = simulationErrorHandler.computeTreeError([mst], realTree)
    return averagedAncestrySwapError