def getAltExonLength(variant, exonicPortionSize, intronicPortionSize, deNovoDonorInRefAcc=False, donor=True): """ Given a variant and the exonic portion size, returns the length of the alternate exon after splicing occurs in max MES window Donor argument determines if function is used for de novo donor (donor=True) or de novo acceptor (donor=False) deNovoDonorInRefAcc=True if looking for deNovoDonor in ref acceptor site, False otherwise """ if varInExon(variant): varExonNum = getVarExonNumberSNS(variant) else: if varInSpliceRegion(variant, donor=donor, deNovo=False): spliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) varExonNum = spliceBounds["exonName"] else: varExonNum = getClosestExonNumberIntronicSNS(variant, "enigma", donor=donor) exonBounds = extract.getExonBoundaries(variant) slidingWindowInfo = getMaxMaxEntScanScoreSlidingWindowSNS( variant, exonicPortionSize, STD_DE_NOVO_LENGTH, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) newSplicePos = extract.getNewSplicePosition( variant["Pos"], verify.getVarStrand(variant), slidingWindowInfo["varWindowPosition"], slidingWindowInfo["inExonicPortion"], exonicPortionSize, intronicPortionSize, donor=donor) if verify.getVarStrand(variant) == "-": if donor: varExonStart = int(exonBounds[varExonNum]["exonStart"]) # +1 to account for all positions including newSplicePos # adding one to exonStart increases length by 1 bp because numbering decreases from left to right exonLength = varExonStart - newSplicePos + 1 else: varExonEnd = int(exonBounds[varExonNum]["exonEnd"]) # -1 to account for all position including newSplicePos # subtracting one increases length by 1 bp because numbering decreases from left to right exonLength = newSplicePos - varExonEnd - 1 else: varExonEnd = int(exonBounds[varExonNum]["exonEnd"]) if donor: refExonLength = getRefExonLength(variant, donor=donor) # need to compare to refExonLength because of + strand gene exonLength = refExonLength - (varExonEnd - newSplicePos) else: exonLength = varExonEnd - newSplicePos return exonLength
def getClosestExonNumberIntronicSNS(variant, boundaries, donor=True): """ Given a variant and boundaries (either priors or enigma), 1. Checks that variant is in an intron or UTR and is a SNS variant 2. Determines the exon end that is closest to that variant Returns the closest exon end in the format "exonN" If variant is not in an intron or UTR, returns "exon0" """ varLoc = getVarLocation(variant, boundaries) if (varLoc == "intron_variant" or varLoc == "UTR_variant") and extract.getVarType(variant) == "substitution" and not varInExon(variant): exonBounds = extract.getExonBoundaries(variant) varGenPos = variant["Pos"] exonIntronDiffs = {} for exon in exonBounds.keys(): if verify.getVarStrand(variant) == "+": if donor: exonIntronDiff = int(varGenPos) - int(exonBounds[exon]["exonEnd"]) else: exonIntronDiff = int(exonBounds[exon]["exonStart"]) - int(varGenPos) if exonIntronDiff > 0: exonIntronDiffs[exon] = exonIntronDiff else: if donor: exonIntronDiff = int(exonBounds[exon]["exonEnd"]) - int(varGenPos) else: exonIntronDiff = int(varGenPos) - int(exonBounds[exon]["exonStart"]) if exonIntronDiff > 0: exonIntronDiffs[exon] = exonIntronDiff closestExonInfo = min(exonIntronDiffs.items(), key=lambda k: k[1]) return closestExonInfo[0] return "exon0"
def varInCIDomain(variant, boundaries): """ Given a variant, determines if variant is in a clinically important domain Second argument determiens which boundaries (ENIGMA or PRIORS) are used for CI domains Returns True if variant in CI domain """ varGenPos = int(variant["Pos"]) varGene = variant["Gene_Symbol"] varStrand = verify.getVarStrand(variant) inExon = varInExon(variant) if inExon: if varGene == "BRCA1": for domain in brca1CIDomains[boundaries].keys(): domainStart = brca1CIDomains[boundaries][domain]["domStart"] domainEnd = brca1CIDomains[boundaries][domain]["domEnd"] withinBoundaries = verify.checkWithinBoundaries(varStrand, varGenPos, domainStart, domainEnd) if withinBoundaries: return True elif varGene == "BRCA2": for domain in brca2CIDomains[boundaries].keys(): domainStart = brca2CIDomains[boundaries][domain]["domStart"] domainEnd = brca2CIDomains[boundaries][domain]["domEnd"] withinBoundaries = verify.checkWithinBoundaries(varStrand, varGenPos, domainStart, domainEnd) if withinBoundaries: return True return False
def varInCIDomain(variant, boundaries): """ Given a variant, determines if variant is in a clinically important domain Second argument determiens which boundaries (ENIGMA or PRIORS) are used for CI domains Returns True if variant in CI domain """ varGenPos = int(variant["Pos"]) varGene = variant["Gene_Symbol"] varStrand = verify.getVarStrand(variant) inExon = varInExon(variant) if inExon: if varGene == "BRCA1": for domain in brca1CIDomains[boundaries].keys(): domainStart = brca1CIDomains[boundaries][domain]["domStart"] domainEnd = brca1CIDomains[boundaries][domain]["domEnd"] withinBoundaries = verify.checkWithinBoundaries( varStrand, varGenPos, domainStart, domainEnd) if withinBoundaries: return True elif varGene == "BRCA2": for domain in brca2CIDomains[boundaries].keys(): domainStart = brca2CIDomains[boundaries][domain]["domStart"] domainEnd = brca2CIDomains[boundaries][domain]["domEnd"] withinBoundaries = verify.checkWithinBoundaries( varStrand, varGenPos, domainStart, domainEnd) if withinBoundaries: return True return False
def getRefSpliceDonorBoundaries(variant, intronicLength, exonicLength): """ Given a variant, intronicLength and exonicLength returns the splice donor boundaries intronicLength = number of bp in intron that will be considered as part of splice donor region exonicLength = number of bp in exon that will be considered as part of splice donor region splice region is the last exonicLength bp in the exon and first intronicLength bp in the intron for the variant's transcript in a dictionary with the format: key = exon number, value = dictionary with donor start and donor end for exon """ varExons = getExonBoundaries(variant) donorExons = varExons.copy() if variant["Gene_Symbol"] == "BRCA1": del donorExons["exon24"] elif variant["Gene_Symbol"] == "BRCA2": del donorExons["exon27"] varStrand = verify.getVarStrand(variant) donorBoundaries = {} for exon in donorExons.keys(): exonEnd = int(donorExons[exon]["exonEnd"]) if varStrand == "+": # exonicLength + 1 because genomic position in RefSeq starts to the right of the first base # which affects 5' side of sequence, donor start is 5' to exon end for + strand transcripts donorStart = exonEnd - exonicLength + 1 donorEnd = exonEnd + intronicLength else: donorStart = exonEnd + exonicLength # intronicLength + 1 because genomic position in RefSeq starts to the right of the first base # which affects 5' side of sequence, donor end is 5' to exon end for - strand transcripts donorEnd = exonEnd - intronicLength + 1 donorBoundaries[exon] = { "donorStart": donorStart, "donorEnd": donorEnd } return donorBoundaries
def getVarSpliceRegionBounds(variant, donor=False, deNovo=False): """ Given a variant, checks if variant is in a splice donor/acceptor region If donor=True, checks if variant is in a splice donor region and returns boundaries for splice donor region *function CANNOT be used to return de novo donor splice region bounds* If donor=False and deNovo=False, checks if variant is in a ref splice acceptor region and returns boundaries for splice acceptor region If donor=False and deNovo=True, checks if variant is in a de novo splice acceptor region and returns boundaries for that region If variant is in a splice region, returns a dictionary with region boundaries where variant is located """ if varInSpliceRegion(variant, donor=donor, deNovo=deNovo): if not donor: if not deNovo: regionBounds = extract.getSpliceAcceptorBoundaries(variant, STD_ACC_INTRONIC_LENGTH, STD_ACC_EXONIC_LENGTH) else: regionBounds = extract.getSpliceAcceptorBoundaries(variant, STD_ACC_INTRONIC_LENGTH, STD_DE_NOVO_LENGTH) regionStartKey = "acceptorStart" regionEndKey = "acceptorEnd" else: regionBounds = extract.getRefSpliceDonorBoundaries(variant, STD_DONOR_INTRONIC_LENGTH, STD_DONOR_EXONIC_LENGTH) regionStartKey = "donorStart" regionEndKey = "donorEnd" for exon in regionBounds.keys(): regionStart = regionBounds[exon][regionStartKey] regionEnd = regionBounds[exon][regionEndKey] withinBoundaries = verify.checkWithinBoundaries(verify.getVarStrand(variant), int(variant["Pos"]), regionStart, regionEnd) if withinBoundaries: return {"exonName": exon, regionStartKey: regionStart, regionEndKey: regionEnd}
def getSpliceAcceptorBoundaries(variant, intronicLength, exonicLength): """ Given a variant, intronicLength and exonicLength returns the splice acceptor boundaries intronicLength = number of bp in intron that will be considered as part of splice acceptor region exonicLength = number of bp in exon that will be considered as part of splice acceptor region splice rgion is the last intronicLength bp in the exon and first exonicLength bp in the exon for the variant's transcript in a dictionary with the format: key = exon number, value = a dictionary with acceptor start and acceptor end for exon """ varExons = getExonBoundaries(variant) acceptorExons = varExons.copy() if variant["Gene_Symbol"] == "BRCA1" or variant["Gene_Symbol"] == "BRCA2": del acceptorExons["exon1"] varStrand = verify.getVarStrand(variant) acceptorBoundaries = {} for exon in acceptorExons.keys(): exonStart = int(acceptorExons[exon]["exonStart"]) if varStrand == "+": # intronicLength + 1 because genomic position in RefSeq starts to the right of the first base # which affects 5' side of sequence, acceptor start is 5' to exon start for + strand transcripts acceptorStart = exonStart - intronicLength + 1 acceptorEnd = exonStart + exonicLength else: acceptorStart = exonStart + intronicLength # exonicLength + 1 because genomic position in RefSeq starts to the right of the first base # which affects 5' side of sequence, acceptor end is 5' to exon start for - strand transcripts acceptorEnd = exonStart - exonicLength + 1 acceptorBoundaries[exon] = { "acceptorStart": acceptorStart, "acceptorEnd": acceptorEnd } return acceptorBoundaries
def getRefExonLength(variant, donor=True): """ Given a variant, returns the length of the reference exon If variant is in an exon, returns length of that exon If variant is in a reference splice region, returns length of exon in which exonic portion is included If variant is in intron, returns exon in which either closest splice donor or acceptor is included depending on donor argument If donor=True, returns exon length for previous exon If donor=False, returns exon length for subsequent exon """ if varInExon(variant): varExonNum = getVarExonNumberSNS(variant) else: if varInSpliceRegion(variant, donor=donor, deNovo=False): spliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) varExonNum = spliceBounds["exonName"] else: varExonNum = getClosestExonNumberIntronicSNS(variant, "enigma", donor=donor) exonBounds = extract.getExonBoundaries(variant) if verify.getVarStrand(variant) == "-": varExonStart = int(exonBounds[varExonNum]["exonStart"]) varExonEnd = int(exonBounds[varExonNum]["exonEnd"]) # +1 is not included in the below equation for exonLength # because due to RefSeq numbering varExonEnd is 1 bp too long # varExonEnd is first intronic base (+1 position) # for minus strand genes exonLength = varExonStart - varExonEnd else: varExonStart = int(exonBounds[varExonNum]["exonStart"]) varExonEnd = int(exonBounds[varExonNum]["exonEnd"]) # +1 is not included in the below equatio for exonLength # because due to RefSeq numbering varExonStart is 1 bp too long # varExonStart is last intronic base (-1 position) # for plus strand genes exonLength = varExonEnd - varExonStart return exonLength
def varInSpliceRegion(variant, donor=False, deNovo=False): """ Given a variant, determines if a variant is in reference transcript's splice donor/acceptor region If donor=True and deNovo=False, checks if variant is in a reference splice donor region If donor=True and deNovo=True, checks if variant is in a de novo splice donor region If donor=False and deNovo=False, checks if variant is in a reference splice acceptor region If donor=False and deNovo=True, checks if variant is in a de novo splice acceptor region Returns True if variant is in a splice region, false otherwise """ if not donor and not deNovo: regionBounds = extract.getSpliceAcceptorBoundaries(variant, STD_ACC_INTRONIC_LENGTH, STD_ACC_EXONIC_LENGTH) elif not donor and deNovo: regionBounds = extract.getSpliceAcceptorBoundaries(variant, STD_ACC_INTRONIC_LENGTH, STD_DE_NOVO_LENGTH) elif donor: # gets reference donor splice boundaries, if deNovo = True then entireity of exon will be included below regionBounds = extract.getRefSpliceDonorBoundaries(variant, STD_DONOR_INTRONIC_LENGTH, STD_DONOR_EXONIC_LENGTH) for exon in regionBounds.keys(): if not donor: regionStart = regionBounds[exon]["acceptorStart"] regionEnd = regionBounds[exon]["acceptorEnd"] else: regionStart = regionBounds[exon]["donorStart"] regionEnd = regionBounds[exon]["donorEnd"] withinBoundaries = verify.checkWithinBoundaries(verify.getVarStrand(variant), int(variant["Pos"]), regionStart, regionEnd) if withinBoundaries and not donor: return True elif donor and not deNovo and withinBoundaries: return True # because de novo donor region includes reference splice donor region and entirity of exon elif donor and deNovo and (withinBoundaries or varInExon(variant)): return True return False
def getRefSpliceDonorBoundaries(variant, intronicLength, exonicLength): """ Given a variant, intronicLength and exonicLength returns the splice donor boundaries intronicLength = number of bp in intron that will be considered as part of splice donor region exonicLength = number of bp in exon that will be considered as part of splice donor region splice region is the last exonicLength bp in the exon and first intronicLength bp in the intron for the variant's transcript in a dictionary with the format: key = exon number, value = dictionary with donor start and donor end for exon """ varExons = getExonBoundaries(variant) donorExons = varExons.copy() if variant["Gene_Symbol"] == "BRCA1": del donorExons["exon24"] elif variant["Gene_Symbol"] == "BRCA2": del donorExons["exon27"] varStrand = verify.getVarStrand(variant) donorBoundaries = {} for exon in donorExons.keys(): exonEnd = int(donorExons[exon]["exonEnd"]) if varStrand == "+": # exonicLength + 1 because genomic position in RefSeq starts to the right of the first base # which affects 5' side of sequence, donor start is 5' to exon end for + strand transcripts donorStart = exonEnd - exonicLength + 1 donorEnd = exonEnd + intronicLength else: donorStart = exonEnd + exonicLength # intronicLength + 1 because genomic position in RefSeq starts to the right of the first base # which affects 5' side of sequence, donor end is 5' to exon end for - strand transcripts donorEnd = exonEnd - intronicLength + 1 donorBoundaries[exon] = {"donorStart": donorStart, "donorEnd": donorEnd} return donorBoundaries
def getSpliceAcceptorBoundaries(variant, intronicLength, exonicLength): """ Given a variant, intronicLength and exonicLength returns the splice acceptor boundaries intronicLength = number of bp in intron that will be considered as part of splice acceptor region exonicLength = number of bp in exon that will be considered as part of splice acceptor region splice rgion is the last intronicLength bp in the exon and first exonicLength bp in the exon for the variant's transcript in a dictionary with the format: key = exon number, value = a dictionary with acceptor start and acceptor end for exon """ varExons = getExonBoundaries(variant) acceptorExons = varExons.copy() if variant["Gene_Symbol"] == "BRCA1" or variant["Gene_Symbol"] == "BRCA2": del acceptorExons["exon1"] varStrand = verify.getVarStrand(variant) acceptorBoundaries = {} for exon in acceptorExons.keys(): exonStart = int(acceptorExons[exon]["exonStart"]) if varStrand == "+": # intronicLength + 1 because genomic position in RefSeq starts to the right of the first base # which affects 5' side of sequence, acceptor start is 5' to exon start for + strand transcripts acceptorStart = exonStart - intronicLength + 1 acceptorEnd = exonStart + exonicLength else: acceptorStart = exonStart + intronicLength # exonicLength + 1 because genomic position in RefSeq starts to the right of the first base # which affects 5' side of sequence, acceptor end is 5' to exon start for - strand transcripts acceptorEnd = exonStart - exonicLength + 1 acceptorBoundaries[exon] = {"acceptorStart": acceptorStart, "acceptorEnd": acceptorEnd} return acceptorBoundaries
def getDeNovoFrameshiftAndCIStatus(variant, boundaries, donor=True, deNovoDonorInRefAcc=False): """ Given a variant, boundaries (enigma or priors), donor argument, and deNovoDonorInRefAcc argument: donor argument = True for de novo donors, False for de novo acceptors deNovoDonorInRefAcc argument = True if lookign for de novo donor in ref acceptor site, False otherwise Determines if new splice position causes a frameshift and would disrupt a CI Domain If de novo splicing would cause a frameshift, returns False Else, checks to see if new splice position would splice out (skip) a CI domain If variant de novo splice position does not cause a frameshift and does not disrupt a CI domain, reutrns True Returns False otherwise """ frameshiftStatus = getDeNovoSpliceFrameshiftStatus(variant, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) # checks to make sure that variant does not cause a frameshift if frameshiftStatus: return False else: # determine if CI domain is in region that would be skipped by new splicing if varInExon(variant): varExonNum = getVarExonNumberSNS(variant) else: if varInSpliceRegion(variant, donor=donor, deNovo=False): spliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) varExonNum = spliceBounds["exonName"] else: if donor: # if a variant is in an intron de novo donor cannot splice out any of the exon # so no part of a CI domain will be spliced out return True # varExonNum is a string in the format "exonN" varWindowPos = getVarWindowPosition(variant, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) inExonicPortion = varInExonicPortion(variant, STD_EXONIC_PORTION, STD_DE_NOVO_LENGTH, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) regionStart = extract.getNewSplicePosition(variant["Pos"], verify.getVarStrand(variant), varWindowPos, inExonicPortion, STD_EXONIC_PORTION, STD_ACC_INTRONIC_LENGTH, donor=donor) if donor: # nextExonNum parses out N from varExonNum and adds 1 to get next exon number "exonN+1" # uses [4:] to remove "exon" from "exonN" so can add 1 to N to get N+1 nextExonNum = "exon" + str(int(varExonNum[4:]) + 1) # skips to exon 5 for any variants in BRCA1 exon 3 because exon 4 does not exist in BRCA1 RefSeq transcript if variant["Gene_Symbol"] == "BRCA1" and nextExonNum == "exon4": nextExonNum = "exon5" refSpliceAccBounds = extract.getSpliceAcceptorBoundaries(variant, STD_ACC_INTRONIC_LENGTH, STD_ACC_EXONIC_LENGTH) regionEnd = refSpliceAccBounds[nextExonNum]["acceptorStart"] else: # prevExonNum parses out N from varExonNum and adds 1 to get previous exon number "exonN-1" # uses [4:] to remove "exon" from "exonN" so can subtract 1 to N to get N-1 prevExonNum = "exon" + str(int(varExonNum[4:]) - 1) if variant["Gene_Symbol"] == "BRCA1" and prevExonNum == "exon4": prevExonNum = "exon3" refSpliceDonorBounds = extract.getRefSpliceDonorBoundaries(variant, STD_DONOR_INTRONIC_LENGTH, STD_DONOR_EXONIC_LENGTH) regionEnd = refSpliceDonorBounds[prevExonNum]["donorEnd"] CIDomainInRegion = verify.isCIDomainInRegion(regionStart, regionEnd, boundaries, variant["Gene_Symbol"]) if not CIDomainInRegion: return True return False
def getAltExonLength(variant, exonicPortionSize, intronicPortionSize, deNovoDonorInRefAcc=False, donor=True): """ Given a variant and the exonic portion size, returns the length of the alternate exon after splicing occurs in max MES window Donor argument determines if function is used for de novo donor (donor=True) or de novo acceptor (donor=False) deNovoDonorInRefAcc=True if looking for deNovoDonor in ref acceptor site, False otherwise """ if varInExon(variant): varExonNum = getVarExonNumberSNS(variant) else: if varInSpliceRegion(variant, donor=donor, deNovo=False): spliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) varExonNum = spliceBounds["exonName"] else: varExonNum = getClosestExonNumberIntronicSNS(variant, "enigma", donor=donor) exonBounds = extract.getExonBoundaries(variant) slidingWindowInfo = getMaxMaxEntScanScoreSlidingWindowSNS(variant, exonicPortionSize, STD_DE_NOVO_LENGTH, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) newSplicePos = extract.getNewSplicePosition(variant["Pos"], verify.getVarStrand(variant), slidingWindowInfo["varWindowPosition"], slidingWindowInfo["inExonicPortion"], exonicPortionSize, intronicPortionSize, donor=donor) if verify.getVarStrand(variant) == "-": if donor: varExonStart = int(exonBounds[varExonNum]["exonStart"]) # +1 to account for all positions including newSplicePos # adding one to exonStart increases length by 1 bp because numbering decreases from left to right exonLength = varExonStart - newSplicePos + 1 else: varExonEnd = int(exonBounds[varExonNum]["exonEnd"]) # -1 to account for all position including newSplicePos # subtracting one increases length by 1 bp because numbering decreases from left to right exonLength = newSplicePos - varExonEnd - 1 else: varExonEnd = int(exonBounds[varExonNum]["exonEnd"]) if donor: refExonLength = getRefExonLength(variant, donor=donor) # need to compare to refExonLength because of + strand gene exonLength = refExonLength - (varExonEnd - newSplicePos) else: exonLength = varExonEnd - newSplicePos return exonLength
def getMaxEntScanScoresSlidingWindowSNS(variant, windowSize, donor=False): """ Given a variant and window size determines window sequences and scores for a sliding window that is the size of windowSize If donor=True, calculates MaxEntScan scores for splice donors If donor=False, calculates MaxEntScan scores for splice acceptors Returns a dictionary containing: 1. window sequences - ref and alt seq for each window (variant in positions 1-windowSize) 2. window scores - ref and alt MaxEntScan scores and zscores for each window 3. window alt MaxEntScan scores - only contains alt MaxEntScan scores for each window """ varGenPos = int(variant["Pos"]) varStrand = verify.getVarStrand(variant) # use +- (windowSize - 1) to get (windowSize*2 - 1) bp region so that have sequence for: # each window of size windowSize bp with variant in each position (1-windowSize) # minus strand and plus strand are opposite for +- (windowSize - 1) to preserve sequence returned by getRefAltSeqs offset = windowSize - 1 varPos = windowSize windowEnd = windowSize totalPositions = windowSize if varStrand == "-": regionStart = varGenPos + offset regionEnd = varGenPos - offset else: regionStart = varGenPos - offset regionEnd = varGenPos + offset refAltSeqs = getRefAltSeqs(variant, regionStart, regionEnd) refSeq = refAltSeqs["refSeq"] altSeq = refAltSeqs["altSeq"] windowStart = 0 windowSeqs = {} windowScores = {} windowAltMaxEntScanScores = {} while windowStart < totalPositions: refWindowSeq = refSeq[windowStart:windowEnd] altWindowSeq = altSeq[windowStart:windowEnd] windowSeqs[varPos] = {"refSeq": refWindowSeq, "altSeq": altWindowSeq} refAltWindowScores = getRefAltScores(refWindowSeq, altWindowSeq, donor=donor) windowScores[varPos] = {"refMaxEntScanScore": refAltWindowScores["refScores"]["maxEntScanScore"], "refZScore": refAltWindowScores["refScores"]["zScore"], "altMaxEntScanScore": refAltWindowScores["altScores"]["maxEntScanScore"], "altZScore": refAltWindowScores["altScores"]["zScore"]} windowAltMaxEntScanScores[varPos] = refAltWindowScores["altScores"]["maxEntScanScore"] varPos -= 1 windowStart += 1 windowEnd += 1 return {"windowSeqs": windowSeqs, "windowScores": windowScores, "windowAltMaxEntScanScores": windowAltMaxEntScanScores}
def isDeNovoWildTypeSplicePosDistanceDivisibleByThree(variant, exonicPortionSize, intronicPortionSize, deNovoDonorInRefAcc=False, donor=True): """ Given a variant, compares de novo splicing position with wild-type splicing position exonicPortionSize refers to length in bp that is considered to be in exonic portion of splice site intronicPortionSize referes to length in bp that is considered to be in intronic portion of splice site deNovoDonorInRefAcc argument=True if looking for de novo donor in reference splice acceptor region, False otherwise Donor argument determines if function is used for de novo donor (donor=True) or de novo acceptor (donor=False) If distance between de novo and wild-type donors is divisible by 3, returns True returns False otherwise This function is another way to check if a de novo splice site would cause a frameshift mutation If it returns True, then de novo splicing would not cause a frameshift If it returns False, then de novo splicing would cause a frameshift """ if varInExon(variant): varExonNum = getVarExonNumberSNS(variant) else: if varInSpliceRegion(variant, donor=donor, deNovo=False): spliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) varExonNum = spliceBounds["exonName"] else: varExonNum = getClosestExonNumberIntronicSNS(variant, "enigma", donor=donor) varStrand = verify.getVarStrand(variant) refExonBounds = extract.getExonBoundaries(variant) slidingWindowInfo = getMaxMaxEntScanScoreSlidingWindowSNS(variant, exonicPortionSize, STD_DE_NOVO_LENGTH, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) deNovoSplicePos = extract.getNewSplicePosition(variant["Pos"], varStrand, slidingWindowInfo["varWindowPosition"], slidingWindowInfo["inExonicPortion"], exonicPortionSize, intronicPortionSize, donor=donor) if donor: wildTypeSplicePos = refExonBounds[varExonNum]["exonEnd"] if varStrand == "+": distanceBetween = wildTypeSplicePos - deNovoSplicePos else: # +1 for minus strand donor because splice donor position is to the right of splice cut position distanceBetween = deNovoSplicePos - (wildTypeSplicePos + 1) else: wildTypeSplicePos = refExonBounds[varExonNum]["exonStart"] if varStrand == "+": distanceBetween = abs(deNovoSplicePos - wildTypeSplicePos) else: # +1 for minus strand acceptor because splice acceptor position is to the left of splice cut position distanceBetween = abs((wildTypeSplicePos + 1) - deNovoSplicePos) if distanceBetween % 3 == 0: return True return False
def getRefAltSeqs(variant, rangeStart, rangeStop): """ Given a variant, rangeStart, and rangeStop: Returns a dicitonary with ref and alt seq for the specified variant and range """ varChrom = getVarChrom(variant) varStrand = verify.getVarStrand(variant) if varStrand == "-": refSeq = getFastaSeq(varChrom, rangeStart, rangeStop, plusStrandSeq=False) else: refSeq = getFastaSeq(varChrom, rangeStart, rangeStop, plusStrandSeq=True) refSeqDict = getSeqLocDict(varChrom, varStrand, rangeStart, rangeStop) altSeqDict = getAltSeqDict(variant, refSeqDict) altSeq = getAltSeq(altSeqDict, varStrand) return {"refSeq": refSeq, "altSeq": altSeq}
def getVarExonNumberSNS(variant): """ Given a SNS variant, checks that variant is in an exon If variant in an exon, returns the number of the exon variant is located within in format "exonN" """ if varInExon(variant): varGenPos = int(variant["Pos"]) varExons = extract.getExonBoundaries(variant) varStrand = verify.getVarStrand(variant) for exon in varExons.keys(): exonStart = varExons[exon]["exonStart"] exonEnd = varExons[exon]["exonEnd"] if varStrand == "+": if varGenPos > exonStart and varGenPos <= exonEnd: return exon else: if varGenPos <= exonStart and varGenPos > exonEnd: return exon
def getVarDict(variant, boundaries): """ Given input data, returns a dictionary containing information for each variant in input Dictionary key is variant HGVS_cDNA and value is a dictionary containing variant gene, variant chromosome, variant strand, variant genomic coordinate, variant type, and variant location """ varStrand = getVarStrand(variant) varType = getVarType(variant) varLoc = getVarLocation(variant, boundaries) varDict = {"varGene": variant["Gene_Symbol"], "varChrom": variant["Chr"], "varStrand": varStrand, "varGenCoordinate": variant["Pos"], "varType": varType, "varLoc": varLoc, "varHGVScDNA": variant["HGVS_cDNA"]} return varDict
def varInExon(variant): """ Given a variant, determines if variant genomic position is inside transcript boundaries AND if variant is in an exon Returns true if variant is in an exon """ varOutBounds = verify.varOutsideBoundaries(variant) if not varOutBounds: varGenPos = int(variant["Pos"]) varExons = extract.getExonBoundaries(variant) varStrand = verify.getVarStrand(variant) for exon in varExons.keys(): exonStart = int(varExons[exon]["exonStart"]) exonEnd = int(varExons[exon]["exonEnd"]) if varStrand == "+": if varGenPos > exonStart and varGenPos <= exonEnd: return True else: if varGenPos <= exonStart and varGenPos > exonEnd: return True return False
def varInSpliceRegion(variant, donor=False, deNovo=False): """ Given a variant, determines if a variant is in reference transcript's splice donor/acceptor region If donor=True and deNovo=False, checks if variant is in a reference splice donor region If donor=True and deNovo=True, checks if variant is in a de novo splice donor region If donor=False and deNovo=False, checks if variant is in a reference splice acceptor region If donor=False and deNovo=True, checks if variant is in a de novo splice acceptor region Returns True if variant is in a splice region, false otherwise """ if not donor and not deNovo: regionBounds = extract.getSpliceAcceptorBoundaries( variant, STD_ACC_INTRONIC_LENGTH, STD_ACC_EXONIC_LENGTH) elif not donor and deNovo: regionBounds = extract.getSpliceAcceptorBoundaries( variant, STD_ACC_INTRONIC_LENGTH, STD_DE_NOVO_LENGTH) elif donor: # gets reference donor splice boundaries, if deNovo = True then entireity of exon will be included below regionBounds = extract.getRefSpliceDonorBoundaries( variant, STD_DONOR_INTRONIC_LENGTH, STD_DONOR_EXONIC_LENGTH) for exon in regionBounds.keys(): if not donor: regionStart = regionBounds[exon]["acceptorStart"] regionEnd = regionBounds[exon]["acceptorEnd"] else: regionStart = regionBounds[exon]["donorStart"] regionEnd = regionBounds[exon]["donorEnd"] withinBoundaries = verify.checkWithinBoundaries( verify.getVarStrand(variant), int(variant["Pos"]), regionStart, regionEnd) if withinBoundaries and not donor: return True elif donor and not deNovo and withinBoundaries: return True # because de novo donor region includes reference splice donor region and entirity of exon elif donor and deNovo and (withinBoundaries or varInExon(variant)): return True return False
def getClosestExonNumberIntronicSNS(variant, boundaries, donor=True): """ Given a variant and boundaries (either priors or enigma), 1. Checks that variant is in an intron or UTR and is a SNS variant 2. Determines the exon end that is closest to that variant Returns the closest exon end in the format "exonN" If variant is not in an intron or UTR, returns "exon0" """ varLoc = getVarLocation(variant, boundaries) if (varLoc == "intron_variant" or varLoc == "UTR_variant") and extract.getVarType( variant) == "substitution" and not varInExon(variant): exonBounds = extract.getExonBoundaries(variant) varGenPos = variant["Pos"] exonIntronDiffs = {} for exon in exonBounds.keys(): if verify.getVarStrand(variant) == "+": if donor: exonIntronDiff = int(varGenPos) - int( exonBounds[exon]["exonEnd"]) else: exonIntronDiff = int( exonBounds[exon]["exonStart"]) - int(varGenPos) if exonIntronDiff > 0: exonIntronDiffs[exon] = exonIntronDiff else: if donor: exonIntronDiff = int( exonBounds[exon]["exonEnd"]) - int(varGenPos) else: exonIntronDiff = int(varGenPos) - int( exonBounds[exon]["exonStart"]) if exonIntronDiff > 0: exonIntronDiffs[exon] = exonIntronDiff closestExonInfo = min(exonIntronDiffs.items(), key=lambda k: k[1]) return closestExonInfo[0] return "exon0"
def getVarSpliceRegionBounds(variant, donor=False, deNovo=False): """ Given a variant, checks if variant is in a splice donor/acceptor region If donor=True, checks if variant is in a splice donor region and returns boundaries for splice donor region *function CANNOT be used to return de novo donor splice region bounds* If donor=False and deNovo=False, checks if variant is in a ref splice acceptor region and returns boundaries for splice acceptor region If donor=False and deNovo=True, checks if variant is in a de novo splice acceptor region and returns boundaries for that region If variant is in a splice region, returns a dictionary with region boundaries where variant is located """ if varInSpliceRegion(variant, donor=donor, deNovo=deNovo): if not donor: if not deNovo: regionBounds = extract.getSpliceAcceptorBoundaries( variant, STD_ACC_INTRONIC_LENGTH, STD_ACC_EXONIC_LENGTH) else: regionBounds = extract.getSpliceAcceptorBoundaries( variant, STD_ACC_INTRONIC_LENGTH, STD_DE_NOVO_LENGTH) regionStartKey = "acceptorStart" regionEndKey = "acceptorEnd" else: regionBounds = extract.getRefSpliceDonorBoundaries( variant, STD_DONOR_INTRONIC_LENGTH, STD_DONOR_EXONIC_LENGTH) regionStartKey = "donorStart" regionEndKey = "donorEnd" for exon in regionBounds.keys(): regionStart = regionBounds[exon][regionStartKey] regionEnd = regionBounds[exon][regionEndKey] withinBoundaries = verify.checkWithinBoundaries( verify.getVarStrand(variant), int(variant["Pos"]), regionStart, regionEnd) if withinBoundaries: return { "exonName": exon, regionStartKey: regionStart, regionEndKey: regionEnd }
def isDeNovoWildTypeSplicePosDistanceDivisibleByThree( variant, exonicPortionSize, intronicPortionSize, deNovoDonorInRefAcc=False, donor=True): """ Given a variant, compares de novo splicing position with wild-type splicing position exonicPortionSize refers to length in bp that is considered to be in exonic portion of splice site intronicPortionSize referes to length in bp that is considered to be in intronic portion of splice site deNovoDonorInRefAcc argument=True if looking for de novo donor in reference splice acceptor region, False otherwise Donor argument determines if function is used for de novo donor (donor=True) or de novo acceptor (donor=False) If distance between de novo and wild-type donors is divisible by 3, returns True returns False otherwise This function is another way to check if a de novo splice site would cause a frameshift mutation If it returns True, then de novo splicing would not cause a frameshift If it returns False, then de novo splicing would cause a frameshift """ if varInExon(variant): varExonNum = getVarExonNumberSNS(variant) else: if varInSpliceRegion(variant, donor=donor, deNovo=False): spliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) varExonNum = spliceBounds["exonName"] else: varExonNum = getClosestExonNumberIntronicSNS(variant, "enigma", donor=donor) varStrand = verify.getVarStrand(variant) refExonBounds = extract.getExonBoundaries(variant) slidingWindowInfo = getMaxMaxEntScanScoreSlidingWindowSNS( variant, exonicPortionSize, STD_DE_NOVO_LENGTH, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) deNovoSplicePos = extract.getNewSplicePosition( variant["Pos"], varStrand, slidingWindowInfo["varWindowPosition"], slidingWindowInfo["inExonicPortion"], exonicPortionSize, intronicPortionSize, donor=donor) if donor: wildTypeSplicePos = refExonBounds[varExonNum]["exonEnd"] if varStrand == "+": distanceBetween = wildTypeSplicePos - deNovoSplicePos else: # +1 for minus strand donor because splice donor position is to the right of splice cut position distanceBetween = deNovoSplicePos - (wildTypeSplicePos + 1) else: wildTypeSplicePos = refExonBounds[varExonNum]["exonStart"] if varStrand == "+": distanceBetween = abs(deNovoSplicePos - wildTypeSplicePos) else: # +1 for minus strand acceptor because splice acceptor position is to the left of splice cut position distanceBetween = abs((wildTypeSplicePos + 1) - deNovoSplicePos) if distanceBetween % 3 == 0: return True return False
def getDeNovoFrameshiftAndCIStatus(variant, boundaries, donor=True, deNovoDonorInRefAcc=False): """ Given a variant, boundaries (enigma or priors), donor argument, and deNovoDonorInRefAcc argument: donor argument = True for de novo donors, False for de novo acceptors deNovoDonorInRefAcc argument = True if lookign for de novo donor in ref acceptor site, False otherwise Determines if new splice position causes a frameshift and would disrupt a CI Domain If de novo splicing would cause a frameshift, returns False Else, checks to see if new splice position would splice out (skip) a CI domain If variant de novo splice position does not cause a frameshift and does not disrupt a CI domain, reutrns True Returns False otherwise """ frameshiftStatus = getDeNovoSpliceFrameshiftStatus( variant, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) # checks to make sure that variant does not cause a frameshift if frameshiftStatus: return False else: # determine if CI domain is in region that would be skipped by new splicing if varInExon(variant): varExonNum = getVarExonNumberSNS(variant) else: if varInSpliceRegion(variant, donor=donor, deNovo=False): spliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) varExonNum = spliceBounds["exonName"] else: if donor: # if a variant is in an intron de novo donor cannot splice out any of the exon # so no part of a CI domain will be spliced out return True # varExonNum is a string in the format "exonN" varWindowPos = getVarWindowPosition( variant, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) inExonicPortion = varInExonicPortion( variant, STD_EXONIC_PORTION, STD_DE_NOVO_LENGTH, donor=donor, deNovoDonorInRefAcc=deNovoDonorInRefAcc) regionStart = extract.getNewSplicePosition( variant["Pos"], verify.getVarStrand(variant), varWindowPos, inExonicPortion, STD_EXONIC_PORTION, STD_ACC_INTRONIC_LENGTH, donor=donor) if donor: # nextExonNum parses out N from varExonNum and adds 1 to get next exon number "exonN+1" # uses [4:] to remove "exon" from "exonN" so can add 1 to N to get N+1 nextExonNum = "exon" + str(int(varExonNum[4:]) + 1) # skips to exon 5 for any variants in BRCA1 exon 3 because exon 4 does not exist in BRCA1 RefSeq transcript if variant["Gene_Symbol"] == "BRCA1" and nextExonNum == "exon4": nextExonNum = "exon5" refSpliceAccBounds = extract.getSpliceAcceptorBoundaries( variant, STD_ACC_INTRONIC_LENGTH, STD_ACC_EXONIC_LENGTH) regionEnd = refSpliceAccBounds[nextExonNum]["acceptorStart"] else: # prevExonNum parses out N from varExonNum and adds 1 to get previous exon number "exonN-1" # uses [4:] to remove "exon" from "exonN" so can subtract 1 to N to get N-1 prevExonNum = "exon" + str(int(varExonNum[4:]) - 1) if variant["Gene_Symbol"] == "BRCA1" and prevExonNum == "exon4": prevExonNum = "exon3" refSpliceDonorBounds = extract.getRefSpliceDonorBoundaries( variant, STD_DONOR_INTRONIC_LENGTH, STD_DONOR_EXONIC_LENGTH) regionEnd = refSpliceDonorBounds[prevExonNum]["donorEnd"] CIDomainInRegion = verify.isCIDomainInRegion(regionStart, regionEnd, boundaries, variant["Gene_Symbol"]) if not CIDomainInRegion: return True return False
def getMaxMaxEntScanScoreSlidingWindowSNS(variant, exonicPortionSize, deNovoLength, donor=True, deNovo=False, deNovoDonorInRefAcc=False): """ Given a variant, determines the maximum alt MaxEntScan score in a sliding window of size STD_DONOR_SIZE with the variant in each position (1-STD_DONOR_SIZE) if donor = True a sliding window of size STD_ACC_SIZE with the variant in each position (1-STD_ACC_SIZE) if donor = False This function should be used to determine window in which de novo splicing is most likely to occur Function can only return highest scoring window details for either de novo donor OR de novo acceptor, not both If donor=True, function determines highest scoring window for potential de novo donor If donor=False, function determines highest scoring window for potential de novo acceptor Returns a dictionary containing the ref and alt MaxEntScan score and z-score and position of variant for the highest scoring window Ref and alt seqs for the highest scoring window are included in dictionary along with varStart (0-based index of variant for formatting) and varLength (equal to 1 for this function, becuase this function only works for single nucleotide substitution variants Dictionary also containing value "inExonicPortion" that has value either True or False If inExonicPortion = True, then variant is in length of bp specified by exonicPortionSize of highest scoring sliding window If inExonicPortion = False, then variant is NOT in length of bp specified by exonicPortionSize highest scoring sliding window deNovoLength refers to the length of the exonic portion of a de novo splice acceptor deNovoDonorInRefAcc = False if NOT checking for de novo splice donors in reference splice acceptor sites deNovoDonorInRefAcc = True if checking for de novo splice donors in reference splice acceptor sites """ if donor: # uses default window size for a splice donor region slidingWindowInfo = extract.getMaxEntScanScoresSlidingWindowSNS( variant, STD_DONOR_SIZE, donor=donor) else: # uses default window size for a splice acceptor region slidingWindowInfo = extract.getMaxEntScanScoresSlidingWindowSNS( variant, STD_ACC_SIZE, donor=donor) windowAltMaxEntScanScores = slidingWindowInfo["windowAltMaxEntScanScores"] # checks to see if variatn is within reference splice donor region inRefSpliceDonorRegion = varInSpliceRegion(variant, donor=True, deNovo=False) # checks to see if variant is within reference splice acceptor region inRefSpliceAccRegion = varInSpliceRegion(variant, donor=False, deNovo=False) # if variant in ref splice donor region (for de novo donor) or in ref splice acceptor region (for de novo acceptor), # then need to remove native splicing window from consideration for highest scoring window if (inRefSpliceDonorRegion or inRefSpliceAccRegion) and not deNovoDonorInRefAcc: if donor: refSpliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) if verify.getVarStrand(variant) == "+": refSpliceSeq = extract.getFastaSeq( extract.getVarChrom(variant), refSpliceBounds["donorStart"], refSpliceBounds["donorEnd"], plusStrandSeq=True) else: refSpliceSeq = extract.getFastaSeq( extract.getVarChrom(variant), refSpliceBounds["donorStart"], refSpliceBounds["donorEnd"], plusStrandSeq=False) else: refSpliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=True) deNovoOffset = deNovoLength - exonicPortionSize # acceptorEnd +- deNovoOffset because deNovo splice acceptor region is deNovoOffset bp longer than reference splice acceptor region if verify.getVarStrand(variant) == "+": # acceptorEnd - deNovoOffset because genomic position increases from left to right on plus strand, refSeq reduced to correct length refSpliceSeq = extract.getFastaSeq( extract.getVarChrom(variant), refSpliceBounds["acceptorStart"], (refSpliceBounds["acceptorEnd"] - deNovoOffset), plusStrandSeq=True) else: # acceptorEnd + deNovoOffset because genomic position decreases from left to right on minus strand, refSeq reduced to correct length refSpliceSeq = extract.getFastaSeq( extract.getVarChrom(variant), refSpliceBounds["acceptorStart"], (refSpliceBounds["acceptorEnd"] + deNovoOffset), plusStrandSeq=False) for position, seqs in slidingWindowInfo["windowSeqs"].iteritems(): if seqs["refSeq"] == refSpliceSeq: refSpliceWindow = position # removes reference splice window so it is not considered for de novo splicing del windowAltMaxEntScanScores[refSpliceWindow] # to get tuple containing sequence with variant position with maximum alt MaxEntScan score maxAltWindowScore = max(windowAltMaxEntScanScores.items(), key=lambda k: k[1]) maxVarPosition = maxAltWindowScore[0] maxScores = slidingWindowInfo["windowScores"][maxVarPosition] maxSeqs = slidingWindowInfo["windowSeqs"][maxVarPosition] # determines if variant is in the exonic portion specified by exonicPortionLength inExonicPortion = False if donor: # determines if variant is in first exonicPortionSize bp of the donor region if maxVarPosition <= exonicPortionSize: inExonicPortion = True else: # determines if variant is in the last exonicPortionSize bp of the acceptor region if (STD_ACC_SIZE - maxVarPosition) < exonicPortionSize: inExonicPortion = True return { "refMaxEntScanScore": maxScores["refMaxEntScanScore"], "refZScore": maxScores["refZScore"], "altMaxEntScanScore": maxScores["altMaxEntScanScore"], "altZScore": maxScores["altZScore"], "refSeq": maxSeqs["refSeq"], "altSeq": maxSeqs["altSeq"], "varStart": maxVarPosition - 1, "varLength": 1, "varWindowPosition": maxVarPosition, "inExonicPortion": inExonicPortion }
def getClosestSpliceSiteScores(variant, deNovoOffset, donor=True, deNovo=False, deNovoDonorInRefAcc=False, testMode=False): """ Given a variant, determines scores for closest reference splice sequence Also returns sequence of closest reference splice site and genomic position of splice site deNovoOffset refers to difference between de novo acceptor length and exonic portion size If donor = True, looks for closest splice donor sequence If donor = False, looks for closest splice acceptor sequence If deNovo = True, accomodates for de novo splicing *Note only use argument deNovo=True in this function if donor=False *Function will not return correct sequence if donor=True and deNovo=True If exonic variant, returns a dictionary containing: MaxEntScan score, z-score, and splice site sequence for reference closest splice sequence If variant located in referene splice site, returns a dictionary containing: MaxEntScan score, z-score, and splice site sequence for that reference splice site sequence If intronic variant or variant in UTR, returns a dictionary containg: MaxEntScan score, z-score, and splice site sequence for reference closest splice site *Note if looking for closest ref acceptor for a variant in an intron, use deNovoOffset=0 Return dictionary also contains necessary formatting variables for splice site sequence (exonStart, intronStart) deNovoDonorInRefAcc = False if NOT checking for de novo splice donor sites in reference splice acceptor sites deNovoDonorInRefAcc = True if checking for de novo splice donor sites in reference splice acceptor sites """ varGenPos = int(variant["Pos"]) varChrom = extract.getVarChrom(variant) varLoc = getVarLocation(variant, "enigma") if (varInExon(variant) and not deNovo) or (varLoc == "intron_variant" or varLoc == "UTR_variant"): if varInExon(variant): exonNumber = getVarExonNumberSNS(variant) exonName = exonNumber if (varLoc == "intron_variant" or varLoc == "UTR_variant") and not varInExon(variant): exonNumber = getClosestExonNumberIntronicSNS(variant, "enigma", donor=donor) exonName = exonNumber if donor: refSpliceDonorBounds = extract.getRefSpliceDonorBoundaries( variant, STD_DONOR_INTRONIC_LENGTH, STD_DONOR_EXONIC_LENGTH) closestSpliceBounds = refSpliceDonorBounds[exonNumber] else: refSpliceAccBounds = extract.getSpliceAcceptorBoundaries( variant, STD_ACC_INTRONIC_LENGTH, STD_ACC_EXONIC_LENGTH) closestSpliceBounds = refSpliceAccBounds[exonNumber] if varInSpliceRegion(variant, donor=donor, deNovo=deNovo) and not deNovoDonorInRefAcc: closestSpliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=deNovo) exonName = closestSpliceBounds["exonName"] if donor: if verify.getVarStrand(variant) == "+": refSeq = extract.getFastaSeq(varChrom, closestSpliceBounds["donorStart"], closestSpliceBounds["donorEnd"], plusStrandSeq=True) # splice site is 3 bp to the right of donor Start (+3 because plus strand numbering increases from left to right) # splice site is 3 bp to the right because exon end is 3 bp to the right of donor start genomicSplicePos = closestSpliceBounds["donorStart"] + 3 else: refSeq = extract.getFastaSeq(varChrom, closestSpliceBounds["donorStart"], closestSpliceBounds["donorEnd"], plusStrandSeq=False) # splice site is 3 bp to the right of donor Start (-3 because minus strand numbering decreases from left to right) # splice site is 3 bp to the right because exon end is 3 bp to the right of donor start genomicSplicePos = closestSpliceBounds["donorStart"] - 3 exonStart = 0 intronStart = STD_EXONIC_PORTION else: # acceptorEnd +- deNovoOffset because deNovo splice acceptor region is deNovoOffset bp longer than reference splice acceptor region # for plus strand it is acceptorEnd - deNovoOffset because # the genomic position increases from left to right on the plus strand and subtraction reduces the refSeq to correct length # for minus strand it is acceptorEnd + deNovoOffset because # the genomic position decreases from left to right on the minus strand and addition reduces the refSeq to correct length if verify.getVarStrand(variant) == "+": refSeq = extract.getFastaSeq( varChrom, closestSpliceBounds["acceptorStart"], (closestSpliceBounds["acceptorEnd"] - deNovoOffset), plusStrandSeq=True) # splice site is 3 bp to the left of reference acceptor End (-3 because plus strand numbering increases from left to right) # minus deNovoOffset because deNovo splice acceptor region is deNovoOffset bp longer than reference splice acceptor region genomicSplicePos = closestSpliceBounds[ "acceptorEnd"] - 3 - deNovoOffset else: refSeq = extract.getFastaSeq( varChrom, closestSpliceBounds["acceptorStart"], (closestSpliceBounds["acceptorEnd"] + deNovoOffset), plusStrandSeq=False) # splice site is 3 bp to the left of reference acceptor End (+3 because minus strand numbering decreases from left to right) # plus deNovoOffset because deNovo splice acceptor region is deNovoOffset bp longer than reference splice acceptor region genomicSplicePos = closestSpliceBounds[ "acceptorEnd"] + 3 + deNovoOffset exonStart = len(refSeq) - STD_EXONIC_PORTION intronStart = 0 if not testMode: # to prevent issue with running max ent scan score on unittests if exonName == "exon0": return { "exonName": "N/A", "sequence": "N/A", "exonStart": "N/A", "intronStart": "N/A", "maxEntScanScore": "N/A", "zScore": "N/A", "genomicSplicePos": "N/A" } closestMaxEntScanScore = calcMaxEntScanMeanStd.runMaxEntScan( refSeq, donor=donor) closestZScore = extract.getZScore(closestMaxEntScanScore, donor=donor) return { "exonName": exonName, "sequence": refSeq.upper(), "exonStart": exonStart, "intronStart": intronStart, "maxEntScanScore": closestMaxEntScanScore, "zScore": closestZScore, "genomicSplicePos": genomicSplicePos } else: return { "exonName": exonName, "sequence": refSeq.upper(), "genomicSplicePos": genomicSplicePos }
def getClosestSpliceSiteScores(variant, deNovoOffset, donor=True, deNovo=False, deNovoDonorInRefAcc=False, testMode=False): """ Given a variant, determines scores for closest reference splice sequence Also returns sequence of closest reference splice site and genomic position of splice site deNovoOffset refers to difference between de novo acceptor length and exonic portion size If donor = True, looks for closest splice donor sequence If donor = False, looks for closest splice acceptor sequence If deNovo = True, accomodates for de novo splicing *Note only use argument deNovo=True in this function if donor=False *Function will not return correct sequence if donor=True and deNovo=True If exonic variant, returns a dictionary containing: MaxEntScan score, z-score, and splice site sequence for reference closest splice sequence If variant located in referene splice site, returns a dictionary containing: MaxEntScan score, z-score, and splice site sequence for that reference splice site sequence If intronic variant or variant in UTR, returns a dictionary containg: MaxEntScan score, z-score, and splice site sequence for reference closest splice site *Note if looking for closest ref acceptor for a variant in an intron, use deNovoOffset=0 Return dictionary also contains necessary formatting variables for splice site sequence (exonStart, intronStart) deNovoDonorInRefAcc = False if NOT checking for de novo splice donor sites in reference splice acceptor sites deNovoDonorInRefAcc = True if checking for de novo splice donor sites in reference splice acceptor sites """ varGenPos = int(variant["Pos"]) varChrom = extract.getVarChrom(variant) varLoc = getVarLocation(variant, "enigma") if (varInExon(variant) and not deNovo) or (varLoc == "intron_variant" or varLoc == "UTR_variant"): if varInExon(variant): exonNumber = getVarExonNumberSNS(variant) exonName = exonNumber if (varLoc == "intron_variant" or varLoc == "UTR_variant") and not varInExon(variant): exonNumber = getClosestExonNumberIntronicSNS(variant, "enigma", donor=donor) exonName = exonNumber if donor: refSpliceDonorBounds = extract.getRefSpliceDonorBoundaries(variant, STD_DONOR_INTRONIC_LENGTH, STD_DONOR_EXONIC_LENGTH) closestSpliceBounds = refSpliceDonorBounds[exonNumber] else: refSpliceAccBounds = extract.getSpliceAcceptorBoundaries(variant, STD_ACC_INTRONIC_LENGTH, STD_ACC_EXONIC_LENGTH) closestSpliceBounds = refSpliceAccBounds[exonNumber] if varInSpliceRegion(variant, donor=donor, deNovo=deNovo) and not deNovoDonorInRefAcc: closestSpliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=deNovo) exonName = closestSpliceBounds["exonName"] if donor: if verify.getVarStrand(variant) == "+": refSeq = extract.getFastaSeq(varChrom, closestSpliceBounds["donorStart"], closestSpliceBounds["donorEnd"], plusStrandSeq=True) # splice site is 3 bp to the right of donor Start (+3 because plus strand numbering increases from left to right) # splice site is 3 bp to the right because exon end is 3 bp to the right of donor start genomicSplicePos = closestSpliceBounds["donorStart"] + 3 else: refSeq = extract.getFastaSeq(varChrom, closestSpliceBounds["donorStart"], closestSpliceBounds["donorEnd"], plusStrandSeq=False) # splice site is 3 bp to the right of donor Start (-3 because minus strand numbering decreases from left to right) # splice site is 3 bp to the right because exon end is 3 bp to the right of donor start genomicSplicePos = closestSpliceBounds["donorStart"] - 3 exonStart = 0 intronStart = STD_EXONIC_PORTION else: # acceptorEnd +- deNovoOffset because deNovo splice acceptor region is deNovoOffset bp longer than reference splice acceptor region # for plus strand it is acceptorEnd - deNovoOffset because # the genomic position increases from left to right on the plus strand and subtraction reduces the refSeq to correct length # for minus strand it is acceptorEnd + deNovoOffset because # the genomic position decreases from left to right on the minus strand and addition reduces the refSeq to correct length if verify.getVarStrand(variant) == "+": refSeq = extract.getFastaSeq(varChrom, closestSpliceBounds["acceptorStart"], (closestSpliceBounds["acceptorEnd"] - deNovoOffset), plusStrandSeq=True) # splice site is 3 bp to the left of reference acceptor End (-3 because plus strand numbering increases from left to right) # minus deNovoOffset because deNovo splice acceptor region is deNovoOffset bp longer than reference splice acceptor region genomicSplicePos = closestSpliceBounds["acceptorEnd"] - 3 - deNovoOffset else: refSeq = extract.getFastaSeq(varChrom, closestSpliceBounds["acceptorStart"], (closestSpliceBounds["acceptorEnd"] + deNovoOffset), plusStrandSeq=False) # splice site is 3 bp to the left of reference acceptor End (+3 because minus strand numbering decreases from left to right) # plus deNovoOffset because deNovo splice acceptor region is deNovoOffset bp longer than reference splice acceptor region genomicSplicePos = closestSpliceBounds["acceptorEnd"] + 3 + deNovoOffset exonStart = len(refSeq) - STD_EXONIC_PORTION intronStart = 0 if not testMode: # to prevent issue with running max ent scan score on unittests if exonName == "exon0": return {"exonName": "N/A", "sequence": "N/A", "exonStart": "N/A", "intronStart": "N/A", "maxEntScanScore": "N/A", "zScore": "N/A", "genomicSplicePos": "N/A"} closestMaxEntScanScore = calcMaxEntScanMeanStd.runMaxEntScan(refSeq, donor=donor) closestZScore = extract.getZScore(closestMaxEntScanScore, donor=donor) return {"exonName": exonName, "sequence": refSeq.upper(), "exonStart": exonStart, "intronStart": intronStart, "maxEntScanScore": closestMaxEntScanScore, "zScore": closestZScore, "genomicSplicePos": genomicSplicePos} else: return {"exonName": exonName, "sequence": refSeq.upper(), "genomicSplicePos": genomicSplicePos}
def getMaxMaxEntScanScoreSlidingWindowSNS(variant, exonicPortionSize, deNovoLength, donor=True, deNovo=False, deNovoDonorInRefAcc=False): """ Given a variant, determines the maximum alt MaxEntScan score in a sliding window of size STD_DONOR_SIZE with the variant in each position (1-STD_DONOR_SIZE) if donor = True a sliding window of size STD_ACC_SIZE with the variant in each position (1-STD_ACC_SIZE) if donor = False This function should be used to determine window in which de novo splicing is most likely to occur Function can only return highest scoring window details for either de novo donor OR de novo acceptor, not both If donor=True, function determines highest scoring window for potential de novo donor If donor=False, function determines highest scoring window for potential de novo acceptor Returns a dictionary containing the ref and alt MaxEntScan score and z-score and position of variant for the highest scoring window Ref and alt seqs for the highest scoring window are included in dictionary along with varStart (0-based index of variant for formatting) and varLength (equal to 1 for this function, becuase this function only works for single nucleotide substitution variants Dictionary also containing value "inExonicPortion" that has value either True or False If inExonicPortion = True, then variant is in length of bp specified by exonicPortionSize of highest scoring sliding window If inExonicPortion = False, then variant is NOT in length of bp specified by exonicPortionSize highest scoring sliding window deNovoLength refers to the length of the exonic portion of a de novo splice acceptor deNovoDonorInRefAcc = False if NOT checking for de novo splice donors in reference splice acceptor sites deNovoDonorInRefAcc = True if checking for de novo splice donors in reference splice acceptor sites """ if donor: # uses default window size for a splice donor region slidingWindowInfo = extract.getMaxEntScanScoresSlidingWindowSNS(variant, STD_DONOR_SIZE, donor=donor) else: # uses default window size for a splice acceptor region slidingWindowInfo = extract.getMaxEntScanScoresSlidingWindowSNS(variant, STD_ACC_SIZE, donor=donor) windowAltMaxEntScanScores = slidingWindowInfo["windowAltMaxEntScanScores"] # checks to see if variatn is within reference splice donor region inRefSpliceDonorRegion = varInSpliceRegion(variant, donor=True, deNovo=False) # checks to see if variant is within reference splice acceptor region inRefSpliceAccRegion = varInSpliceRegion(variant, donor=False, deNovo=False) # if variant in ref splice donor region (for de novo donor) or in ref splice acceptor region (for de novo acceptor), # then need to remove native splicing window from consideration for highest scoring window if (inRefSpliceDonorRegion or inRefSpliceAccRegion) and not deNovoDonorInRefAcc: if donor: refSpliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=False) if verify.getVarStrand(variant) == "+": refSpliceSeq = extract.getFastaSeq(extract.getVarChrom(variant), refSpliceBounds["donorStart"], refSpliceBounds["donorEnd"], plusStrandSeq=True) else: refSpliceSeq = extract.getFastaSeq(extract.getVarChrom(variant), refSpliceBounds["donorStart"], refSpliceBounds["donorEnd"], plusStrandSeq=False) else: refSpliceBounds = getVarSpliceRegionBounds(variant, donor=donor, deNovo=True) deNovoOffset = deNovoLength - exonicPortionSize # acceptorEnd +- deNovoOffset because deNovo splice acceptor region is deNovoOffset bp longer than reference splice acceptor region if verify.getVarStrand(variant) == "+": # acceptorEnd - deNovoOffset because genomic position increases from left to right on plus strand, refSeq reduced to correct length refSpliceSeq = extract.getFastaSeq(extract.getVarChrom(variant), refSpliceBounds["acceptorStart"], (refSpliceBounds["acceptorEnd"] - deNovoOffset), plusStrandSeq=True) else: # acceptorEnd + deNovoOffset because genomic position decreases from left to right on minus strand, refSeq reduced to correct length refSpliceSeq = extract.getFastaSeq(extract.getVarChrom(variant), refSpliceBounds["acceptorStart"], (refSpliceBounds["acceptorEnd"] + deNovoOffset), plusStrandSeq=False) for position, seqs in slidingWindowInfo["windowSeqs"].iteritems(): if seqs["refSeq"] == refSpliceSeq: refSpliceWindow = position # removes reference splice window so it is not considered for de novo splicing del windowAltMaxEntScanScores[refSpliceWindow] # to get tuple containing sequence with variant position with maximum alt MaxEntScan score maxAltWindowScore = max(windowAltMaxEntScanScores.items(), key=lambda k: k[1]) maxVarPosition = maxAltWindowScore[0] maxScores = slidingWindowInfo["windowScores"][maxVarPosition] maxSeqs = slidingWindowInfo["windowSeqs"][maxVarPosition] # determines if variant is in the exonic portion specified by exonicPortionLength inExonicPortion = False if donor: # determines if variant is in first exonicPortionSize bp of the donor region if maxVarPosition <= exonicPortionSize: inExonicPortion = True else: # determines if variant is in the last exonicPortionSize bp of the acceptor region if (STD_ACC_SIZE - maxVarPosition) < exonicPortionSize: inExonicPortion = True return {"refMaxEntScanScore": maxScores["refMaxEntScanScore"], "refZScore": maxScores["refZScore"], "altMaxEntScanScore": maxScores["altMaxEntScanScore"], "altZScore": maxScores["altZScore"], "refSeq": maxSeqs["refSeq"], "altSeq": maxSeqs["altSeq"], "varStart": maxVarPosition - 1, "varLength": 1, "varWindowPosition": maxVarPosition, "inExonicPortion": inExonicPortion}
def getMaxEntScanScoresSlidingWindowSNS(variant, windowSize, donor=False): """ Given a variant and window size determines window sequences and scores for a sliding window that is the size of windowSize If donor=True, calculates MaxEntScan scores for splice donors If donor=False, calculates MaxEntScan scores for splice acceptors Returns a dictionary containing: 1. window sequences - ref and alt seq for each window (variant in positions 1-windowSize) 2. window scores - ref and alt MaxEntScan scores and zscores for each window 3. window alt MaxEntScan scores - only contains alt MaxEntScan scores for each window """ varGenPos = int(variant["Pos"]) varStrand = verify.getVarStrand(variant) # use +- (windowSize - 1) to get (windowSize*2 - 1) bp region so that have sequence for: # each window of size windowSize bp with variant in each position (1-windowSize) # minus strand and plus strand are opposite for +- (windowSize - 1) to preserve sequence returned by getRefAltSeqs offset = windowSize - 1 varPos = windowSize windowEnd = windowSize totalPositions = windowSize if varStrand == "-": regionStart = varGenPos + offset regionEnd = varGenPos - offset else: regionStart = varGenPos - offset regionEnd = varGenPos + offset refAltSeqs = getRefAltSeqs(variant, regionStart, regionEnd) refSeq = refAltSeqs["refSeq"] altSeq = refAltSeqs["altSeq"] windowStart = 0 windowSeqs = {} windowScores = {} windowAltMaxEntScanScores = {} while windowStart < totalPositions: refWindowSeq = refSeq[windowStart:windowEnd] altWindowSeq = altSeq[windowStart:windowEnd] windowSeqs[varPos] = {"refSeq": refWindowSeq, "altSeq": altWindowSeq} refAltWindowScores = getRefAltScores(refWindowSeq, altWindowSeq, donor=donor) windowScores[varPos] = { "refMaxEntScanScore": refAltWindowScores["refScores"]["maxEntScanScore"], "refZScore": refAltWindowScores["refScores"]["zScore"], "altMaxEntScanScore": refAltWindowScores["altScores"]["maxEntScanScore"], "altZScore": refAltWindowScores["altScores"]["zScore"] } windowAltMaxEntScanScores[varPos] = refAltWindowScores["altScores"][ "maxEntScanScore"] varPos -= 1 windowStart += 1 windowEnd += 1 return { "windowSeqs": windowSeqs, "windowScores": windowScores, "windowAltMaxEntScanScores": windowAltMaxEntScanScores }