def getGeneticChanges(self, abspos, variants): """ Given the possible variants at chromosomal position abspos, determine any "interesting" genetic changes induced by each variant. That would be any changes modifying a splice-site or interefering with the promoter region """ import getGenomeSpan ### Stitch together the MRNA Sequence ... sequence = [] changes = [] relpos = self.getRelpos(abspos) sequence = getGenomeSpan.getGenomeSpan(self.chrom, self.txStart, self.txEnd) sequence2 = list(sequence) ### This makes a copy ### For each variant specified, determine if there is an amino ### acid change in the protein when changing from the reference ### base at that position, to this "mutant" base return [] if relpos < 0: return 'Promoter?' refBase = sequence[int(relpos)] for variant in variants: ### Don't bother calculating a matching base, or deletions if variant in (refBase, '-'): continue sequence2[positionInSequence] = variant resp.append('NUCLEOTIDE=%s/%s' % (sequence[positionInSequence], sequence2[positionInSequence])) sequence = ''.join(sequence) sequence2 = ''.join(sequence2) return changes
def getGeneticChanges(self, abspos, variants): """ Given the possible variants at chromosomal position abspos, determine any "interesting" genetic changes induced by each variant. That would be any changes modifying a splice-site or interefering with the promoter region """ import getGenomeSpan ### Stitch together the MRNA Sequence ... sequence = [] changes = [] relpos = self.getRelpos(abspos) sequence = getGenomeSpan.getGenomeSpan(self.chrom, self.txStart, self.txEnd) sequence2 = list(sequence) ### This makes a copy ### For each variant specified, determine if there is an amino ### acid change in the protein when changing from the reference ### base at that position, to this "mutant" base return [] if relpos < 0: return 'Promoter?' refBase = sequence[int(relpos)] for variant in variants: ### Don't bother calculating a matching base, or deletions if variant in (refBase, '-'): continue sequence2[positionInSequence] = variant resp.append( 'NUCLEOTIDE=%s/%s' % (sequence[positionInSequence], sequence2[positionInSequence])) sequence = ''.join(sequence) sequence2 = ''.join(sequence2) return changes
def getProteinChanges(self, abspos, variants): """ """ import getGenomeSpan ### Stitch together the MRNA Sequence ... sequence = [] variantPositions = [] for exonStart, exonEnd in self.exons: ### Forget exons that are entirely before cdsStart ### or after cdsEnd if exonStart <= self.cdsStart and exonEnd <= self.cdsStart: continue elif exonStart >= self.cdsEnd and exonEnd >= self.cdsEnd: continue ### Otherwise, add a CDS feature for the part of the ### exon contained in the range cdsStart-cdsEnd fStart = max(exonStart, self.cdsStart) fEnd = min(exonEnd, self.cdsEnd) exon = getGenomeSpan.getGenomeSpan(self.chrom, fStart, fEnd) sequence.append(exon) ### Determine the relative position within the mrna sequence if fStart <= abspos <= fEnd: variantPositions.append(len(sequence) - (fEnd - abspos)) changes = [] warnings = Set() for positionInSequence in variantPositions: sequence = list(''.join(sequence)) ### For each variant specified, determine if there is an amino ### acid change in the protein when changing from the reference ### base at that position, to this "mutant" base refBase = sequence[positionInSequence] for variant in variants: sequence2 = list(sequence) change = {} change['REF_BASE'] = refBase ### Don't bother calculating a matching base, or deletions if variant in (refBase, '-'): continue sequence2[positionInSequence] = variant change['VARIANT_BASE'] = sequence2[positionInSequence] sequence = ''.join(sequence) sequence2 = ''.join(sequence2) mrna = Seq(sequence, alphabet) mrna2 = Seq(sequence2, alphabet) if self.strand == '-': mrna = mrna.reverse_complement() mrna2 = mrna2.reverse_complement() protein = translator.translate(mrna) protein2 = translator.translate(mrna2) aminopos = positionInSequence/3 change['AMINOPOS'] = aminopos+1 ### These two warning flags are here temporarily so we can get an idea ### of how often this happens. We've noticed sequences of stitched-together ### coding sequences that are not divisible by three, so when they are ### translated into proteins, the remaining bases get truncated. This is ### a general problem in that I don't know how best to handle that, and ### whether it's supposed to happen at all. It is a more specific problem ### when the SNP of interest is actually one of the truncated bases! if len(mrna) % 3 != 0: warnings.add('MRNA_LENGTH_NOT_3_MULTIPLE') if len(protein) <= aminopos: warnings.add('PROTEIN_LENGTH_LESS_THAN_AMINOPOS') change['PROTEIN_LENGTH'] = len(protein) continue refAmino = protein[aminopos] varAmino = protein2[aminopos] if varAmino != refAmino: ### Some of these are rather verbose and are only useful for debugging #change['HEAD'] = ''.join([str(i)[-2:].ljust(3) for i in range(1, len(protein)+1)]) #change['PRO1'] = ' '.join(protein.tostring()) #change['PRO2'] = ' '.join(protein2.tostring()) #change['SEQ1'] = mrna.tostring() #change['SEQ2'] = mrna2.tostring() change['VARIANT_POS'] = positionInSequence change['CODING_LENGTH'] = len(sequence) change['PROTEIN_LENGTH'] = len(protein) change['REF_AMINO'] = refAmino change['VAR_AMINO'] = varAmino changes.append(change) return changes, warnings
def getProteinChanges(self, abspos, variants): """ """ import getGenomeSpan ### Stitch together the MRNA Sequence ... sequence = [] variantPositions = [] for exonStart, exonEnd in self.exons: ### Forget exons that are entirely before cdsStart ### or after cdsEnd if exonStart <= self.cdsStart and exonEnd <= self.cdsStart: continue elif exonStart >= self.cdsEnd and exonEnd >= self.cdsEnd: continue ### Otherwise, add a CDS feature for the part of the ### exon contained in the range cdsStart-cdsEnd fStart = max(exonStart, self.cdsStart) fEnd = min(exonEnd, self.cdsEnd) exon = getGenomeSpan.getGenomeSpan(self.chrom, fStart, fEnd) sequence.append(exon) ### Determine the relative position within the mrna sequence if fStart <= abspos <= fEnd: variantPositions.append(len(sequence) - (fEnd - abspos)) changes = [] warnings = Set() for positionInSequence in variantPositions: sequence = list(''.join(sequence)) ### For each variant specified, determine if there is an amino ### acid change in the protein when changing from the reference ### base at that position, to this "mutant" base refBase = sequence[positionInSequence] for variant in variants: sequence2 = list(sequence) change = {} change['REF_BASE'] = refBase ### Don't bother calculating a matching base, or deletions if variant in (refBase, '-'): continue sequence2[positionInSequence] = variant change['VARIANT_BASE'] = sequence2[positionInSequence] sequence = ''.join(sequence) sequence2 = ''.join(sequence2) mrna = Seq(sequence, alphabet) mrna2 = Seq(sequence2, alphabet) if self.strand == '-': mrna = mrna.reverse_complement() mrna2 = mrna2.reverse_complement() protein = translator.translate(mrna) protein2 = translator.translate(mrna2) aminopos = positionInSequence / 3 change['AMINOPOS'] = aminopos + 1 ### These two warning flags are here temporarily so we can get an idea ### of how often this happens. We've noticed sequences of stitched-together ### coding sequences that are not divisible by three, so when they are ### translated into proteins, the remaining bases get truncated. This is ### a general problem in that I don't know how best to handle that, and ### whether it's supposed to happen at all. It is a more specific problem ### when the SNP of interest is actually one of the truncated bases! if len(mrna) % 3 != 0: warnings.add('MRNA_LENGTH_NOT_3_MULTIPLE') if len(protein) <= aminopos: warnings.add('PROTEIN_LENGTH_LESS_THAN_AMINOPOS') change['PROTEIN_LENGTH'] = len(protein) continue refAmino = protein[aminopos] varAmino = protein2[aminopos] if varAmino != refAmino: ### Some of these are rather verbose and are only useful for debugging #change['HEAD'] = ''.join([str(i)[-2:].ljust(3) for i in range(1, len(protein)+1)]) #change['PRO1'] = ' '.join(protein.tostring()) #change['PRO2'] = ' '.join(protein2.tostring()) #change['SEQ1'] = mrna.tostring() #change['SEQ2'] = mrna2.tostring() change['VARIANT_POS'] = positionInSequence change['CODING_LENGTH'] = len(sequence) change['PROTEIN_LENGTH'] = len(protein) change['REF_AMINO'] = refAmino change['VAR_AMINO'] = varAmino changes.append(change) return changes, warnings