def __calculate_maxentscore(self): """ --- Calculate the maxentscan socre --- When a mutation occurs, if the WT score is above the threshold and the score variation (between WT and Mutant) is under -10% for HSF (-30% for MaxEnt) we consider that the mutation breaks the splice site. In the other case, if the WT score is under the threshold and the score variation is above +10% for HSF (+30% for MaxEnt) we consider that the mutation creates a new splice site. """ maxentscore_alt = maxentscore_ref = -1.00 if self.type == 'donor': if len(self.refseq) == 9 and len(self.altseq) == 9: maxentscore_ref = maxent.score5(self.refseq, matrix=matrix5) maxentscore_alt = maxent.score5(self.altseq, matrix=matrix5) elif self.type == 'acceptor': if len(self.refseq) == 23 and len(self.altseq) == 23: maxentscore_ref = maxent.score3(self.refseq, matrix=matrix3) maxentscore_alt = maxent.score3(self.altseq, matrix=matrix3) maxent_foldchange = maxentscore_alt / maxentscore_ref self.maxentscore_ref = round(maxentscore_ref, 2) self.maxentscore_alt = round(maxentscore_alt, 2) self.maxent_foldchange = round(maxent_foldchange, 2)
def check3Prime(self, sequence, length, sequenceStart, mutationStart, mutationTuple, matrix3): wtMaxScore = -99.0 muMaxScore = -99.0 wtMaxStart = 0 muMaxStart = 0 wtMaxSequence = None muMaxSequence = None mutationOffset = mutationStart - sequenceStart mutatedSequence = sequence[:mutationOffset] + mutationTuple[1].upper( ) + sequence[mutationOffset + 1:] for i in range(length, -1, -1): start = mutationOffset - i end = start + length #wtSequence = sequence[start:end] wtSequence = sequence[start:mutationOffset] + mutationTuple[ 0].upper() + sequence[mutationOffset + 1:end] muSequence = mutatedSequence[start:end].strip() try: wtSequenceScore = maxent.score3(wtSequence, matrix3) muSequenceScore = maxent.score3(muSequence, matrix3) except: #sys.stderr.write("maxent failure") continue if (wtSequenceScore > wtMaxScore): wtMaxScore = wtSequenceScore wtMaxStart = start wtMaxSequence = wtSequence if (muSequenceScore > muMaxScore): muMaxScore = muSequenceScore muMaxStart = start muMaxSequence = muSequence return (wtMaxStart + sequenceStart, wtMaxSequence, wtMaxScore, muMaxStart + sequenceStart, muMaxSequence, muMaxScore)
def get_maxent(df): mxnt = pd.DataFrame(index=df.index) mxnt['maxent3first'] = df.index.map(lambda x: maxent.score3( str(df.sequence[x][int(df.acceptor1[x]) - 20:int(df.acceptor1[x]) + 3]) )) mxnt['maxent3second'] = df.index.map(lambda x: maxent.score3( str(df.sequence[x][int(df.acceptor2[x]) - 20:int(df.acceptor2[x]) + 3]) )) return mxnt
def get_maxent(df): mxnt = pd.DataFrame(index=df.index) mxnt['maxent5'] = df.index.map(lambda x: maxent.score5( str(df.sequence[x][int(df.exonend[x]) - 3:int(df.exonend[x]) + 6]))) mxnt['maxent3'] = df.index.map(lambda x: maxent.score3( str(df.sequence[x][int(df.exonstart[x]) - 20:int(df.exonstart[x]) + 3]) )) return mxnt
pos3=int(lig[2])-1 # position jonction 3' d=int(pos5)-3 fi=int(pos5)+6 seq5=seq[d:fi] deb=int(pos3)-20 fin=int(pos3)+3 seq3=seq[deb:fin] sco5=maxent.score5(seq5) sco3=maxent.score3(seq3) fichsorti.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(lig[0],lig[1],lig[2],lig[3],seq5,seq3,sco5,sco3)) f.closed fichsorti.close
def cal_score(ss3_seq, matrix3, min_score): ss3 = score3(ss3_seq, matrix=matrix3) if ss3 < min_score: return (ss3, False) else: return (ss3, True)
fsm1rnavars['variantclass'] = fsm1rnavars.variant.apply( lambda x: classify_variant(x)) fsm1rnavars['intronlength'] = fsm1rnavars.variantclass.apply( lambda x: int(x) if x not in ['wt', 'multiple', 'pointmut'] else np.nan) fsm1rnavars['intronseq'] = fsm1rnavars.index.map( lambda x: lib300.varseq[fsm1rnavars.libindex[x]][int(fsm1rnavars.variant[ x].split('|')[0]) - 5:int(fsm1rnavars.variant[x].split('|')[-1]) + 5] if fsm1rnavars.intronlength[x] > 20 else np.nan) fsm1rnavars['donorstrength'] = fsm1rnavars.intronseq.map( lambda x: np.max([maxent.score5(x[i - 3:i + 6]) for i in range(3, 8)]) if (len(str(x)) > 28) else np.nan) fsm1rnavars['acceptorstrength'] = fsm1rnavars.intronseq.map( lambda x: np.max([maxent.score3(x[-i - 20:-i + 3]) for i in range(4, 8)]) if (len(str(x)) > 28) else np.nan) rnacounts['numbersplicedreads_cum'] = fsm1rnavars[ fsm1rnavars.intronlength > 20].groupby('libindex').numberreads.sum() rnacounts['fractionnumbersplicedreads'] = rnacounts.index.map( lambda x: rnacounts.numbersplicedreads_cum[x] / rnacounts.numberreads_cum[ x]) rnacounts['fractionnumbersplicedreads'].replace(to_replace=np.nan, value=0, inplace=True) rnacounts.to_pickle('./mapping/RNA/rnacounts_min3_dlud_minlength10.pkl') fsm1rnavars.to_pickle('./mapping/RNA/fsm1rnavars_min3_dlud_minlength10.pkl')
def cryptic_splice_site(self): """ Search for cryptic splice site 1) nearby (+/- 20 nts) strong consensus splice sequence 2) reconstitutes or disrupts in-frame splicing 3) undergo NMD or not Consensus values go from 0 to 100 for HSF, -20 to +20 for MaxEnt. The threshold is defined at 65 for HSF, 3 for MaxEnt. This means that every signal with a score above the threshold is considered to be a splice site (donor or acceptor). Cite: http://www.umd.be/HSF3/technicaltips.html """ refscore = self.maxentscore_ref chrom = self.chrom if 'chr' in self.chrom else 'chr' + self.chrom search_flank = 50 list1 = list( range(self.refseq_start - 1, self.refseq_start - 1 - search_flank, -1)) list2 = list( range(self.refseq_start + 1, self.refseq_start + 1 + search_flank, 1)) search_region = list(itertools.chain.from_iterable(zip(list1, list2))) for pos in search_region: if self.type == 'donor': splice_context = genome[chrom][pos:pos + 9].seq alt_index = self.offset - pos - 1 if 0 < alt_index < 9: splice_context = splice_context[:alt_index] + self.alt + \ splice_context[alt_index + len(self.alt):10-len(self.alt)] if self.transcript.strand == '-': splice_context = self.reverse_complement(splice_context) splice_context = self.format_donor(splice_context) if len(splice_context) == 9: maxentscore = maxent.score5(splice_context, matrix=matrix5) else: maxentscore = 0 if splice_context[3:5] in ['GT', self.refseq[3:5]] and \ (maxentscore >= self.donor_threshold or maxentscore / refscore >= self.percent_threshold): return pos, splice_context, maxentscore elif self.type == 'acceptor': splice_context = genome[chrom][pos:pos + 23].seq alt_index = self.offset - pos - 1 if 0 < alt_index < 23: splice_context = splice_context[:alt_index] + self.alt + \ splice_context[alt_index + len(self.alt):24-len(self.alt)] if self.transcript.strand == '-': splice_context = self.reverse_complement(splice_context) splice_context = self.format_acceptor(splice_context) if len(splice_context) == 23: maxentscore = maxent.score3(splice_context, matrix=matrix3) else: maxentscore = 0 if splice_context[18:20] in ['AG', self.refseq[18:20]] and \ (maxentscore >= self.acceptor_threshold or maxentscore / refscore >= self.percent_threshold): return pos, splice_context, maxentscore return 0, '', 0
def read_and_score_fasta(outdir, species, donor_dinucleotide_start=3, acceptor_dinucleotide_start=18): donor_dict = {} acceptor_dict = {} acceptor_scorefile = open(outdir + "/" + species + "_acceptor_scores.tsv", 'w') acceptor_scorefile.write("\t".join([ "splice_site_type", "location", "seq", "score", "dinucleotide", "dinucleotide_is_standard" ]) + "\n") donor_scorefile = open(outdir + "/" + species + "_donor_scores.tsv", 'w') donor_scorefile.write("\t".join([ "splice_site_type", "location", "seq", "score", "dinucleotide", "dinucleotide_is_standard" ]) + "\n") with open(outdir + "/" + species + "_donor.fastatab", 'r') as file: donor_matrix = maxent.load_matrix5() for line in file: entry = line.strip().split("\t") key = entry[0].split("(")[0] seq = entry[1].upper() dinucleotide = seq[ donor_dinucleotide_start:donor_dinucleotide_start + 2] standard_dinucleotide = dinucleotide == "GT" donor_dict[key] = { "seq": seq, "score": maxent.score5(seq, donor_matrix) if "N" not in seq else "NA", "dinucleotide": dinucleotide, "standard_dinucleotide": standard_dinucleotide } donor_scorefile.write("\t".join([ "donor", key, seq, str(donor_dict[key]["score"]), dinucleotide, str(standard_dinucleotide) ]) + "\n") with open(outdir + "/" + species + "_acceptor.fastatab", 'r') as file: acceptor_matrix = maxent.load_matrix3() for line in file: entry = line.strip().split("\t") key = entry[0].split("(")[0] seq = entry[1].upper() dinucleotide = seq[ acceptor_dinucleotide_start:acceptor_dinucleotide_start + 2] standard_dinucleotide = dinucleotide == "AG" acceptor_dict[key] = { "seq": seq, "score": maxent.score3(seq, acceptor_matrix) if "N" not in seq else "NA", "dinucleotide": dinucleotide, "standard_dinucleotide": standard_dinucleotide } acceptor_scorefile.write("\t".join([ "acceptor", key, seq, str(acceptor_dict[key]["score"]), dinucleotide, str(standard_dinucleotide) ]) + "\n") donor_scorefile.close() acceptor_scorefile.close() return donor_dict, acceptor_dict
for filename in os.listdir('../rawdata/ir/'): if ('coveragePYTHON-' in filename): splitcov = pd.read_pickle('../rawdata/ir/' + filename) rnareads = rnareads.add(splitcov) rnareads.to_pickle('../rawdata/ir/rnareads.pkl') rna_condition = analysis_functions.unbiased_mapping_ir(rnareads) rna_condition_final = analysis_functions.prepare_rnadf_ir(rna_condition) rna_condition_final.to_pickle('../rawdata/ir/rna_from_unbiased_mapping.pkl') rna_condition_final.to_csv('../rawdata/ir/rna_from_unbiased_mapping.csv') irdf = pd.read_pickle('../rawdata/ir/rna_from_unbiased_mapping.pkl') irdf['maxent5'] = irdf.index.map(lambda x: maxent.score5(irdf.varseq162[x][int( irdf.intronstart_varseq[x]) - 3:int(irdf.intronstart_varseq[x]) + 6])) irdf['maxent3'] = irdf.index.map(lambda x: maxent.score3(irdf.varseq162[x][int( irdf.intronend_varseqnew[x]) - 20:int(irdf.intronend_varseqnew[x]) + 3])) irdf['maxentadd'] = irdf.index.map(lambda x: irdf.maxent5[x] + irdf.maxent3[x]) irdf['exon1'] = irdf.index.map(lambda x: RNA.fold(irdf[ (irdf.intronstart_varseq > 24)].varseq162[x][int(irdf[ (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) - 24:int(irdf[ (irdf.intronstart_varseq > 24)].intronstart_varseq[x])])[1] if (irdf.intronstart_varseq[x] > 24) else np.nan) irdf['donor'] = irdf.index.map(lambda x: RNA.fold(irdf[ (irdf.intronstart_varseq > 24)].varseq162[x][int(irdf[ (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) - 12:int(irdf[ (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) + 12])[1] if (irdf.intronstart_varseq[x] > 24) else np.nan) irdf['intron5'] = irdf.index.map(lambda x: RNA.fold(irdf[ (irdf.intronstart_varseq > 24)].varseq162[x][int(irdf[ (irdf.intronstart_varseq > 24)].intronstart_varseq[x]):int(irdf[