Beispiel #1
0
    def __calculate_maxentscore(self):
        """
        --- Calculate the maxentscan socre ---
        When a mutation occurs, if the WT score is above the threshold and
        the score variation (between WT and Mutant) is under -10% for HSF (-30% for MaxEnt)
        we consider that the mutation breaks the splice site.
        In the other case, if the WT score is under the threshold and
        the score variation is above +10% for HSF (+30% for MaxEnt) we consider that
        the mutation creates a new splice site.
        """
        maxentscore_alt = maxentscore_ref = -1.00
        if self.type == 'donor':
            if len(self.refseq) == 9 and len(self.altseq) == 9:
                maxentscore_ref = maxent.score5(self.refseq, matrix=matrix5)
                maxentscore_alt = maxent.score5(self.altseq, matrix=matrix5)
        elif self.type == 'acceptor':
            if len(self.refseq) == 23 and len(self.altseq) == 23:
                maxentscore_ref = maxent.score3(self.refseq, matrix=matrix3)
                maxentscore_alt = maxent.score3(self.altseq, matrix=matrix3)

        maxent_foldchange = maxentscore_alt / maxentscore_ref

        self.maxentscore_ref = round(maxentscore_ref, 2)
        self.maxentscore_alt = round(maxentscore_alt, 2)
        self.maxent_foldchange = round(maxent_foldchange, 2)
Beispiel #2
0
    def check3Prime(self, sequence, length, sequenceStart, mutationStart,
                    mutationTuple, matrix3):
        wtMaxScore = -99.0
        muMaxScore = -99.0
        wtMaxStart = 0
        muMaxStart = 0
        wtMaxSequence = None
        muMaxSequence = None
        mutationOffset = mutationStart - sequenceStart
        mutatedSequence = sequence[:mutationOffset] + mutationTuple[1].upper(
        ) + sequence[mutationOffset + 1:]

        for i in range(length, -1, -1):
            start = mutationOffset - i
            end = start + length
            #wtSequence = sequence[start:end]
            wtSequence = sequence[start:mutationOffset] + mutationTuple[
                0].upper() + sequence[mutationOffset + 1:end]
            muSequence = mutatedSequence[start:end].strip()
            try:
                wtSequenceScore = maxent.score3(wtSequence, matrix3)
                muSequenceScore = maxent.score3(muSequence, matrix3)
            except:
                #sys.stderr.write("maxent failure")
                continue
            if (wtSequenceScore > wtMaxScore):
                wtMaxScore = wtSequenceScore
                wtMaxStart = start
                wtMaxSequence = wtSequence
            if (muSequenceScore > muMaxScore):
                muMaxScore = muSequenceScore
                muMaxStart = start
                muMaxSequence = muSequence
        return (wtMaxStart + sequenceStart, wtMaxSequence, wtMaxScore,
                muMaxStart + sequenceStart, muMaxSequence, muMaxScore)
Beispiel #3
0
def get_maxent(df):
    mxnt = pd.DataFrame(index=df.index)
    mxnt['maxent3first'] = df.index.map(lambda x: maxent.score3(
        str(df.sequence[x][int(df.acceptor1[x]) - 20:int(df.acceptor1[x]) + 3])
    ))
    mxnt['maxent3second'] = df.index.map(lambda x: maxent.score3(
        str(df.sequence[x][int(df.acceptor2[x]) - 20:int(df.acceptor2[x]) + 3])
    ))
    return mxnt
Beispiel #4
0
def get_maxent(df):
    mxnt = pd.DataFrame(index=df.index)
    mxnt['maxent5'] = df.index.map(lambda x: maxent.score5(
        str(df.sequence[x][int(df.exonend[x]) - 3:int(df.exonend[x]) + 6])))
    mxnt['maxent3'] = df.index.map(lambda x: maxent.score3(
        str(df.sequence[x][int(df.exonstart[x]) - 20:int(df.exonstart[x]) + 3])
    ))
    return mxnt
Beispiel #5
0
				pos3=int(lig[2])-1 # position jonction 3' 
				
				
				d=int(pos5)-3
				fi=int(pos5)+6
				
				seq5=seq[d:fi] 

				deb=int(pos3)-20
				fin=int(pos3)+3
				seq3=seq[deb:fin]


				sco5=maxent.score5(seq5)

				sco3=maxent.score3(seq3)
				fichsorti.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(lig[0],lig[1],lig[2],lig[3],seq5,seq3,sco5,sco3))


f.closed
fichsorti.close









Beispiel #6
0
def cal_score(ss3_seq, matrix3, min_score):
    ss3 = score3(ss3_seq, matrix=matrix3)
    if ss3 < min_score:
        return (ss3, False)
    else:
        return (ss3, True)
fsm1rnavars['variantclass'] = fsm1rnavars.variant.apply(
    lambda x: classify_variant(x))

fsm1rnavars['intronlength'] = fsm1rnavars.variantclass.apply(
    lambda x: int(x) if x not in ['wt', 'multiple', 'pointmut'] else np.nan)

fsm1rnavars['intronseq'] = fsm1rnavars.index.map(
    lambda x: lib300.varseq[fsm1rnavars.libindex[x]][int(fsm1rnavars.variant[
        x].split('|')[0]) - 5:int(fsm1rnavars.variant[x].split('|')[-1]) + 5]
    if fsm1rnavars.intronlength[x] > 20 else np.nan)

fsm1rnavars['donorstrength'] = fsm1rnavars.intronseq.map(
    lambda x: np.max([maxent.score5(x[i - 3:i + 6]) for i in range(3, 8)])
    if (len(str(x)) > 28) else np.nan)
fsm1rnavars['acceptorstrength'] = fsm1rnavars.intronseq.map(
    lambda x: np.max([maxent.score3(x[-i - 20:-i + 3]) for i in range(4, 8)])
    if (len(str(x)) > 28) else np.nan)

rnacounts['numbersplicedreads_cum'] = fsm1rnavars[
    fsm1rnavars.intronlength > 20].groupby('libindex').numberreads.sum()
rnacounts['fractionnumbersplicedreads'] = rnacounts.index.map(
    lambda x: rnacounts.numbersplicedreads_cum[x] / rnacounts.numberreads_cum[
        x])
rnacounts['fractionnumbersplicedreads'].replace(to_replace=np.nan,
                                                value=0,
                                                inplace=True)

rnacounts.to_pickle('./mapping/RNA/rnacounts_min3_dlud_minlength10.pkl')

fsm1rnavars.to_pickle('./mapping/RNA/fsm1rnavars_min3_dlud_minlength10.pkl')
Beispiel #8
0
    def cryptic_splice_site(self):
        """
        Search for cryptic splice site
        1) nearby (+/- 20 nts) strong consensus splice sequence
        2) reconstitutes or disrupts in-frame splicing
        3) undergo NMD or not
        Consensus values go from 0 to 100 for HSF, -20 to +20 for MaxEnt.
        The threshold is defined at 65 for HSF, 3 for MaxEnt.
        This means that every signal with a score above the threshold is considered
        to be a splice site (donor or acceptor).
        Cite: http://www.umd.be/HSF3/technicaltips.html
        """
        refscore = self.maxentscore_ref
        chrom = self.chrom if 'chr' in self.chrom else 'chr' + self.chrom
        search_flank = 50
        list1 = list(
            range(self.refseq_start - 1, self.refseq_start - 1 - search_flank,
                  -1))
        list2 = list(
            range(self.refseq_start + 1, self.refseq_start + 1 + search_flank,
                  1))
        search_region = list(itertools.chain.from_iterable(zip(list1, list2)))

        for pos in search_region:
            if self.type == 'donor':
                splice_context = genome[chrom][pos:pos + 9].seq
                alt_index = self.offset - pos - 1
                if 0 < alt_index < 9:
                    splice_context = splice_context[:alt_index] + self.alt + \
                                     splice_context[alt_index + len(self.alt):10-len(self.alt)]
                if self.transcript.strand == '-':
                    splice_context = self.reverse_complement(splice_context)

                splice_context = self.format_donor(splice_context)
                if len(splice_context) == 9:
                    maxentscore = maxent.score5(splice_context, matrix=matrix5)
                else:
                    maxentscore = 0
                if splice_context[3:5] in ['GT', self.refseq[3:5]] and \
                        (maxentscore >= self.donor_threshold or
                         maxentscore / refscore >= self.percent_threshold):
                    return pos, splice_context, maxentscore

            elif self.type == 'acceptor':
                splice_context = genome[chrom][pos:pos + 23].seq
                alt_index = self.offset - pos - 1
                if 0 < alt_index < 23:
                    splice_context = splice_context[:alt_index] + self.alt + \
                                     splice_context[alt_index + len(self.alt):24-len(self.alt)]
                if self.transcript.strand == '-':
                    splice_context = self.reverse_complement(splice_context)

                splice_context = self.format_acceptor(splice_context)
                if len(splice_context) == 23:
                    maxentscore = maxent.score3(splice_context, matrix=matrix3)
                else:
                    maxentscore = 0
                if splice_context[18:20] in ['AG', self.refseq[18:20]] and \
                        (maxentscore >= self.acceptor_threshold or
                         maxentscore / refscore >= self.percent_threshold):
                    return pos, splice_context, maxentscore
        return 0, '', 0
Beispiel #9
0
def read_and_score_fasta(outdir,
                         species,
                         donor_dinucleotide_start=3,
                         acceptor_dinucleotide_start=18):

    donor_dict = {}
    acceptor_dict = {}

    acceptor_scorefile = open(outdir + "/" + species + "_acceptor_scores.tsv",
                              'w')
    acceptor_scorefile.write("\t".join([
        "splice_site_type", "location", "seq", "score", "dinucleotide",
        "dinucleotide_is_standard"
    ]) + "\n")

    donor_scorefile = open(outdir + "/" + species + "_donor_scores.tsv", 'w')
    donor_scorefile.write("\t".join([
        "splice_site_type", "location", "seq", "score", "dinucleotide",
        "dinucleotide_is_standard"
    ]) + "\n")

    with open(outdir + "/" + species + "_donor.fastatab", 'r') as file:

        donor_matrix = maxent.load_matrix5()

        for line in file:

            entry = line.strip().split("\t")

            key = entry[0].split("(")[0]
            seq = entry[1].upper()
            dinucleotide = seq[
                donor_dinucleotide_start:donor_dinucleotide_start + 2]
            standard_dinucleotide = dinucleotide == "GT"

            donor_dict[key] = {
                "seq": seq,
                "score":
                maxent.score5(seq, donor_matrix) if "N" not in seq else "NA",
                "dinucleotide": dinucleotide,
                "standard_dinucleotide": standard_dinucleotide
            }

            donor_scorefile.write("\t".join([
                "donor", key, seq,
                str(donor_dict[key]["score"]), dinucleotide,
                str(standard_dinucleotide)
            ]) + "\n")

    with open(outdir + "/" + species + "_acceptor.fastatab", 'r') as file:

        acceptor_matrix = maxent.load_matrix3()

        for line in file:

            entry = line.strip().split("\t")

            key = entry[0].split("(")[0]
            seq = entry[1].upper()
            dinucleotide = seq[
                acceptor_dinucleotide_start:acceptor_dinucleotide_start + 2]
            standard_dinucleotide = dinucleotide == "AG"

            acceptor_dict[key] = {
                "seq":
                seq,
                "score":
                maxent.score3(seq, acceptor_matrix)
                if "N" not in seq else "NA",
                "dinucleotide":
                dinucleotide,
                "standard_dinucleotide":
                standard_dinucleotide
            }

            acceptor_scorefile.write("\t".join([
                "acceptor", key, seq,
                str(acceptor_dict[key]["score"]), dinucleotide,
                str(standard_dinucleotide)
            ]) + "\n")

    donor_scorefile.close()
    acceptor_scorefile.close()

    return donor_dict, acceptor_dict
for filename in os.listdir('../rawdata/ir/'):
    if ('coveragePYTHON-' in filename):
        splitcov = pd.read_pickle('../rawdata/ir/' + filename)
        rnareads = rnareads.add(splitcov)

rnareads.to_pickle('../rawdata/ir/rnareads.pkl')

rna_condition = analysis_functions.unbiased_mapping_ir(rnareads)
rna_condition_final = analysis_functions.prepare_rnadf_ir(rna_condition)
rna_condition_final.to_pickle('../rawdata/ir/rna_from_unbiased_mapping.pkl')
rna_condition_final.to_csv('../rawdata/ir/rna_from_unbiased_mapping.csv')

irdf = pd.read_pickle('../rawdata/ir/rna_from_unbiased_mapping.pkl')
irdf['maxent5'] = irdf.index.map(lambda x: maxent.score5(irdf.varseq162[x][int(
    irdf.intronstart_varseq[x]) - 3:int(irdf.intronstart_varseq[x]) + 6]))
irdf['maxent3'] = irdf.index.map(lambda x: maxent.score3(irdf.varseq162[x][int(
    irdf.intronend_varseqnew[x]) - 20:int(irdf.intronend_varseqnew[x]) + 3]))
irdf['maxentadd'] = irdf.index.map(lambda x: irdf.maxent5[x] + irdf.maxent3[x])

irdf['exon1'] = irdf.index.map(lambda x: RNA.fold(irdf[
    (irdf.intronstart_varseq > 24)].varseq162[x][int(irdf[
        (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) - 24:int(irdf[
            (irdf.intronstart_varseq > 24)].intronstart_varseq[x])])[1] if
                               (irdf.intronstart_varseq[x] > 24) else np.nan)
irdf['donor'] = irdf.index.map(lambda x: RNA.fold(irdf[
    (irdf.intronstart_varseq > 24)].varseq162[x][int(irdf[
        (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) - 12:int(irdf[
            (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) + 12])[1] if
                               (irdf.intronstart_varseq[x] > 24) else np.nan)
irdf['intron5'] = irdf.index.map(lambda x: RNA.fold(irdf[
    (irdf.intronstart_varseq > 24)].varseq162[x][int(irdf[
        (irdf.intronstart_varseq > 24)].intronstart_varseq[x]):int(irdf[