コード例 #1
0
def get_maxent(df):
    mxnt = pd.DataFrame(index=df.index)
    mxnt['maxent5first'] = df.index.map(lambda x: maxent.score5(
        str(df.sequence[x][int(df.donor1[x]) - 3:int(df.donor1[x]) + 6])))
    mxnt['maxent5second'] = df.index.map(lambda x: maxent.score5(
        str(df.sequence[x][int(df.donor2[x]) - 3:int(df.donor2[x]) + 6])))
    return mxnt
コード例 #2
0
ファイル: splicing.py プロジェクト: liserjrqlxue/autopvs1
    def __calculate_maxentscore(self):
        """
        --- Calculate the maxentscan socre ---
        When a mutation occurs, if the WT score is above the threshold and
        the score variation (between WT and Mutant) is under -10% for HSF (-30% for MaxEnt)
        we consider that the mutation breaks the splice site.
        In the other case, if the WT score is under the threshold and
        the score variation is above +10% for HSF (+30% for MaxEnt) we consider that
        the mutation creates a new splice site.
        """
        maxentscore_alt = maxentscore_ref = -1.00
        if self.type == 'donor':
            if len(self.refseq) == 9 and len(self.altseq) == 9:
                maxentscore_ref = maxent.score5(self.refseq, matrix=matrix5)
                maxentscore_alt = maxent.score5(self.altseq, matrix=matrix5)
        elif self.type == 'acceptor':
            if len(self.refseq) == 23 and len(self.altseq) == 23:
                maxentscore_ref = maxent.score3(self.refseq, matrix=matrix3)
                maxentscore_alt = maxent.score3(self.altseq, matrix=matrix3)

        maxent_foldchange = maxentscore_alt / maxentscore_ref

        self.maxentscore_ref = round(maxentscore_ref, 2)
        self.maxentscore_alt = round(maxentscore_alt, 2)
        self.maxent_foldchange = round(maxent_foldchange, 2)
コード例 #3
0
    def check5Prime(self, sequence, length, sequenceStart, mutationStart,
                    mutationTuple, matrix5):
        wtMaxScore = -99.0
        muMaxScore = -99.0
        wtMaxStart = 0
        muMaxStart = 0
        wtMaxSequence = None
        muMaxSequence = None
        mutationOffset = mutationStart - sequenceStart
        mutatedSequence = sequence[:mutationOffset] + mutationTuple[1].upper(
        ) + sequence[mutationOffset + 1:]

        for i in range(length, -1, -1):
            start = mutationOffset - i
            end = start + length
            #wtSequence = sequence[start:end]
            wtSequence = sequence[start:mutationOffset] + mutationTuple[
                0].upper() + sequence[mutationOffset + 1:end]
            muSequence = mutatedSequence[start:end].strip()
            try:
                wtSequenceScore = maxent.score5(wtSequence, matrix5)
                muSequenceScore = maxent.score5(muSequence, matrix5)
            except:
                #sys.stderr.write("maxent failure")
                continue
            if (wtSequenceScore > wtMaxScore):
                wtMaxScore = wtSequenceScore
                wtMaxStart = start
                wtMaxSequence = wtSequence
            if (muSequenceScore > muMaxScore):
                muMaxScore = muSequenceScore
                muMaxStart = start
                muMaxSequence = muSequence
        return (wtMaxStart + sequenceStart, wtMaxSequence, wtMaxScore,
                muMaxStart + sequenceStart, muMaxSequence, muMaxScore)
コード例 #4
0
ファイル: test.py プロジェクト: la466/ess_stops
def get_sequences(length):

    exon_names, exon_seqs = files.read_fasta(exons_file)
    exons = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    [exons[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1])].append(exon_seqs[i]) for i, name in enumerate(exon_names) if len(exon_seqs[i]) > length]
    exons = {id: {exon_id: exons[id][exon_id][0] for exon_id in exons[id]} for id in exons}

    intron_names, intron_seqs = files.read_fasta(introns_file)
    introns = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    [introns[name.split('(')[0].split('.')[0]][int(name.split('(')[0].split('.')[1].split('-')[0])].append(intron_seqs[i]) for i, name in enumerate(intron_names) if name.split('.')[0] in exons]
    introns = {id: {intron_id: introns[id][intron_id][0] for intron_id in introns[id]} for id in introns}




    #
    with open(output_file, 'w') as outfile:
        for id in exons:
            for exon_id in exons[id]:
                if id in introns:
                    if exon_id in introns[id]:
                        outfile.write(">{0}.{1}\n{2}{3}\n".format(id, exon_id, exons[id][exon_id][-length:].lower(), introns[id][exon_id][:length]))


    matrix5 = load_matrix5()
    entries = files.read_fasta(output_file)
    entries = {id: entries.sequences[i] for i, id in enumerate(entries.ids)}


    decoys = []

    for id in entries:
        seq = entries[id]

        splice_site = int(len(seq)/2)

        splice_site_seq = seq[splice_site-3:splice_site+6]
        real_splice_site_max_ent = maxent.score5(splice_site_seq, matrix=matrix5)


        kept = False
        for i in range(1, len(seq) - splice_site - 5):
            if not kept:
                query = seq[splice_site + i - 3:splice_site + i + 6]
                query = "{0}{1}".format(query[:3].lower(), query[3:])
                max_ent_score = maxent.score5(query, matrix=matrix5)
                if max_ent_score >= real_splice_site_max_ent:
                    print(id, real_splice_site_max_ent, i, query, max_ent_score)
                    decoys.append(id)
                    kept = True


    with open(decoy_file, "w") as outfile:
        [outfile.write(">{0}\n{1}\n".format(id, entries[id])) for id in decoys]
コード例 #5
0
def get_maxent(df):
    mxnt = pd.DataFrame(index=df.index)
    mxnt['maxent5'] = df.index.map(lambda x: maxent.score5(
        str(df.sequence[x][int(df.exonend[x]) - 3:int(df.exonend[x]) + 6])))
    mxnt['maxent3'] = df.index.map(lambda x: maxent.score3(
        str(df.sequence[x][int(df.exonstart[x]) - 20:int(df.exonstart[x]) + 3])
    ))
    return mxnt
コード例 #6
0
				pos5=int(lig[1])-1 # position jontion 5'  le -1 car python compte a partir de zero 
				pos3=int(lig[2])-1 # position jonction 3' 
				
				
				d=int(pos5)-3
				fi=int(pos5)+6
				
				seq5=seq[d:fi] 

				deb=int(pos3)-20
				fin=int(pos3)+3
				seq3=seq[deb:fin]


				sco5=maxent.score5(seq5)

				sco3=maxent.score3(seq3)
				fichsorti.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(lig[0],lig[1],lig[2],lig[3],seq5,seq3,sco5,sco3))


f.closed
fichsorti.close







コード例 #7
0
        return 'pointmut'


fsm1rnavars['variantclass'] = fsm1rnavars.variant.apply(
    lambda x: classify_variant(x))

fsm1rnavars['intronlength'] = fsm1rnavars.variantclass.apply(
    lambda x: int(x) if x not in ['wt', 'multiple', 'pointmut'] else np.nan)

fsm1rnavars['intronseq'] = fsm1rnavars.index.map(
    lambda x: lib300.varseq[fsm1rnavars.libindex[x]][int(fsm1rnavars.variant[
        x].split('|')[0]) - 5:int(fsm1rnavars.variant[x].split('|')[-1]) + 5]
    if fsm1rnavars.intronlength[x] > 20 else np.nan)

fsm1rnavars['donorstrength'] = fsm1rnavars.intronseq.map(
    lambda x: np.max([maxent.score5(x[i - 3:i + 6]) for i in range(3, 8)])
    if (len(str(x)) > 28) else np.nan)
fsm1rnavars['acceptorstrength'] = fsm1rnavars.intronseq.map(
    lambda x: np.max([maxent.score3(x[-i - 20:-i + 3]) for i in range(4, 8)])
    if (len(str(x)) > 28) else np.nan)

rnacounts['numbersplicedreads_cum'] = fsm1rnavars[
    fsm1rnavars.intronlength > 20].groupby('libindex').numberreads.sum()
rnacounts['fractionnumbersplicedreads'] = rnacounts.index.map(
    lambda x: rnacounts.numbersplicedreads_cum[x] / rnacounts.numberreads_cum[
        x])
rnacounts['fractionnumbersplicedreads'].replace(to_replace=np.nan,
                                                value=0,
                                                inplace=True)

rnacounts.to_pickle('./mapping/RNA/rnacounts_min3_dlud_minlength10.pkl')
コード例 #8
0
ファイル: splicing.py プロジェクト: liserjrqlxue/autopvs1
    def cryptic_splice_site(self):
        """
        Search for cryptic splice site
        1) nearby (+/- 20 nts) strong consensus splice sequence
        2) reconstitutes or disrupts in-frame splicing
        3) undergo NMD or not
        Consensus values go from 0 to 100 for HSF, -20 to +20 for MaxEnt.
        The threshold is defined at 65 for HSF, 3 for MaxEnt.
        This means that every signal with a score above the threshold is considered
        to be a splice site (donor or acceptor).
        Cite: http://www.umd.be/HSF3/technicaltips.html
        """
        refscore = self.maxentscore_ref
        chrom = self.chrom if 'chr' in self.chrom else 'chr' + self.chrom
        search_flank = 50
        list1 = list(
            range(self.refseq_start - 1, self.refseq_start - 1 - search_flank,
                  -1))
        list2 = list(
            range(self.refseq_start + 1, self.refseq_start + 1 + search_flank,
                  1))
        search_region = list(itertools.chain.from_iterable(zip(list1, list2)))

        for pos in search_region:
            if self.type == 'donor':
                splice_context = genome[chrom][pos:pos + 9].seq
                alt_index = self.offset - pos - 1
                if 0 < alt_index < 9:
                    splice_context = splice_context[:alt_index] + self.alt + \
                                     splice_context[alt_index + len(self.alt):10-len(self.alt)]
                if self.transcript.strand == '-':
                    splice_context = self.reverse_complement(splice_context)

                splice_context = self.format_donor(splice_context)
                if len(splice_context) == 9:
                    maxentscore = maxent.score5(splice_context, matrix=matrix5)
                else:
                    maxentscore = 0
                if splice_context[3:5] in ['GT', self.refseq[3:5]] and \
                        (maxentscore >= self.donor_threshold or
                         maxentscore / refscore >= self.percent_threshold):
                    return pos, splice_context, maxentscore

            elif self.type == 'acceptor':
                splice_context = genome[chrom][pos:pos + 23].seq
                alt_index = self.offset - pos - 1
                if 0 < alt_index < 23:
                    splice_context = splice_context[:alt_index] + self.alt + \
                                     splice_context[alt_index + len(self.alt):24-len(self.alt)]
                if self.transcript.strand == '-':
                    splice_context = self.reverse_complement(splice_context)

                splice_context = self.format_acceptor(splice_context)
                if len(splice_context) == 23:
                    maxentscore = maxent.score3(splice_context, matrix=matrix3)
                else:
                    maxentscore = 0
                if splice_context[18:20] in ['AG', self.refseq[18:20]] and \
                        (maxentscore >= self.acceptor_threshold or
                         maxentscore / refscore >= self.percent_threshold):
                    return pos, splice_context, maxentscore
        return 0, '', 0
コード例 #9
0
def cal_score(ss5_seq, matrix5, min_score):
    ss5 = score5(ss5_seq, matrix=matrix5)
    if ss5 < min_score:
        return (ss5, False)
    else:
        return (ss5, True)
コード例 #10
0
def read_and_score_fasta(outdir,
                         species,
                         donor_dinucleotide_start=3,
                         acceptor_dinucleotide_start=18):

    donor_dict = {}
    acceptor_dict = {}

    acceptor_scorefile = open(outdir + "/" + species + "_acceptor_scores.tsv",
                              'w')
    acceptor_scorefile.write("\t".join([
        "splice_site_type", "location", "seq", "score", "dinucleotide",
        "dinucleotide_is_standard"
    ]) + "\n")

    donor_scorefile = open(outdir + "/" + species + "_donor_scores.tsv", 'w')
    donor_scorefile.write("\t".join([
        "splice_site_type", "location", "seq", "score", "dinucleotide",
        "dinucleotide_is_standard"
    ]) + "\n")

    with open(outdir + "/" + species + "_donor.fastatab", 'r') as file:

        donor_matrix = maxent.load_matrix5()

        for line in file:

            entry = line.strip().split("\t")

            key = entry[0].split("(")[0]
            seq = entry[1].upper()
            dinucleotide = seq[
                donor_dinucleotide_start:donor_dinucleotide_start + 2]
            standard_dinucleotide = dinucleotide == "GT"

            donor_dict[key] = {
                "seq": seq,
                "score":
                maxent.score5(seq, donor_matrix) if "N" not in seq else "NA",
                "dinucleotide": dinucleotide,
                "standard_dinucleotide": standard_dinucleotide
            }

            donor_scorefile.write("\t".join([
                "donor", key, seq,
                str(donor_dict[key]["score"]), dinucleotide,
                str(standard_dinucleotide)
            ]) + "\n")

    with open(outdir + "/" + species + "_acceptor.fastatab", 'r') as file:

        acceptor_matrix = maxent.load_matrix3()

        for line in file:

            entry = line.strip().split("\t")

            key = entry[0].split("(")[0]
            seq = entry[1].upper()
            dinucleotide = seq[
                acceptor_dinucleotide_start:acceptor_dinucleotide_start + 2]
            standard_dinucleotide = dinucleotide == "AG"

            acceptor_dict[key] = {
                "seq":
                seq,
                "score":
                maxent.score3(seq, acceptor_matrix)
                if "N" not in seq else "NA",
                "dinucleotide":
                dinucleotide,
                "standard_dinucleotide":
                standard_dinucleotide
            }

            acceptor_scorefile.write("\t".join([
                "acceptor", key, seq,
                str(acceptor_dict[key]["score"]), dinucleotide,
                str(standard_dinucleotide)
            ]) + "\n")

    donor_scorefile.close()
    acceptor_scorefile.close()

    return donor_dict, acceptor_dict
コード例 #11
0
rnareads = pd.Series([''], index=lib300.index).astype(str)

for filename in os.listdir('../rawdata/ir/'):
    if ('coveragePYTHON-' in filename):
        splitcov = pd.read_pickle('../rawdata/ir/' + filename)
        rnareads = rnareads.add(splitcov)

rnareads.to_pickle('../rawdata/ir/rnareads.pkl')

rna_condition = analysis_functions.unbiased_mapping_ir(rnareads)
rna_condition_final = analysis_functions.prepare_rnadf_ir(rna_condition)
rna_condition_final.to_pickle('../rawdata/ir/rna_from_unbiased_mapping.pkl')
rna_condition_final.to_csv('../rawdata/ir/rna_from_unbiased_mapping.csv')

irdf = pd.read_pickle('../rawdata/ir/rna_from_unbiased_mapping.pkl')
irdf['maxent5'] = irdf.index.map(lambda x: maxent.score5(irdf.varseq162[x][int(
    irdf.intronstart_varseq[x]) - 3:int(irdf.intronstart_varseq[x]) + 6]))
irdf['maxent3'] = irdf.index.map(lambda x: maxent.score3(irdf.varseq162[x][int(
    irdf.intronend_varseqnew[x]) - 20:int(irdf.intronend_varseqnew[x]) + 3]))
irdf['maxentadd'] = irdf.index.map(lambda x: irdf.maxent5[x] + irdf.maxent3[x])

irdf['exon1'] = irdf.index.map(lambda x: RNA.fold(irdf[
    (irdf.intronstart_varseq > 24)].varseq162[x][int(irdf[
        (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) - 24:int(irdf[
            (irdf.intronstart_varseq > 24)].intronstart_varseq[x])])[1] if
                               (irdf.intronstart_varseq[x] > 24) else np.nan)
irdf['donor'] = irdf.index.map(lambda x: RNA.fold(irdf[
    (irdf.intronstart_varseq > 24)].varseq162[x][int(irdf[
        (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) - 12:int(irdf[
            (irdf.intronstart_varseq > 24)].intronstart_varseq[x]) + 12])[1] if
                               (irdf.intronstart_varseq[x] > 24) else np.nan)
irdf['intron5'] = irdf.index.map(lambda x: RNA.fold(irdf[