Ejemplo n.º 1
0
 def __init__(self, annotation, reference):
     self.annotation = pd.read_csv(os.path.join(
         ANNOTATION, annotation + '.exon.gff.tsv'),
                                   sep='\t',
                                   header=None,
                                   dtype={0: str})
     self.matrix5 = load_matrix5()
     self.matrix3 = load_matrix3()
     self.reference = pyfaidx.Fasta(reference)
Ejemplo n.º 2
0
    def __init__(self, side='5prime'):
        """

        """
        if side not in ['5prime', '3prime']:
            raise Exception("side should be 5prime or 3prime")
        self.side = side
        if self.side == '5prime':
            self.matrix = load_matrix5()
            self.model = score5
        else:
            self.matrix = load_matrix3()
            self.model = score3
Ejemplo n.º 3
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # load matrix
    matrix3 = load_matrix3()
    # parse options
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    min_score = float(options['--min-score'])
    min_phastcons = float(options['--min-phastcons'])
    # start to parse rs sites
    rs_list = []
    for m in re.finditer('AGGT', intron_fa):
        if strand == '+':
            pos = start + m.start() + 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 20, pos + 3))
            if ss3_seq.find('N') != -1:  # ensure there is no N
                continue
            ss3, score_flag = cal_score(ss3_seq, matrix3, min_score)
            if not score_flag:  # not high score
                continue
        else:
            pos = end - m.start() - 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 20),
                                 strand='-')
            if ss3_seq.find('N') != -1:  # ensure there is no N
                continue
            ss3, score_flag = cal_score(ss3_seq, matrix3, min_score)
            if not score_flag:  # not high score
                continue
        phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
        if phastcons is None or phastcons < min_phastcons:  # not conserved
            continue
        rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss3,
                                         phastcons)
        rs_list.append(rs_feature)
    if rs_list:
        return (intron_info, rs_list)
    else:
        return (None, None)
Ejemplo n.º 4
0
    def findSpliceSites(self):
        # chr15:28235701-28235871,chr15:28235789-28235790,G,A,1921,682,1467,315,3223,1142,1276,367,attgggactgtgac...
        # first line is schema, every line after that is data
        i = 0
        for line in self.inputData:
            if i == 0:
                print(line.strip() + "," + "wt5start" + "," + "wt5sequence" +
                      ","
                      "wt5score" + ","
                      "mu5start" + "," + "mu5sequence" + "," + "mu5score" +
                      "," + "wt3start" + "," + "wt3sequence" + "," +
                      "wt3score" + "," + "mu3start" + "," + "mu3sequence" +
                      "," + "mu3score")
            else:
                splitLine = line.strip().split(',')
                sequenceStart = int(splitLine[0].split(":")[1].split("-")[0])
                mutationStart = int(splitLine[1].split(":")[1].split("-")[1])
                mutationTuple = (splitLine[2], splitLine[3])
                matrix5 = maxent.load_matrix5()
                self.fivePrimeSites[splitLine[0]] = self.check5Prime(
                    splitLine[12].strip(), self.fivePrimeSiteLength,
                    sequenceStart, mutationStart, mutationTuple, matrix5)
                matrix3 = maxent.load_matrix3()
                self.threePrimeSites[splitLine[0]] = self.check3Prime(
                    splitLine[12].strip(), self.threePrimeSiteLength,
                    sequenceStart, mutationStart, mutationTuple, matrix3)

                print(line.strip() + "," +
                      str(self.fivePrimeSites[splitLine[0]][0]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][1]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][2]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][3]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][4]) + "," +
                      str(self.fivePrimeSites[splitLine[0]][5]) + "," +
                      str(self.threePrimeSites[splitLine[0]][0]) + "," +
                      str(self.threePrimeSites[splitLine[0]][1]) + "," +
                      str(self.threePrimeSites[splitLine[0]][2]) + "," +
                      str(self.threePrimeSites[splitLine[0]][3]) + "," +
                      str(self.threePrimeSites[splitLine[0]][4]) + "," +
                      str(self.threePrimeSites[splitLine[0]][5]))
            i += 1
Ejemplo n.º 5
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author: Jiguang Peng
# datetime: 2019/6/27 17:54

import itertools
import re

from pyhgvs.models import Transcript
from maxentpy import maxent
from maxentpy.maxent import load_matrix5, load_matrix3
from read_data import transcripts, genome, domain_bed, hotspot_bed, curated_region, pathogenic_dict
from utils import contained_in_bed

matrix5 = load_matrix5()
matrix3 = load_matrix3()


class Splicing:
    """
    splice class
    """
    donor_threshold = 3
    acceptor_threshold = 3
    percent_threshold = 0.7

    def __init__(self, vcfrecord, transcript):
        self.chrom = vcfrecord.chrom
        self.offset = int(vcfrecord.pos)
        self.ref = vcfrecord.ref
        self.alt = vcfrecord.alt
Ejemplo n.º 6
0
def read_and_score_fasta(outdir,
                         species,
                         donor_dinucleotide_start=3,
                         acceptor_dinucleotide_start=18):

    donor_dict = {}
    acceptor_dict = {}

    acceptor_scorefile = open(outdir + "/" + species + "_acceptor_scores.tsv",
                              'w')
    acceptor_scorefile.write("\t".join([
        "splice_site_type", "location", "seq", "score", "dinucleotide",
        "dinucleotide_is_standard"
    ]) + "\n")

    donor_scorefile = open(outdir + "/" + species + "_donor_scores.tsv", 'w')
    donor_scorefile.write("\t".join([
        "splice_site_type", "location", "seq", "score", "dinucleotide",
        "dinucleotide_is_standard"
    ]) + "\n")

    with open(outdir + "/" + species + "_donor.fastatab", 'r') as file:

        donor_matrix = maxent.load_matrix5()

        for line in file:

            entry = line.strip().split("\t")

            key = entry[0].split("(")[0]
            seq = entry[1].upper()
            dinucleotide = seq[
                donor_dinucleotide_start:donor_dinucleotide_start + 2]
            standard_dinucleotide = dinucleotide == "GT"

            donor_dict[key] = {
                "seq": seq,
                "score":
                maxent.score5(seq, donor_matrix) if "N" not in seq else "NA",
                "dinucleotide": dinucleotide,
                "standard_dinucleotide": standard_dinucleotide
            }

            donor_scorefile.write("\t".join([
                "donor", key, seq,
                str(donor_dict[key]["score"]), dinucleotide,
                str(standard_dinucleotide)
            ]) + "\n")

    with open(outdir + "/" + species + "_acceptor.fastatab", 'r') as file:

        acceptor_matrix = maxent.load_matrix3()

        for line in file:

            entry = line.strip().split("\t")

            key = entry[0].split("(")[0]
            seq = entry[1].upper()
            dinucleotide = seq[
                acceptor_dinucleotide_start:acceptor_dinucleotide_start + 2]
            standard_dinucleotide = dinucleotide == "AG"

            acceptor_dict[key] = {
                "seq":
                seq,
                "score":
                maxent.score3(seq, acceptor_matrix)
                if "N" not in seq else "NA",
                "dinucleotide":
                dinucleotide,
                "standard_dinucleotide":
                standard_dinucleotide
            }

            acceptor_scorefile.write("\t".join([
                "acceptor", key, seq,
                str(acceptor_dict[key]["score"]), dinucleotide,
                str(standard_dinucleotide)
            ]) + "\n")

    donor_scorefile.close()
    acceptor_scorefile.close()

    return donor_dict, acceptor_dict