def __init__(self, annotation, reference): self.annotation = pd.read_csv(os.path.join( ANNOTATION, annotation + '.exon.gff.tsv'), sep='\t', header=None, dtype={0: str}) self.matrix5 = load_matrix5() self.matrix3 = load_matrix3() self.reference = pyfaidx.Fasta(reference)
def __init__(self, side='5prime'): """ """ if side not in ['5prime', '3prime']: raise Exception("side should be 5prime or 3prime") self.side = side if self.side == '5prime': self.matrix = load_matrix5() self.model = score5 else: self.matrix = load_matrix3() self.model = score3
def parse_intron(options, chrom, start, end, strand, intron_info): # fetch fasta fa = check_fasta(options['--genome']) intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand) # load matrix matrix3 = load_matrix3() # parse options phastcons_f = pyBigWig.open(options['--bigwig']) min_distance = int(options['--min-distance']) min_score = float(options['--min-score']) min_phastcons = float(options['--min-phastcons']) # start to parse rs sites rs_list = [] for m in re.finditer('AGGT', intron_fa): if strand == '+': pos = start + m.start() + 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 20, pos + 3)) if ss3_seq.find('N') != -1: # ensure there is no N continue ss3, score_flag = cal_score(ss3_seq, matrix3, min_score) if not score_flag: # not high score continue else: pos = end - m.start() - 2 left_dist, right_dist, dist_flag = cal_distance( pos, start, end, min_distance) if not dist_flag: # not enough distance continue ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 20), strand='-') if ss3_seq.find('N') != -1: # ensure there is no N continue ss3, score_flag = cal_score(ss3_seq, matrix3, min_score) if not score_flag: # not high score continue phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0] if phastcons is None or phastcons < min_phastcons: # not conserved continue rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss3, phastcons) rs_list.append(rs_feature) if rs_list: return (intron_info, rs_list) else: return (None, None)
def findSpliceSites(self): # chr15:28235701-28235871,chr15:28235789-28235790,G,A,1921,682,1467,315,3223,1142,1276,367,attgggactgtgac... # first line is schema, every line after that is data i = 0 for line in self.inputData: if i == 0: print(line.strip() + "," + "wt5start" + "," + "wt5sequence" + "," "wt5score" + "," "mu5start" + "," + "mu5sequence" + "," + "mu5score" + "," + "wt3start" + "," + "wt3sequence" + "," + "wt3score" + "," + "mu3start" + "," + "mu3sequence" + "," + "mu3score") else: splitLine = line.strip().split(',') sequenceStart = int(splitLine[0].split(":")[1].split("-")[0]) mutationStart = int(splitLine[1].split(":")[1].split("-")[1]) mutationTuple = (splitLine[2], splitLine[3]) matrix5 = maxent.load_matrix5() self.fivePrimeSites[splitLine[0]] = self.check5Prime( splitLine[12].strip(), self.fivePrimeSiteLength, sequenceStart, mutationStart, mutationTuple, matrix5) matrix3 = maxent.load_matrix3() self.threePrimeSites[splitLine[0]] = self.check3Prime( splitLine[12].strip(), self.threePrimeSiteLength, sequenceStart, mutationStart, mutationTuple, matrix3) print(line.strip() + "," + str(self.fivePrimeSites[splitLine[0]][0]) + "," + str(self.fivePrimeSites[splitLine[0]][1]) + "," + str(self.fivePrimeSites[splitLine[0]][2]) + "," + str(self.fivePrimeSites[splitLine[0]][3]) + "," + str(self.fivePrimeSites[splitLine[0]][4]) + "," + str(self.fivePrimeSites[splitLine[0]][5]) + "," + str(self.threePrimeSites[splitLine[0]][0]) + "," + str(self.threePrimeSites[splitLine[0]][1]) + "," + str(self.threePrimeSites[splitLine[0]][2]) + "," + str(self.threePrimeSites[splitLine[0]][3]) + "," + str(self.threePrimeSites[splitLine[0]][4]) + "," + str(self.threePrimeSites[splitLine[0]][5])) i += 1
#!/usr/bin/env python # -*- coding:utf-8 -*- # author: Jiguang Peng # datetime: 2019/6/27 17:54 import itertools import re from pyhgvs.models import Transcript from maxentpy import maxent from maxentpy.maxent import load_matrix5, load_matrix3 from read_data import transcripts, genome, domain_bed, hotspot_bed, curated_region, pathogenic_dict from utils import contained_in_bed matrix5 = load_matrix5() matrix3 = load_matrix3() class Splicing: """ splice class """ donor_threshold = 3 acceptor_threshold = 3 percent_threshold = 0.7 def __init__(self, vcfrecord, transcript): self.chrom = vcfrecord.chrom self.offset = int(vcfrecord.pos) self.ref = vcfrecord.ref self.alt = vcfrecord.alt
def read_and_score_fasta(outdir, species, donor_dinucleotide_start=3, acceptor_dinucleotide_start=18): donor_dict = {} acceptor_dict = {} acceptor_scorefile = open(outdir + "/" + species + "_acceptor_scores.tsv", 'w') acceptor_scorefile.write("\t".join([ "splice_site_type", "location", "seq", "score", "dinucleotide", "dinucleotide_is_standard" ]) + "\n") donor_scorefile = open(outdir + "/" + species + "_donor_scores.tsv", 'w') donor_scorefile.write("\t".join([ "splice_site_type", "location", "seq", "score", "dinucleotide", "dinucleotide_is_standard" ]) + "\n") with open(outdir + "/" + species + "_donor.fastatab", 'r') as file: donor_matrix = maxent.load_matrix5() for line in file: entry = line.strip().split("\t") key = entry[0].split("(")[0] seq = entry[1].upper() dinucleotide = seq[ donor_dinucleotide_start:donor_dinucleotide_start + 2] standard_dinucleotide = dinucleotide == "GT" donor_dict[key] = { "seq": seq, "score": maxent.score5(seq, donor_matrix) if "N" not in seq else "NA", "dinucleotide": dinucleotide, "standard_dinucleotide": standard_dinucleotide } donor_scorefile.write("\t".join([ "donor", key, seq, str(donor_dict[key]["score"]), dinucleotide, str(standard_dinucleotide) ]) + "\n") with open(outdir + "/" + species + "_acceptor.fastatab", 'r') as file: acceptor_matrix = maxent.load_matrix3() for line in file: entry = line.strip().split("\t") key = entry[0].split("(")[0] seq = entry[1].upper() dinucleotide = seq[ acceptor_dinucleotide_start:acceptor_dinucleotide_start + 2] standard_dinucleotide = dinucleotide == "AG" acceptor_dict[key] = { "seq": seq, "score": maxent.score3(seq, acceptor_matrix) if "N" not in seq else "NA", "dinucleotide": dinucleotide, "standard_dinucleotide": standard_dinucleotide } acceptor_scorefile.write("\t".join([ "acceptor", key, seq, str(acceptor_dict[key]["score"]), dinucleotide, str(standard_dinucleotide) ]) + "\n") donor_scorefile.close() acceptor_scorefile.close() return donor_dict, acceptor_dict