def get_cds(self, seq_dict): """ Return the CDS sequence (as a string) for the transcript (based on the exons) using a sequenceDict as the sequence source. The returned sequence is in the correct 5'-3' orientation (i.e. it has been reverse complemented if necessary). """ sequence = seq_dict[self.chromosome] assert self.stop <= len(sequence) # make sure this isn't a non-coding gene if self.thick_start == self.thick_stop == 0: return "" s = [] for e in self.exon_intervals: if self.thick_start < e.start and e.stop < self.thick_stop: # squarely in the CDS s.append(sequence[e.start:e.stop]) elif e.start <= self.thick_start < e.stop < self.thick_stop: # thickStart marks the start of the CDS s.append(sequence[self.thick_start:e.stop]) elif e.start <= self.thick_start and self.thick_stop <= e.stop: # thickStart and thickStop mark the whole CDS s.append(sequence[self.thick_start: self.thick_stop]) elif self.thick_start < e.start < self.thick_stop <= e.stop: # thickStop marks the end of the CDS s.append(sequence[e.start:self.thick_stop]) if not self.strand: cds = reverse_complement("".join(s)) else: cds = "".join(s) return cds
def get_sequence(self, seq_dict, stranded=True): """ Returns the sequence for this intron in transcript orientation (reverse complement as necessary) If strand is False, returns the + strand regardless of transcript orientation. """ if stranded is False or self.strand is True: return seq_dict[self.chromosome][self.start:self.stop] if self.strand is False: return reverse_complement(seq_dict[self.chromosome][self.start:self.stop])
def bam_to_rec(read): """Convert pysam record to fastq https://www.biostars.org/p/6970/ """ seq = read.seq qual = read.qual if read.is_reverse: seq = reverse_complement(seq) qual = qual[::-1] return "@{}\n{}\n+\n{}\n".format(read.qname, seq, qual)
def get_mrna(self, seq_dict): """ Returns the mRNA sequence for this transcript based on a Fasta object. and the start/end positions and the exons. Sequence returned in 5'-3' transcript orientation. """ sequence = seq_dict[self.chromosome] assert self.stop <= len(sequence) s = [] for e in self.exon_intervals: s.append(sequence[e.start:e.stop]) if self.strand is True: mrna = "".join(s) else: mrna = reverse_complement("".join(s)) return mrna
def get_cds(self, seq_dict, in_frame=True): """ Return the CDS sequence (as a string) for the transcript (based on the exons) using a sequenceDict as the sequence source. The returned sequence is in the correct 5'-3' orientation (i.e. it has been reverse complemented if necessary). Overrides get_cds in GenePredTranscript to provide frame functionality TODO: now loses the store-and-reuse functionality to avoid slicing offset sizes each time. """ sequence = seq_dict[self.chromosome] assert self.stop <= len(sequence) # make sure this isn't a non-coding gene if self.thick_start == self.thick_stop == 0: return "" s = [] for e in self.exon_intervals: if self.thick_start < e.start and e.stop < self.thick_stop: # squarely in the CDS s.append(sequence[e.start:e.stop]) elif e.start <= self.thick_start < e.stop < self.thick_stop: # thickStart marks the start of the CDS s.append(sequence[self.thick_start:e.stop]) elif e.start <= self.thick_start and self.thick_stop <= e.stop: # thickStart and thickStop mark the whole CDS s.append(sequence[self.thick_start: self.thick_stop]) elif self.thick_start < e.start < self.thick_stop <= e.stop: # thickStop marks the end of the CDS s.append(sequence[e.start:self.thick_stop]) if not self.strand: cds = reverse_complement("".join(s)) else: cds = "".join(s) if in_frame is True: offset = find_offset(self.exon_frames, self.strand) cds = cds[offset:] return cds