def __set_fasta_seq(self, fasta_seq): if isinstance(fasta_seq, Seq.Seq): self.__fasta_seq = pyfaidx.Sequence(name=self.id, seq=str(fasta_seq)) elif isinstance(fasta_seq, str): self.__fasta_seq = pyfaidx.Sequence(name=self.id, seq=str(fasta_seq)) assert len(self.__fasta_seq) == len(str(fasta_seq)) elif isinstance(fasta_seq, pyfaidx.Sequence): self.__fasta_seq = fasta_seq else: raise ValueError("Unkown type: {}".format(type(fasta_seq)))
def tests(): seq_a = pyfaidx.Sequence('contig1', 'ATGCGT', start=1, end=6) print(seq_a, seq_a.start, seq_a.end) seq_b = pyfaidx.Sequence('contig1', 'GCGAGT', start=13, end=18) seq_a = connectFragment(seq_a, seq_b) # Connect fragment after original print(seq_a, seq_a.start, seq_a.end) # Wrong contig seq_c = pyfaidx.Sequence('different_contig1', 'CCCCCC', start=7, end=12) seq_a = connectFragment(seq_a, seq_c) seq_c = pyfaidx.Sequence('contig1', 'CCCCCC', start=7, end=12) seq_a = connectFragment(seq_a, seq_c) # Substitute fragment in centre print(seq_a, seq_a.start, seq_a.end) seq_a = pyfaidx.Sequence('contig1', 'ATGCGT', start=1, end=6) seq_b = pyfaidx.Sequence('contig1', 'GCGAGT', start=13, end=18) seq_b.percentIdentity = 97.1 seq_b = connectFragment(seq_b, seq_a) # Connect fragment before original print(seq_b, seq_b.start, seq_b.end, seq_b.percentIdentity) seq_c = pyfaidx.Sequence('contig1', 'CCC', start=5, end=7) seq_c.percentIdentity = 97.2 seq_b = connectFragment(seq_b, seq_c) print(seq_b, seq_b.start, seq_b.end, seq_b.percentIdentity) seq_c = pyfaidx.Sequence('contig1', 'AATTT', start=8, end=12) seq_b = connectFragment(seq_b, seq_c) print(seq_b, seq_b.start, seq_b.end, len(seq_b))
def test_init(self): with self.assertRaises(ValueError): tcheck = TranscriptChecker(self.model, None) for wrong_splices in ["AGGT", None, 100]: with self.assertRaises(ValueError): tcheck = TranscriptChecker(self.model, self.model_fasta, canonical_splices=wrong_splices) tcheck = TranscriptChecker(self.model, self.model_fasta) self.assertEqual(tcheck.cdna_length, 1718) self.assertEqual(sorted(tcheck.exons), sorted([(exon.start, exon.end) for exon in self.exons])) self.assertEqual(tcheck.fasta_seq, self.model_fasta) with self.subTest(initializer=Bio.Seq.Seq): _ = TranscriptChecker(self.model, Bio.Seq.Seq(str(self.model_fasta))) with self.subTest(initializer=str): _ = TranscriptChecker(self.model, str(self.model_fasta)) with self.subTest(initializer=pyfaidx.Sequence): _ = TranscriptChecker(self.model, pyfaidx.Sequence(seq=str(self.model_fasta), name=tcheck.id)) # Now check initializing with a GFF/GTF line for out_format in ["gtf", "gff3"]: with self.subTest(out_format=out_format): line = self.model.format(out_format).split("\n")[0] try: tcheck = TranscriptChecker(line, self.model_fasta) except ValueError as exc: raise ValueError(line)
def match_seq(rec: pd.Series, sequences: pyfaidx.Fasta) -> pyfaidx.Sequence: """Given a feature in a GTF/GFF read in by gtfparse, match_seq() will extract the corresponding DNA sequence and create a new pyfaidx.Sequence object Parameters ---------- rec : :class:`~pandas.Series` Information for a feature (i.e. gene, exon, etc...). Requires the following indices: strand, gene_name, feature, strand, start, end, seq_hash sequences : :class:`~pyfaidx.Sequence` Object containing sequences to match against the positions in the index. Returns ------- :class:`~pyfaidx.Sequence object` with annotation from `rec` and sequence information from `sequences`. """ try: rev: bool = bool(rec["strand"] == "-") seq = pyfaidx.Sequence( name=f"{rec['gene_name']}_" f"{rec['feature']}_" f"{rec['strand']}_" f"{rec['start']}_" f"{rec['end']}_" f"{rec['seq_hash']}", seq=sequences.get_seq(name=rec["seqname"], start=rec["start"], end=rec["end"], rc=rev).seq, ) return seq except ValueError: print(f"problem with {rec['gene_name']} {rec['start']} " f"{rec['end']} {rec['seqname']} {rec['strand']}")
def pad_transcripts(self): """ """ try: self.fai = pyfaidx.Fasta(self.json_conf["reference"]["genome"]) except KeyError: raise KeyError(self.json_conf.keys()) five_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=False) three_graph = self.define_graph(self.transcripts, self.__share_extreme, three_prime=True) five_comm = deque(sorted(self.find_communities(five_graph), key=lambda clique: min(self[_].start for _ in clique))) three_comm = deque(sorted(self.find_cliques(three_graph), key=lambda clique: max(self[_].end for _ in clique), reverse=True)) five_found = set() # First do the 5' end __to_modify = dict() while len(five_comm) > 0: comm = five_comm.popleft() comm = deque(sorted(list(set.difference(set(comm), five_found)), key=lambda tid: self[tid].start)) if len(comm) == 1: continue first = comm.popleft() five_found.add(first) comm_start = self[first].start # self[first].strip_cds() for tid in comm: if ((self[tid].start - comm_start + 1) < self.json_conf["pick"]["alternative_splicing"]["ts_distance"] and len([_ for _ in self.splices if comm_start <= _ <= self[tid].start]) < self.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] and self[tid].start > comm_start): __to_modify[tid] = [comm_start, False] five_found.add(tid) else: continue comm = deque([_ for _ in comm if _ not in five_found]) if comm: five_comm.appendleft(comm) # Then do the 3' end three_found = set() while len(three_comm) > 0: comm = three_comm.popleft() comm = deque(sorted(list(set.difference(set(comm), three_found)), key=lambda tid: self[tid].end, reverse=True)) if len(comm) == 1: continue first = comm.popleft() three_found.add(first) comm_end = self[first].end for tid in comm: if ((self[tid].end - comm_end + 1) < self.json_conf["pick"]["alternative_splicing"]["ts_distance"] and len([_ for _ in self.splices if self[tid].end <= _ <= comm_end]) < self.json_conf["pick"]["alternative_splicing"]["ts_max_splices"] and self[tid].end < comm_end): if tid in __to_modify: __to_modify[tid][1] = comm_end else: __to_modify[tid] = [False, comm_end] three_found.add(tid) else: continue comm = deque([_ for _ in comm if _ not in three_found ]) if comm: three_comm.appendleft(comm) # Now we can do the proper modification for tid in __to_modify: new_transcript = self[tid].copy() old_length = new_transcript.cdna_length # First get the ORFs if new_transcript.combined_cds_length > 0: internal_orfs = list(new_transcript.get_internal_orf_beds()) else: internal_orfs = [] # Remove the CDS and unfinalize new_transcript.strip_cds() new_transcript.unfinalize() upstream = 0 downstream = 0 if __to_modify[tid][0]: __new_exon = (__to_modify[tid][0], new_transcript.exons[0][1]) upstream = new_transcript.start - __to_modify[tid][0] new_transcript.start = __to_modify[tid][0] new_transcript.remove_exon(new_transcript.exons[0]) new_transcript.add_exon(__new_exon) new_transcript.exons = sorted(new_transcript.exons) if __to_modify[tid][1]: __new_exon = (new_transcript.exons[-1][0], __to_modify[tid][1]) downstream = __to_modify[tid][1] - new_transcript.end new_transcript.end = __to_modify[tid][1] new_transcript.remove_exon(new_transcript.exons[-1]) new_transcript.add_exon(__new_exon) new_transcript.exons = sorted(new_transcript.exons) # Now for the difficult part if internal_orfs and (__to_modify[tid][1] or __to_modify[tid][0]): self.logger.warning("Enlarging the ORFs for TID %s (%s)", tid, __to_modify[tid]) new_orfs = [] seq = '' for exon in new_transcript.exons: seq += self.fai[self.chrom][exon[0] - 1:exon[1]].seq seq = pyfaidx.Sequence(tid, seq) self.logger.warning("For TID %s we have new length %d, old length %d, exons:\n%s", tid, len(seq), old_length, new_transcript.exons) if self.strand == "-": seq = seq.reverse.complement upstream, downstream = downstream, upstream for orf in internal_orfs: self.logger.warning("Old ORF: %s", str(orf)) orf.expand(seq, upstream, downstream) self.logger.warning("New ORF: %s", str(orf)) new_orfs.append(orf) from ..utilities.log_utils import create_default_logger new_transcript.logger = create_default_logger("TEMP") new_transcript.logger.setLevel("DEBUG") new_transcript.load_orfs(new_orfs) new_transcript.logger.setLevel("WARNING") # Now finalize again new_transcript.finalize() self.transcripts[tid] = new_transcript