def test_load_pos_and_neg(self): b1 = BED12(transcriptomic=True) b1.chrom = self.tr.id b1.start = 0 b1.end = self.tr.cdna_length - 1 b1.strand = "+" b1.name = "first" b1.thick_start = 101 b1.thick_end = 190 self.assertFalse(b1.invalid) b2 = b1.copy() b2.strand = "-" b2.thick_start = 1 b2.thick_end = 87 b2.name = "second" self.assertFalse(b2.invalid) with self.assertLogs("null", "DEBUG") as _: after_overlap_check = retrieval.find_overlapping_cds( self.tr, [b1, b2]) # print(*_.output, sep="\n") self.assertEqual(len(after_overlap_check), 2, self.tr.json_conf["pick"]["orf_loading"]) self.assertEqual(after_overlap_check, [b1, b2], [_.name for _ in after_overlap_check]) retrieval.load_orfs(self.tr, [b1, b2]) self.assertEqual(self.tr.number_internal_orfs, 1) self.assertEqual(self.tr.combined_cds_start, 201, self.tr.combined_cds_start) self.assertEqual(self.tr.combined_cds_length, 90)
def as_bed12(transcript): """ Method to create a BED12 object for printing :param transcript: Mikado.loci.transcript.Transcript :return: """ bed12 = BED12() bed12.transcriptomic = False bed12.header = False bed12.chrom = transcript.chrom bed12.start = transcript.start bed12.end = transcript.end bed12.name = transcript.id bed12.score = transcript.score bed12.strand = transcript.strand if transcript.is_coding: bed12.thick_start = transcript.combined_cds[0][0] bed12.thick_end = transcript.combined_cds[-1][1] else: bed12.thick_start = bed12.thick_end = bed12.start bed12.block_count = transcript.exon_num bed12.block_sizes = [exon[1] - exon[0] + 1 for exon in transcript.exons] bed12.block_starts = [0] for pos, intron in enumerate(sorted(transcript.introns)): bed12.block_starts.append(bed12.block_starts[pos] + bed12.block_sizes[pos] + intron[1] - intron[0] + 1) return bed12
def test_load_invalid_length(self): b_invalid = BED12(transcriptomic=True) b_invalid.chrom = self.tr.id self.assertTrue(b_invalid.transcriptomic) # b_invalid.name = self.tr.id b_invalid.start = 0 b_invalid.strand = "+" b_invalid.end = self.tr.cdna_length + 10 b_invalid.thick_start = 101 b_invalid.thick_end = 190 self.assertEqual(b_invalid.chrom, b_invalid.id, b_invalid.id) with self.assertLogs("null", "WARNING") as cm: retrieval.load_orfs(self.tr, [b_invalid]) found_message = False for _ in cm.output: if "Wrong ORF for {}:".format(self.tr.id) in _: found_message = True break self.assertTrue(found_message, cm.output)
def test_filter_non_transcriptomic(self): b_valid = BED12(transcriptomic=True) b_valid.chrom = self.tr.id b_valid.name = "valid" b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+" b_valid.thick_start, b_valid.thick_end = 101, 190 b_invalid = b_valid.copy() b_invalid.name = "non-transcriptomic" b_invalid.transcriptomic = False retained = retrieval.find_overlapping_cds(self.tr, [b_invalid, b_valid]) self.assertEqual(retained, [b_valid])
def prepare_info(self, transcript): cdna = str(self.fai[transcript]).upper() bed_position = self.__found_in_bed[transcript] self.bedfile.seek(bed_position) line = self.bedfile.readline() bed = BED12(line) assert bed.name == transcript, (bed.name, transcript, bed_position) new_bed = bed.to_transcriptomic(sequence=cdna, lenient=True) if new_bed.coding is False and bed.coding is True: raise AssertionError( "The transcriptomic BED has been transformed incorrectly. Reason: {}" .format(new_bed.invalid_reason)) if bed.phase and new_bed.phase != bed.phase: raise AssertionError( "The transcriptomic BED has been transformed incorrectly. Phases: {}, {}, {}" .format(new_bed.phase, bed.phase, line)) pep = str( Seq.Seq( str(cdna[max(0 + new_bed.phase, new_bed.thick_start - 1):new_bed.thick_end])).translate()) return cdna, new_bed, pep
def test_load_invalid_multiple(self): b_valid = BED12(transcriptomic=True) b_valid.chrom = self.tr.id b_valid.name = "valid" b_valid.start, b_valid.end, b_valid.strand = 0, self.tr.cdna_length - 1, "+" b_valid.thick_start, b_valid.thick_end = 101, 190 b_invalid = b_valid.copy() b_invalid.name = "invalid" b_invalid.thick_start = 1 b_invalid.thick_end = 89 b_invalid.phase = 0 self.assertTrue(b_invalid.invalid) self.assertFalse(b_valid.invalid, b_valid.invalid_reason) with self.assertLogs("null", "DEBUG") as _: retrieval.load_orfs(self.tr, [b_valid, b_invalid]) # print(*cm.output, sep="\n") self.assertEqual(self.tr.number_internal_orfs, 1)
def as_bed12(transcript, transcriptomic=False): """ Method to create a BED12 object for printing :param transcript: Mikado.loci.transcript.Transcript :return: """ transcript.finalize() bed12 = BED12(table=transcript.codon_table) bed12.transcriptomic = False bed12.header = False bed12.chrom = transcript.chrom bed12.start = transcript.start bed12.end = transcript.end if transcript.is_coding is True: if transcript.strand != "-": try: phase = transcript.phases[transcript.selected_cds[0]] except KeyError: raise KeyError((transcript.selected_cds[0], transcript.phases)) else: try: phase = transcript.phases[transcript.selected_cds[-1]] except KeyError: raise KeyError( (transcript.selected_cds[-1], transcript.phases)) name = "ID={ID};coding={coding};phase={phase}".format( ID=transcript.id, coding=transcript.is_coding, # Now we have to get the phase of the first CDS exon .. phase=phase) else: name = "ID={ID};coding={coding}".format( ID=transcript.id, coding=transcript.is_coding, # Now we have to get the phase of the first CDS exon .. ) if transcript.alias is not None and transcript.alias != transcript.id: name += ";alias={}".format(transcript.alias) bed12.name = name bed12.score = transcript.score if transcript.score else 0 bed12.strand = transcript.strand if transcript.is_coding: bed12.coding = True first_exon = [ _ for _ in transcript.selected_cds if transcript.selected_cds_start in _ ] assert len(first_exon) == 1 bed12.phase = transcript.phases[first_exon.pop()] bed12.thick_start = transcript.selected_cds[0][0] bed12.thick_end = transcript.selected_cds[-1][1] else: bed12.thick_start = bed12.thick_end = bed12.start bed12.block_count = transcript.exon_num bed12.block_sizes = [exon[1] - exon[0] + 1 for exon in transcript.exons] bed12.block_starts = [0] for pos, intron in enumerate(sorted(transcript.introns)): bed12.block_starts.append(bed12.block_starts[pos] + bed12.block_sizes[pos] + intron[1] - intron[0] + 1) if transcriptomic: bed12 = bed12.to_transcriptomic(alias=transcript.alias, start_adjustment=False, coding=transcript.is_coding) bed12.chrom = transcript.id return bed12
def transfer_cds(transcript: Transcript, ref_cdna: str, ref_bed: BED12, target_cdna: str, target_bed: BED12, logger=create_null_logger()): if transcript is None: return transcript, target_bed, (None, None, False) transcript.finalize() assert target_bed.transcriptomic is True logger.debug("Starting with %s, phases: %s (BED %s)", transcript.id, transcript.phases, target_bed.phase) if ref_bed.coding is False: logger.debug("%s is non coding, returning immediately.", transcript.id, transcript.phases) transcript.attributes["aligner_cds"] = False transcript.attributes["was_coding"] = transcript.is_coding target_bed.coding = False transcript.strip_cds() pep_coords = (None, None, True) else: original_start, original_end = target_bed.thick_start, target_bed.thick_end original_phase, original_phases = target_bed.phase, transcript.phases.copy( ) ref_pep = str( Seq.Seq(str( ref_cdna[ref_bed.thick_start - 1:ref_bed.thick_end])).translate(to_stop=False)) ref_has_multiple_stops = False if ref_pep.count("*") == 0: pass elif abs(ref_pep.index("*") * 3 - ref_bed.cds_len) in (0, 3): ref_pep = ref_pep[:ref_pep.index( "*")] # This is the "good" case: the CDS is correct. else: ref_has_multiple_stops = True logger.warning( "The sequence of %s has in frame stop codons. Adjusting the program to take this into account.", ref_bed.name) logger.debug("%s now has phases: %s (%s)", transcript.id, transcript.phases, target_bed.phase) target_bed, pep_coords = transfer_by_alignment(ref_pep, target_cdna, target_bed, logger=logger) logger.debug("%s now has phases: %s; target bed: %s", transcript.id, transcript.phases, target_bed.phase) pep_coords = (pep_coords[0], pep_coords[1], (pep_coords[0] == 1 and pep_coords[1] == len(ref_pep))) if target_bed.thick_start == original_start and target_bed.thick_end == original_end: transcript.attributes["aligner_cds"] = True logger.debug("%s now has phases: %s", transcript.id, transcript.phases) else: transcript.attributes["aligner_cds"] = False transcript.strip_cds() if target_bed.coding is True: transcript.load_orfs([target_bed]) logger.debug("%s now has phases: %s", transcript.id, transcript.phases) # Now we have to decide whether the transcript has the "original" CDS or not result, cigar = transfer.get_and_prepare_cigar(str(ref_cdna), str(target_cdna)) ref_array, target_array = transfer.create_translation_array(cigar) try: target_start = target_array[ref_array.index(ref_bed.thick_start)] except IndexError: target_start = target_bed.start try: target_end = target_array[ref_array.index(ref_bed.thick_end)] except IndexError: target_end = target_bed.end if target_start == target_bed.thick_start and target_end == target_bed.thick_end: transcript.attributes["original_cds"] = True else: transcript.attributes["original_cds"] = False if ref_cdna == target_cdna: logger.debug("%s now has phases: %s", transcript.id, transcript.phases) if transcript.is_coding is False: raise AssertionError("{} not coding".format(transcript.id)) elif transcript.attributes["original_cds"] is False: raise AssertionError("\n".join([ str(_) for _ in [ transcript.id, (target_bed.thick_start, target_start, target_bed.thick_start == target_start), (target_bed.thick_end, target_end, target_bed.thick_end == target_end ), target_bed.thick_start == target_start and target_bed.thick_end == target_end ] ])) return transcript, target_bed, pep_coords