Python TranscriptClean.processSpliceAnnotation Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_attempt_jn_correction.py Projet : gongjingtang/TranscriptClean

    def test_too_far_away(self):
        """ A case where the NCSJ should not be corrected because it is too far
            away from the closest annotated junction relative to the maxDist
            parameter.
     
         Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical.
            chr1: 23,071,357 - 23,072,126 
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test_jns/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:6"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)
        jnNumber = 0
        maxDist = 1

        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)
        assert correction_status == False
        assert reason == "TooFarFromAnnotJn"
        assert dist == 2

Exemple #2

0

Afficher le fichier

Fichier : test_attempt_jn_correction.py Projet : gongjingtang/TranscriptClean

    def test_crash(self):
        """ This is a Drosophila junction that borders a small match preceded by
            a 7 bp deletion. It is supposed to crash correction, which will result
            in a categorization of 'Other' in the log """

        # Process references
        sjFile = "input_files/drosophila_example/chr3R_SJs.tsv"
        outprefix = "scratch/dmel_crash/"
        tmp_dir = "scratch/dmel_crash/TC_tmp/"
        chroms = set(["chr3R"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/drosophila_example/chr3R.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr3R", "14890420", "255", "7M7D2M264N7M", "*",
            "0", "0", "GATCAAACAACAAGTC", "*"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)

        jnNumber = 0
        maxDist = 5
        # Attempt to correct the splice junction
        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)
        assert correction_status == False
        assert reason == "Other"
        assert dist == 5

Exemple #3

0

Afficher le fichier

Fichier : test_all_jns_annotated.py Projet : gongjingtang/TranscriptClean

    def test_noncanonical(self):
        """ Transcript should be noncanonical and un-annotated prior to 
            correction, but be canonical and annotated afterwards """

        sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
        sjFile = "input_files/GM12878_SJs_chr1.tab"
        tmp_dir = "scratch/test_jIjM/TC_tmp/"
        chroms = set(["chr1"])
        refs = dstruct.Struct()
        refs.genome = Fasta("input_files/hg38_chr1.fa")
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                                               chroms)

        with open(sam, 'r') as f:
            sam_line = f.readline().strip()
            transcript, logInfo = TC.transcript_init(sam_line, refs.genome, 
                                                     refs.sjAnnot)

        assert transcript.allJnsAnnotated == False
        assert transcript.isCanonical == False

        # Now correct the junction and retest
        upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo)        

        assert upd_transcript.allJnsAnnotated == True
        assert upd_transcript.isCanonical == True

Exemple #4

0

Afficher le fichier

    def test_fix_donor_case3(self):
        """ Toy transcript with sequence AAGGT|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

            So-called case #3
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                               chroms)
        genome = Fasta("input_files/hg38_chr1.fa")


        # Init transcript object
        sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*",
                      "0", "0", "AAGGTGAA", "*",  "NM:i:0", "MD:Z:8"]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice donor side of the junction (left)
        new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM,
                                                         transcript.POS, jnNumber,
                                                         donor, -2, genome,
                                                         transcript.SEQ,
                                                         transcript.CIGAR)

        assert new_seq == "AAGGAA"
        assert new_cigar == "3M764N3M"

Exemple #5

0

Afficher le fichier

Fichier : test_update_post_ncsj_correction.py Projet : gongjingtang/TranscriptClean

    def test_no_correction(self):
        """ Make sure that the attributes stay the same if no correction 
            was performed
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donor, acceptor, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Now test the update function
        TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict)

        junction = transcript.spliceJunctions[jnNumber]
        assert junction.motif_code == "0"
        assert junction.isCanonical == False
        assert transcript.MD == "MD:Z:4"
        assert transcript.isCanonical == False

Exemple #6

0

Afficher le fichier

Fichier : test_find_closest_annotated_bound.py Projet : gongjingtang/TranscriptClean

    def test_find_closest_splice_acceptor_minus(self):
        """ Find the closest splice acceptor, which is 1 bp downstream.
            Minus strand. Note that dist is relative to the genome, not to
            the direction of the transcript."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 22071331
        end = 22073331
        strand = "-"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        acceptor = junction.get_splice_acceptor()
        closest_acceptor = TC.find_closest_bound(acceptor, acceptors)
        assert closest_acceptor.start == 22071329
        assert closest_acceptor.end == 22071330
        assert closest_acceptor.dist == -1

Exemple #7

0

Afficher le fichier

    def test_find_closest_sj_plus(self):

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23071350
        end = 23072124
        strand = "+"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        closest_donor, closest_acceptor = TC.find_closest_ref_junction(
            junction, donors, acceptors)
        assert closest_donor.end == 23071360
        assert closest_acceptor.end == 23072123

Exemple #8

0

Afficher le fichier

Fichier : test_find_closest_annotated_bound.py Projet : gongjingtang/TranscriptClean

    def test_find_closest_splice_donor_minus(self):
        """ For a toy case with multiple donors and acceptors in close
            proximity, test whether TC can find the closest reference donor
            to the supplied intron bound.

            Similar to before, there is an exact match for the donor, located
            at 23071360 in 1-based coordinates and 23071359 in 0-based."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23070360
        end = 23071360
        strand = "-"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        donor = junction.get_splice_donor()
        closest_donor = TC.find_closest_bound(donor, donors)
        assert closest_donor.start == 23071359
        assert closest_donor.end == 23071360
        assert closest_donor.dist == 0

Exemple #9

0

Afficher le fichier

    def test_tmp_files(self):
        """ Check that the expected tmp files are created."""

        sj_file = "input_files/toy_sjs_mixed_chroms.txt"
        chroms = set(["chr1", "chr2"])
        tmp_dir = "scratch/sj_reading_test/"
        os.system("mkdir -p " + tmp_dir)

        donor_bt, accept_bt, annot = TC.processSpliceAnnotation(sj_file,
                                                                tmp_dir,
                                                                chroms,
                                                                process="test")

        # Check if paths of tmp files are correct
        assert os.path.exists(
            "scratch/sj_reading_test/splice_files/test_ref_splice_donors_tmp.bed"
        )
        assert os.path.exists(
            "scratch/sj_reading_test/splice_files/test_ref_splice_acceptors_tmp.bed"
        )
        assert os.path.exists(
            "scratch/sj_reading_test/splice_files/test_ref_splice_donors_tmp.sorted.bed"
        )
        assert os.path.exists(
            "scratch/sj_reading_test/splice_files/test_ref_splice_acceptors_tmp.sorted.bed"
        )

Exemple #10

0

Afficher le fichier

Fichier : test_attempt_jn_correction.py Projet : gongjingtang/TranscriptClean

    def test_correct_jn(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        outprefix = "scratch/test_jns/"
        tmp_dir = "scratch/test_jns/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)
        jnNumber = 0
        maxDist = 5
        #donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice junction
        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)

        assert correction_status == True
        assert reason == "NA"
        assert dist == 2

Exemple #11

0

Afficher le fichier

Fichier : test_find_closest_annotated_bound.py Projet : gongjingtang/TranscriptClean

    def test_find_closest_splice_acceptor_plus(self):
        """ Find the closest splice acceptor, which is 17 bp upstream.
            Plus strand."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23071360
        end = 23072140
        strand = "+"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        acceptor = junction.get_splice_acceptor()
        closest_acceptor = TC.find_closest_bound(acceptor, acceptors)
        assert closest_acceptor.start == 23072122
        assert closest_acceptor.end == 23072123
        assert closest_acceptor.dist == -17

Exemple #12

0

Afficher le fichier

Fichier : test_high_level_ncsj_correction.py Projet : gongjingtang/TranscriptClean

    def test_crash_dmel(self):
        """ This is a Drosophila junction that borders a small match preceded by
            a 7 bp deletion. It is also supposed to crash correction, but did
            not in TC v2.0.1."""

        # Process references
        sjFile = "input_files/drosophila_example/chr3R_SJs.tsv"
        tmp_dir = "scratch/dmel/TC"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr3R"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/drosophila_example/chr3R.fa")

        sam = "input_files/drosophila_example/no_SJ_corr.sam"
        with open(sam, 'r') as f:
            for sam_line in f:
                if sam_line.startswith("@"):
                    continue
                else:
                    sam_line = sam_line.strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)
        orig_CIGAR = transcript.CIGAR
        orig_seq = transcript.SEQ
        orig_MD = transcript.MD
        expected_TE = "\t".join([
            "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013",
            "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected",
            "Other"
        ]) + "\n"

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        new_transcript, TE_entries = TC.cleanNoncanonical(
            transcript, refs, maxDist, logInfo)

        print(TE_entries)
        assert new_transcript.isCanonical == False
        assert TE_entries == expected_TE
        assert new_transcript.MD == orig_MD
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert new_transcript.CIGAR == orig_CIGAR
        assert new_transcript.SEQ == orig_seq

Exemple #13

0

Afficher le fichier

Fichier : test_all_jns_annotated.py Projet : gongjingtang/TranscriptClean

    def test_two_annotated_SJs(self):
        """ Transcript with 2 junctions and each match the provided reference
        """
        sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjFile = "input_files/GM12878_SJs_chr1.tab"
        outprefix = "scratch/test"
        tmp_dir = "scratch/test_jIjM/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                               chroms)

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, sjDict)

        assert transcript.allJnsAnnotated == True
        assert transcript.isCanonical == True

Exemple #14

0

Afficher le fichier

    def test_splice_donors(self):
        """ Make sure that the correct positions got labeled as splice donors """

        sj_file = "input_files/toy_sjs_mixed_chroms.txt"
        chroms = set(["chr1", "chr2"])
        tmp_dir = "scratch/sj_reading_test/"
        os.system("mkdir -p " + tmp_dir)

        donor_bt, accept_bt, annot = TC.processSpliceAnnotation(sj_file,
                                                                tmp_dir,
                                                                chroms,
                                                                process="test")

        # Remember, file is 1-based but BedTool is 0-based
        expected_donors = set([99, 399])
        donors = set()
        for donor in donor_bt:
            donors.add(donor.start)
        assert donors == expected_donors

Exemple #15

0

Afficher le fichier

Fichier : test_high_level_ncsj_correction.py Projet : gongjingtang/TranscriptClean

    def test_crash_correction(self):
        """ This is a case that is supposed to crash the NCSJ correction process,
           resulting in no correction. This is because the mapping has
           created a 7-bp micro-exon with a canonical but likely incorrect
           junction to its left, and a non-canonical junction on its right.
           Post-correction, we end up with two introns next to each other
           with a zero-length exon, which is not valid."""

        # Process references
        sjFile = "input_files/chr11_sjs.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr11"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr11.fa")

        sam = "input_files/sams/microexon.sam"
        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N"
                      "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N"
                      "59M864N31M9891N69M1711N7M1341N47M13S")

        assert transcript.isCanonical == False
        assert transcript.MD == "MD:Z:2473"
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert transcript.CIGAR == orig_CIGAR

Exemple #16

0

Afficher le fichier

Fichier : test_high_level_ncsj_correction.py Projet : gongjingtang/TranscriptClean

    def test_correct_ncsj(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test_ncsj/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr1"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, refs.genome, refs.sjAnnot)
        jnNumber = 0
        maxDist = 5
        logInfo = TC.init_log_info(sam_fields)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        assert transcript.isCanonical == True
        assert transcript.spliceJunctions[jnNumber].isCanonical == True
        assert transcript.SEQ == "AAGGAA"
        assert transcript.CIGAR == "3M764N3M"
        assert transcript.MD == "MD:Z:6"
        assert logInfo.corrected_NC_SJs == 1

Exemple #17

0

Afficher le fichier

Fichier : test_update_post_ncsj_correction.py Projet : gongjingtang/TranscriptClean

    def test_update(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donor, acceptor, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice donor side of the junction (left)
        transcript.SEQ, transcript.CIGAR = TC.fix_one_side_of_junction(
            transcript.CHROM, transcript.POS, jnNumber, donor, 2, genome,
            transcript.SEQ, transcript.CIGAR)

        # Now test the update function
        TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict)

        junction = transcript.spliceJunctions[jnNumber]
        assert junction.motif_code == "21"
        assert junction.isCanonical == True
        assert transcript.MD == "MD:Z:6"
        assert transcript.isCanonical == True

Exemple #18

0

Afficher le fichier

    def test_chrom_filtering(self):
        """ Check that only chr1 and chr2 junctions get saved"""

        sj_file = "input_files/toy_sjs_mixed_chroms.txt"
        chroms = set(["chr1", "chr2"])
        tmp_dir = "scratch/sj_reading_test/"
        os.system("mkdir -p " + tmp_dir)

        donor_bt, accept_bt, annot = TC.processSpliceAnnotation(sj_file,
                                                                tmp_dir,
                                                                chroms,
                                                                process="test")

        # Check donor chroms
        donor_chroms = set()
        for pos in donor_bt:
            donor_chroms.add(pos.chrom)
        assert donor_chroms == chroms

        # Check acceptor chroms
        acc_chroms = set()
        for pos in accept_bt:
            acc_chroms.add(pos.chrom)
        assert acc_chroms == chroms

Exemple #19

0

Afficher le fichier

    def test_DIM_nc(self):
        """ Correct a transcript containing a deletion, insertion, mismatch,
            and noncanonical splice junction """

        # Initialize options etc.
        sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjFile = "input_files/GM12878_SJs_chr1.tab"
        tmp_dir = "scratch/example/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        chroms = set(["chr1"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        outfiles = dstruct.Struct()
        outfiles.TElog = open(tmp_dir + "DIM_nc_clean.TE.log", 'w')
        outfiles.sam = open(tmp_dir + "DIM_nc_clean.sam", 'w')
        outfiles.fasta = open(tmp_dir + "DIM_nc_clean.fasta", 'w')
        outfiles.log = open(tmp_dir + "DIM_nc_clean.log", 'w')

        refs = dstruct.Struct()
        refs.sjAnnot = sjAnnot
        refs.genome = genome
        refs.donors = donors
        refs.acceptors = acceptors
        refs.snps = {}
        refs.deletions = {}
        refs.insertions = {}

        options = dstruct.Struct()
        options.maxLenIndel = 5
        options.maxSJOffset = 5
        options.correctMismatches = "true"
        options.correctIndels = "true"
        options.correctSJs = "true"
        options.primaryOnly = True
        options.canonOnly = False

        # Correct the transcript
        with open(sam, 'r') as f:
            transcripts = [f.readline().strip()]
        TC.batch_correct(transcripts, options, refs, outfiles)

        # Close the output files
        for handle in outfiles.values():
            handle.close()

        # Expected transcript attributes post-correction
        correct_CIGAR = ("12M1134N126M163N202M866N74M924N191M1777N127M2109N"
                         "157M88N159M932N633M274N117M7696N170M1215N629M938N"
                         "29M428N133M254N166M390N212M253N89M163N483M")
        correct_MD = "MD:Z:3709"
        correct_NM = "NM:i:0"
        correct_jI = (
            "jI:B:i,150941429,150942562,150942689,150942851,150943054,"
            "150943919,150943994,150944917,150945109,150946885,150947013,"
            "150949121,150949279,150949366,150949526,150950457,150951091,"
            "150951364,150951482,150959177,150959348,150960562,150961192,"
            "150962129,150962159,150962586,150962720,150962973,150963140,"
            "150963529,150963742,150963994,150964084,150964246")
        correct_jM = "jM:B:c,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21"

        # Read in transcript from outfile
        with open(tmp_dir + "DIM_nc_clean.sam", 'r') as f:
            sam_line = f.readline().strip().split('\t')
        transcript = t2.Transcript(sam_line, genome, sjAnnot)

        assert transcript.CIGAR == correct_CIGAR
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM
        assert transcript.jI == correct_jI
        assert transcript.jM == correct_jM

        # Read logs and make sure they are OK
        expected_log = "\t".join([
            "c34150/f1p1/3707", "primary", "2", "0", "0", "1", "0", "0", "2",
            "0", "1", "0"
        ])

        with open(tmp_dir + "DIM_nc_clean.log", 'r') as f:
            log = f.readline().strip()
            assert log == expected_log