def test_no_correction(self):
        """ Make sure that the attributes stay the same if no correction 
            was performed
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donor, acceptor, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Now test the update function
        TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict)

        junction = transcript.spliceJunctions[jnNumber]
        assert junction.motif_code == "0"
        assert junction.isCanonical == False
        assert transcript.MD == "MD:Z:4"
        assert transcript.isCanonical == False
    def test_find_closest_splice_acceptor_plus(self):
        """ Find the closest splice acceptor, which is 17 bp upstream.
            Plus strand."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23071360
        end = 23072140
        strand = "+"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        acceptor = junction.get_splice_acceptor()
        closest_acceptor = TC.find_closest_bound(acceptor, acceptors)
        assert closest_acceptor.start == 23072122
        assert closest_acceptor.end == 23072123
        assert closest_acceptor.dist == -17
예제 #3
0
    def test_variant_insertion(self):
        """ Toy transcript with sequence AAATTGA, where the Ts are a 2 bp 
            insertion that matches a known variant.
            chr1: 202,892,094 - 202,892,098. Insertion is between position
            202,892,096 and 202,892,097. The genomic position used to refer
            to it is 202,892,097 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "3M2I2M", "*", "0",
            "0", "AAATTGA", "*", "NM:i:2", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {"chr1_202892096_202892098": "TT"}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctInsertions(transcript, genome, variants, maxLen,
                                          logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAATTGA"
        assert transcript.CIGAR == "3M2I2M"

        # Check the log entries
        expected_log = "\t".join([
            "test_read", "chr1_202892096_202892098", "Insertion", "2",
            "Uncorrected", "VariantMatch"
        ]) + "\n"
        assert TE_entries == expected_log
    def test_sj_corr_off(self):
        """ Splice reference provided, but correction set to off. Expected 
            behavior is to skip SJ ref initialization because it would be a
            waste of time """

        # Initialize options etc.
        sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        tmp_dir = "scratch/prep_refs/sj_off/TC_tmp/"
        os.system("mkdir -p " + tmp_dir)

        options = dstruct.Struct()
        options.refGenome = "input_files/hg38_chr1.fa"
        options.tmp_dir = tmp_dir
        options.maxLenIndel = options.maxSJOffset = 5
        options.correctSJs = "false"
        options.variantFile = None
        options.sjAnnotFile = "input_files/test_junctions.txt"

        header, chroms, sam_chunks = TC.split_SAM(sam, 1)
        refs = TC.prep_refs(options, sam_chunks[0], header)

        # Check that variant dicts are empty
        assert refs.snps == refs.insertions == refs.deletions == {}

        # Check that SJ bedtools and annot lookup are empty
        assert refs.donors == refs.acceptors == None
        assert refs.sjAnnot == set()
    def test_variants(self):
        """ A variant file is provided """

        # Initialize options etc.
        sam = "input_files/vcf_test/read_with_snps.sam"
        tmp_dir = "scratch/prep_refs/variant/TC_tmp/"
        os.system("mkdir -p " + tmp_dir)

        options = dstruct.Struct()
        options.refGenome = "input_files/hg38_chr11.fa"
        options.tmp_dir = tmp_dir
        options.maxLenIndel = options.maxSJOffset = 5
        options.correctSJs = "false"
        options.variantFile = "input_files/vcf_test/snps.vcf"
        options.sjAnnotFile = None

        header, chroms, sam_chunks = TC.split_SAM(sam, 1)
        refs = TC.prep_refs(options, sam_chunks[0], header)

        # Check that variant deletion and insertion dicts are empty
        assert len(refs.insertions) == 0
        assert len(refs.deletions) == 0
        assert len(refs.snps) > 0

        # Check that SJ bedtools and annot lookup are empty
        assert refs.donors == refs.acceptors == None
        assert refs.sjAnnot == set()
    def test_genome_only(self):
        """ Make sure that the prep_refs function works under the simplest
            possible option setting: no variants or SJs provided. """

        # Initialize options etc.
        sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        tmp_dir = "scratch/prep_refs/genome-only/TC_tmp/"
        os.system("mkdir -p " + tmp_dir)

        options = dstruct.Struct()
        options.refGenome = "input_files/hg38_chr1.fa"
        options.tmp_dir = tmp_dir
        options.maxLenIndel = options.maxSJOffset = 5
        options.correctSJs = "false"
        options.variantFile = None
        options.sjAnnotFile = None

        header, chroms, sam_chunks = TC.split_SAM(sam, 1)
        refs = TC.prep_refs(options, sam_chunks[0], header)

        # Check that variant dicts are empty
        assert refs.snps == refs.insertions == refs.deletions == {}

        # Check that SJ bedtools and annot lookup are empty
        assert refs.donors == refs.acceptors == None
        assert refs.sjAnnot == set()
    def test_sjs(self):
        """ Genome and splice junction reference provided. Variant structs
            should still be empty. """

        # Initialize options etc.
        sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        tmp_dir = "scratch/prep_refs/sjs/TC_tmp/"
        os.system("mkdir -p " + tmp_dir)

        options = dstruct.Struct()
        options.refGenome = "input_files/hg38_chr1.fa"
        options.tmp_dir = tmp_dir
        options.maxLenIndel = options.maxSJOffset = 5
        options.correctSJs = "true"
        options.variantFile = None
        options.sjAnnotFile = "input_files/test_junctions.txt"

        header, chroms, sam_chunks = TC.split_SAM(sam, 1)
        refs = TC.prep_refs(options, sam_chunks[0], header)

        # Check that variant dicts are empty
        assert refs.snps == refs.insertions == refs.deletions == {}

        # Check SJ bedtools and annot lookup
        assert (refs.donors).count() == 3
        assert (refs.acceptors).count() == 2  # Same acceptor appears in 2 jns
        assert len(refs.sjAnnot) == 3
    def test_wrong_variant_mismatch(self):
        """ Toy transcript with sequence AACGA, where the C is a mismatch to the
            reference base 'A' in the location, but not matching, a known SNP.
            chr1: 202,892,094 - 202,892,098. Mismatch is at 202,892,096 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0",
            "AACGA", "*", "NM:i:1", "MD:Z:2A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None
        variants = {"chr1_202892096": ["G"]}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        error_entries = TC.correctMismatches(transcript, genome, variants,
                                             logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check the number and content of the transcript error entries
        assert error_entries.count('\n') == 1
        assert "Corrected" in error_entries
        assert "VariantMatch" not in error_entries
    def test_two_mismatches(self):
        """ Correct 2 mismatches in the same read. Useful for making sure that
            the TE log string is correct. """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0",
            "ACCGA", "*", "NM:i:2", "MD:Z:1A0A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        error_entries = TC.correctMismatches(transcript, genome, variants,
                                             logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check the number and content of the transcript error entries
        print(error_entries)
        assert error_entries.count('\n') == 2
        assert error_entries.count('Corrected') == 2
예제 #10
0
    def test_find_closest_sj_plus(self):

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23071350
        end = 23072124
        strand = "+"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        closest_donor, closest_acceptor = TC.find_closest_ref_junction(
            junction, donors, acceptors)
        assert closest_donor.end == 23071360
        assert closest_acceptor.end == 23072123
    def test_find_closest_splice_donor_minus(self):
        """ For a toy case with multiple donors and acceptors in close
            proximity, test whether TC can find the closest reference donor
            to the supplied intron bound.

            Similar to before, there is an exact match for the donor, located
            at 23071360 in 1-based coordinates and 23071359 in 0-based."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 23070360
        end = 23071360
        strand = "-"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        donor = junction.get_splice_donor()
        closest_donor = TC.find_closest_bound(donor, donors)
        assert closest_donor.start == 23071359
        assert closest_donor.end == 23071360
        assert closest_donor.dist == 0
예제 #12
0
    def test_fix_donor_case3(self):
        """ Toy transcript with sequence AAGGT|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

            So-called case #3
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                               chroms)
        genome = Fasta("input_files/hg38_chr1.fa")


        # Init transcript object
        sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*",
                      "0", "0", "AAGGTGAA", "*",  "NM:i:0", "MD:Z:8"]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice donor side of the junction (left)
        new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM,
                                                         transcript.POS, jnNumber,
                                                         donor, -2, genome,
                                                         transcript.SEQ,
                                                         transcript.CIGAR)

        assert new_seq == "AAGGAA"
        assert new_cigar == "3M764N3M"
예제 #13
0
    def test_variant_deletion(self):
        """ Same deletion again, but with a matching variant at the same 
            location. Correct action is to leave the deletion in place """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {"chr1_202892095_202892096": 1}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if deletion is still there as expected
        assert transcript.SEQ == "AAGA"
        assert transcript.CIGAR == "2M1D2M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Uncorrected", "VariantMatch"
        ]) + "\n"

        assert TE_entries == expected_TE
예제 #14
0
    def test_not_correctable_deletion(self):
        """ Same deletion again, but correction cutoff set to 0 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 0
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAGA"
        assert transcript.CIGAR == "2M1D2M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Uncorrected", "TooLarge"
        ]) + "\n"

        assert TE_entries == expected_TE
    def test_crash(self):
        """ This is a Drosophila junction that borders a small match preceded by
            a 7 bp deletion. It is supposed to crash correction, which will result
            in a categorization of 'Other' in the log """

        # Process references
        sjFile = "input_files/drosophila_example/chr3R_SJs.tsv"
        outprefix = "scratch/dmel_crash/"
        tmp_dir = "scratch/dmel_crash/TC_tmp/"
        chroms = set(["chr3R"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/drosophila_example/chr3R.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr3R", "14890420", "255", "7M7D2M264N7M", "*",
            "0", "0", "GATCAAACAACAAGTC", "*"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)

        jnNumber = 0
        maxDist = 5
        # Attempt to correct the splice junction
        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)
        assert correction_status == False
        assert reason == "Other"
        assert dist == 5
예제 #16
0
    def test_correctable_deletion(self):
        """ Toy transcript with sequence AA-GA, where the '-' is a deletion of 
            the base 'A'.
            chr1: 202,892,094 - 202,892,098. Deletion is at 202,892,096 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Corrected", "NA"
        ]) + "\n"

        assert TE_entries == expected_TE
    def test_too_far_away(self):
        """ A case where the NCSJ should not be corrected because it is too far
            away from the closest annotated junction relative to the maxDist
            parameter.
     
         Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical.
            chr1: 23,071,357 - 23,072,126 
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test_jns/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:6"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)
        jnNumber = 0
        maxDist = 1

        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)
        assert correction_status == False
        assert reason == "TooFarFromAnnotJn"
        assert dist == 2
    def test_correct_jn(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        outprefix = "scratch/test_jns/"
        tmp_dir = "scratch/test_jns/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)
        jnNumber = 0
        maxDist = 5
        #donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice junction
        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)

        assert correction_status == True
        assert reason == "NA"
        assert dist == 2
    def test_noncanonical(self):
        """ Transcript should be noncanonical and un-annotated prior to 
            correction, but be canonical and annotated afterwards """

        sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
        sjFile = "input_files/GM12878_SJs_chr1.tab"
        tmp_dir = "scratch/test_jIjM/TC_tmp/"
        chroms = set(["chr1"])
        refs = dstruct.Struct()
        refs.genome = Fasta("input_files/hg38_chr1.fa")
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                                               chroms)

        with open(sam, 'r') as f:
            sam_line = f.readline().strip()
            transcript, logInfo = TC.transcript_init(sam_line, refs.genome, 
                                                     refs.sjAnnot)

        assert transcript.allJnsAnnotated == False
        assert transcript.isCanonical == False

        # Now correct the junction and retest
        upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo)        

        assert upd_transcript.allJnsAnnotated == True
        assert upd_transcript.isCanonical == True
    def test_find_closest_splice_acceptor_minus(self):
        """ Find the closest splice acceptor, which is 1 bp downstream.
            Minus strand. Note that dist is relative to the genome, not to
            the direction of the transcript."""

        # Process reference junctions
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        # Intron bound info
        transcriptID = "test_read"
        jnNumber = 0
        chrom = "chr1"
        start = 22071331
        end = 22073331
        strand = "-"
        genome = Fasta("input_files/hg38_chr1.fa")

        junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end,
                                     strand, genome, sjDict)

        acceptor = junction.get_splice_acceptor()
        closest_acceptor = TC.find_closest_bound(acceptor, acceptors)
        assert closest_acceptor.start == 22071329
        assert closest_acceptor.end == 22071330
        assert closest_acceptor.dist == -1
    def test_crash_dmel(self):
        """ This is a Drosophila junction that borders a small match preceded by
            a 7 bp deletion. It is also supposed to crash correction, but did
            not in TC v2.0.1."""

        # Process references
        sjFile = "input_files/drosophila_example/chr3R_SJs.tsv"
        tmp_dir = "scratch/dmel/TC"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr3R"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/drosophila_example/chr3R.fa")

        sam = "input_files/drosophila_example/no_SJ_corr.sam"
        with open(sam, 'r') as f:
            for sam_line in f:
                if sam_line.startswith("@"):
                    continue
                else:
                    sam_line = sam_line.strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)
        orig_CIGAR = transcript.CIGAR
        orig_seq = transcript.SEQ
        orig_MD = transcript.MD
        expected_TE = "\t".join([
            "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013",
            "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected",
            "Other"
        ]) + "\n"

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        new_transcript, TE_entries = TC.cleanNoncanonical(
            transcript, refs, maxDist, logInfo)

        print(TE_entries)
        assert new_transcript.isCanonical == False
        assert TE_entries == expected_TE
        assert new_transcript.MD == orig_MD
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert new_transcript.CIGAR == orig_CIGAR
        assert new_transcript.SEQ == orig_seq
    def test_both_inside(self):
        """ Reference:     ----->|          |<-----
            Transcript:      ----->|      |<-----
            dist_0 = -2, dist_1 = +2, combined dist = 4
        """

        assert TC.combinedJunctionDist(-2, 2) == 4
예제 #23
0
    def test_tmp_files(self):
        """ Check that the expected tmp files are created."""

        sj_file = "input_files/toy_sjs_mixed_chroms.txt"
        chroms = set(["chr1", "chr2"])
        tmp_dir = "scratch/sj_reading_test/"
        os.system("mkdir -p " + tmp_dir)

        donor_bt, accept_bt, annot = TC.processSpliceAnnotation(sj_file,
                                                                tmp_dir,
                                                                chroms,
                                                                process="test")

        # Check if paths of tmp files are correct
        assert os.path.exists(
            "scratch/sj_reading_test/splice_files/test_ref_splice_donors_tmp.bed"
        )
        assert os.path.exists(
            "scratch/sj_reading_test/splice_files/test_ref_splice_acceptors_tmp.bed"
        )
        assert os.path.exists(
            "scratch/sj_reading_test/splice_files/test_ref_splice_donors_tmp.sorted.bed"
        )
        assert os.path.exists(
            "scratch/sj_reading_test/splice_files/test_ref_splice_acceptors_tmp.sorted.bed"
        )
예제 #24
0
    def test_create_tmp_sam(self):
        """ Create a tmp sam file from the mock header and transcripts provided.
            Then, check the order of the lines in the tmp file just to be sure.
        """

        sam_header = ["HLine1", "HLine2"]
        sam_transcripts = [ "\t".join(["read1", "mapping", "chr1", "..."]),
                            "\t".join(["read2", "mapping", "chr2", "..."]) ]
        tmp_dir = "scratch/tmp_sam_test/"                            

        fname, chroms = TC.create_tmp_sam(sam_header, sam_transcripts, tmp_dir, 
                                          process = "test")

        assert fname == "scratch/tmp_sam_test/split_uncorr_sams/test.sam"
        assert chroms == set(["chr1", "chr2"])

        # Now check the integrity of the output file
        line_num = 0
        with open(fname, 'r') as f:
            for line in f:
                line = line.strip() 
                if line_num == 0:
                    assert line == sam_header[0]
                elif line_num == 1:
                    assert line == sam_header[1]
                elif line_num == 2:
                    assert line == sam_transcripts[0]
                elif line_num == 3:
                    assert line == sam_transcripts[1]
                else:
                    pytest.fail("Output contains more lines than expected")
                line_num += 1           
예제 #25
0
    def test_primary_monoexon_read(self):
        """ The supplied read is a primary alignment. This means that a
            transcript object is created, and the logInfo struct notes the
            primary status."""

        sam_file = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        with open(sam_file, 'r') as f:
            sam_line = f.readline().strip()

        genome = Fasta("input_files/hg38_chr1.fa")
        sjAnnot = set()

        transcript, logInfo = TC.transcript_init(sam_line, genome, sjAnnot)
        assert transcript.QNAME == "c21031/f2p3/3400"
        assert transcript.FLAG == 0
        assert transcript.CHROM == "chr1"
        assert transcript.POS == 192575775
        assert transcript.CIGAR == "155M"
        assert transcript.MD == "MD:Z:155"
        assert logInfo.Mapping == "primary"      
        assert logInfo.corrected_deletions == \
               logInfo.uncorrected_deletions == \
               logInfo.variant_deletions == \
               logInfo.corrected_insertions == \
               logInfo.uncorrected_insertions == \
               logInfo.variant_insertions == \
               logInfo.corrected_mismatches == \
               logInfo.uncorrected_mismatches == \
               logInfo.corrected_NC_SJs == logInfo.uncorrected_NC_SJs == "NA" 
    def test_left_same_right_inside(self):
        """ Reference:     ----->|          |<-----
            Transcript:    ----->|        |<-----
            dist_0 = 0, dist_1 = +2, combined dist = 2
        """

        assert TC.combinedJunctionDist(0, 2) == 2
    def test_left_outside_right_inside(self):
        """ Reference:     ----->|          |<-----
            Transcript:   ----->|       |<-----
            dist_0 = +1, dist_1 = +4, combined dist = 3
        """

        assert TC.combinedJunctionDist(1, 4) == 3
    def test_crash_correction(self):
        """ This is a case that is supposed to crash the NCSJ correction process,
           resulting in no correction. This is because the mapping has
           created a 7-bp micro-exon with a canonical but likely incorrect
           junction to its left, and a non-canonical junction on its right.
           Post-correction, we end up with two introns next to each other
           with a zero-length exon, which is not valid."""

        # Process references
        sjFile = "input_files/chr11_sjs.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr11"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr11.fa")

        sam = "input_files/sams/microexon.sam"
        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N"
                      "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N"
                      "59M864N31M9891N69M1711N7M1341N47M13S")

        assert transcript.isCanonical == False
        assert transcript.MD == "MD:Z:2473"
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert transcript.CIGAR == orig_CIGAR
    def test_correct_ncsj(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test_ncsj/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr1"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, refs.genome, refs.sjAnnot)
        jnNumber = 0
        maxDist = 5
        logInfo = TC.init_log_info(sam_fields)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        assert transcript.isCanonical == True
        assert transcript.spliceJunctions[jnNumber].isCanonical == True
        assert transcript.SEQ == "AAGGAA"
        assert transcript.CIGAR == "3M764N3M"
        assert transcript.MD == "MD:Z:6"
        assert logInfo.corrected_NC_SJs == 1
    def test_update(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donor, acceptor, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice donor side of the junction (left)
        transcript.SEQ, transcript.CIGAR = TC.fix_one_side_of_junction(
            transcript.CHROM, transcript.POS, jnNumber, donor, 2, genome,
            transcript.SEQ, transcript.CIGAR)

        # Now test the update function
        TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict)

        junction = transcript.spliceJunctions[jnNumber]
        assert junction.motif_code == "21"
        assert junction.isCanonical == True
        assert transcript.MD == "MD:Z:6"
        assert transcript.isCanonical == True