Exemple #1
0
    def test_not_correctable_deletion(self):
        """ Same deletion again, but correction cutoff set to 0 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 0
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAGA"
        assert transcript.CIGAR == "2M1D2M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Uncorrected", "TooLarge"
        ]) + "\n"

        assert TE_entries == expected_TE
Exemple #2
0
    def test_variant_deletion(self):
        """ Same deletion again, but with a matching variant at the same 
            location. Correct action is to leave the deletion in place """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {"chr1_202892095_202892096": 1}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if deletion is still there as expected
        assert transcript.SEQ == "AAGA"
        assert transcript.CIGAR == "2M1D2M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Uncorrected", "VariantMatch"
        ]) + "\n"

        assert TE_entries == expected_TE
Exemple #3
0
    def test_variant_insertion(self):
        """ Toy transcript with sequence AAATTGA, where the Ts are a 2 bp 
            insertion that matches a known variant.
            chr1: 202,892,094 - 202,892,098. Insertion is between position
            202,892,096 and 202,892,097. The genomic position used to refer
            to it is 202,892,097 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "3M2I2M", "*", "0",
            "0", "AAATTGA", "*", "NM:i:2", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {"chr1_202892096_202892098": "TT"}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctInsertions(transcript, genome, variants, maxLen,
                                          logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAATTGA"
        assert transcript.CIGAR == "3M2I2M"

        # Check the log entries
        expected_log = "\t".join([
            "test_read", "chr1_202892096_202892098", "Insertion", "2",
            "Uncorrected", "VariantMatch"
        ]) + "\n"
        assert TE_entries == expected_log
Exemple #4
0
    def test_correctable_deletion(self):
        """ Toy transcript with sequence AA-GA, where the '-' is a deletion of 
            the base 'A'.
            chr1: 202,892,094 - 202,892,098. Deletion is at 202,892,096 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Corrected", "NA"
        ]) + "\n"

        assert TE_entries == expected_TE
    def test_wrong_variant_mismatch(self):
        """ Toy transcript with sequence AACGA, where the C is a mismatch to the
            reference base 'A' in the location, but not matching, a known SNP.
            chr1: 202,892,094 - 202,892,098. Mismatch is at 202,892,096 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0",
            "AACGA", "*", "NM:i:1", "MD:Z:2A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None
        variants = {"chr1_202892096": ["G"]}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        error_entries = TC.correctMismatches(transcript, genome, variants,
                                             logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check the number and content of the transcript error entries
        assert error_entries.count('\n') == 1
        assert "Corrected" in error_entries
        assert "VariantMatch" not in error_entries
    def test_two_mismatches(self):
        """ Correct 2 mismatches in the same read. Useful for making sure that
            the TE log string is correct. """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0",
            "ACCGA", "*", "NM:i:2", "MD:Z:1A0A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        error_entries = TC.correctMismatches(transcript, genome, variants,
                                             logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check the number and content of the transcript error entries
        print(error_entries)
        assert error_entries.count('\n') == 2
        assert error_entries.count('Corrected') == 2
    def test_crash_dmel(self):
        """ This is a Drosophila junction that borders a small match preceded by
            a 7 bp deletion. It is also supposed to crash correction, but did
            not in TC v2.0.1."""

        # Process references
        sjFile = "input_files/drosophila_example/chr3R_SJs.tsv"
        tmp_dir = "scratch/dmel/TC"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr3R"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/drosophila_example/chr3R.fa")

        sam = "input_files/drosophila_example/no_SJ_corr.sam"
        with open(sam, 'r') as f:
            for sam_line in f:
                if sam_line.startswith("@"):
                    continue
                else:
                    sam_line = sam_line.strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)
        orig_CIGAR = transcript.CIGAR
        orig_seq = transcript.SEQ
        orig_MD = transcript.MD
        expected_TE = "\t".join([
            "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013",
            "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected",
            "Other"
        ]) + "\n"

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        new_transcript, TE_entries = TC.cleanNoncanonical(
            transcript, refs, maxDist, logInfo)

        print(TE_entries)
        assert new_transcript.isCanonical == False
        assert TE_entries == expected_TE
        assert new_transcript.MD == orig_MD
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert new_transcript.CIGAR == orig_CIGAR
        assert new_transcript.SEQ == orig_seq
    def test_crash_correction(self):
        """ This is a case that is supposed to crash the NCSJ correction process,
           resulting in no correction. This is because the mapping has
           created a 7-bp micro-exon with a canonical but likely incorrect
           junction to its left, and a non-canonical junction on its right.
           Post-correction, we end up with two introns next to each other
           with a zero-length exon, which is not valid."""

        # Process references
        sjFile = "input_files/chr11_sjs.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr11"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr11.fa")

        sam = "input_files/sams/microexon.sam"
        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N"
                      "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N"
                      "59M864N31M9891N69M1711N7M1341N47M13S")

        assert transcript.isCanonical == False
        assert transcript.MD == "MD:Z:2473"
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert transcript.CIGAR == orig_CIGAR
    def test_correct_ncsj(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test_ncsj/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr1"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, refs.genome, refs.sjAnnot)
        jnNumber = 0
        maxDist = 5
        logInfo = TC.init_log_info(sam_fields)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        assert transcript.isCanonical == True
        assert transcript.spliceJunctions[jnNumber].isCanonical == True
        assert transcript.SEQ == "AAGGAA"
        assert transcript.CIGAR == "3M764N3M"
        assert transcript.MD == "MD:Z:6"
        assert logInfo.corrected_NC_SJs == 1