Python Transcript Examples, transcript.Transcript Python Examples

Example #1

0

Show file

File: test_correct_mismatches.py Project: gongjingtang/TranscriptClean

    def test_two_mismatches(self):
        """ Correct 2 mismatches in the same read. Useful for making sure that
            the TE log string is correct. """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0",
            "ACCGA", "*", "NM:i:2", "MD:Z:1A0A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        error_entries = TC.correctMismatches(transcript, genome, variants,
                                             logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check the number and content of the transcript error entries
        print(error_entries)
        assert error_entries.count('\n') == 2
        assert error_entries.count('Corrected') == 2

Example #2

0

Show file

    def test_fix_donor_case3(self):
        """ Toy transcript with sequence AAGGT|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

            So-called case #3
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                               chroms)
        genome = Fasta("input_files/hg38_chr1.fa")


        # Init transcript object
        sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*",
                      "0", "0", "AAGGTGAA", "*",  "NM:i:0", "MD:Z:8"]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice donor side of the junction (left)
        new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM,
                                                         transcript.POS, jnNumber,
                                                         donor, -2, genome,
                                                         transcript.SEQ,
                                                         transcript.CIGAR)

        assert new_seq == "AAGGAA"
        assert new_cigar == "3M764N3M"

Example #3

0

Show file

File: test_attempt_jn_correction.py Project: gongjingtang/TranscriptClean

    def test_correct_jn(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        outprefix = "scratch/test_jns/"
        tmp_dir = "scratch/test_jns/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)
        jnNumber = 0
        maxDist = 5
        #donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Attempt to correct the splice junction
        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)

        assert correction_status == True
        assert reason == "NA"
        assert dist == 2

Example #4

0

Show file

    def test_mark_canonical(self):
        """ In this test, we check whether TC correctly detects the junction
            and labels it canonical.

            Toy transcript with sequence AAG|GAA, where the splice motif (GT-AG)
            is canonical.
            chr1: 23,071,357 - 23,072,126 """

        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "3M764N3M", "*", "0",
            "0", "AAGGAA", "*", "NM:i:0", "MD:Z:6"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = {}
        variants = {}

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Check if the intron bounds are correct
        intronBounds = transcript.getAllIntronBounds()
        assert intronBounds[0].pos == 23071360
        assert intronBounds[1].pos == 23072123

        # Check if the overall junction is labeled correctly
        assert (transcript.spliceJunctions[0]).isCanonical == True

        # Check if the overall transcript is labeled correctly
        assert transcript.isCanonical == True

Example #5

0

Show file

    def test_mark_noncanonical(self):
        """ In this test, we check whether TC correctly detects the junction 
            and labels it noncanonical.

            Toy transcript with sequence GGT|GTG, where the splice motif (AA-CA)
            is noncanonical.
            chr1: 23,072,197 - 23,073,291.  """

        sam_fields = [
            "test_read", "0", "chr1", "23072197", "255", "3M1091N3M", "*", "0",
            "0", "GGTGTG", "*", "NM:i:0", "MD:Z:6"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = {}
        variants = {}

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Check if the intron bounds are correct
        intronBounds = transcript.getAllIntronBounds()
        assert intronBounds[0].pos == 23072200
        assert intronBounds[1].pos == 23073290

        # Check if the overall junction is labeled correctly
        assert (transcript.spliceJunctions[0]).isCanonical == False

        # Check if the overall transcript is labeled correctly
        assert transcript.isCanonical == False

Example #6

0

Show file

    def test_correctable_deletion(self):
        """ Toy transcript with sequence AA-GA, where the '-' is a deletion of 
            the base 'A'.
            chr1: 202,892,094 - 202,892,098. Deletion is at 202,892,096 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Corrected", "NA"
        ]) + "\n"

        assert TE_entries == expected_TE

Example #7

0

Show file

File: test_attempt_jn_correction.py Project: gongjingtang/TranscriptClean

    def test_crash(self):
        """ This is a Drosophila junction that borders a small match preceded by
            a 7 bp deletion. It is supposed to crash correction, which will result
            in a categorization of 'Other' in the log """

        # Process references
        sjFile = "input_files/drosophila_example/chr3R_SJs.tsv"
        outprefix = "scratch/dmel_crash/"
        tmp_dir = "scratch/dmel_crash/TC_tmp/"
        chroms = set(["chr3R"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/drosophila_example/chr3R.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr3R", "14890420", "255", "7M7D2M264N7M", "*",
            "0", "0", "GATCAAACAACAAGTC", "*"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)

        jnNumber = 0
        maxDist = 5
        # Attempt to correct the splice junction
        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)
        assert correction_status == False
        assert reason == "Other"
        assert dist == 5

Example #8

0

Show file

    def test_variant_insertion(self):
        """ Toy transcript with sequence AAATTGA, where the Ts are a 2 bp 
            insertion that matches a known variant.
            chr1: 202,892,094 - 202,892,098. Insertion is between position
            202,892,096 and 202,892,097. The genomic position used to refer
            to it is 202,892,097 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "3M2I2M", "*", "0",
            "0", "AAATTGA", "*", "NM:i:2", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {"chr1_202892096_202892098": "TT"}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctInsertions(transcript, genome, variants, maxLen,
                                          logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAATTGA"
        assert transcript.CIGAR == "3M2I2M"

        # Check the log entries
        expected_log = "\t".join([
            "test_read", "chr1_202892096_202892098", "Insertion", "2",
            "Uncorrected", "VariantMatch"
        ]) + "\n"
        assert TE_entries == expected_log

Example #9

0

Show file

File: SethWhitakerFinalProject.py Project: sethjwhitaker/INF360-projects

def main():

    logging.debug("Main function called")

    # this is the object that will hold all of the data
    transcriptObj = transcript.Transcript()

    # display menu and call appropriate function based on user input
    while True:
        # display menu
        choice = menuPrompt()

        logging.debug("User entered: " + choice)

        # exit loop if user chooses 0
        if choice == "0":
            break
        # if 1, prompt user for grade info and store in transcript object
        elif choice == "1":
            enterGrades(transcriptObj)
        # if 2, print the transcript to the console
        elif choice == "2":
            transcriptObj.print()
        else:
            logging.debug("Invalid choice by user")
            print("Invalid choice. Please try again.")

Example #10

0

Show file

File: test_update_post_ncsj_correction.py Project: gongjingtang/TranscriptClean

    def test_no_correction(self):
        """ Make sure that the attributes stay the same if no correction 
            was performed
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        chroms = set(["chr1"])
        donor, acceptor, sjDict = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjDict)
        jnNumber = 0
        maxDist = 5
        donor = (transcript.spliceJunctions[jnNumber]).bounds[0]

        # Now test the update function
        TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict)

        junction = transcript.spliceJunctions[jnNumber]
        assert junction.motif_code == "0"
        assert junction.isCanonical == False
        assert transcript.MD == "MD:Z:4"
        assert transcript.isCanonical == False

Example #11

0

Show file

    def test_not_correctable_deletion(self):
        """ Same deletion again, but correction cutoff set to 0 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 0
        spliceAnnot = None
        variants = {}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAGA"
        assert transcript.CIGAR == "2M1D2M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Uncorrected", "TooLarge"
        ]) + "\n"

        assert TE_entries == expected_TE

Example #12

0

Show file

    def test_variant_deletion(self):
        """ Same deletion again, but with a matching variant at the same 
            location. Correct action is to leave the deletion in place """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0",
            "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        maxLen = 5
        spliceAnnot = None
        variants = {"chr1_202892095_202892096": 1}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen,
                                         logInfo)

        # Check to see if deletion is still there as expected
        assert transcript.SEQ == "AAGA"
        assert transcript.CIGAR == "2M1D2M"

        # Check TE log
        expected_TE = "\t".join([
            "test_read", "chr1_202892095_202892096", "Deletion", "1",
            "Uncorrected", "VariantMatch"
        ]) + "\n"

        assert TE_entries == expected_TE

Example #13

0

Show file

File: test_attempt_jn_correction.py Project: gongjingtang/TranscriptClean

    def test_too_far_away(self):
        """ A case where the NCSJ should not be corrected because it is too far
            away from the closest annotated junction relative to the maxDist
            parameter.
     
         Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical.
            chr1: 23,071,357 - 23,072,126 
        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test_jns/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:6"
        ]
        transcript = t2.Transcript(sam_fields, genome, sjAnnot)
        jnNumber = 0
        maxDist = 1

        correction_status, reason, dist = TC.attempt_jn_correction(
            transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist)
        assert correction_status == False
        assert reason == "TooFarFromAnnotJn"
        assert dist == 2

Example #14

0

Show file

File: test_correct_mismatches.py Project: gongjingtang/TranscriptClean

    def test_wrong_variant_mismatch(self):
        """ Toy transcript with sequence AACGA, where the C is a mismatch to the
            reference base 'A' in the location, but not matching, a known SNP.
            chr1: 202,892,094 - 202,892,098. Mismatch is at 202,892,096 """

        sam_fields = [
            "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0",
            "AACGA", "*", "NM:i:1", "MD:Z:2A2", "jI:B:i,-1", "jM:B:c,-1"
        ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None
        variants = {"chr1_202892096": ["G"]}
        logInfo = TC.init_log_info(sam_fields)

        # Init transcript object
        transcript = t2.Transcript(sam_fields, genome, spliceAnnot)

        # Run correction
        error_entries = TC.correctMismatches(transcript, genome, variants,
                                             logInfo)

        # Check to see if correction was successful
        assert transcript.SEQ == "AAAGA"
        assert transcript.CIGAR == "5M"

        # Check the number and content of the transcript error entries
        assert error_entries.count('\n') == 1
        assert "Corrected" in error_entries
        assert "VariantMatch" not in error_entries

Example #15

0

Show file

 def _get_all_text(self):
     if self.text:
         return self.text
     links = transcript.getLinksForTranscripts(self.series)
     transcripts = [transcript.Transcript(link) for link in links]
     self.text = ''
     for tp in transcripts:
         self.text += " ".join([line.text for line in tp.getAllLines()])
     return self.text

Example #16

0

Show file

File: test_all_jns_annotated.py Project: gongjingtang/TranscriptClean

    def test_two_annotated_SJs_without_ref(self):
        """ Same example, but no splice annot provided, so no junctions can
            show up as annotated
        """
        sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        assert transcript.allJnsAnnotated == False
        assert transcript.isCanonical == True

Example #17

0

Show file

File: test_all_jns_annotated.py Project: gongjingtang/TranscriptClean

    def test_no_jns(self):
        """ Return transcript.allJnsAnnotated = True for a transcript without
            junctions"""
        
        sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        assert transcript.allJnsAnnotated == True
        assert transcript.isCanonical == True

Example #18

0

Show file

    def test_perfect_match(self):
        """ Since this read is a perfect match to the reference, its CIGAR and
            sequence fields should definitely be the same length """

        sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjDict = set()

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, sjDict)

        assert t2.check_seq_and_cigar_length(transcript.SEQ,
                                             transcript.CIGAR) == True

Example #19

0

Show file

    def test_mismatch(self):
        """ Compare SEQ and CIGAR for a spliced transcript that contains a
            mismatch. """

        sam = "input_files/sams/mismatch.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjDict = set()

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, sjDict)

        assert t2.check_seq_and_cigar_length(transcript.SEQ,
                                             transcript.CIGAR) == True

Example #20

0

Show file

    def test_perfect_match_with_introns(self):
        """ Compare SEQ and CIGAR for a transcript that is a perfect
            reference match containing introns. """

        sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjDict = set()

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, sjDict)

        assert t2.check_seq_and_cigar_length(transcript.SEQ,
                                             transcript.CIGAR) == True

Example #21

0

Show file

File: test_high_level_ncsj_correction.py Project: gongjingtang/TranscriptClean

    def test_crash_dmel(self):
        """ This is a Drosophila junction that borders a small match preceded by
            a 7 bp deletion. It is also supposed to crash correction, but did
            not in TC v2.0.1."""

        # Process references
        sjFile = "input_files/drosophila_example/chr3R_SJs.tsv"
        tmp_dir = "scratch/dmel/TC"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr3R"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/drosophila_example/chr3R.fa")

        sam = "input_files/drosophila_example/no_SJ_corr.sam"
        with open(sam, 'r') as f:
            for sam_line in f:
                if sam_line.startswith("@"):
                    continue
                else:
                    sam_line = sam_line.strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)
        orig_CIGAR = transcript.CIGAR
        orig_seq = transcript.SEQ
        orig_MD = transcript.MD
        expected_TE = "\t".join([
            "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013",
            "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected",
            "Other"
        ]) + "\n"

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        new_transcript, TE_entries = TC.cleanNoncanonical(
            transcript, refs, maxDist, logInfo)

        print(TE_entries)
        assert new_transcript.isCanonical == False
        assert TE_entries == expected_TE
        assert new_transcript.MD == orig_MD
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert new_transcript.CIGAR == orig_CIGAR
        assert new_transcript.SEQ == orig_seq

Example #22

0

Show file

File: test_compute_MD_NM.py Project: gongjingtang/TranscriptClean

    def test_insertion(self):
        """ Compute the correct MD tag for a spliced transcript that contains an
            insertion. """

        sam = "input_files/sams/insertion.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        correct_MD = "MD:Z:3069"
        correct_NM = "NM:i:2"
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM

Example #23

0

Show file

    def test_insertion_deletion_mismatch_ncsj(self):
        """ Compare SEQ and CIGAR for a transcript that contains an 
           insertion, deletion, mismatch, and noncanonical splice junction in 
           it. """

        sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjDict = set()

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, sjDict)

        assert t2.check_seq_and_cigar_length(transcript.SEQ,
                                             transcript.CIGAR) == True

Example #24

0

Show file

File: test_compute_MD_NM.py Project: gongjingtang/TranscriptClean

    def test_deletion_insertion_mismatch(self):
        """ Compute the correct MD tag for a spliced transcript that contains an
            insertion, deletion, and mismatch. """

        sam = "input_files/sams/deletion_insertion_mismatch.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        correct_MD = "MD:Z:475C0A0C0C0A1082^C1347G17C205"
        correct_NM = "NM:i:9"
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM

Example #25

0

Show file

File: test_compute_MD_NM.py Project: gongjingtang/TranscriptClean

    def test_perfect_match_with_introns(self):
        """ Compute the correct MD tag for a transcript that is a perfect
            reference match containing introns. """

        sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        correct_MD = "MD:Z:3400"
        correct_NM = "NM:i:0"
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM

Example #26

0

Show file

File: test_compute_MD_NM.py Project: gongjingtang/TranscriptClean

    def test_insertion_deletion_mismatch_ncsj(self):
        """ Compute the correct MD tag for a transcript that contains an 
           insertion, deletion, mismatch, and noncanonical splice junction in 
           it. """

        sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
        genome = Fasta("input_files/hg38_chr1.fa")

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, {})

        correct_MD = "MD:Z:414G0A450^C405^C2435"
        correct_NM = "NM:i:5"
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM

Example #27

0

Show file

    def test_pre_correction_dmel(self):
        """ This is a noisy Drosophila read, but prior to correction, the CIGAR
            and SEQ strings should definitely match """

        sam = "input_files/drosophila_example/input_read.sam"
        genome = Fasta("input_files/drosophila_example/chr3R.fa")
        sjDict = set()

        with open(sam, 'r') as f:
            for sam_line in f:
                if sam_line.startswith("@"):
                    continue
                else:
                    sam_line = sam_line.strip().split('\t')
                    transcript = t2.Transcript(sam_line, genome, sjDict)

        assert t2.check_seq_and_cigar_length(transcript.SEQ,
                                             transcript.CIGAR) == True

Example #28

0

Show file

File: test_print_fasta_seq.py Project: gongjingtang/TranscriptClean

    def test_plus_strand(self):
        """ Toy transcript with sequence AAAGA on the plus strand. Sequence
            should be output as listed in the SAM fields."""

        sam_fields = ["test_read", "0", "chr1", "202892094", "255", "5M", "*",
                      "0", "0", "AAAGA", "*",	"NM:i:0", "MD:Z:5", "jI:B:i,-1",
                      "jM:B:c,-1" ]

        genome = Fasta("input_files/hg38_chr1.fa")
        spliceAnnot = None

        # Init transcript object
        transcript = ts.Transcript(sam_fields, genome, spliceAnnot)

        # Output fasta and check against expected
        expected_fasta = ">test_read" + "\n" + "AAAGA"

        assert transcript.printableFa() == expected_fasta

Example #29

0

Show file

File: test_all_jns_annotated.py Project: gongjingtang/TranscriptClean

    def test_two_annotated_SJs(self):
        """ Transcript with 2 junctions and each match the provided reference
        """
        sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjFile = "input_files/GM12878_SJs_chr1.tab"
        outprefix = "scratch/test"
        tmp_dir = "scratch/test_jIjM/TC_tmp/"
        chroms = set(["chr1"])
        donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                               chroms)

        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')
            transcript = t2.Transcript(sam_line, genome, sjDict)

        assert transcript.allJnsAnnotated == True
        assert transcript.isCanonical == True

Example #30

0

Show file

def get_transcript(sid, pin):
    """ get transcript including list of classes taken, grade, and current gpa """

    login_number = 2
    #If we are not logged in we will loop around again
    for i in range(login_number):

        #The transcript page url
        html = fetch_html.get_transcript()

        if parse_html.get_page_title(html) != 'Login':
            # We set the transcript variable to a instance of the transcript class
            grades = parse_html.get_grades(html)
            credits = parse_html.get_credits(html)
            gpa = parse_html.get_gpa(html)
            return transcript.Transcript(html, grades, credits, gpa)

        else:
            login(sid, pin)