def test_two_mismatches(self): """ Correct 2 mismatches in the same read. Useful for making sure that the TE log string is correct. """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0", "ACCGA", "*", "NM:i:2", "MD:Z:1A0A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction error_entries = TC.correctMismatches(transcript, genome, variants, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check the number and content of the transcript error entries print(error_entries) assert error_entries.count('\n') == 2 assert error_entries.count('Corrected') == 2
def test_fix_donor_case3(self): """ Toy transcript with sequence AAGGT|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 So-called case #3 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*", "0", "0", "AAGGTGAA", "*", "NM:i:0", "MD:Z:8"] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice donor side of the junction (left) new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM, transcript.POS, jnNumber, donor, -2, genome, transcript.SEQ, transcript.CIGAR) assert new_seq == "AAGGAA" assert new_cigar == "3M764N3M"
def test_correct_jn(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" outprefix = "scratch/test_jns/" tmp_dir = "scratch/test_jns/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 5 #donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice junction correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == True assert reason == "NA" assert dist == 2
def test_mark_canonical(self): """ In this test, we check whether TC correctly detects the junction and labels it canonical. Toy transcript with sequence AAG|GAA, where the splice motif (GT-AG) is canonical. chr1: 23,071,357 - 23,072,126 """ sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "3M764N3M", "*", "0", "0", "AAGGAA", "*", "NM:i:0", "MD:Z:6" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = {} variants = {} # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Check if the intron bounds are correct intronBounds = transcript.getAllIntronBounds() assert intronBounds[0].pos == 23071360 assert intronBounds[1].pos == 23072123 # Check if the overall junction is labeled correctly assert (transcript.spliceJunctions[0]).isCanonical == True # Check if the overall transcript is labeled correctly assert transcript.isCanonical == True
def test_mark_noncanonical(self): """ In this test, we check whether TC correctly detects the junction and labels it noncanonical. Toy transcript with sequence GGT|GTG, where the splice motif (AA-CA) is noncanonical. chr1: 23,072,197 - 23,073,291. """ sam_fields = [ "test_read", "0", "chr1", "23072197", "255", "3M1091N3M", "*", "0", "0", "GGTGTG", "*", "NM:i:0", "MD:Z:6" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = {} variants = {} # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Check if the intron bounds are correct intronBounds = transcript.getAllIntronBounds() assert intronBounds[0].pos == 23072200 assert intronBounds[1].pos == 23073290 # Check if the overall junction is labeled correctly assert (transcript.spliceJunctions[0]).isCanonical == False # Check if the overall transcript is labeled correctly assert transcript.isCanonical == False
def test_correctable_deletion(self): """ Toy transcript with sequence AA-GA, where the '-' is a deletion of the base 'A'. chr1: 202,892,094 - 202,892,098. Deletion is at 202,892,096 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Corrected", "NA" ]) + "\n" assert TE_entries == expected_TE
def test_crash(self): """ This is a Drosophila junction that borders a small match preceded by a 7 bp deletion. It is supposed to crash correction, which will result in a categorization of 'Other' in the log """ # Process references sjFile = "input_files/drosophila_example/chr3R_SJs.tsv" outprefix = "scratch/dmel_crash/" tmp_dir = "scratch/dmel_crash/TC_tmp/" chroms = set(["chr3R"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/drosophila_example/chr3R.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr3R", "14890420", "255", "7M7D2M264N7M", "*", "0", "0", "GATCAAACAACAAGTC", "*" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 5 # Attempt to correct the splice junction correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == False assert reason == "Other" assert dist == 5
def test_variant_insertion(self): """ Toy transcript with sequence AAATTGA, where the Ts are a 2 bp insertion that matches a known variant. chr1: 202,892,094 - 202,892,098. Insertion is between position 202,892,096 and 202,892,097. The genomic position used to refer to it is 202,892,097 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "3M2I2M", "*", "0", "0", "AAATTGA", "*", "NM:i:2", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {"chr1_202892096_202892098": "TT"} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctInsertions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAATTGA" assert transcript.CIGAR == "3M2I2M" # Check the log entries expected_log = "\t".join([ "test_read", "chr1_202892096_202892098", "Insertion", "2", "Uncorrected", "VariantMatch" ]) + "\n" assert TE_entries == expected_log
def main(): logging.debug("Main function called") # this is the object that will hold all of the data transcriptObj = transcript.Transcript() # display menu and call appropriate function based on user input while True: # display menu choice = menuPrompt() logging.debug("User entered: " + choice) # exit loop if user chooses 0 if choice == "0": break # if 1, prompt user for grade info and store in transcript object elif choice == "1": enterGrades(transcriptObj) # if 2, print the transcript to the console elif choice == "2": transcriptObj.print() else: logging.debug("Invalid choice by user") print("Invalid choice. Please try again.")
def test_no_correction(self): """ Make sure that the attributes stay the same if no correction was performed """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donor, acceptor, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Now test the update function TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict) junction = transcript.spliceJunctions[jnNumber] assert junction.motif_code == "0" assert junction.isCanonical == False assert transcript.MD == "MD:Z:4" assert transcript.isCanonical == False
def test_not_correctable_deletion(self): """ Same deletion again, but correction cutoff set to 0 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 0 spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAGA" assert transcript.CIGAR == "2M1D2M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Uncorrected", "TooLarge" ]) + "\n" assert TE_entries == expected_TE
def test_variant_deletion(self): """ Same deletion again, but with a matching variant at the same location. Correct action is to leave the deletion in place """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {"chr1_202892095_202892096": 1} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if deletion is still there as expected assert transcript.SEQ == "AAGA" assert transcript.CIGAR == "2M1D2M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Uncorrected", "VariantMatch" ]) + "\n" assert TE_entries == expected_TE
def test_too_far_away(self): """ A case where the NCSJ should not be corrected because it is too far away from the closest annotated junction relative to the maxDist parameter. Toy transcript with sequence A|GAA, where the splice motif is noncanonical. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test_jns/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:6" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 1 correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == False assert reason == "TooFarFromAnnotJn" assert dist == 2
def test_wrong_variant_mismatch(self): """ Toy transcript with sequence AACGA, where the C is a mismatch to the reference base 'A' in the location, but not matching, a known SNP. chr1: 202,892,094 - 202,892,098. Mismatch is at 202,892,096 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0", "AACGA", "*", "NM:i:1", "MD:Z:2A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None variants = {"chr1_202892096": ["G"]} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction error_entries = TC.correctMismatches(transcript, genome, variants, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check the number and content of the transcript error entries assert error_entries.count('\n') == 1 assert "Corrected" in error_entries assert "VariantMatch" not in error_entries
def _get_all_text(self): if self.text: return self.text links = transcript.getLinksForTranscripts(self.series) transcripts = [transcript.Transcript(link) for link in links] self.text = '' for tp in transcripts: self.text += " ".join([line.text for line in tp.getAllLines()]) return self.text
def test_two_annotated_SJs_without_ref(self): """ Same example, but no splice annot provided, so no junctions can show up as annotated """ sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) assert transcript.allJnsAnnotated == False assert transcript.isCanonical == True
def test_no_jns(self): """ Return transcript.allJnsAnnotated = True for a transcript without junctions""" sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) assert transcript.allJnsAnnotated == True assert transcript.isCanonical == True
def test_perfect_match(self): """ Since this read is a perfect match to the reference, its CIGAR and sequence fields should definitely be the same length """ sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") sjDict = set() with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert t2.check_seq_and_cigar_length(transcript.SEQ, transcript.CIGAR) == True
def test_mismatch(self): """ Compare SEQ and CIGAR for a spliced transcript that contains a mismatch. """ sam = "input_files/sams/mismatch.sam" genome = Fasta("input_files/hg38_chr1.fa") sjDict = set() with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert t2.check_seq_and_cigar_length(transcript.SEQ, transcript.CIGAR) == True
def test_perfect_match_with_introns(self): """ Compare SEQ and CIGAR for a transcript that is a perfect reference match containing introns. """ sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") sjDict = set() with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert t2.check_seq_and_cigar_length(transcript.SEQ, transcript.CIGAR) == True
def test_crash_dmel(self): """ This is a Drosophila junction that borders a small match preceded by a 7 bp deletion. It is also supposed to crash correction, but did not in TC v2.0.1.""" # Process references sjFile = "input_files/drosophila_example/chr3R_SJs.tsv" tmp_dir = "scratch/dmel/TC" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr3R"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/drosophila_example/chr3R.fa") sam = "input_files/drosophila_example/no_SJ_corr.sam" with open(sam, 'r') as f: for sam_line in f: if sam_line.startswith("@"): continue else: sam_line = sam_line.strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) orig_CIGAR = transcript.CIGAR orig_seq = transcript.SEQ orig_MD = transcript.MD expected_TE = "\t".join([ "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013", "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected", "Other" ]) + "\n" assert transcript.isCanonical == False # Attempt to correct the splice junction new_transcript, TE_entries = TC.cleanNoncanonical( transcript, refs, maxDist, logInfo) print(TE_entries) assert new_transcript.isCanonical == False assert TE_entries == expected_TE assert new_transcript.MD == orig_MD assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert new_transcript.CIGAR == orig_CIGAR assert new_transcript.SEQ == orig_seq
def test_insertion(self): """ Compute the correct MD tag for a spliced transcript that contains an insertion. """ sam = "input_files/sams/insertion.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) correct_MD = "MD:Z:3069" correct_NM = "NM:i:2" assert transcript.MD == correct_MD assert transcript.NM == correct_NM
def test_insertion_deletion_mismatch_ncsj(self): """ Compare SEQ and CIGAR for a transcript that contains an insertion, deletion, mismatch, and noncanonical splice junction in it. """ sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" genome = Fasta("input_files/hg38_chr1.fa") sjDict = set() with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert t2.check_seq_and_cigar_length(transcript.SEQ, transcript.CIGAR) == True
def test_deletion_insertion_mismatch(self): """ Compute the correct MD tag for a spliced transcript that contains an insertion, deletion, and mismatch. """ sam = "input_files/sams/deletion_insertion_mismatch.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) correct_MD = "MD:Z:475C0A0C0C0A1082^C1347G17C205" correct_NM = "NM:i:9" assert transcript.MD == correct_MD assert transcript.NM == correct_NM
def test_perfect_match_with_introns(self): """ Compute the correct MD tag for a transcript that is a perfect reference match containing introns. """ sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) correct_MD = "MD:Z:3400" correct_NM = "NM:i:0" assert transcript.MD == correct_MD assert transcript.NM == correct_NM
def test_insertion_deletion_mismatch_ncsj(self): """ Compute the correct MD tag for a transcript that contains an insertion, deletion, mismatch, and noncanonical splice junction in it. """ sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" genome = Fasta("input_files/hg38_chr1.fa") with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, {}) correct_MD = "MD:Z:414G0A450^C405^C2435" correct_NM = "NM:i:5" assert transcript.MD == correct_MD assert transcript.NM == correct_NM
def test_pre_correction_dmel(self): """ This is a noisy Drosophila read, but prior to correction, the CIGAR and SEQ strings should definitely match """ sam = "input_files/drosophila_example/input_read.sam" genome = Fasta("input_files/drosophila_example/chr3R.fa") sjDict = set() with open(sam, 'r') as f: for sam_line in f: if sam_line.startswith("@"): continue else: sam_line = sam_line.strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert t2.check_seq_and_cigar_length(transcript.SEQ, transcript.CIGAR) == True
def test_plus_strand(self): """ Toy transcript with sequence AAAGA on the plus strand. Sequence should be output as listed in the SAM fields.""" sam_fields = ["test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0", "AAAGA", "*", "NM:i:0", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None # Init transcript object transcript = ts.Transcript(sam_fields, genome, spliceAnnot) # Output fasta and check against expected expected_fasta = ">test_read" + "\n" + "AAAGA" assert transcript.printableFa() == expected_fasta
def test_two_annotated_SJs(self): """ Transcript with 2 junctions and each match the provided reference """ sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") sjFile = "input_files/GM12878_SJs_chr1.tab" outprefix = "scratch/test" tmp_dir = "scratch/test_jIjM/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert transcript.allJnsAnnotated == True assert transcript.isCanonical == True
def get_transcript(sid, pin): """ get transcript including list of classes taken, grade, and current gpa """ login_number = 2 #If we are not logged in we will loop around again for i in range(login_number): #The transcript page url html = fetch_html.get_transcript() if parse_html.get_page_title(html) != 'Login': # We set the transcript variable to a instance of the transcript class grades = parse_html.get_grades(html) credits = parse_html.get_credits(html) gpa = parse_html.get_gpa(html) return transcript.Transcript(html, grades, credits, gpa) else: login(sid, pin)