def test_not_correctable_deletion(self): """ Same deletion again, but correction cutoff set to 0 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 0 spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAGA" assert transcript.CIGAR == "2M1D2M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Uncorrected", "TooLarge" ]) + "\n" assert TE_entries == expected_TE
def test_variant_deletion(self): """ Same deletion again, but with a matching variant at the same location. Correct action is to leave the deletion in place """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {"chr1_202892095_202892096": 1} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if deletion is still there as expected assert transcript.SEQ == "AAGA" assert transcript.CIGAR == "2M1D2M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Uncorrected", "VariantMatch" ]) + "\n" assert TE_entries == expected_TE
def test_variant_insertion(self): """ Toy transcript with sequence AAATTGA, where the Ts are a 2 bp insertion that matches a known variant. chr1: 202,892,094 - 202,892,098. Insertion is between position 202,892,096 and 202,892,097. The genomic position used to refer to it is 202,892,097 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "3M2I2M", "*", "0", "0", "AAATTGA", "*", "NM:i:2", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {"chr1_202892096_202892098": "TT"} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctInsertions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAATTGA" assert transcript.CIGAR == "3M2I2M" # Check the log entries expected_log = "\t".join([ "test_read", "chr1_202892096_202892098", "Insertion", "2", "Uncorrected", "VariantMatch" ]) + "\n" assert TE_entries == expected_log
def test_correctable_deletion(self): """ Toy transcript with sequence AA-GA, where the '-' is a deletion of the base 'A'. chr1: 202,892,094 - 202,892,098. Deletion is at 202,892,096 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Corrected", "NA" ]) + "\n" assert TE_entries == expected_TE
def test_wrong_variant_mismatch(self): """ Toy transcript with sequence AACGA, where the C is a mismatch to the reference base 'A' in the location, but not matching, a known SNP. chr1: 202,892,094 - 202,892,098. Mismatch is at 202,892,096 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0", "AACGA", "*", "NM:i:1", "MD:Z:2A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None variants = {"chr1_202892096": ["G"]} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction error_entries = TC.correctMismatches(transcript, genome, variants, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check the number and content of the transcript error entries assert error_entries.count('\n') == 1 assert "Corrected" in error_entries assert "VariantMatch" not in error_entries
def test_two_mismatches(self): """ Correct 2 mismatches in the same read. Useful for making sure that the TE log string is correct. """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0", "ACCGA", "*", "NM:i:2", "MD:Z:1A0A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction error_entries = TC.correctMismatches(transcript, genome, variants, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check the number and content of the transcript error entries print(error_entries) assert error_entries.count('\n') == 2 assert error_entries.count('Corrected') == 2
def test_crash_dmel(self): """ This is a Drosophila junction that borders a small match preceded by a 7 bp deletion. It is also supposed to crash correction, but did not in TC v2.0.1.""" # Process references sjFile = "input_files/drosophila_example/chr3R_SJs.tsv" tmp_dir = "scratch/dmel/TC" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr3R"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/drosophila_example/chr3R.fa") sam = "input_files/drosophila_example/no_SJ_corr.sam" with open(sam, 'r') as f: for sam_line in f: if sam_line.startswith("@"): continue else: sam_line = sam_line.strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) orig_CIGAR = transcript.CIGAR orig_seq = transcript.SEQ orig_MD = transcript.MD expected_TE = "\t".join([ "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013", "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected", "Other" ]) + "\n" assert transcript.isCanonical == False # Attempt to correct the splice junction new_transcript, TE_entries = TC.cleanNoncanonical( transcript, refs, maxDist, logInfo) print(TE_entries) assert new_transcript.isCanonical == False assert TE_entries == expected_TE assert new_transcript.MD == orig_MD assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert new_transcript.CIGAR == orig_CIGAR assert new_transcript.SEQ == orig_seq
def test_crash_correction(self): """ This is a case that is supposed to crash the NCSJ correction process, resulting in no correction. This is because the mapping has created a 7-bp micro-exon with a canonical but likely incorrect junction to its left, and a non-canonical junction on its right. Post-correction, we end up with two introns next to each other with a zero-length exon, which is not valid.""" # Process references sjFile = "input_files/chr11_sjs.txt" tmp_dir = "scratch/test/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr11"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr11.fa") sam = "input_files/sams/microexon.sam" with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N" "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N" "59M864N31M9891N69M1711N7M1341N47M13S") assert transcript.isCanonical == False assert transcript.MD == "MD:Z:2473" assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert transcript.CIGAR == orig_CIGAR
def test_correct_ncsj(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test_ncsj/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr1"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, refs.genome, refs.sjAnnot) jnNumber = 0 maxDist = 5 logInfo = TC.init_log_info(sam_fields) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) assert transcript.isCanonical == True assert transcript.spliceJunctions[jnNumber].isCanonical == True assert transcript.SEQ == "AAGGAA" assert transcript.CIGAR == "3M764N3M" assert transcript.MD == "MD:Z:6" assert logInfo.corrected_NC_SJs == 1