def test_too_far_away(self): """ A case where the NCSJ should not be corrected because it is too far away from the closest annotated junction relative to the maxDist parameter. Toy transcript with sequence A|GAA, where the splice motif is noncanonical. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test_jns/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:6" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 1 correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == False assert reason == "TooFarFromAnnotJn" assert dist == 2
def test_crash(self): """ This is a Drosophila junction that borders a small match preceded by a 7 bp deletion. It is supposed to crash correction, which will result in a categorization of 'Other' in the log """ # Process references sjFile = "input_files/drosophila_example/chr3R_SJs.tsv" outprefix = "scratch/dmel_crash/" tmp_dir = "scratch/dmel_crash/TC_tmp/" chroms = set(["chr3R"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/drosophila_example/chr3R.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr3R", "14890420", "255", "7M7D2M264N7M", "*", "0", "0", "GATCAAACAACAAGTC", "*" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 5 # Attempt to correct the splice junction correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == False assert reason == "Other" assert dist == 5
def test_noncanonical(self): """ Transcript should be noncanonical and un-annotated prior to correction, but be canonical and annotated afterwards """ sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" sjFile = "input_files/GM12878_SJs_chr1.tab" tmp_dir = "scratch/test_jIjM/TC_tmp/" chroms = set(["chr1"]) refs = dstruct.Struct() refs.genome = Fasta("input_files/hg38_chr1.fa") refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) with open(sam, 'r') as f: sam_line = f.readline().strip() transcript, logInfo = TC.transcript_init(sam_line, refs.genome, refs.sjAnnot) assert transcript.allJnsAnnotated == False assert transcript.isCanonical == False # Now correct the junction and retest upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo) assert upd_transcript.allJnsAnnotated == True assert upd_transcript.isCanonical == True
def test_fix_donor_case3(self): """ Toy transcript with sequence AAGGT|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 So-called case #3 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*", "0", "0", "AAGGTGAA", "*", "NM:i:0", "MD:Z:8"] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice donor side of the junction (left) new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM, transcript.POS, jnNumber, donor, -2, genome, transcript.SEQ, transcript.CIGAR) assert new_seq == "AAGGAA" assert new_cigar == "3M764N3M"
def test_no_correction(self): """ Make sure that the attributes stay the same if no correction was performed """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donor, acceptor, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Now test the update function TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict) junction = transcript.spliceJunctions[jnNumber] assert junction.motif_code == "0" assert junction.isCanonical == False assert transcript.MD == "MD:Z:4" assert transcript.isCanonical == False
def test_find_closest_splice_acceptor_minus(self): """ Find the closest splice acceptor, which is 1 bp downstream. Minus strand. Note that dist is relative to the genome, not to the direction of the transcript.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 22071331 end = 22073331 strand = "-" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) acceptor = junction.get_splice_acceptor() closest_acceptor = TC.find_closest_bound(acceptor, acceptors) assert closest_acceptor.start == 22071329 assert closest_acceptor.end == 22071330 assert closest_acceptor.dist == -1
def test_find_closest_sj_plus(self): # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 23071350 end = 23072124 strand = "+" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) closest_donor, closest_acceptor = TC.find_closest_ref_junction( junction, donors, acceptors) assert closest_donor.end == 23071360 assert closest_acceptor.end == 23072123
def test_find_closest_splice_donor_minus(self): """ For a toy case with multiple donors and acceptors in close proximity, test whether TC can find the closest reference donor to the supplied intron bound. Similar to before, there is an exact match for the donor, located at 23071360 in 1-based coordinates and 23071359 in 0-based.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 23070360 end = 23071360 strand = "-" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) donor = junction.get_splice_donor() closest_donor = TC.find_closest_bound(donor, donors) assert closest_donor.start == 23071359 assert closest_donor.end == 23071360 assert closest_donor.dist == 0
def test_tmp_files(self): """ Check that the expected tmp files are created.""" sj_file = "input_files/toy_sjs_mixed_chroms.txt" chroms = set(["chr1", "chr2"]) tmp_dir = "scratch/sj_reading_test/" os.system("mkdir -p " + tmp_dir) donor_bt, accept_bt, annot = TC.processSpliceAnnotation(sj_file, tmp_dir, chroms, process="test") # Check if paths of tmp files are correct assert os.path.exists( "scratch/sj_reading_test/splice_files/test_ref_splice_donors_tmp.bed" ) assert os.path.exists( "scratch/sj_reading_test/splice_files/test_ref_splice_acceptors_tmp.bed" ) assert os.path.exists( "scratch/sj_reading_test/splice_files/test_ref_splice_donors_tmp.sorted.bed" ) assert os.path.exists( "scratch/sj_reading_test/splice_files/test_ref_splice_acceptors_tmp.sorted.bed" )
def test_correct_jn(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" outprefix = "scratch/test_jns/" tmp_dir = "scratch/test_jns/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 5 #donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice junction correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == True assert reason == "NA" assert dist == 2
def test_find_closest_splice_acceptor_plus(self): """ Find the closest splice acceptor, which is 17 bp upstream. Plus strand.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 23071360 end = 23072140 strand = "+" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) acceptor = junction.get_splice_acceptor() closest_acceptor = TC.find_closest_bound(acceptor, acceptors) assert closest_acceptor.start == 23072122 assert closest_acceptor.end == 23072123 assert closest_acceptor.dist == -17
def test_crash_dmel(self): """ This is a Drosophila junction that borders a small match preceded by a 7 bp deletion. It is also supposed to crash correction, but did not in TC v2.0.1.""" # Process references sjFile = "input_files/drosophila_example/chr3R_SJs.tsv" tmp_dir = "scratch/dmel/TC" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr3R"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/drosophila_example/chr3R.fa") sam = "input_files/drosophila_example/no_SJ_corr.sam" with open(sam, 'r') as f: for sam_line in f: if sam_line.startswith("@"): continue else: sam_line = sam_line.strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) orig_CIGAR = transcript.CIGAR orig_seq = transcript.SEQ orig_MD = transcript.MD expected_TE = "\t".join([ "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013", "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected", "Other" ]) + "\n" assert transcript.isCanonical == False # Attempt to correct the splice junction new_transcript, TE_entries = TC.cleanNoncanonical( transcript, refs, maxDist, logInfo) print(TE_entries) assert new_transcript.isCanonical == False assert TE_entries == expected_TE assert new_transcript.MD == orig_MD assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert new_transcript.CIGAR == orig_CIGAR assert new_transcript.SEQ == orig_seq
def test_two_annotated_SJs(self): """ Transcript with 2 junctions and each match the provided reference """ sam = "input_files/sams/perfectReferenceMatch_twoIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") sjFile = "input_files/GM12878_SJs_chr1.tab" outprefix = "scratch/test" tmp_dir = "scratch/test_jIjM/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjDict) assert transcript.allJnsAnnotated == True assert transcript.isCanonical == True
def test_splice_donors(self): """ Make sure that the correct positions got labeled as splice donors """ sj_file = "input_files/toy_sjs_mixed_chroms.txt" chroms = set(["chr1", "chr2"]) tmp_dir = "scratch/sj_reading_test/" os.system("mkdir -p " + tmp_dir) donor_bt, accept_bt, annot = TC.processSpliceAnnotation(sj_file, tmp_dir, chroms, process="test") # Remember, file is 1-based but BedTool is 0-based expected_donors = set([99, 399]) donors = set() for donor in donor_bt: donors.add(donor.start) assert donors == expected_donors
def test_crash_correction(self): """ This is a case that is supposed to crash the NCSJ correction process, resulting in no correction. This is because the mapping has created a 7-bp micro-exon with a canonical but likely incorrect junction to its left, and a non-canonical junction on its right. Post-correction, we end up with two introns next to each other with a zero-length exon, which is not valid.""" # Process references sjFile = "input_files/chr11_sjs.txt" tmp_dir = "scratch/test/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr11"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr11.fa") sam = "input_files/sams/microexon.sam" with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N" "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N" "59M864N31M9891N69M1711N7M1341N47M13S") assert transcript.isCanonical == False assert transcript.MD == "MD:Z:2473" assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert transcript.CIGAR == orig_CIGAR
def test_correct_ncsj(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test_ncsj/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr1"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, refs.genome, refs.sjAnnot) jnNumber = 0 maxDist = 5 logInfo = TC.init_log_info(sam_fields) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) assert transcript.isCanonical == True assert transcript.spliceJunctions[jnNumber].isCanonical == True assert transcript.SEQ == "AAGGAA" assert transcript.CIGAR == "3M764N3M" assert transcript.MD == "MD:Z:6" assert logInfo.corrected_NC_SJs == 1
def test_update(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donor, acceptor, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice donor side of the junction (left) transcript.SEQ, transcript.CIGAR = TC.fix_one_side_of_junction( transcript.CHROM, transcript.POS, jnNumber, donor, 2, genome, transcript.SEQ, transcript.CIGAR) # Now test the update function TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict) junction = transcript.spliceJunctions[jnNumber] assert junction.motif_code == "21" assert junction.isCanonical == True assert transcript.MD == "MD:Z:6" assert transcript.isCanonical == True
def test_chrom_filtering(self): """ Check that only chr1 and chr2 junctions get saved""" sj_file = "input_files/toy_sjs_mixed_chroms.txt" chroms = set(["chr1", "chr2"]) tmp_dir = "scratch/sj_reading_test/" os.system("mkdir -p " + tmp_dir) donor_bt, accept_bt, annot = TC.processSpliceAnnotation(sj_file, tmp_dir, chroms, process="test") # Check donor chroms donor_chroms = set() for pos in donor_bt: donor_chroms.add(pos.chrom) assert donor_chroms == chroms # Check acceptor chroms acc_chroms = set() for pos in accept_bt: acc_chroms.add(pos.chrom) assert acc_chroms == chroms
def test_DIM_nc(self): """ Correct a transcript containing a deletion, insertion, mismatch, and noncanonical splice junction """ # Initialize options etc. sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" genome = Fasta("input_files/hg38_chr1.fa") sjFile = "input_files/GM12878_SJs_chr1.tab" tmp_dir = "scratch/example/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) chroms = set(["chr1"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) outfiles = dstruct.Struct() outfiles.TElog = open(tmp_dir + "DIM_nc_clean.TE.log", 'w') outfiles.sam = open(tmp_dir + "DIM_nc_clean.sam", 'w') outfiles.fasta = open(tmp_dir + "DIM_nc_clean.fasta", 'w') outfiles.log = open(tmp_dir + "DIM_nc_clean.log", 'w') refs = dstruct.Struct() refs.sjAnnot = sjAnnot refs.genome = genome refs.donors = donors refs.acceptors = acceptors refs.snps = {} refs.deletions = {} refs.insertions = {} options = dstruct.Struct() options.maxLenIndel = 5 options.maxSJOffset = 5 options.correctMismatches = "true" options.correctIndels = "true" options.correctSJs = "true" options.primaryOnly = True options.canonOnly = False # Correct the transcript with open(sam, 'r') as f: transcripts = [f.readline().strip()] TC.batch_correct(transcripts, options, refs, outfiles) # Close the output files for handle in outfiles.values(): handle.close() # Expected transcript attributes post-correction correct_CIGAR = ("12M1134N126M163N202M866N74M924N191M1777N127M2109N" "157M88N159M932N633M274N117M7696N170M1215N629M938N" "29M428N133M254N166M390N212M253N89M163N483M") correct_MD = "MD:Z:3709" correct_NM = "NM:i:0" correct_jI = ( "jI:B:i,150941429,150942562,150942689,150942851,150943054," "150943919,150943994,150944917,150945109,150946885,150947013," "150949121,150949279,150949366,150949526,150950457,150951091," "150951364,150951482,150959177,150959348,150960562,150961192," "150962129,150962159,150962586,150962720,150962973,150963140," "150963529,150963742,150963994,150964084,150964246") correct_jM = "jM:B:c,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21" # Read in transcript from outfile with open(tmp_dir + "DIM_nc_clean.sam", 'r') as f: sam_line = f.readline().strip().split('\t') transcript = t2.Transcript(sam_line, genome, sjAnnot) assert transcript.CIGAR == correct_CIGAR assert transcript.MD == correct_MD assert transcript.NM == correct_NM assert transcript.jI == correct_jI assert transcript.jM == correct_jM # Read logs and make sure they are OK expected_log = "\t".join([ "c34150/f1p1/3707", "primary", "2", "0", "0", "1", "0", "0", "2", "0", "1", "0" ]) with open(tmp_dir + "DIM_nc_clean.log", 'r') as f: log = f.readline().strip() assert log == expected_log