def test_no_correction(self): """ Make sure that the attributes stay the same if no correction was performed """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donor, acceptor, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Now test the update function TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict) junction = transcript.spliceJunctions[jnNumber] assert junction.motif_code == "0" assert junction.isCanonical == False assert transcript.MD == "MD:Z:4" assert transcript.isCanonical == False
def test_find_closest_splice_acceptor_plus(self): """ Find the closest splice acceptor, which is 17 bp upstream. Plus strand.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 23071360 end = 23072140 strand = "+" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) acceptor = junction.get_splice_acceptor() closest_acceptor = TC.find_closest_bound(acceptor, acceptors) assert closest_acceptor.start == 23072122 assert closest_acceptor.end == 23072123 assert closest_acceptor.dist == -17
def test_variant_insertion(self): """ Toy transcript with sequence AAATTGA, where the Ts are a 2 bp insertion that matches a known variant. chr1: 202,892,094 - 202,892,098. Insertion is between position 202,892,096 and 202,892,097. The genomic position used to refer to it is 202,892,097 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "3M2I2M", "*", "0", "0", "AAATTGA", "*", "NM:i:2", "MD:Z:5", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {"chr1_202892096_202892098": "TT"} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctInsertions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAATTGA" assert transcript.CIGAR == "3M2I2M" # Check the log entries expected_log = "\t".join([ "test_read", "chr1_202892096_202892098", "Insertion", "2", "Uncorrected", "VariantMatch" ]) + "\n" assert TE_entries == expected_log
def test_sj_corr_off(self): """ Splice reference provided, but correction set to off. Expected behavior is to skip SJ ref initialization because it would be a waste of time """ # Initialize options etc. sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" tmp_dir = "scratch/prep_refs/sj_off/TC_tmp/" os.system("mkdir -p " + tmp_dir) options = dstruct.Struct() options.refGenome = "input_files/hg38_chr1.fa" options.tmp_dir = tmp_dir options.maxLenIndel = options.maxSJOffset = 5 options.correctSJs = "false" options.variantFile = None options.sjAnnotFile = "input_files/test_junctions.txt" header, chroms, sam_chunks = TC.split_SAM(sam, 1) refs = TC.prep_refs(options, sam_chunks[0], header) # Check that variant dicts are empty assert refs.snps == refs.insertions == refs.deletions == {} # Check that SJ bedtools and annot lookup are empty assert refs.donors == refs.acceptors == None assert refs.sjAnnot == set()
def test_variants(self): """ A variant file is provided """ # Initialize options etc. sam = "input_files/vcf_test/read_with_snps.sam" tmp_dir = "scratch/prep_refs/variant/TC_tmp/" os.system("mkdir -p " + tmp_dir) options = dstruct.Struct() options.refGenome = "input_files/hg38_chr11.fa" options.tmp_dir = tmp_dir options.maxLenIndel = options.maxSJOffset = 5 options.correctSJs = "false" options.variantFile = "input_files/vcf_test/snps.vcf" options.sjAnnotFile = None header, chroms, sam_chunks = TC.split_SAM(sam, 1) refs = TC.prep_refs(options, sam_chunks[0], header) # Check that variant deletion and insertion dicts are empty assert len(refs.insertions) == 0 assert len(refs.deletions) == 0 assert len(refs.snps) > 0 # Check that SJ bedtools and annot lookup are empty assert refs.donors == refs.acceptors == None assert refs.sjAnnot == set()
def test_genome_only(self): """ Make sure that the prep_refs function works under the simplest possible option setting: no variants or SJs provided. """ # Initialize options etc. sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" tmp_dir = "scratch/prep_refs/genome-only/TC_tmp/" os.system("mkdir -p " + tmp_dir) options = dstruct.Struct() options.refGenome = "input_files/hg38_chr1.fa" options.tmp_dir = tmp_dir options.maxLenIndel = options.maxSJOffset = 5 options.correctSJs = "false" options.variantFile = None options.sjAnnotFile = None header, chroms, sam_chunks = TC.split_SAM(sam, 1) refs = TC.prep_refs(options, sam_chunks[0], header) # Check that variant dicts are empty assert refs.snps == refs.insertions == refs.deletions == {} # Check that SJ bedtools and annot lookup are empty assert refs.donors == refs.acceptors == None assert refs.sjAnnot == set()
def test_sjs(self): """ Genome and splice junction reference provided. Variant structs should still be empty. """ # Initialize options etc. sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" tmp_dir = "scratch/prep_refs/sjs/TC_tmp/" os.system("mkdir -p " + tmp_dir) options = dstruct.Struct() options.refGenome = "input_files/hg38_chr1.fa" options.tmp_dir = tmp_dir options.maxLenIndel = options.maxSJOffset = 5 options.correctSJs = "true" options.variantFile = None options.sjAnnotFile = "input_files/test_junctions.txt" header, chroms, sam_chunks = TC.split_SAM(sam, 1) refs = TC.prep_refs(options, sam_chunks[0], header) # Check that variant dicts are empty assert refs.snps == refs.insertions == refs.deletions == {} # Check SJ bedtools and annot lookup assert (refs.donors).count() == 3 assert (refs.acceptors).count() == 2 # Same acceptor appears in 2 jns assert len(refs.sjAnnot) == 3
def test_wrong_variant_mismatch(self): """ Toy transcript with sequence AACGA, where the C is a mismatch to the reference base 'A' in the location, but not matching, a known SNP. chr1: 202,892,094 - 202,892,098. Mismatch is at 202,892,096 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0", "AACGA", "*", "NM:i:1", "MD:Z:2A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None variants = {"chr1_202892096": ["G"]} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction error_entries = TC.correctMismatches(transcript, genome, variants, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check the number and content of the transcript error entries assert error_entries.count('\n') == 1 assert "Corrected" in error_entries assert "VariantMatch" not in error_entries
def test_two_mismatches(self): """ Correct 2 mismatches in the same read. Useful for making sure that the TE log string is correct. """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "5M", "*", "0", "0", "ACCGA", "*", "NM:i:2", "MD:Z:1A0A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction error_entries = TC.correctMismatches(transcript, genome, variants, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check the number and content of the transcript error entries print(error_entries) assert error_entries.count('\n') == 2 assert error_entries.count('Corrected') == 2
def test_find_closest_sj_plus(self): # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 23071350 end = 23072124 strand = "+" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) closest_donor, closest_acceptor = TC.find_closest_ref_junction( junction, donors, acceptors) assert closest_donor.end == 23071360 assert closest_acceptor.end == 23072123
def test_find_closest_splice_donor_minus(self): """ For a toy case with multiple donors and acceptors in close proximity, test whether TC can find the closest reference donor to the supplied intron bound. Similar to before, there is an exact match for the donor, located at 23071360 in 1-based coordinates and 23071359 in 0-based.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 23070360 end = 23071360 strand = "-" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) donor = junction.get_splice_donor() closest_donor = TC.find_closest_bound(donor, donors) assert closest_donor.start == 23071359 assert closest_donor.end == 23071360 assert closest_donor.dist == 0
def test_fix_donor_case3(self): """ Toy transcript with sequence AAGGT|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 So-called case #3 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*", "0", "0", "AAGGTGAA", "*", "NM:i:0", "MD:Z:8"] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice donor side of the junction (left) new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM, transcript.POS, jnNumber, donor, -2, genome, transcript.SEQ, transcript.CIGAR) assert new_seq == "AAGGAA" assert new_cigar == "3M764N3M"
def test_variant_deletion(self): """ Same deletion again, but with a matching variant at the same location. Correct action is to leave the deletion in place """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {"chr1_202892095_202892096": 1} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if deletion is still there as expected assert transcript.SEQ == "AAGA" assert transcript.CIGAR == "2M1D2M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Uncorrected", "VariantMatch" ]) + "\n" assert TE_entries == expected_TE
def test_not_correctable_deletion(self): """ Same deletion again, but correction cutoff set to 0 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 0 spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAGA" assert transcript.CIGAR == "2M1D2M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Uncorrected", "TooLarge" ]) + "\n" assert TE_entries == expected_TE
def test_crash(self): """ This is a Drosophila junction that borders a small match preceded by a 7 bp deletion. It is supposed to crash correction, which will result in a categorization of 'Other' in the log """ # Process references sjFile = "input_files/drosophila_example/chr3R_SJs.tsv" outprefix = "scratch/dmel_crash/" tmp_dir = "scratch/dmel_crash/TC_tmp/" chroms = set(["chr3R"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/drosophila_example/chr3R.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr3R", "14890420", "255", "7M7D2M264N7M", "*", "0", "0", "GATCAAACAACAAGTC", "*" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 5 # Attempt to correct the splice junction correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == False assert reason == "Other" assert dist == 5
def test_correctable_deletion(self): """ Toy transcript with sequence AA-GA, where the '-' is a deletion of the base 'A'. chr1: 202,892,094 - 202,892,098. Deletion is at 202,892,096 """ sam_fields = [ "test_read", "0", "chr1", "202892094", "255", "2M1D2M", "*", "0", "0", "AAGA", "*", "NM:i:1", "MD:Z:2^A2", "jI:B:i,-1", "jM:B:c,-1" ] genome = Fasta("input_files/hg38_chr1.fa") maxLen = 5 spliceAnnot = None variants = {} logInfo = TC.init_log_info(sam_fields) # Init transcript object transcript = t2.Transcript(sam_fields, genome, spliceAnnot) # Run correction TE_entries = TC.correctDeletions(transcript, genome, variants, maxLen, logInfo) # Check to see if correction was successful assert transcript.SEQ == "AAAGA" assert transcript.CIGAR == "5M" # Check TE log expected_TE = "\t".join([ "test_read", "chr1_202892095_202892096", "Deletion", "1", "Corrected", "NA" ]) + "\n" assert TE_entries == expected_TE
def test_too_far_away(self): """ A case where the NCSJ should not be corrected because it is too far away from the closest annotated junction relative to the maxDist parameter. Toy transcript with sequence A|GAA, where the splice motif is noncanonical. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test_jns/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:6" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 1 correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == False assert reason == "TooFarFromAnnotJn" assert dist == 2
def test_correct_jn(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" outprefix = "scratch/test_jns/" tmp_dir = "scratch/test_jns/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, genome, sjAnnot) jnNumber = 0 maxDist = 5 #donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice junction correction_status, reason, dist = TC.attempt_jn_correction( transcript, jnNumber, genome, donors, acceptors, sjAnnot, maxDist) assert correction_status == True assert reason == "NA" assert dist == 2
def test_noncanonical(self): """ Transcript should be noncanonical and un-annotated prior to correction, but be canonical and annotated afterwards """ sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" sjFile = "input_files/GM12878_SJs_chr1.tab" tmp_dir = "scratch/test_jIjM/TC_tmp/" chroms = set(["chr1"]) refs = dstruct.Struct() refs.genome = Fasta("input_files/hg38_chr1.fa") refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) with open(sam, 'r') as f: sam_line = f.readline().strip() transcript, logInfo = TC.transcript_init(sam_line, refs.genome, refs.sjAnnot) assert transcript.allJnsAnnotated == False assert transcript.isCanonical == False # Now correct the junction and retest upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo) assert upd_transcript.allJnsAnnotated == True assert upd_transcript.isCanonical == True
def test_find_closest_splice_acceptor_minus(self): """ Find the closest splice acceptor, which is 1 bp downstream. Minus strand. Note that dist is relative to the genome, not to the direction of the transcript.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 22071331 end = 22073331 strand = "-" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) acceptor = junction.get_splice_acceptor() closest_acceptor = TC.find_closest_bound(acceptor, acceptors) assert closest_acceptor.start == 22071329 assert closest_acceptor.end == 22071330 assert closest_acceptor.dist == -1
def test_crash_dmel(self): """ This is a Drosophila junction that borders a small match preceded by a 7 bp deletion. It is also supposed to crash correction, but did not in TC v2.0.1.""" # Process references sjFile = "input_files/drosophila_example/chr3R_SJs.tsv" tmp_dir = "scratch/dmel/TC" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr3R"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/drosophila_example/chr3R.fa") sam = "input_files/drosophila_example/no_SJ_corr.sam" with open(sam, 'r') as f: for sam_line in f: if sam_line.startswith("@"): continue else: sam_line = sam_line.strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) orig_CIGAR = transcript.CIGAR orig_seq = transcript.SEQ orig_MD = transcript.MD expected_TE = "\t".join([ "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013", "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected", "Other" ]) + "\n" assert transcript.isCanonical == False # Attempt to correct the splice junction new_transcript, TE_entries = TC.cleanNoncanonical( transcript, refs, maxDist, logInfo) print(TE_entries) assert new_transcript.isCanonical == False assert TE_entries == expected_TE assert new_transcript.MD == orig_MD assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert new_transcript.CIGAR == orig_CIGAR assert new_transcript.SEQ == orig_seq
def test_both_inside(self): """ Reference: ----->| |<----- Transcript: ----->| |<----- dist_0 = -2, dist_1 = +2, combined dist = 4 """ assert TC.combinedJunctionDist(-2, 2) == 4
def test_tmp_files(self): """ Check that the expected tmp files are created.""" sj_file = "input_files/toy_sjs_mixed_chroms.txt" chroms = set(["chr1", "chr2"]) tmp_dir = "scratch/sj_reading_test/" os.system("mkdir -p " + tmp_dir) donor_bt, accept_bt, annot = TC.processSpliceAnnotation(sj_file, tmp_dir, chroms, process="test") # Check if paths of tmp files are correct assert os.path.exists( "scratch/sj_reading_test/splice_files/test_ref_splice_donors_tmp.bed" ) assert os.path.exists( "scratch/sj_reading_test/splice_files/test_ref_splice_acceptors_tmp.bed" ) assert os.path.exists( "scratch/sj_reading_test/splice_files/test_ref_splice_donors_tmp.sorted.bed" ) assert os.path.exists( "scratch/sj_reading_test/splice_files/test_ref_splice_acceptors_tmp.sorted.bed" )
def test_create_tmp_sam(self): """ Create a tmp sam file from the mock header and transcripts provided. Then, check the order of the lines in the tmp file just to be sure. """ sam_header = ["HLine1", "HLine2"] sam_transcripts = [ "\t".join(["read1", "mapping", "chr1", "..."]), "\t".join(["read2", "mapping", "chr2", "..."]) ] tmp_dir = "scratch/tmp_sam_test/" fname, chroms = TC.create_tmp_sam(sam_header, sam_transcripts, tmp_dir, process = "test") assert fname == "scratch/tmp_sam_test/split_uncorr_sams/test.sam" assert chroms == set(["chr1", "chr2"]) # Now check the integrity of the output file line_num = 0 with open(fname, 'r') as f: for line in f: line = line.strip() if line_num == 0: assert line == sam_header[0] elif line_num == 1: assert line == sam_header[1] elif line_num == 2: assert line == sam_transcripts[0] elif line_num == 3: assert line == sam_transcripts[1] else: pytest.fail("Output contains more lines than expected") line_num += 1
def test_primary_monoexon_read(self): """ The supplied read is a primary alignment. This means that a transcript object is created, and the logInfo struct notes the primary status.""" sam_file = "input_files/sams/perfectReferenceMatch_noIntrons.sam" with open(sam_file, 'r') as f: sam_line = f.readline().strip() genome = Fasta("input_files/hg38_chr1.fa") sjAnnot = set() transcript, logInfo = TC.transcript_init(sam_line, genome, sjAnnot) assert transcript.QNAME == "c21031/f2p3/3400" assert transcript.FLAG == 0 assert transcript.CHROM == "chr1" assert transcript.POS == 192575775 assert transcript.CIGAR == "155M" assert transcript.MD == "MD:Z:155" assert logInfo.Mapping == "primary" assert logInfo.corrected_deletions == \ logInfo.uncorrected_deletions == \ logInfo.variant_deletions == \ logInfo.corrected_insertions == \ logInfo.uncorrected_insertions == \ logInfo.variant_insertions == \ logInfo.corrected_mismatches == \ logInfo.uncorrected_mismatches == \ logInfo.corrected_NC_SJs == logInfo.uncorrected_NC_SJs == "NA"
def test_left_same_right_inside(self): """ Reference: ----->| |<----- Transcript: ----->| |<----- dist_0 = 0, dist_1 = +2, combined dist = 2 """ assert TC.combinedJunctionDist(0, 2) == 2
def test_left_outside_right_inside(self): """ Reference: ----->| |<----- Transcript: ----->| |<----- dist_0 = +1, dist_1 = +4, combined dist = 3 """ assert TC.combinedJunctionDist(1, 4) == 3
def test_crash_correction(self): """ This is a case that is supposed to crash the NCSJ correction process, resulting in no correction. This is because the mapping has created a 7-bp micro-exon with a canonical but likely incorrect junction to its left, and a non-canonical junction on its right. Post-correction, we end up with two introns next to each other with a zero-length exon, which is not valid.""" # Process references sjFile = "input_files/chr11_sjs.txt" tmp_dir = "scratch/test/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr11"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr11.fa") sam = "input_files/sams/microexon.sam" with open(sam, 'r') as f: sam_line = f.readline().strip().split('\t') # Init transcript object transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot) maxDist = 5 logInfo = TC.init_log_info(sam_line) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N" "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N" "59M864N31M9891N69M1711N7M1341N47M13S") assert transcript.isCanonical == False assert transcript.MD == "MD:Z:2473" assert logInfo.corrected_NC_SJs == 0 assert logInfo.uncorrected_NC_SJs == 1 assert transcript.CIGAR == orig_CIGAR
def test_correct_ncsj(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test_ncsj/TC_tmp/" os.system("mkdir -p %s" % tmp_dir) refs = dstruct.Struct() chroms = set(["chr1"]) refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) refs.genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, refs.genome, refs.sjAnnot) jnNumber = 0 maxDist = 5 logInfo = TC.init_log_info(sam_fields) assert transcript.isCanonical == False # Attempt to correct the splice junction transcript, TE_entries = TC.cleanNoncanonical(transcript, refs, maxDist, logInfo) assert transcript.isCanonical == True assert transcript.spliceJunctions[jnNumber].isCanonical == True assert transcript.SEQ == "AAGGAA" assert transcript.CIGAR == "3M764N3M" assert transcript.MD == "MD:Z:6" assert logInfo.corrected_NC_SJs == 1
def test_update(self): """ Toy transcript with sequence A|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donor, acceptor, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = [ "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0", "0", "AGAA", "*", "NM:i:0", "MD:Z:4" ] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice donor side of the junction (left) transcript.SEQ, transcript.CIGAR = TC.fix_one_side_of_junction( transcript.CHROM, transcript.POS, jnNumber, donor, 2, genome, transcript.SEQ, transcript.CIGAR) # Now test the update function TC.update_post_ncsj_correction(transcript, jnNumber, genome, sjDict) junction = transcript.spliceJunctions[jnNumber] assert junction.motif_code == "21" assert junction.isCanonical == True assert transcript.MD == "MD:Z:6" assert transcript.isCanonical == True