def test_sj_corr_off(self):
        """ Splice reference provided, but correction set to off. Expected 
            behavior is to skip SJ ref initialization because it would be a
            waste of time """

        # Initialize options etc.
        sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        tmp_dir = "scratch/prep_refs/sj_off/TC_tmp/"
        os.system("mkdir -p " + tmp_dir)

        options = dstruct.Struct()
        options.refGenome = "input_files/hg38_chr1.fa"
        options.tmp_dir = tmp_dir
        options.maxLenIndel = options.maxSJOffset = 5
        options.correctSJs = "false"
        options.variantFile = None
        options.sjAnnotFile = "input_files/test_junctions.txt"

        header, chroms, sam_chunks = TC.split_SAM(sam, 1)
        refs = TC.prep_refs(options, sam_chunks[0], header)

        # Check that variant dicts are empty
        assert refs.snps == refs.insertions == refs.deletions == {}

        # Check that SJ bedtools and annot lookup are empty
        assert refs.donors == refs.acceptors == None
        assert refs.sjAnnot == set()
    def test_noncanonical(self):
        """ Transcript should be noncanonical and un-annotated prior to 
            correction, but be canonical and annotated afterwards """

        sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
        sjFile = "input_files/GM12878_SJs_chr1.tab"
        tmp_dir = "scratch/test_jIjM/TC_tmp/"
        chroms = set(["chr1"])
        refs = dstruct.Struct()
        refs.genome = Fasta("input_files/hg38_chr1.fa")
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(sjFile, tmp_dir,
                                                                               chroms)

        with open(sam, 'r') as f:
            sam_line = f.readline().strip()
            transcript, logInfo = TC.transcript_init(sam_line, refs.genome, 
                                                     refs.sjAnnot)

        assert transcript.allJnsAnnotated == False
        assert transcript.isCanonical == False

        # Now correct the junction and retest
        upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo)        

        assert upd_transcript.allJnsAnnotated == True
        assert upd_transcript.isCanonical == True
    def test_variants(self):
        """ A variant file is provided """

        # Initialize options etc.
        sam = "input_files/vcf_test/read_with_snps.sam"
        tmp_dir = "scratch/prep_refs/variant/TC_tmp/"
        os.system("mkdir -p " + tmp_dir)

        options = dstruct.Struct()
        options.refGenome = "input_files/hg38_chr11.fa"
        options.tmp_dir = tmp_dir
        options.maxLenIndel = options.maxSJOffset = 5
        options.correctSJs = "false"
        options.variantFile = "input_files/vcf_test/snps.vcf"
        options.sjAnnotFile = None

        header, chroms, sam_chunks = TC.split_SAM(sam, 1)
        refs = TC.prep_refs(options, sam_chunks[0], header)

        # Check that variant deletion and insertion dicts are empty
        assert len(refs.insertions) == 0
        assert len(refs.deletions) == 0
        assert len(refs.snps) > 0

        # Check that SJ bedtools and annot lookup are empty
        assert refs.donors == refs.acceptors == None
        assert refs.sjAnnot == set()
    def test_sjs(self):
        """ Genome and splice junction reference provided. Variant structs
            should still be empty. """

        # Initialize options etc.
        sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        tmp_dir = "scratch/prep_refs/sjs/TC_tmp/"
        os.system("mkdir -p " + tmp_dir)

        options = dstruct.Struct()
        options.refGenome = "input_files/hg38_chr1.fa"
        options.tmp_dir = tmp_dir
        options.maxLenIndel = options.maxSJOffset = 5
        options.correctSJs = "true"
        options.variantFile = None
        options.sjAnnotFile = "input_files/test_junctions.txt"

        header, chroms, sam_chunks = TC.split_SAM(sam, 1)
        refs = TC.prep_refs(options, sam_chunks[0], header)

        # Check that variant dicts are empty
        assert refs.snps == refs.insertions == refs.deletions == {}

        # Check SJ bedtools and annot lookup
        assert (refs.donors).count() == 3
        assert (refs.acceptors).count() == 2  # Same acceptor appears in 2 jns
        assert len(refs.sjAnnot) == 3
    def test_genome_only(self):
        """ Make sure that the prep_refs function works under the simplest
            possible option setting: no variants or SJs provided. """

        # Initialize options etc.
        sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
        tmp_dir = "scratch/prep_refs/genome-only/TC_tmp/"
        os.system("mkdir -p " + tmp_dir)

        options = dstruct.Struct()
        options.refGenome = "input_files/hg38_chr1.fa"
        options.tmp_dir = tmp_dir
        options.maxLenIndel = options.maxSJOffset = 5
        options.correctSJs = "false"
        options.variantFile = None
        options.sjAnnotFile = None

        header, chroms, sam_chunks = TC.split_SAM(sam, 1)
        refs = TC.prep_refs(options, sam_chunks[0], header)

        # Check that variant dicts are empty
        assert refs.snps == refs.insertions == refs.deletions == {}

        # Check that SJ bedtools and annot lookup are empty
        assert refs.donors == refs.acceptors == None
        assert refs.sjAnnot == set()
    def test_crash_dmel(self):
        """ This is a Drosophila junction that borders a small match preceded by
            a 7 bp deletion. It is also supposed to crash correction, but did
            not in TC v2.0.1."""

        # Process references
        sjFile = "input_files/drosophila_example/chr3R_SJs.tsv"
        tmp_dir = "scratch/dmel/TC"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr3R"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/drosophila_example/chr3R.fa")

        sam = "input_files/drosophila_example/no_SJ_corr.sam"
        with open(sam, 'r') as f:
            for sam_line in f:
                if sam_line.startswith("@"):
                    continue
                else:
                    sam_line = sam_line.strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)
        orig_CIGAR = transcript.CIGAR
        orig_seq = transcript.SEQ
        orig_MD = transcript.MD
        expected_TE = "\t".join([
            "m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013",
            "chr3R_14890436_14890699", "NC_SJ_boundary", "5", "Uncorrected",
            "Other"
        ]) + "\n"

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        new_transcript, TE_entries = TC.cleanNoncanonical(
            transcript, refs, maxDist, logInfo)

        print(TE_entries)
        assert new_transcript.isCanonical == False
        assert TE_entries == expected_TE
        assert new_transcript.MD == orig_MD
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert new_transcript.CIGAR == orig_CIGAR
        assert new_transcript.SEQ == orig_seq
    def test_crash_correction(self):
        """ This is a case that is supposed to crash the NCSJ correction process,
           resulting in no correction. This is because the mapping has
           created a 7-bp micro-exon with a canonical but likely incorrect
           junction to its left, and a non-canonical junction on its right.
           Post-correction, we end up with two introns next to each other
           with a zero-length exon, which is not valid."""

        # Process references
        sjFile = "input_files/chr11_sjs.txt"
        tmp_dir = "scratch/test/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr11"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr11.fa")

        sam = "input_files/sams/microexon.sam"
        with open(sam, 'r') as f:
            sam_line = f.readline().strip().split('\t')

        # Init transcript object
        transcript = t2.Transcript(sam_line, refs.genome, refs.sjAnnot)
        maxDist = 5
        logInfo = TC.init_log_info(sam_line)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        orig_CIGAR = ("1211M5612N57M464N30M2717N120M1097N23M2632N146M1225N"
                      "140M4770N72M5051N132M1513N87M567N142M3780N100M2160N"
                      "59M864N31M9891N69M1711N7M1341N47M13S")

        assert transcript.isCanonical == False
        assert transcript.MD == "MD:Z:2473"
        assert logInfo.corrected_NC_SJs == 0
        assert logInfo.uncorrected_NC_SJs == 1
        assert transcript.CIGAR == orig_CIGAR
    def test_correct_ncsj(self):
        """ Toy transcript with sequence A|GAA, where the splice motif
            is noncanonical but located 2 bp from a canonical splice donor.
            chr1: 23,071,357 - 23,072,126

        """

        # Process references
        sjFile = "input_files/test_junctions.txt"
        tmp_dir = "scratch/test_ncsj/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        refs = dstruct.Struct()
        chroms = set(["chr1"])
        refs.donors, refs.acceptors, refs.sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)
        refs.genome = Fasta("input_files/hg38_chr1.fa")

        # Init transcript object
        sam_fields = [
            "test_read", "0", "chr1", "23071357", "255", "1M766N3M", "*", "0",
            "0", "AGAA", "*", "NM:i:0", "MD:Z:4"
        ]
        transcript = t2.Transcript(sam_fields, refs.genome, refs.sjAnnot)
        jnNumber = 0
        maxDist = 5
        logInfo = TC.init_log_info(sam_fields)

        assert transcript.isCanonical == False

        # Attempt to correct the splice junction
        transcript, TE_entries = TC.cleanNoncanonical(transcript, refs,
                                                      maxDist, logInfo)

        assert transcript.isCanonical == True
        assert transcript.spliceJunctions[jnNumber].isCanonical == True
        assert transcript.SEQ == "AAGGAA"
        assert transcript.CIGAR == "3M764N3M"
        assert transcript.MD == "MD:Z:6"
        assert logInfo.corrected_NC_SJs == 1
Beispiel #9
0
def make_novelty_type_struct(database, datasets):
    """ Create a data structure where it is possible to look up whether a gene
        or transcript belongs to a particular category of novelty"""

    conn = sqlite3.connect(database)
    conn.row_factory = sqlite3.Row
    cursor = conn.cursor()

    novelty_type = dstruct.Struct()
    novelty_type.known_genes = set(
        qutils.fetch_all_known_genes_detected(cursor, datasets))
    novelty_type.antisense_genes = set(
        qutils.fetch_antisense_genes(cursor, datasets))
    novelty_type.intergenic_genes = set(
        qutils.fetch_intergenic_novel_genes(cursor, datasets))
    novelty_type.known_transcripts = set(
        qutils.fetch_all_known_transcripts_detected(cursor, datasets))
    novelty_type.ISM_transcripts = set(
        qutils.fetch_all_ISM_transcripts(cursor, datasets))
    novelty_type.ISM_prefix = set(
        qutils.fetch_prefix_ISM_transcripts(cursor, datasets))
    novelty_type.ISM_suffix = set(
        qutils.fetch_suffix_ISM_transcripts(cursor, datasets))
    novelty_type.NIC_transcripts = set(
        qutils.fetch_NIC_transcripts(cursor, datasets))
    novelty_type.NNC_transcripts = set(
        qutils.fetch_NNC_transcripts(cursor, datasets))
    novelty_type.antisense_transcripts = set(
        qutils.fetch_antisense_transcripts(cursor, datasets))
    novelty_type.intergenic_transcripts = set(
        qutils.fetch_intergenic_transcripts(cursor, datasets))
    novelty_type.genomic_transcripts = set(
        qutils.fetch_genomic_transcripts(cursor, datasets))

    conn.close()
    return novelty_type
Beispiel #10
0
    def test_DIM_nc(self):
        """ Correct a transcript containing a deletion, insertion, mismatch,
            and noncanonical splice junction """

        # Initialize options etc.
        sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
        genome = Fasta("input_files/hg38_chr1.fa")
        sjFile = "input_files/GM12878_SJs_chr1.tab"
        tmp_dir = "scratch/example/TC_tmp/"
        os.system("mkdir -p %s" % tmp_dir)
        chroms = set(["chr1"])
        donors, acceptors, sjAnnot = TC.processSpliceAnnotation(
            sjFile, tmp_dir, chroms)

        outfiles = dstruct.Struct()
        outfiles.TElog = open(tmp_dir + "DIM_nc_clean.TE.log", 'w')
        outfiles.sam = open(tmp_dir + "DIM_nc_clean.sam", 'w')
        outfiles.fasta = open(tmp_dir + "DIM_nc_clean.fasta", 'w')
        outfiles.log = open(tmp_dir + "DIM_nc_clean.log", 'w')

        refs = dstruct.Struct()
        refs.sjAnnot = sjAnnot
        refs.genome = genome
        refs.donors = donors
        refs.acceptors = acceptors
        refs.snps = {}
        refs.deletions = {}
        refs.insertions = {}

        options = dstruct.Struct()
        options.maxLenIndel = 5
        options.maxSJOffset = 5
        options.correctMismatches = "true"
        options.correctIndels = "true"
        options.correctSJs = "true"
        options.primaryOnly = True
        options.canonOnly = False

        # Correct the transcript
        with open(sam, 'r') as f:
            transcripts = [f.readline().strip()]
        TC.batch_correct(transcripts, options, refs, outfiles)

        # Close the output files
        for handle in outfiles.values():
            handle.close()

        # Expected transcript attributes post-correction
        correct_CIGAR = ("12M1134N126M163N202M866N74M924N191M1777N127M2109N"
                         "157M88N159M932N633M274N117M7696N170M1215N629M938N"
                         "29M428N133M254N166M390N212M253N89M163N483M")
        correct_MD = "MD:Z:3709"
        correct_NM = "NM:i:0"
        correct_jI = (
            "jI:B:i,150941429,150942562,150942689,150942851,150943054,"
            "150943919,150943994,150944917,150945109,150946885,150947013,"
            "150949121,150949279,150949366,150949526,150950457,150951091,"
            "150951364,150951482,150959177,150959348,150960562,150961192,"
            "150962129,150962159,150962586,150962720,150962973,150963140,"
            "150963529,150963742,150963994,150964084,150964246")
        correct_jM = "jM:B:c,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21"

        # Read in transcript from outfile
        with open(tmp_dir + "DIM_nc_clean.sam", 'r') as f:
            sam_line = f.readline().strip().split('\t')
        transcript = t2.Transcript(sam_line, genome, sjAnnot)

        assert transcript.CIGAR == correct_CIGAR
        assert transcript.MD == correct_MD
        assert transcript.NM == correct_NM
        assert transcript.jI == correct_jI
        assert transcript.jM == correct_jM

        # Read logs and make sure they are OK
        expected_log = "\t".join([
            "c34150/f1p1/3707", "primary", "2", "0", "0", "1", "0", "0", "2",
            "0", "1", "0"
        ])

        with open(tmp_dir + "DIM_nc_clean.log", 'r') as f:
            log = f.readline().strip()
            assert log == expected_log