Ejemplo n.º 1
0
def check_get_transcript_from_args_add_three_tabix(fmt):
    # check endpoints of transcript and CDS
    # GTF2 and BED format
    bed_ref = REF_FILES["100transcripts_bed"]
    cds = {}
    tx  = {}
    for line in open(bed_ref):
        items = line.strip("\n").split("\t")
        name = items[3]

        tx_start = int(items[1])
        tx_end   = int(items[2])
        tx[name] = (tx_start,tx_end)
        
        cds_start = int(items[6])
        cds_end   = int(items[7])
        if cds_start < cds_end:
            cds[name] = (cds_start,cds_end)
        else:
            cds[name] = (None,None)
    
    ref_files = {
             "BED"  : REF_FILES["100transcripts_bed_tabix"],
             "GTF2" : REF_FILES["100transcripts_gtf_tabix"]
             }    
    parser = get_annotation_file_parser()
    argstr = "--annotation_format %s --annotation_files %s --tabix" % (fmt,ref_files[fmt])
    args = parser.parse_args(shlex.split(argstr))
    transcripts = list(get_transcripts_from_args(args))
    
    for my_tx in transcripts:
        name = my_tx.get_name()
        expected_cds_start, expected_cds_end = cds[name]
        
        found_cds_start = my_tx.cds_genome_start
        found_cds_end   = my_tx.cds_genome_end
        
        found_tx_start = my_tx.spanning_segment.start
        found_tx_end   = my_tx.spanning_segment.end
        
        expected_tx_start, expected_tx_end = tx[name]
        if found_cds_start is None:
            assert_true(found_cds_end is None)
            assert_true(expected_cds_start is None,"Unequal CDS start (expected %s, got %s) on transcript %s" % (expected_cds_start,found_cds_start,name))
            assert_true(expected_cds_end is None,"Unequal CDS stop (expected %s, got %s) on transcript %s" % (expected_cds_end,found_cds_end,name))
        else:
            assert_equal(found_cds_start,expected_cds_start,"Unequal CDS start (expected %s, got %s) on transcript %s" % (expected_cds_start,found_cds_start,name))
            assert_equal(found_cds_end,expected_cds_end,"Unequal CDS stop (expected %s, got %s) on transcript %s" % (expected_cds_end,found_cds_end,name))

        assert_equal(found_tx_start,expected_tx_start,"Unequal transcript start (expected %s, got %s) on transcript %s" % (expected_tx_start,found_tx_start,name))
        assert_equal(found_tx_end,expected_tx_end,"Unequal transcript end (expected %s, got %s) on transcript %s" % (expected_tx_end,found_tx_end,name))
    
        assert_equal(len(transcripts),len(cds),"Not all transcripts found in input. Expected %s. Got %s." % (len(cds),len(transcripts)))
Ejemplo n.º 2
0
def check_get_transcript_from_args_add_three(parser,fmt,add_three,n=1):
    """Make sure stop codons are moved by three if ``add_three`` is specified
    in argument string and not moved by three if not specified.
    These are checked against expected values found in a BED file.
    
    Parameters
    ----------
    parser : :py:class:`argparse.ArgumentParser`
        Argument parser for annotation files

    fmt : str
        Format of input file ("BED", "BigBed", "GTF2", "GFF3", or "PSL")
        
    add_three : bool
        Whether or not to move stop codons by three nucleotides
    
    n : int
        Number of times input file was passed to parser (Default : 1)
    """
    # GTF file does not include stop codon; others do.
    # So, GTF file will be off by three relative to others
    # unless it includes explicit stop codon features
    #
    # we correct for that here
    gtf_offset = 3 if fmt == "GTF2" else 0
    bed_ref = MINI["bed_file"]
    dtmp = {}
    for line in open(bed_ref):
        items = line.strip("\n").split("\t")
        name = items[3]
        strand = items[5]
        cds_start = int(items[6])
        cds_end   = int(items[7])
        if cds_start < cds_end:
            if strand == "+":
                cds_end -= gtf_offset
            else:
                cds_start += gtf_offset
                
            if add_three == True:
                if strand == "+":
                    cds_end += 3
                else:
                    cds_start -= 3
            dtmp[name] = (cds_start,cds_end)
        else:
            dtmp[name] = (None,None)
    
    files = " ".join([MINI["%s_file" % fmt.lower()]] * n)
    argstr = "--annotation_format %s --annotation_files %s " % (fmt,files)
    if add_three == True:
        argstr += " --add_three"
    
    args = parser.parse_args(shlex.split(argstr))
    transcripts = list(get_transcripts_from_args(args))
    for tx in transcripts:
        name = tx.get_name()
        expected_start = dtmp[name][0]
        expected_end   = dtmp[name][1]
        found_start = tx.cds_genome_start
        found_end   = tx.cds_genome_end
        if found_start is None:
            assert_true(found_end is None)
            assert_true(expected_start is None,"Unequal CDS start (expected %s, got %s) on transcript %s" % (expected_start,found_start,name))
            assert_true(expected_end is None,"Unequal CDS stop (expected %s, got %s) on transcript %s" % (expected_end,found_end,name))
        else:
            assert_equal(found_start,expected_start,"Unequal CDS start (expected %s, got %s) on transcript %s" % (expected_start,found_start,name))
            assert_equal(found_end,expected_end,"Unequal CDS stop (expected %s, got %s) on transcript %s" % (expected_end,found_end,name))

    assert_equal(len(transcripts),n*len(dtmp),"Not all transcripts found in input. Expected %s. Got %s." % (n*len(dtmp),len(transcripts)))