def check_get_transcript_from_args_add_three_tabix(fmt): # check endpoints of transcript and CDS # GTF2 and BED format bed_ref = REF_FILES["100transcripts_bed"] cds = {} tx = {} for line in open(bed_ref): items = line.strip("\n").split("\t") name = items[3] tx_start = int(items[1]) tx_end = int(items[2]) tx[name] = (tx_start,tx_end) cds_start = int(items[6]) cds_end = int(items[7]) if cds_start < cds_end: cds[name] = (cds_start,cds_end) else: cds[name] = (None,None) ref_files = { "BED" : REF_FILES["100transcripts_bed_tabix"], "GTF2" : REF_FILES["100transcripts_gtf_tabix"] } parser = get_annotation_file_parser() argstr = "--annotation_format %s --annotation_files %s --tabix" % (fmt,ref_files[fmt]) args = parser.parse_args(shlex.split(argstr)) transcripts = list(get_transcripts_from_args(args)) for my_tx in transcripts: name = my_tx.get_name() expected_cds_start, expected_cds_end = cds[name] found_cds_start = my_tx.cds_genome_start found_cds_end = my_tx.cds_genome_end found_tx_start = my_tx.spanning_segment.start found_tx_end = my_tx.spanning_segment.end expected_tx_start, expected_tx_end = tx[name] if found_cds_start is None: assert_true(found_cds_end is None) assert_true(expected_cds_start is None,"Unequal CDS start (expected %s, got %s) on transcript %s" % (expected_cds_start,found_cds_start,name)) assert_true(expected_cds_end is None,"Unequal CDS stop (expected %s, got %s) on transcript %s" % (expected_cds_end,found_cds_end,name)) else: assert_equal(found_cds_start,expected_cds_start,"Unequal CDS start (expected %s, got %s) on transcript %s" % (expected_cds_start,found_cds_start,name)) assert_equal(found_cds_end,expected_cds_end,"Unequal CDS stop (expected %s, got %s) on transcript %s" % (expected_cds_end,found_cds_end,name)) assert_equal(found_tx_start,expected_tx_start,"Unequal transcript start (expected %s, got %s) on transcript %s" % (expected_tx_start,found_tx_start,name)) assert_equal(found_tx_end,expected_tx_end,"Unequal transcript end (expected %s, got %s) on transcript %s" % (expected_tx_end,found_tx_end,name)) assert_equal(len(transcripts),len(cds),"Not all transcripts found in input. Expected %s. Got %s." % (len(cds),len(transcripts)))
def check_get_transcript_from_args_add_three(parser,fmt,add_three,n=1): """Make sure stop codons are moved by three if ``add_three`` is specified in argument string and not moved by three if not specified. These are checked against expected values found in a BED file. Parameters ---------- parser : :py:class:`argparse.ArgumentParser` Argument parser for annotation files fmt : str Format of input file ("BED", "BigBed", "GTF2", "GFF3", or "PSL") add_three : bool Whether or not to move stop codons by three nucleotides n : int Number of times input file was passed to parser (Default : 1) """ # GTF file does not include stop codon; others do. # So, GTF file will be off by three relative to others # unless it includes explicit stop codon features # # we correct for that here gtf_offset = 3 if fmt == "GTF2" else 0 bed_ref = MINI["bed_file"] dtmp = {} for line in open(bed_ref): items = line.strip("\n").split("\t") name = items[3] strand = items[5] cds_start = int(items[6]) cds_end = int(items[7]) if cds_start < cds_end: if strand == "+": cds_end -= gtf_offset else: cds_start += gtf_offset if add_three == True: if strand == "+": cds_end += 3 else: cds_start -= 3 dtmp[name] = (cds_start,cds_end) else: dtmp[name] = (None,None) files = " ".join([MINI["%s_file" % fmt.lower()]] * n) argstr = "--annotation_format %s --annotation_files %s " % (fmt,files) if add_three == True: argstr += " --add_three" args = parser.parse_args(shlex.split(argstr)) transcripts = list(get_transcripts_from_args(args)) for tx in transcripts: name = tx.get_name() expected_start = dtmp[name][0] expected_end = dtmp[name][1] found_start = tx.cds_genome_start found_end = tx.cds_genome_end if found_start is None: assert_true(found_end is None) assert_true(expected_start is None,"Unequal CDS start (expected %s, got %s) on transcript %s" % (expected_start,found_start,name)) assert_true(expected_end is None,"Unequal CDS stop (expected %s, got %s) on transcript %s" % (expected_end,found_end,name)) else: assert_equal(found_start,expected_start,"Unequal CDS start (expected %s, got %s) on transcript %s" % (expected_start,found_start,name)) assert_equal(found_end,expected_end,"Unequal CDS stop (expected %s, got %s) on transcript %s" % (expected_end,found_end,name)) assert_equal(len(transcripts),n*len(dtmp),"Not all transcripts found in input. Expected %s. Got %s." % (n*len(dtmp),len(transcripts)))