def setUpClass(cls): cls.cols = { "intA": numpy.random.randint(0, high=2**16, size=size), "intB": numpy.random.randint(-10, high=20, size=size), "idxA": numpy.arange(size), "chrA": numpy.array([chr(65 + (X % (91 - 65))) for X in range(size)]), "strA": numpy.array([ str(GenomicSegment("chrA", X, X + 500, "+")) for X in range(size) ]), "strB": numpy.array([ str(GenomicSegment("chrB", X / 2, X / 2 + 500, "-")) for X in range(size) ]), "floatA": 10 * numpy.random.randn(size) + 500, "floatB": (10**-5) * numpy.random.random(size), "objA": numpy.tile(None, 5000), "objB": numpy.array([ GenomicSegment("chrC", X, X + Y, "+") for X, Y in zip( range(size), numpy.random.randint(2, high=1000, size=size)) ]), }
def setUpClass(cls): cls.ivs = [ GenomicSegment("chrA", 100, 190, "+"), GenomicSegment("chrA", 200, 203, "+"), GenomicSegment("chrA", 200, 201, "+"), GenomicSegment("chrA", 204, 206, "+"), ] cls.common_attr = dict(common1="common", common2="also common", common3="still common") cls.attrs = [ dict(common_diff_val="unique_f1", unique_f1_key="something"), dict(common_diff_val="unique_f2", unique_f2_key="something", unique_f2f3="something else"), dict(common_diff_val="unique_f3", unique_f3_key="something", unique_f2f3="something else", unique_f3f4="f3 only"), dict(common_diff_val="unique_f4", unique_f4_key="something", unique_f3f4="f4 only"), ] for x in cls.attrs: x.update(cls.common_attr)
def test_window_landmark(): # test cases: plus and minus-strand IVCs with splicing flank_up = 50 flank_down = 100 my_segmentchains = [ SegmentChain(GenomicSegment("chrA", 50, 350, "+"), GenomicSegment("chrA", 500, 900, "+")), SegmentChain(GenomicSegment("chrA", 50, 350, "-"), GenomicSegment("chrA", 500, 900, "-")), ] for my_segmentchain in my_segmentchains: for landmark in range(0, 700, 50): yield check_window_landmark, my_segmentchain, landmark, flank_up, flank_down
def setUpClass(cls): min_ = 25 max_ = 40 cls.strands = ("+","-") cls.segs = { X : GenomicSegment("mock",0,2000,X) for X in cls.strands } cls.reads = { Y : [cls.make_alignment(0,X,Y) for X in range(min_,max_)] for Y in ("+","-") } cls.expected = {} for mapping in ("fiveprime","threeprime","center"): for param in (0,10): for strand in cls.strands: cls.expected[(mapping,param,strand)] = numpy.zeros(2000) cls.expected[("fiveprime",0, "+")][0] = max_ - min_ cls.expected[("fiveprime",10,"+")][10] = max_ - min_ cls.expected[("fiveprime",0, "-")][min_-1:max_-1] = 1 cls.expected[("fiveprime",10,"-")][min_-11:max_-11] = 1 cls.expected[("threeprime",0, "-")][0] = max_ - min_ cls.expected[("threeprime",10,"-")][10] = max_ - min_ cls.expected[("threeprime",0, "+")][min_-1:max_-1] = 1 cls.expected[("threeprime",10,"+")][min_-11:max_-11] = 1 for my_len in range(min_,max_): cls.expected[("center",0,"+")][:my_len] += 1.0/my_len cls.expected[("center",0,"-")][:my_len] += 1.0/my_len cls.expected[("center",10,"+")][10:my_len-10] += 1.0/(my_len-2*10) cls.expected[("center",10,"-")][10:my_len-10] += 1.0/(my_len-2*10) cls.map_factories = { "fiveprime" : FivePrimeMapFactory, "threeprime" : ThreePrimeMapFactory, "fiveprime_variable" : VariableFivePrimeMapFactory, "center" : CenterMapFactory }
def check_random_windows_against_wig(self, strand): chrdict = self.chrdict chroms = list(self.chrdict) chridx = numpy.random.randint(0, high=len(chroms), size=50) ga = GenomeArray() i = 0 with open(wigfile) as fin: ga.add_from_wiggle(fin, strand) while i < 50: chrom = chroms[chridx[i]] maxlength = chrdict[chrom] start = numpy.random.randint(0, high=maxlength - 2000) end = numpy.random.randint(start + 10000, high=start + 20000) # make sure we don't go off chrom while end > maxlength: end = numpy.random.randint(start + 100, high=start + 10000) seg = GenomicSegment(chrom, start, end, strand) expected = ga[seg] # make sure segment has counts in it if expected.sum() > 0: i += 1 found = self.bw[seg] yield self.check_vals_against_wig, expected, found
def covered_by_repetitive(query_junc,minus_range,plus_range,cross_hash): """Determine whether one or both ends of a splice site overlap with a repetitive area of the genome. Parameters ---------- query_junc : |SegmentChain| A two-exon fragment representing a query splice junction minus_range : int <= 0 Maximum number of nucleotides splice junction could be moved to the left without reducing sequence support for the junction see :py:func:`find_match_range` plus_range : int >= 0 Maximum number of nucleotides splice junction could be moved to the right without reducing sequence support for the junction see :py:func:`find_match_range` cross_hash : |GenomeHash| |GenomeHash| of 1-length features denoting repetitive regions of the genome Returns ------- bool `True` if any of the genomic positions within `minus_range...plus_range` of the 5' or 3' splice sites of `query_junc` overlap a repetitive region of the genome as annotated by ``cross_hash``. Otherwise, `False` """ chrom = query_junc.spanning_segment.chrom strand = query_junc.spanning_segment.strand qend = query_junc[0].end qstart = query_junc[1].start fiveprime_splice_area = GenomicSegment(chrom, qend + minus_range, qend + plus_range + 1, strand) threeprime_splice_area = GenomicSegment(chrom, qstart + minus_range, qstart + plus_range + 1, strand) support_region = SegmentChain(fiveprime_splice_area,threeprime_splice_area) return len(cross_hash.get_overlapping_features(support_region)) > 0
def test_get_chromosome_counts_zero_fill(self): ga = GenomeArray() with open(wigfile) as fin: ga.add_from_wiggle(fin, "+") for chrom, length in self.chrdict.items(): seg = GenomicSegment(chrom, 0, length, "+") expected = ga[seg] found = self.bw.get_chromosome_counts(chrom) yield self.check_vals_against_wig, expected, found
def test_variable_stratified_mapping_plus(self): offsets = { 26 : 6, 27 : 22, 28 : 13, 29 : 4, 30 : 5 } chains = { "fw" : SegmentChain(GenomicSegment('chrII',392959,393180,'+'), GenomicSegment('chrII',393510,394742,'+'), GenomicSegment('chrII',394860,394901,'+'), ID='YBR078W_mRNA'), "rc" : SegmentChain(GenomicSegment('chrVIII',189061,189749,'-'), GenomicSegment('chrVIII',189850,190017,'-'), ID='YHR041C_mRNA') } expected = { "fw" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_fw_vec.txt"),delimiter="\t"), "rc" : numpy.loadtxt(resource_filename("plastid","test/data/stratmap/strat_rc_vec.txt"),delimiter="\t"), } ga = BAMGenomeArray([resource_filename("plastid","test/data/stratmap/strat.bam")]) ga.set_mapping(StratifiedVariableFivePrimeMapFactory(offsets,26,30))
def filter(self,line): """Parse a read alignment as |SegmentChain| from a line of `bowtie`_ output""" items = line.strip("\n").split("\t") read_name = items[0] strand = items[1] ref_seq = items[2] coord = int(items[3]) attr = { 'seq_as_aligned' : items[4], 'qualstr' : items[5], 'mismatch_str' : items[7], 'type' : "alignment", 'ID' : read_name, } iv = GenomicSegment(ref_seq,coord,coord+len(attr['seq_as_aligned']),strand) feature = SegmentChain(iv,**attr) return feature
def test_fill_val_present_chrom(self): filldef = BigWigReader(bigwigfile) fillnan = BigWigReader(bigwigfile, fill=numpy.nan) fill0 = BigWigReader(bigwigfile, fill=0) fill10 = BigWigReader(bigwigfile, fill=10) # empty region seg = GenomicSegment("chrIV", 5, 10, "+") assert_equal(len(filldef[seg]), len(seg), "fetched wrong size") # assert_true(numpy.isnan(filldef[seg]).all(), # "default not nan") # # assert_true(numpy.isnan(fillnan[seg]).all(), # "nanfill didn't work") assert_true((fill0[seg] == 0).all(), "0-fill didn't work")
def test_fill_val_absent_chrom(self): filldef = BigWigReader(bigwigfile) fillnan = BigWigReader(bigwigfile, fill=numpy.nan) fill0 = BigWigReader(bigwigfile, fill=0) fill10 = BigWigReader(bigwigfile, fill=10) # chrVI is not in dataset; this should be an empty array seg = GenomicSegment("chrVI", 5, 1000, "+") assert_equal(len(filldef[seg]), len(seg), "fetched wrong size") # assert_true(numpy.isnan(filldef[seg]).all(), # "default not nan") # # assert_true(numpy.isnan(fillnan[seg]).all(), # "nanfill didn't work") assert_true((fill0[seg] == 0).all(), "0-fill didn't work")
def revcomp_mask_chain(seg, k, offset=0): """Reverse-complement a single-interval mask, correcting for `offset`. Parameters ---------- seg : |SegmentChain| Plus-strand mask, including `offset` k : int Length of k-mers offset : int, optional Offset from 5' end of read at which to map mask (Default: `0`) Returns ------- |SegmentChain| Mask on minus strand corresponding to `seg` """ # Algorithm note: # # Let # FW = plus-strand coordinate # RC = minus-strand coordinate # # Then # RC = FW + k - 1 - offset # # But we are given FW + offset, so: # # RC + offset = (FW + offset) + k - 1 - offset # RC = (FW + offset) + k - 1 - 2*offset span = seg.spanning_segment new_offset = k - 1 - 2 * offset ivminus = GenomicSegment(span.chrom, span.start + new_offset, span.end + new_offset, "-") return SegmentChain(ivminus)
def test_search_fields_multivalue(self): reader = BigBedReader(self.bb_indexed) found = list( reader.search("name", "should_have_no_match", "should_also_have_no_match")) self.assertEqual([], found) found = list(reader.search("Name", "Sam-S-RE", "Sam-S-RK")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 111337, '+'), Alias='na', ID='FBtr0308091', Name='Sam-S-RK', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='110900', thickstart='108685', type='exon'), ] self.assertEqual(expected, found)
def fa_to_bed(toomany_fh, k, offset=0): """Create a `BED`_ file indicating genomic origins of reads in a `bowtie`_ ``toomany`` file Parameters ---------- toomany_fh : file-like Open filehandle to fasta-formatted ``toomany`` file from `bowtie`_ k : int Length of k-mers offset : int, optional Offset from 5' end of read at which to map read, if any (Default: `0`) Yields ------ |SegmentChain| Plus-strand |SegmentChain| representing a repetitive region |SegmentChain| Minus-strand |SegmentChain| representing a repetitive region """ last_chrom = None last_pos = None start_pos = None reader = FastaNameReader(toomany_fh) for n, read_name in enumerate(reader): chrom, pos = namepat.search(read_name).groups() pos = int(pos) + offset if chrom != last_chrom: if last_chrom is not None: plus_chain = SegmentChain( GenomicSegment(last_chrom, start_pos, last_pos + 1, "+")) minus_chain = revcomp_mask_chain(plus_chain, k, offset) last_chrom = chrom start_pos = pos last_pos = pos yield plus_chain, minus_chain else: last_chrom = chrom start_pos = pos last_pos = pos else: delta = pos - last_pos if delta > 1: plus_chain = SegmentChain( GenomicSegment(chrom, start_pos, last_pos + 1, "+")) minus_chain = revcomp_mask_chain(plus_chain, k, offset) last_pos = pos start_pos = pos yield plus_chain, minus_chain elif delta == 1: last_pos = pos else: msg = "k-mers are not sorted at read %s! Aborting." % read_name raise MalformedFileError(toomany_fh, msg, line_num=n) # export final feature plus_chain = SegmentChain( GenomicSegment(chrom, start_pos, last_pos + 1, "+")) minus_chain = revcomp_mask_chain(plus_chain, k, offset) yield plus_chain, minus_chain
], # these below are all 1 nucleotide outside match range 'YBR215W_mRNA_0' : ['YBR215W_mRNA_0:0-105^189-2175(+)', 'YBR215W_mRNA_0:0-109^193-2175(+)'], 'YHL001W_mRNA_0' : ['YHL001W_mRNA_0:0-143^541-961(+)', 'YHL001W_mRNA_0:0-149^547-961(+)'], 'YIL018W_mRNA_0' : ['YIL018W_mRNA_0:0-28^428-1280(+)', 'YIL018W_mRNA_0:0-34^434-1280(+)'], 'YIL133C_mRNA_0' : ['YIL133C_mRNA_0:0-644^934-1007(-)', 'YIL133C_mRNA_0:0-650^940-1007(-)'], 'YIL156W_B_mRNA_0': ['YIL156W_B_mRNA_0:0-40^102-408(+)', 'YIL156W_B_mRNA_0:0-45^107-408(+)'], 'YKL006W_mRNA_0' : ['YKL006W_mRNA_0:0-154^552-954(+)', 'YKL006W_mRNA_0:0-160^558-954(+)'], 'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-324^396-729(-)', 'YMR194C_B_mRNA_0:0-328^400-729(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-410^648-697(-)', 'YPL249C_A_mRNA_0:0-417^655-697(-)'] } unmatched_query_juncs = { K : [SegmentChain.from_str(X) for X in V] for K,V in unmatched_query_juncs.items() } """Query junctions with no known matches""" unmatched_noncan_query_juncs = ["YNL130C:0-23^145-180(-)", "YNL130C:0-53^165-180(-)", "YNL130C:0-70^141-180(-)", "YNL130C:0-49^121-180(-)", ] unmatched_noncan_query_juncs = [SegmentChain.from_str(X) for X in unmatched_noncan_query_juncs] """Query junctions without canonical splice junctions in the match range""" repetitive_regions = [ "YBR215W_mRNA_0:190-193(+)", # threeprime splice site plus "YHL001W_mRNA_0:144-149(+)", # fiveprime splice site plus "YIL133C_mRNA_0:935-940(-)", # threeprime splice site minus "YMR194C_B_mRNA_0:325-328(-)", # fiveprime splice site minus ] cross_hash = GenomeHash([SegmentChain(GenomicSegment.from_str(X)) for X in repetitive_regions]) cross_hash_seqs = { X.chrom for X in cross_hash.feature_dict.values() }
reader = BED_Reader(cStringIO.StringIO(_NARROW_PEAK_TEXT), extra_columns=14) with warnings.catch_warnings(record=True) as warns: warnings.simplefilter("always") ltmp = list(reader) assert_greater_equal(len(warns), 0) #=============================================================================== # INDEX: test data #=============================================================================== # test dataset, constructed manually to include various edge cases _TEST_SEGMENTCHAINS = [ # single-interval SegmentChain(GenomicSegment("chrA", 100, 1100, "+"), ID="IVC1p"), SegmentChain(GenomicSegment("chrA", 100, 1100, "-"), ID="IVC1m"), # multi-interval SegmentChain(GenomicSegment("chrA", 100, 1100, "+"), GenomicSegment("chrA", 2100, 2600, "+"), ID="IVC2p"), SegmentChain(GenomicSegment("chrA", 100, 1100, "-"), GenomicSegment("chrA", 2100, 2600, "-"), ID="IVC2m"), # multi-interval, with score SegmentChain(GenomicSegment("chrA", 100, 1100, "+"), GenomicSegment("chrA", 2100, 2600, "+"), ID="IVC3p", score=500), SegmentChain(GenomicSegment("chrA", 100, 1100, "-"), GenomicSegment("chrA", 2100, 2600, "-"),
def test_search_fields_singlevalue(self): reader = BigBedReader(self.bb_indexed) found = list(reader.search("name", "should_have_no_match")) self.assertEqual([], found) found = list(reader.search("Name", "Sam-S-RE")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), ] self.assertEqual(expected, found) found = list(reader.search("gene_id", "FBgn0005278")) expected = [ SegmentChain(GenomicSegment('2L', 106902, 107000, '+'), GenomicSegment('2L', 107764, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RE', 'CG2674-RE']'", ID='FBtr0089437', Name='Sam-S-RE', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 111337, '+'), Alias='na', ID='FBtr0308091', Name='Sam-S-RK', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='110900', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RB', 'CG2674-RB']'", ID='FBtr0089428', Name='Sam-S-RB', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='112741', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107838, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RA', 'CG2674-RA']'", ID='FBtr0089429', Name='Sam-S-RA', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107760, 107956, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias='na', ID='FBtr0330656', Name='Sam-S-RL', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='112781', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RH', 'CG2674-RH']'", ID='FBtr0089432', Name='Sam-S-RH', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108101, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RD', 'CG2674-RD']'", ID='FBtr0089430', Name='Sam-S-RD', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 107936, 108101, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RC', 'CG2674-RC']'", ID='FBtr0089431', Name='Sam-S-RC', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108088, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RF', 'CG2674-RF']'", ID='FBtr0089433', Name='Sam-S-RF', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108132, 108346, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111906, 112019, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RI', 'CG2674-RI']'", ID='FBtr0089434', Name='Sam-S-RI', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 108132, 108226, '+'), GenomicSegment('2L', 108587, 108809, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114432, '+'), Alias="'['M(2)21AB-RJ', 'CG2674-RJ']'", ID='FBtr0089435', Name='Sam-S-RJ', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='108685', type='exon'), SegmentChain(GenomicSegment('2L', 109593, 109793, '+'), GenomicSegment('2L', 110405, 110483, '+'), GenomicSegment('2L', 110754, 110877, '+'), GenomicSegment('2L', 111004, 111117, '+'), GenomicSegment('2L', 112689, 113369, '+'), GenomicSegment('2L', 113433, 114210, '+'), Alias="'['M(2)21AB-RG', 'CG2674-RG']'", ID='FBtr0089436', Name='Sam-S-RG', color='#000000', gene_id='FBgn0005278', score='0.0', thickend='113542', thickstart='109750', type='exon'), ] self.assertEqual(sorted(expected), sorted(found))
CCCTCCTTCCGCTGGCCCCGACTGC >chr30b:1(+) CCTCCTTCCGCTGGCCCCGACTGCC >chr30b:2(+) CTCCTTCCGCTGGCCCCGACTGCCC >chr30b:3(+) TCCTTCCGCTGGCCCCGACTGCCCC >chr30b:4(+) CCTTCCGCTGGCCCCGACTGCCCCA >chr30b:5(+) CTTCCGCTGGCCCCGACTGCCCCAG """ CROSSMAP1 = [ ( SegmentChain(GenomicSegment("chr50a", 1, 10, "+")), SegmentChain(GenomicSegment("chr50a", 1 + 25 - 1, 10 + 25 - 1, "-")), ), ( SegmentChain(GenomicSegment("chr50a", 19, 26, "+")), SegmentChain(GenomicSegment("chr50a", 19 + 25 - 1, 26 + 25 - 1, "-")), ), ( SegmentChain(GenomicSegment("chr30b", 0, 6, "+")), SegmentChain(GenomicSegment("chr30b", 0 + 25 - 1, 6 + 25 - 1, "-")), ) ] CROSSMAP2 = [ ( SegmentChain(GenomicSegment("chr50a", 1 + 1000, 10 + 1000, "+")),
'YMR194C_B_mRNA_0': ['YMR194C_B_mRNA_0:0-324^396-729(-)', 'YMR194C_B_mRNA_0:0-328^400-729(-)'], 'YPL249C_A_mRNA_0': ['YPL249C_A_mRNA_0:0-410^648-697(-)', 'YPL249C_A_mRNA_0:0-417^655-697(-)'] } unmatched_query_juncs = { K: [SegmentChain.from_str(X) for X in V] for K, V in unmatched_query_juncs.items() } """Query junctions with no known matches""" unmatched_noncan_query_juncs = [ "YNL130C:0-23^145-180(-)", "YNL130C:0-53^165-180(-)", "YNL130C:0-70^141-180(-)", "YNL130C:0-49^121-180(-)", ] unmatched_noncan_query_juncs = [ SegmentChain.from_str(X) for X in unmatched_noncan_query_juncs ] """Query junctions without canonical splice junctions in the match range""" repetitive_regions = [ "YBR215W_mRNA_0:190-193(+)", # threeprime splice site plus "YHL001W_mRNA_0:144-149(+)", # fiveprime splice site plus "YIL133C_mRNA_0:935-940(-)", # threeprime splice site minus "YMR194C_B_mRNA_0:325-328(-)", # fiveprime splice site minus ] cross_hash = GenomeHash( [SegmentChain(GenomicSegment.from_str(X)) for X in repetitive_regions]) cross_hash_seqs = {X.chrom for X in cross_hash.feature_dict.values()}
def test_exit_status(): # define columns cols = { "intA" : numpy.random.randint(0,high=2**16,size=size), "intB" : numpy.random.randint(-10,high=20,size=size), "idxA" : numpy.arange(size), "chrA" : numpy.array([chr(65+(X%(91-65))) for X in range(size)]), "strA" : numpy.array([str(GenomicSegment("chrA",X,X+500,"+")) for X in range(size)]), "strB" : numpy.array([str(GenomicSegment("chrB",X/2,X/2+500,"-")) for X in range(size)]), "floatA" : 10*numpy.random.randn(size) + 500, "floatB" : (10**-5)*numpy.random.random(size), "objA" : numpy.tile(None,5000), "objB" : numpy.array([GenomicSegment("chrC",X,X+Y,"+") for X,Y in zip(range(size),numpy.random.randint(2,high=1000,size=size))]), } # allocate temp files we will use headerfile = NamedTemporaryFile(delete=False,mode="w") headerfile_extra_cols = NamedTemporaryFile(delete=False,mode="w") headerfile_extra_cols_diff = NamedTemporaryFile(delete=False,mode="w") headerfile_extra_cols_shuffled = NamedTemporaryFile(delete=False,mode="w") headerfile_shuffled = NamedTemporaryFile(delete=False,mode="w") headerfile_diff_vals = NamedTemporaryFile(delete=False,mode="w") noheaderfile = NamedTemporaryFile(delete=False,mode="w") noheaderfile_extra_cols = NamedTemporaryFile(delete=False,mode="w") noheaderfile_extra_cols_diff = NamedTemporaryFile(delete=False,mode="w") noheaderfile_extra_cols_shuffled = NamedTemporaryFile(delete=False,mode="w") noheaderfile_shuffled = NamedTemporaryFile(delete=False,mode="w") noheaderfile_diff_vals = NamedTemporaryFile(delete=False,mode="w") # write values keyorder = ["idxA"] + sorted(list(set(cols.keys()) - { "idxA" })) table1 = pd.DataFrame(cols) table1.to_csv(headerfile,index=False,header=True,sep="\t") table1.to_csv(noheaderfile,index=False,header=False,sep="\t", columns=keyorder) headerfile.close() noheaderfile.close() table1["extra"] = 2**7 * numpy.random.random(size=size) table1.to_csv(headerfile_extra_cols,index=False,header=True,sep="\t") table1.to_csv(noheaderfile_extra_cols,index=False,header=False,sep="\t", columns=["extra"]+keyorder) headerfile_extra_cols.close() noheaderfile_extra_cols.close() table1["extra"] += 10**-4 * numpy.random.random(size=size) table1.to_csv(headerfile_extra_cols_diff,index=False,header=True,sep="\t") table1.to_csv(noheaderfile_extra_cols_diff,index=False,header=False,sep="\t", columns=["extra"]+keyorder) headerfile_extra_cols_diff.close() noheaderfile_extra_cols_diff.close() shufidx = numpy.arange(size) shuffle(shufidx) table2 = pd.DataFrame({ K : V[shufidx] for K,V in cols.items()}) table2.to_csv(headerfile_shuffled,index=False,header=True,sep="\t") table2.to_csv(noheaderfile_shuffled,index=False,header=False,sep="\t", columns=keyorder) headerfile_shuffled.close() noheaderfile_shuffled.close() table2["extra"] = table1["extra"][shufidx] table2.to_csv(headerfile_extra_cols_shuffled,index=False,header=True,sep="\t") table2.to_csv(noheaderfile_extra_cols_shuffled, index=False,header=False,sep="\t", columns=["extra"]+keyorder) headerfile_extra_cols_shuffled.close() noheaderfile_extra_cols_shuffled.close() # Define tests, as tuples of: # -Test name/description # -Command-line arguments to pass to :py:mod:`plastid.bin.test_table_equality` # -Expected exit code/returns status for :py:func:`main` tests = [ ("same", "%s %s" % (headerfile.name,headerfile.name), 0), ("diff_column_names", "%s %s" % (headerfile.name,headerfile_extra_cols.name), 1), ("extra_column_names_ignored", "%s %s --exclude extra" % (headerfile.name,headerfile_extra_cols.name), 0), ("shuffled_rows", "%s %s" % (headerfile.name,headerfile_shuffled.name), 1), ("shuffled_rows_name_sort", "%s %s --sort_keys idxA" % (headerfile.name,headerfile_shuffled.name), 0), ("shuffled_rows_multi_name_sort", "%s %s --sort_keys strB chrA" % (headerfile.name,headerfile_shuffled.name), 0), ("same_column_names_diff_values", "%s %s" % (headerfile_extra_cols.name,headerfile_extra_cols_diff.name), 1), ("same_column_names_diff_values_tol", "%s %s --tol 0.01" % (headerfile_extra_cols.name,headerfile_extra_cols_diff.name), 0), ("same_column_names_diff_values_ignored", "%s %s --exclude extra" % (headerfile_extra_cols.name,headerfile_extra_cols_diff.name), 0), ("shuffled_rows_extra_columns_ignored", "%s %s --exclude extra" % (headerfile.name,headerfile_extra_cols_shuffled.name), 1), ("shuffled_rows_extra_columns_ignored_name_sort", "%s %s --exclude extra --sort_keys idxA" % (headerfile.name,headerfile_extra_cols_shuffled.name), 0), ("noheader_same", "%s %s --no_header" % (noheaderfile.name,noheaderfile.name), 0), ("noheader_extra_columns", "%s %s --no_header" % (noheaderfile.name,noheaderfile_extra_cols.name), 1), ("noheader_shuffled_rows", "%s %s --no_header" % (noheaderfile.name,noheaderfile_shuffled.name), 1), ("noheader_shuffled_rows_int_sort", "%s %s --no_header --sort_keys 0" % (noheaderfile.name,noheaderfile_shuffled.name), 0), ("noheader_diff_values", "%s %s --no_header" % (noheaderfile_extra_cols.name,noheaderfile_extra_cols_diff.name), 1), ("noheader_diff_values_tol", "%s %s --no_header --tol 0.01" % (noheaderfile_extra_cols.name,noheaderfile_extra_cols_diff.name), 0), ("no_header_diff_values_ignored", "%s %s --no_header --exclude 0" % (noheaderfile_extra_cols.name,noheaderfile_extra_cols_diff.name), 0), ("no_header_shuffled_rows_extra_columns_ignored_int_sort", "%s %s --no_header --exclude 0 --sort_keys 1" % (noheaderfile_extra_cols.name,noheaderfile_extra_cols_shuffled.name), 0) ] """ Tests to conduct, as tuples of: - Test name/description - Command-line arguments to pass to :py:mod:`plastid.bin.test_table_equality` - Expected exit code/returns status for :py:func:`main` """ for test_name, argstr, expected_exit in tests: yield check_exit_status, test_name, argstr, expected_exit # clean up os.unlink(headerfile.name ) os.unlink(headerfile_extra_cols.name ) os.unlink(headerfile_extra_cols_diff.name ) os.unlink(headerfile_extra_cols_shuffled.name ) os.unlink(headerfile_shuffled.name ) os.unlink(headerfile_diff_vals.name ) os.unlink(noheaderfile.name ) os.unlink(noheaderfile_extra_cols.name ) os.unlink(noheaderfile_extra_cols_diff.name ) os.unlink(noheaderfile_extra_cols_shuffled.name ) os.unlink(noheaderfile_shuffled.name ) os.unlink(noheaderfile_diff_vals.name ) cleanup_resources()
def setUpClass(cls): cls.ivcs = { "plus": [ SegmentChain(GenomicSegment("chrA", 0, 100, "+")), SegmentChain(GenomicSegment("chrA", 50, 100, "+")), SegmentChain(GenomicSegment("chrA", 50, 51, "+")) ], "minus_k25_off0": [ SegmentChain( GenomicSegment("chrA", 0 + 25 - 1, 100 + 25 - 1, "-")), SegmentChain( GenomicSegment("chrA", 50 + 25 - 1, 100 + 25 - 1, "-")), SegmentChain( GenomicSegment("chrA", 50 + 25 - 1, 51 + 25 - 1, "-")) ], "minus_k50_off0": [ SegmentChain( GenomicSegment("chrA", 0 + 50 - 1, 100 + 50 - 1, "-")), SegmentChain( GenomicSegment("chrA", 50 + 50 - 1, 100 + 50 - 1, "-")), SegmentChain( GenomicSegment("chrA", 50 + 50 - 1, 51 + 50 - 1, "-")) ], "minus_k25_off10": [ SegmentChain( GenomicSegment("chrA", 0 + 25 - 1 - 2 * 10, 100 + 25 - 1 - 2 * 10, "-")), SegmentChain( GenomicSegment("chrA", 50 + 25 - 1 - 2 * 10, 100 + 25 - 1 - 2 * 10, "-")), SegmentChain( GenomicSegment("chrA", 50 + 25 - 1 - 2 * 10, 51 + 25 - 1 - 2 * 10, "-")) ], "minus_k50_off10": [ SegmentChain( GenomicSegment("chrA", 0 + 50 - 1 - 2 * 10, 100 + 50 - 1 - 2 * 10, "-")), SegmentChain( GenomicSegment("chrA", 50 + 50 - 1 - 2 * 10, 100 + 50 - 1 - 2 * 10, "-")), SegmentChain( GenomicSegment("chrA", 50 + 50 - 1 - 2 * 10, 51 + 50 - 1 - 2 * 10, "-")) ], }
def find_canonicals_in_range(query_junc,minus_range,plus_range,genome,canonicals): """Find any canonical splice junctions within in `minus_range...plus_range` of `query_junc` To be classified as within the range, the boundaries of the canonical junction must be: 1. within `minus_range...plus_range` of the boundaries of the the discovered junction. 2. separated by a nucleotide distance equal to the distance separating the junction in `query_junc`. 3. On the same chromosome and strand. Parameters ---------- query_junc : |SegmentChain| A two-exon fragment representing a query splice junction minus_range : int <= 0 Maximum number of nucleotides splice junction could be moved to the left without reducing sequence support for the junction see :py:func:`find_match_range` plus_range : int >= 0 Maximum number of nucleotides splice junction could be moved to the right without reducing sequence support for the junction see :py:func:`find_match_range` genome : dict dict mapping chromosome names to :py:class:`Bio.SeqRecord.SeqRecord` s canonicals : list dinucleotide sequences to consider as canonical splice sites, as a list of tuples. e.g. `[("GT","AG"), ("GC","AG")]` Returns ------- list List of |SegmentChains| representing canonical splice junctions in `minus_range...plus_range` of `query_junc` """ ltmp = [] chrom = query_junc.chrom strand = query_junc.strand iv1,iv2 = query_junc[0], query_junc[1] iv1start, iv1end = iv1.start, iv1.end iv2start, iv2end = iv2.start, iv2.end for i in range(minus_range,plus_range+1): for pair in canonicals: if str(genome[chrom][iv1end + i:iv1end + i + 2].seq) == pair[0]\ and str(genome[chrom][iv2start - 2 + i:iv2start + i].seq) == pair[1]: new_iv1 = GenomicSegment(chrom, iv1start, iv1end + i, strand) new_iv2 = GenomicSegment(chrom, iv2start + i, iv2end, strand) ltmp.append(SegmentChain(new_iv1,new_iv2)) return ltmp