def test_simulate_reads(self): fh = cStringIO.StringIO() genome = SeqIO.parse(cStringIO.StringIO(SHORT_FASTA), "fasta") for seq in genome: simulate_reads(seq, k=25, fh=fh) fh.seek(0) reads = fh.read() self.assertEqual(reads, SHORT_FASTA_KMERS)
def test_track_subtype_raises_warning_if_wrong_extra_columns(self): reader = BED_Reader(cStringIO.StringIO(_NARROW_PEAK_TEXT), extra_columns=14) with warnings.catch_warnings(record=True) as warns: warnings.simplefilter("always") ltmp = list(reader) assert_greater_equal(len(warns), 0)
def test_fiveprime_variable_from_file(self): offset_dicts = {} fancy = { X : X//2 for X in range (25,40) } fancy_reads = { X : numpy.zeros(2000) for X in self.strands } for read in self.reads["+"]: fancy_reads["+"][read.positions[fancy[len(read.positions)]]] += 1 for read in self.reads["-"]: fancy_reads["-"][read.positions[-fancy[len(read.positions)]-1]] += 1 fancy_plus = numpy.zeros(2000) fancy_minus = numpy.zeros(2000) offset_dicts[("default_only","+")] = ({ "default" : 0 }, self.expected[("fiveprime",0,"+")]) offset_dicts[("default_only","-")] = ({ "default" : 0 }, self.expected[("fiveprime",0,"-")]) offset_dicts[("fancy","+") ] = (fancy,fancy_reads["+"]) offset_dicts[("fancy","-") ] = (fancy,fancy_reads["-"]) for (name, strand), (dict_, expected) in offset_dicts.items(): fhlike = cStringIO.StringIO("\n".join(["%s\t%s" % (K,V) for K,V in dict_.items()])) fn = VariableFivePrimeMapFactory.from_file(fhlike) reads_out, count_array = fn(self.reads[strand],self.segs[strand]) msg = "Failed fiveprime variable mapping for test %s(%s)" % (name,strand) assert_true((count_array==expected).all(),msg)
def test_transcript_cds_start_end_8to12_columns(self): """Checks equality of endpoints of coding regions for Transcript objects""" for n, data_str in sorted(self.data.items()): for c, (test_ivc, known_ivc) in enumerate( zip( BED_Reader(cStringIO.StringIO(data_str), return_type=Transcript), _TEST_TRANSCRIPTS)): if n >= 8: err_msg = "Failed thickstart/end equality on %s-column BED input: %s,%s" % ( n, known_ivc.attr, test_ivc.attr) if known_ivc.attr.get("cds_genome_start", None) is not None: yield self.check_equal, known_ivc.attr[ "cds_start"], test_ivc.attr["cds_start"], err_msg yield self.check_equal, known_ivc.attr[ "cds_genome_start"], test_ivc.attr[ "cds_genome_start"], err_msg yield self.check_equal, known_ivc.cds_genome_start, test_ivc.cds_genome_start, err_msg yield self.check_equal, known_ivc.cds_start, test_ivc.cds_start, err_msg if known_ivc.attr.get("cds_genome_end", None) is not None: yield self.check_equal, known_ivc.attr[ "cds_end"], test_ivc.attr["cds_end"], err_msg yield self.check_equal, known_ivc.attr[ "cds_genome_end"], test_ivc.attr[ "cds_genome_end"], err_msg yield self.check_equal, known_ivc.cds_genome_end, test_ivc.cds_genome_end, err_msg yield self.check_equal, known_ivc.cds_end, test_ivc.cds_end, err_msg yield self.check_equal, c, 20 - 1, "Not all intervals loaded! Expected %s, found %s." % ( 20 - 1, c)
def setUpClass(cls): cls.header = _BED_HEADER cls.data = {} cls.extracol_data = {} bed_df = pd.read_table(cStringIO.StringIO(_BED12_DATA), header=None, sep="\t", index_col=None) extra_df = pd.read_table(cStringIO.StringIO(_EXTRA_COLS), header=0, sep="\t", index_col=None) cls.big_df = pd.concat([bed_df, extra_df], axis=1) for n in (3, 4, 5, 6, 8, 9, 12): cls.data[n] = cls.get_bed_subset(cls.header, n, 0) cls.extracol_data[n] = cls.get_bed_subset(cls.header, n, 4)
def test_track_subtype_parsing(self): reader = BED_Reader(cStringIO.StringIO(_NARROW_PEAK_TEXT)) for c, (found, expected) in enumerate(zip(reader, _NARROW_PEAK_CHAINS)): found.attr.pop("color") found.attr.pop("score") assert_dict_equal(found.attr, expected.attr) assert_equal(found, expected) assert_equal(c, len(_NARROW_PEAK_CHAINS) - 1)
def test_fa_to_bed(self): # test with and without offsets # start with block of FASTA formatted sequence: # two blocks on same chromosome # additional block on next chromosome # additional block on next chromosome # two blocks on chrA, one on chrB # test without offset self.maxDiff = None reader = cStringIO.StringIO(CROSSMAP_BLOCK) # test without offset blocks1 = list(fa_to_bed(reader, 25, offset=0)) self.assertListEqual(blocks1, CROSSMAP1) # test with offset reader = cStringIO.StringIO(CROSSMAP_BLOCK) blocks2 = list(fa_to_bed(reader, 25, offset=1000)) self.assertListEqual(blocks2, CROSSMAP2)
def get_bed_subset(cls, header, bed_cols, extra_cols=0): buf = cStringIO.StringIO() columns = cls.big_df.columns[list(range(bed_cols)) + list(range(12, 12 + extra_cols))] cls.big_df.to_csv(buf, columns=columns, sep="\t", index=False, header=False, quoting=QUOTE_NONE) #,float_format="%.8f") return buf.getvalue()
def test_fa_to_bed_throws_expected_error(self): # test with and without offsets # start with unsorted FASTA block # grab chrA only and randomize order reads = cStringIO.StringIO(SHORT_FASTA_KMERS).read().split("\n") reads = reads[40:50] + reads[30:40] + reads[0:16] + reads[20:30] reads = "\n".join(reads) reader = cStringIO.StringIO(reads) # fa_to_bed is a generator, so we need to create a callable to make # it a list for it to actually raise the error tfunc = lambda x, y, z: list(fa_to_bed(x, y, offset=z)) self.assertRaises(MalformedFileError, tfunc, reader, 25, 0) reader = cStringIO.StringIO(reads) self.assertRaises(MalformedFileError, tfunc, reader, 25, 1000) reader = cStringIO.StringIO(reads) self.assertRaises(MalformedFileError, tfunc, reader, 15, 0)
def __getitem__(self, roi, stranded=True): """Return list of features that overlap the region of interest (roi). Parameters ---------- roi : |GenomicSegment| or |SegmentChain| Query feature indicating region of interest stranded : bool If `True`, retrieve only features on same strand as query feature. Otherwise, retrieve features on both strands Returns ------- list Features that overlap `roi` Raises ------ TypeError if `roi` is not a |GenomicSegment| or |SegmentChain| """ if isinstance(roi, GenomicSegment): #roi_chain = SegmentChain(roi) roi_seg = roi roi_chain = SegmentChain(roi) elif isinstance(roi, SegmentChain): roi_chain = roi roi_seg = roi.spanning_segment else: raise TypeError( "Query feature must be a GenomicSegment or SegmentChain") chrom = roi_seg.chrom feature_text = "\n".join(["\n".join(list(R.fetch(chrom, X.start, X.end))) \ for R in self.tabix_readers \ for X in roi_chain]) features = (self._reader_class(cStringIO.StringIO(feature_text))) if stranded == True: features = [X for X in features if roi_chain.overlaps(X)] else: features = [ X for X in features if roi_chain.unstranded_overlaps(X) ] return features
def test_ivcollection_thick_start_end_8to12_columns(self): """Checks equality of thickstart and thickend attributes for SegmentChain objects""" for n, data_str in sorted(self.data.items()): for c, (test_ivc, known_ivc) in enumerate( zip( BED_Reader(cStringIO.StringIO(data_str), return_type=SegmentChain), _TEST_SEGMENTCHAINS)): if n >= 8: err_msg = "Failed thickstart/end equality on %s-column BED input: %s,%s" % ( n, known_ivc.attr, test_ivc.attr) if known_ivc.attr.get("thickstart", None) is not None: yield self.check_equal, known_ivc.attr[ "thickstart"], test_ivc.attr["thickstart"], err_msg if known_ivc.attr.get("thickend", None) is not None: yield self.check_equal, known_ivc.attr.get( "thickend"), test_ivc.attr["thickend"], err_msg yield self.check_equal, c, 20 - 1, "Not all intervals loaded! Expected %s, found %s." % ( 20 - 1, c)
def _do(self, data_str, data_format, expected_results): """Execute tests on various formats Parameters ---------- data_str : str wiggle file data data_format : str name of expected data format """ reader = WiggleReader(cStringIO.StringIO(data_str)) for n, (tup1, tup2) in enumerate(zip(expected_results, reader)): self.assertEquals(tup1, tup2) self.assertEquals( n, len(expected_results) - 1, "Not all results processed: %s vs %s" % (n, len(expected_results))) self.assertEquals(reader.data_format, data_format)
def test_iter_bundles(self): # create a list of entries with same query name, so that these will be bundled by reader psl_transcripts = list(PSL_Reader(open(MINI["psl_file"]))) repeats = numpy.random.randint(1,high=10,size=len(psl_transcripts)) stmp = "" for tx, r in zip(psl_transcripts,repeats): for _ in range(r): stmp += tx.as_psl() fakefile = cStringIO.StringIO(stmp) reader = BundledPSL_Reader(fakefile) for n,tx_group in enumerate(reader): # make sure groups are correct length self.assertEqual(len(tx_group),repeats[n]) # make sure each group has the correct query name names = [X.get_name() for X in tx_group] self.assertEqual(len(set(names)),1) self.assertEqual(names[0],psl_transcripts[n].get_name()) self.assertEqual(n+1,len(repeats)) self.assertEqual(n+1,len(psl_transcripts))
def test_fasta_name_reader(self): # make sure we get out only names of sequences from FASTA file reader = FastaNameReader(cStringIO.StringIO(SHORT_FASTA)) self.assertEqual(next(reader), "chr50a") self.assertEqual(next(reader), "chr30b") self.assertRaises(StopIteration, reader.next)
def get_stream(self): return SkipBlankReader(cStringIO.StringIO(_NL_TEXT))
def get_stream(self): return CommentReader(cStringIO.StringIO(_COMMENT_TEXT))
2L FlyBase CDS 8997762 8998386 . + 1 ID=CDS_FBgn0040964:3_866;Name=CG18661-cds;Parent=FBtr0303880;parent_type=mRNA 2L FlyBase mRNA 9397110 9397988 . - . ID=FBtr0079813;Name=CG4438-RA;Parent=FBgn0032115; 2L FlyBase mRNA 9397110 9397988 . - . ID=FBtr0303900;Name=CG4438-RB;Parent=FBgn0032115; 2L FlyBase exon 9397110 9397772 . - . ID=FBgn0032115:2;Name=CG4438:2;Parent=FBtr0079813;parent_type=mRNA 2L FlyBase exon 9397110 9397764 . - . ID=FBgn0032115:1;Name=CG4438:1;Parent=FBtr0303900;parent_type=mRNA 2L FlyBase CDS 9397177 9397746 . - 0 ID=CDS_FBgn0032115:1_867;Name=CG4438-cds;Parent=FBtr0303900;parent_type=mRNA 2L FlyBase CDS 9397177 9397746 . - 0 ID=CDS_FBgn0032115:2_867;Name=CG4438-cds;Parent=FBtr0079813;parent_type=mRNA 2L FlyBase exon 9397833 9397988 . - . ID=FBgn0032115:3;Name=CG4438:3;Parent=FBtr0079813,FBtr0303900;parent_type=mRNA """.replace(" ", "\t") """GFF of transcripts used in these tests""" _TRANSCRIPTS = { X.get_name(): X for X in GFF3_TranscriptAssembler( SkipBlankReader(cStringIO.StringIO(_TRANSCRIPTS_GFF))) } """|Transcript| representation of transcripts used in these tests""" _CDS_START_QUERIES = [ "FBtr0081950", # minus-strand, CDS internal, UTR longer than flank "FBtr0081950_short_utr", # minus-strand, UTR will be shorter than flank "FBtr0081950_no_cds", # minus-strand, no CDS "FBtr0081950_no_utr", # minus-strand, CDS start at end of tx "FBtr0081950_at_splice", # minus-strand, CDS start at splice junction "FBtr0079531", "FBtr0079531_short_utr", "FBtr0079531_no_cds", "FBtr0079531_no_utr", "FBtr0079531_at_splice", ]
def test_via_backwards(self): reader = FunctionReader(cStringIO.StringIO(_NL_TEXT),lambda x: x[::-1]) for line1, line2 in zip(reader,cStringIO.StringIO(_NL_TEXT)): self.assertEqual(line1,line2[::-1])
def test_via_upper(self): reader = FunctionReader(cStringIO.StringIO(_NL_TEXT),str.upper) for line1, line2 in zip(reader,cStringIO.StringIO(_NL_TEXT)): self.assertEqual(line1,line2.upper())
def test_bed_import_3to12_columns(self): tx_reader = functools.partial(BED_Reader, return_type=Transcript) tests = [ (BED_Reader, _TEST_SEGMENTCHAINS, "reader_segmentchain"), (tx_reader, _TEST_TRANSCRIPTS, "reader_transcript"), ] for reader_fn, known_set, name in tests: for n, data_str in sorted(self.data.items()): c = 0 for (test_ivc, known_ivc) in zip(reader_fn(cStringIO.StringIO(data_str)), known_set): # columns: chrom, start, end if n >= 3: # no strand info, so we need to test iv.start, iv.end, iv.chrom err_msg = "%s failed endpoint equality on %s-column BED input: %s,%s" % ( name, n, known_ivc, test_ivc) yield self.check_equal, known_ivc.spanning_segment.start, test_ivc.spanning_segment.start, err_msg yield self.check_equal, known_ivc.spanning_segment.end, test_ivc.spanning_segment.end, err_msg yield self.check_equal, known_ivc.spanning_segment.chrom, test_ivc.spanning_segment.chrom, err_msg # column: name if n >= 4: err_msg = "%s failed name equality on %s-column BED input: %s,%s" % ( name, n, known_ivc.attr, test_ivc.attr) yield self.check_equal, known_ivc.attr[ "ID"], test_ivc.attr["ID"], err_msg # column: score if n >= 5: err_msg = "%s failed score equality on %s-column BED input: %s,%s" % ( name, n, known_ivc.attr, test_ivc.attr) yield self.check_equal, known_ivc.attr.get( "score", 0), test_ivc.attr["score"], err_msg # column : strand if n >= 6: err_msg = "%s failed strand equality on %s-column BED input: %s,%s" % ( name, n, known_ivc, test_ivc) yield self.check_equal, known_ivc.spanning_segment.strand, test_ivc.spanning_segment.strand # column: color if n >= 9: err_msg = "%s failed color equality on %s-column BED input: %s,%s" % ( name, n, known_ivc.attr, test_ivc.attr) yield self.check_equal, known_ivc.attr.get( "color", "#000000"), test_ivc.attr["color"], err_msg # columns: exon/block info if n == 12: err_msg = "%s failed block equality on %s-column BED input: %s,%s" % ( name, n, known_ivc, test_ivc) for iv1, iv2 in zip(known_ivc, test_ivc): assert_equal(iv1, iv2, err_msg) err_msg = "%s failed position set on %s-column BED input: %s,%s" % ( name, n, known_ivc, test_ivc) yield self.check_equal, known_ivc.get_position_set( ), test_ivc.get_position_set(), err_msg c += 1 yield self.check_equal, c, len( known_set ), "Not all intervals loaded! Expected %s, found %s." % ( len(known_set), c)
def test_bed_import_3to12plus4_columns_with_formatters(self): names = [ ("numcol", int), ("floatcol", float), ("strcol", str), ("attrcol", str), ] tx_reader = functools.partial(BED_Reader, return_type=Transcript, extra_columns=names) seg_reader = functools.partial(BED_Reader, return_type=SegmentChain, extra_columns=names) tests = [ (seg_reader, _TEST_SEGMENTCHAINS, "reader_segmentchain"), (tx_reader, _TEST_TRANSCRIPTS, "reader_transcript"), ] for reader_fn, known_set, name in tests: for n, data_str in sorted(self.extracol_data.items()): c = 0 for (test_ivc, known_ivc) in zip(reader_fn(cStringIO.StringIO(data_str)), known_set): for x in range(4): colname = names[x][0] assert_true( colname in test_ivc.attr, "Column name '%s' not found in attr dict (%s BED columns)" % (x, n)) assert_equal(test_ivc.attr[colname], self.big_df.iloc[c, 12 + x]) # columns: chrom, start, end if n >= 3: # no strand info, so we need to test iv.start, iv.end, iv.chrom err_msg = "%s failed endpoint equality on %s-column BED input: %s,%s" % ( name, n, known_ivc, test_ivc) yield self.check_equal, known_ivc.spanning_segment.start, test_ivc.spanning_segment.start, err_msg yield self.check_equal, known_ivc.spanning_segment.end, test_ivc.spanning_segment.end, err_msg yield self.check_equal, known_ivc.spanning_segment.chrom, test_ivc.spanning_segment.chrom, err_msg # column: name if n >= 4: err_msg = "%s failed name equality on %s-column BED input: %s,%s" % ( name, n, known_ivc.attr, test_ivc.attr) yield self.check_equal, known_ivc.attr[ "ID"], test_ivc.attr["ID"], err_msg # column: score if n >= 5: err_msg = "%s failed score equality on %s-column BED input: %s,%s" % ( name, n, known_ivc.attr, test_ivc.attr) yield self.check_equal, known_ivc.attr.get( "score", 0), test_ivc.attr["score"], err_msg # column : strand if n >= 6: err_msg = "%s failed strand equality on %s-column BED input: %s,%s" % ( name, n, known_ivc, test_ivc) yield self.check_equal, known_ivc.spanning_segment.strand, test_ivc.spanning_segment.strand # column: color if n >= 9: err_msg = "%s failed color equality on %s-column BED input: %s,%s" % ( name, n, known_ivc.attr, test_ivc.attr) yield self.check_equal, known_ivc.attr.get( "color", "#000000"), test_ivc.attr["color"], err_msg # columns: exon/block info if n == 12: err_msg = "%s failed block equality on %s-column BED input: %s,%s" % ( name, n, known_ivc, test_ivc) for iv1, iv2 in zip(known_ivc, test_ivc): assert_equal(iv1, iv2, err_msg) err_msg = "%s failed position set on %s-column BED input: %s,%s" % ( name, n, known_ivc, test_ivc) yield self.check_equal, known_ivc.get_position_set( ), test_ivc.get_position_set(), err_msg c += 1 yield self.check_equal, c, len( known_set ), "Not all intervals loaded! Expected %s, found %s." % ( len(known_set), c)