Esempio n. 1
0
 def test_simulate_reads(self):
     fh = cStringIO.StringIO()
     genome = SeqIO.parse(cStringIO.StringIO(SHORT_FASTA), "fasta")
     for seq in genome:
         simulate_reads(seq, k=25, fh=fh)
     fh.seek(0)
     reads = fh.read()
     self.assertEqual(reads, SHORT_FASTA_KMERS)
Esempio n. 2
0
 def test_track_subtype_raises_warning_if_wrong_extra_columns(self):
     reader = BED_Reader(cStringIO.StringIO(_NARROW_PEAK_TEXT),
                         extra_columns=14)
     with warnings.catch_warnings(record=True) as warns:
         warnings.simplefilter("always")
         ltmp = list(reader)
         assert_greater_equal(len(warns), 0)
Esempio n. 3
0
    def test_fiveprime_variable_from_file(self):
        offset_dicts = {}
        fancy = { X : X//2 for X in range (25,40) }
        fancy_reads = { X : numpy.zeros(2000) for X in self.strands }
        for read in self.reads["+"]:
            fancy_reads["+"][read.positions[fancy[len(read.positions)]]] += 1
        for read in self.reads["-"]:
            fancy_reads["-"][read.positions[-fancy[len(read.positions)]-1]] += 1

        fancy_plus  = numpy.zeros(2000)
        fancy_minus = numpy.zeros(2000)


        offset_dicts[("default_only","+")] = ({ "default" : 0 },
                                               self.expected[("fiveprime",0,"+")])
        offset_dicts[("default_only","-")] = ({ "default" : 0 },
                                               self.expected[("fiveprime",0,"-")])
        offset_dicts[("fancy","+")       ] = (fancy,fancy_reads["+"])
        offset_dicts[("fancy","-")       ] = (fancy,fancy_reads["-"])
                                             
        for (name, strand), (dict_, expected) in offset_dicts.items():
            fhlike = cStringIO.StringIO("\n".join(["%s\t%s" % (K,V) for K,V in dict_.items()]))
            fn = VariableFivePrimeMapFactory.from_file(fhlike)
            reads_out, count_array = fn(self.reads[strand],self.segs[strand])
            msg = "Failed fiveprime variable mapping for test %s(%s)" % (name,strand)
            assert_true((count_array==expected).all(),msg)
Esempio n. 4
0
    def test_transcript_cds_start_end_8to12_columns(self):
        """Checks equality of endpoints of coding regions for Transcript objects"""
        for n, data_str in sorted(self.data.items()):
            for c, (test_ivc, known_ivc) in enumerate(
                    zip(
                        BED_Reader(cStringIO.StringIO(data_str),
                                   return_type=Transcript),
                        _TEST_TRANSCRIPTS)):
                if n >= 8:
                    err_msg = "Failed thickstart/end equality on %s-column BED input: %s,%s" % (
                        n, known_ivc.attr, test_ivc.attr)
                    if known_ivc.attr.get("cds_genome_start",
                                          None) is not None:
                        yield self.check_equal, known_ivc.attr[
                            "cds_start"], test_ivc.attr["cds_start"], err_msg
                        yield self.check_equal, known_ivc.attr[
                            "cds_genome_start"], test_ivc.attr[
                                "cds_genome_start"], err_msg
                        yield self.check_equal, known_ivc.cds_genome_start, test_ivc.cds_genome_start, err_msg
                        yield self.check_equal, known_ivc.cds_start, test_ivc.cds_start, err_msg
                    if known_ivc.attr.get("cds_genome_end", None) is not None:
                        yield self.check_equal, known_ivc.attr[
                            "cds_end"], test_ivc.attr["cds_end"], err_msg
                        yield self.check_equal, known_ivc.attr[
                            "cds_genome_end"], test_ivc.attr[
                                "cds_genome_end"], err_msg
                        yield self.check_equal, known_ivc.cds_genome_end, test_ivc.cds_genome_end, err_msg
                        yield self.check_equal, known_ivc.cds_end, test_ivc.cds_end, err_msg

            yield self.check_equal, c, 20 - 1, "Not all intervals loaded! Expected %s, found %s." % (
                20 - 1, c)
Esempio n. 5
0
    def setUpClass(cls):
        cls.header = _BED_HEADER
        cls.data = {}
        cls.extracol_data = {}
        bed_df = pd.read_table(cStringIO.StringIO(_BED12_DATA),
                               header=None,
                               sep="\t",
                               index_col=None)
        extra_df = pd.read_table(cStringIO.StringIO(_EXTRA_COLS),
                                 header=0,
                                 sep="\t",
                                 index_col=None)
        cls.big_df = pd.concat([bed_df, extra_df], axis=1)

        for n in (3, 4, 5, 6, 8, 9, 12):
            cls.data[n] = cls.get_bed_subset(cls.header, n, 0)
            cls.extracol_data[n] = cls.get_bed_subset(cls.header, n, 4)
Esempio n. 6
0
    def test_track_subtype_parsing(self):
        reader = BED_Reader(cStringIO.StringIO(_NARROW_PEAK_TEXT))
        for c, (found, expected) in enumerate(zip(reader,
                                                  _NARROW_PEAK_CHAINS)):
            found.attr.pop("color")
            found.attr.pop("score")
            assert_dict_equal(found.attr, expected.attr)
            assert_equal(found, expected)

        assert_equal(c, len(_NARROW_PEAK_CHAINS) - 1)
Esempio n. 7
0
    def test_fa_to_bed(self):
        # test with and without offsets
        # start with block of FASTA formatted sequence:
        #     two blocks on same chromosome
        #     additional block on next chromosome
        #     additional block on next chromosome
        # two blocks on chrA, one on chrB
        # test without offset
        self.maxDiff = None

        reader = cStringIO.StringIO(CROSSMAP_BLOCK)

        # test without offset
        blocks1 = list(fa_to_bed(reader, 25, offset=0))
        self.assertListEqual(blocks1, CROSSMAP1)

        # test with offset
        reader = cStringIO.StringIO(CROSSMAP_BLOCK)
        blocks2 = list(fa_to_bed(reader, 25, offset=1000))
        self.assertListEqual(blocks2, CROSSMAP2)
Esempio n. 8
0
 def get_bed_subset(cls, header, bed_cols, extra_cols=0):
     buf = cStringIO.StringIO()
     columns = cls.big_df.columns[list(range(bed_cols)) +
                                  list(range(12, 12 + extra_cols))]
     cls.big_df.to_csv(buf,
                       columns=columns,
                       sep="\t",
                       index=False,
                       header=False,
                       quoting=QUOTE_NONE)  #,float_format="%.8f")
     return buf.getvalue()
Esempio n. 9
0
    def test_fa_to_bed_throws_expected_error(self):
        # test with and without offsets
        # start with unsorted FASTA block

        # grab chrA only and randomize order
        reads = cStringIO.StringIO(SHORT_FASTA_KMERS).read().split("\n")
        reads = reads[40:50] + reads[30:40] + reads[0:16] + reads[20:30]
        reads = "\n".join(reads)
        reader = cStringIO.StringIO(reads)

        # fa_to_bed is a generator, so we need to create a callable to make
        # it a list for it to actually raise the error
        tfunc = lambda x, y, z: list(fa_to_bed(x, y, offset=z))

        self.assertRaises(MalformedFileError, tfunc, reader, 25, 0)

        reader = cStringIO.StringIO(reads)
        self.assertRaises(MalformedFileError, tfunc, reader, 25, 1000)
        reader = cStringIO.StringIO(reads)
        self.assertRaises(MalformedFileError, tfunc, reader, 15, 0)
Esempio n. 10
0
    def __getitem__(self, roi, stranded=True):
        """Return list of features that overlap the region of interest (roi).
        
        Parameters
        ----------
        roi : |GenomicSegment| or |SegmentChain|
            Query feature indicating region of interest

        stranded : bool
            If `True`, retrieve only features on same strand as query feature.
            Otherwise, retrieve features on both strands
                             
                             
        Returns
        -------
        list
            Features that overlap `roi`

        Raises
        ------
        TypeError
            if `roi` is not a |GenomicSegment| or |SegmentChain|
        """
        if isinstance(roi, GenomicSegment):
            #roi_chain = SegmentChain(roi)
            roi_seg = roi
            roi_chain = SegmentChain(roi)
        elif isinstance(roi, SegmentChain):
            roi_chain = roi
            roi_seg = roi.spanning_segment
        else:
            raise TypeError(
                "Query feature must be a GenomicSegment or SegmentChain")

        chrom = roi_seg.chrom
        feature_text = "\n".join(["\n".join(list(R.fetch(chrom,
                                                         X.start,
                                                         X.end))) \
                                                         for R in self.tabix_readers \
                                                         for X in roi_chain])

        features = (self._reader_class(cStringIO.StringIO(feature_text)))
        if stranded == True:
            features = [X for X in features if roi_chain.overlaps(X)]
        else:
            features = [
                X for X in features if roi_chain.unstranded_overlaps(X)
            ]
        return features
Esempio n. 11
0
    def test_ivcollection_thick_start_end_8to12_columns(self):
        """Checks equality of thickstart and thickend attributes for SegmentChain objects"""
        for n, data_str in sorted(self.data.items()):
            for c, (test_ivc, known_ivc) in enumerate(
                    zip(
                        BED_Reader(cStringIO.StringIO(data_str),
                                   return_type=SegmentChain),
                        _TEST_SEGMENTCHAINS)):
                if n >= 8:
                    err_msg = "Failed thickstart/end equality on %s-column BED input: %s,%s" % (
                        n, known_ivc.attr, test_ivc.attr)
                    if known_ivc.attr.get("thickstart", None) is not None:
                        yield self.check_equal, known_ivc.attr[
                            "thickstart"], test_ivc.attr["thickstart"], err_msg
                    if known_ivc.attr.get("thickend", None) is not None:
                        yield self.check_equal, known_ivc.attr.get(
                            "thickend"), test_ivc.attr["thickend"], err_msg

            yield self.check_equal, c, 20 - 1, "Not all intervals loaded! Expected %s, found %s." % (
                20 - 1, c)
Esempio n. 12
0
    def _do(self, data_str, data_format, expected_results):
        """Execute tests on various formats

        Parameters
        ----------
        data_str : str
            wiggle file data

        data_format : str
            name of expected data format
        """
        reader = WiggleReader(cStringIO.StringIO(data_str))
        for n, (tup1, tup2) in enumerate(zip(expected_results, reader)):
            self.assertEquals(tup1, tup2)

        self.assertEquals(
            n,
            len(expected_results) - 1,
            "Not all results processed: %s vs %s" % (n, len(expected_results)))

        self.assertEquals(reader.data_format, data_format)
Esempio n. 13
0
 def test_iter_bundles(self):
     # create a list of entries with same query name, so that these will be bundled by reader
     psl_transcripts = list(PSL_Reader(open(MINI["psl_file"])))
     repeats = numpy.random.randint(1,high=10,size=len(psl_transcripts))
     stmp = ""
     for tx, r in zip(psl_transcripts,repeats):
         for _ in range(r):
             stmp += tx.as_psl()
         
     fakefile = cStringIO.StringIO(stmp)
     reader = BundledPSL_Reader(fakefile)
     for n,tx_group in enumerate(reader):
         # make sure groups are correct length
         self.assertEqual(len(tx_group),repeats[n])
         
         # make sure each group has the correct query name
         names = [X.get_name() for X in tx_group]
         self.assertEqual(len(set(names)),1)
         self.assertEqual(names[0],psl_transcripts[n].get_name())
     
     self.assertEqual(n+1,len(repeats))
     self.assertEqual(n+1,len(psl_transcripts))
Esempio n. 14
0
 def test_fasta_name_reader(self):
     # make sure we get out only names of sequences from FASTA file
     reader = FastaNameReader(cStringIO.StringIO(SHORT_FASTA))
     self.assertEqual(next(reader), "chr50a")
     self.assertEqual(next(reader), "chr30b")
     self.assertRaises(StopIteration, reader.next)
Esempio n. 15
0
 def get_stream(self):
     return SkipBlankReader(cStringIO.StringIO(_NL_TEXT))
Esempio n. 16
0
 def get_stream(self):
     return CommentReader(cStringIO.StringIO(_COMMENT_TEXT))
Esempio n. 17
0
2L    FlyBase    CDS    8997762    8998386    .    +    1    ID=CDS_FBgn0040964:3_866;Name=CG18661-cds;Parent=FBtr0303880;parent_type=mRNA

2L    FlyBase    mRNA    9397110    9397988    .    -    .    ID=FBtr0079813;Name=CG4438-RA;Parent=FBgn0032115;
2L    FlyBase    mRNA    9397110    9397988    .    -    .    ID=FBtr0303900;Name=CG4438-RB;Parent=FBgn0032115;
2L    FlyBase    exon    9397110    9397772    .    -    .    ID=FBgn0032115:2;Name=CG4438:2;Parent=FBtr0079813;parent_type=mRNA
2L    FlyBase    exon    9397110    9397764    .    -    .    ID=FBgn0032115:1;Name=CG4438:1;Parent=FBtr0303900;parent_type=mRNA
2L    FlyBase    CDS    9397177    9397746    .    -    0    ID=CDS_FBgn0032115:1_867;Name=CG4438-cds;Parent=FBtr0303900;parent_type=mRNA
2L    FlyBase    CDS    9397177    9397746    .    -    0    ID=CDS_FBgn0032115:2_867;Name=CG4438-cds;Parent=FBtr0079813;parent_type=mRNA
2L    FlyBase    exon    9397833    9397988    .    -    .    ID=FBgn0032115:3;Name=CG4438:3;Parent=FBtr0079813,FBtr0303900;parent_type=mRNA
""".replace("    ", "\t")
"""GFF of transcripts used in these tests"""

_TRANSCRIPTS = {
    X.get_name(): X
    for X in GFF3_TranscriptAssembler(
        SkipBlankReader(cStringIO.StringIO(_TRANSCRIPTS_GFF)))
}
"""|Transcript| representation of transcripts used in these tests"""

_CDS_START_QUERIES = [
    "FBtr0081950",  # minus-strand, CDS internal, UTR longer than flank
    "FBtr0081950_short_utr",  # minus-strand, UTR will be shorter than flank
    "FBtr0081950_no_cds",  # minus-strand, no CDS
    "FBtr0081950_no_utr",  # minus-strand, CDS start at end of tx
    "FBtr0081950_at_splice",  # minus-strand, CDS start at splice junction   
    "FBtr0079531",
    "FBtr0079531_short_utr",
    "FBtr0079531_no_cds",
    "FBtr0079531_no_utr",
    "FBtr0079531_at_splice",
]
Esempio n. 18
0
 def test_via_backwards(self):
     reader = FunctionReader(cStringIO.StringIO(_NL_TEXT),lambda x: x[::-1])
     for line1, line2 in zip(reader,cStringIO.StringIO(_NL_TEXT)):
         self.assertEqual(line1,line2[::-1])
Esempio n. 19
0
 def test_via_upper(self):
     reader = FunctionReader(cStringIO.StringIO(_NL_TEXT),str.upper)
     for line1, line2 in zip(reader,cStringIO.StringIO(_NL_TEXT)):
         self.assertEqual(line1,line2.upper())
Esempio n. 20
0
    def test_bed_import_3to12_columns(self):
        tx_reader = functools.partial(BED_Reader, return_type=Transcript)
        tests = [
            (BED_Reader, _TEST_SEGMENTCHAINS, "reader_segmentchain"),
            (tx_reader, _TEST_TRANSCRIPTS, "reader_transcript"),
        ]
        for reader_fn, known_set, name in tests:
            for n, data_str in sorted(self.data.items()):
                c = 0
                for (test_ivc,
                     known_ivc) in zip(reader_fn(cStringIO.StringIO(data_str)),
                                       known_set):
                    # columns: chrom, start, end
                    if n >= 3:
                        # no strand info, so we need to test iv.start, iv.end, iv.chrom
                        err_msg = "%s failed endpoint equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc, test_ivc)
                        yield self.check_equal, known_ivc.spanning_segment.start, test_ivc.spanning_segment.start, err_msg
                        yield self.check_equal, known_ivc.spanning_segment.end, test_ivc.spanning_segment.end, err_msg
                        yield self.check_equal, known_ivc.spanning_segment.chrom, test_ivc.spanning_segment.chrom, err_msg
                    # column: name
                    if n >= 4:
                        err_msg = "%s failed name equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc.attr, test_ivc.attr)
                        yield self.check_equal, known_ivc.attr[
                            "ID"], test_ivc.attr["ID"], err_msg
                    # column: score
                    if n >= 5:
                        err_msg = "%s failed score equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc.attr, test_ivc.attr)
                        yield self.check_equal, known_ivc.attr.get(
                            "score", 0), test_ivc.attr["score"], err_msg
                    # column : strand
                    if n >= 6:
                        err_msg = "%s failed strand equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc, test_ivc)
                        yield self.check_equal, known_ivc.spanning_segment.strand, test_ivc.spanning_segment.strand
                    # column: color
                    if n >= 9:
                        err_msg = "%s failed color equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc.attr, test_ivc.attr)
                        yield self.check_equal, known_ivc.attr.get(
                            "color",
                            "#000000"), test_ivc.attr["color"], err_msg
                    # columns: exon/block info
                    if n == 12:
                        err_msg = "%s failed block equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc, test_ivc)
                        for iv1, iv2 in zip(known_ivc, test_ivc):
                            assert_equal(iv1, iv2, err_msg)
                        err_msg = "%s failed position set on %s-column BED input: %s,%s" % (
                            name, n, known_ivc, test_ivc)
                        yield self.check_equal, known_ivc.get_position_set(
                        ), test_ivc.get_position_set(), err_msg

                    c += 1

                yield self.check_equal, c, len(
                    known_set
                ), "Not all intervals loaded! Expected %s, found %s." % (
                    len(known_set), c)
Esempio n. 21
0
    def test_bed_import_3to12plus4_columns_with_formatters(self):
        names = [
            ("numcol", int),
            ("floatcol", float),
            ("strcol", str),
            ("attrcol", str),
        ]

        tx_reader = functools.partial(BED_Reader,
                                      return_type=Transcript,
                                      extra_columns=names)
        seg_reader = functools.partial(BED_Reader,
                                       return_type=SegmentChain,
                                       extra_columns=names)
        tests = [
            (seg_reader, _TEST_SEGMENTCHAINS, "reader_segmentchain"),
            (tx_reader, _TEST_TRANSCRIPTS, "reader_transcript"),
        ]
        for reader_fn, known_set, name in tests:
            for n, data_str in sorted(self.extracol_data.items()):
                c = 0
                for (test_ivc,
                     known_ivc) in zip(reader_fn(cStringIO.StringIO(data_str)),
                                       known_set):
                    for x in range(4):
                        colname = names[x][0]
                        assert_true(
                            colname in test_ivc.attr,
                            "Column name '%s' not found in attr dict (%s BED columns)"
                            % (x, n))
                        assert_equal(test_ivc.attr[colname],
                                     self.big_df.iloc[c, 12 + x])

                    # columns: chrom, start, end
                    if n >= 3:
                        # no strand info, so we need to test iv.start, iv.end, iv.chrom
                        err_msg = "%s failed endpoint equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc, test_ivc)
                        yield self.check_equal, known_ivc.spanning_segment.start, test_ivc.spanning_segment.start, err_msg
                        yield self.check_equal, known_ivc.spanning_segment.end, test_ivc.spanning_segment.end, err_msg
                        yield self.check_equal, known_ivc.spanning_segment.chrom, test_ivc.spanning_segment.chrom, err_msg
                    # column: name
                    if n >= 4:
                        err_msg = "%s failed name equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc.attr, test_ivc.attr)
                        yield self.check_equal, known_ivc.attr[
                            "ID"], test_ivc.attr["ID"], err_msg
                    # column: score
                    if n >= 5:
                        err_msg = "%s failed score equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc.attr, test_ivc.attr)
                        yield self.check_equal, known_ivc.attr.get(
                            "score", 0), test_ivc.attr["score"], err_msg
                    # column : strand
                    if n >= 6:
                        err_msg = "%s failed strand equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc, test_ivc)
                        yield self.check_equal, known_ivc.spanning_segment.strand, test_ivc.spanning_segment.strand
                    # column: color
                    if n >= 9:
                        err_msg = "%s failed color equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc.attr, test_ivc.attr)
                        yield self.check_equal, known_ivc.attr.get(
                            "color",
                            "#000000"), test_ivc.attr["color"], err_msg
                    # columns: exon/block info
                    if n == 12:
                        err_msg = "%s failed block equality on %s-column BED input: %s,%s" % (
                            name, n, known_ivc, test_ivc)
                        for iv1, iv2 in zip(known_ivc, test_ivc):
                            assert_equal(iv1, iv2, err_msg)
                        err_msg = "%s failed position set on %s-column BED input: %s,%s" % (
                            name, n, known_ivc, test_ivc)
                        yield self.check_equal, known_ivc.get_position_set(
                        ), test_ivc.get_position_set(), err_msg

                    c += 1

                yield self.check_equal, c, len(
                    known_set
                ), "Not all intervals loaded! Expected %s, found %s." % (
                    len(known_set), c)