Beispiel #1
0
def sff_filter(in_file, out_file, iterator_filter, inter):
    count = 0
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    with open(in_file, "rb") as in_handle:
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        in_handle.seek(0)
        with open(out_file, "wb") as out_handle:
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  # start again after getting manifest
            if inter:
                from itertools import chain

                count = writer.write_file(
                    chain.from_iterable(
                        iterator_filter(pair(SffIterator(in_handle)))))
                assert count % 2 == 0, "Odd number of records? %i" % count
                count /= 2
            else:
                count = writer.write_file(
                    iterator_filter(SffIterator(in_handle)))
    return count
Beispiel #2
0
    def test_read(self):
        filename = "Roche/E3MFGYR02_random_10_reads.sff"
        with open(filename, "rb") as handle:
            sff = list(SffIterator(handle))
        with open(filename, "rb") as handle:
            sff_trim = list(SffIterator(handle, trim=True))

        filename = "Roche/E3MFGYR02_random_10_reads_no_trim.fasta"
        fasta_no_trim = list(SeqIO.parse(filename, "fasta"))
        filename = "Roche/E3MFGYR02_random_10_reads_no_trim.qual"
        qual_no_trim = list(SeqIO.parse(filename, "qual"))

        filename = "Roche/E3MFGYR02_random_10_reads.fasta"
        fasta_trim = list(SeqIO.parse(filename, "fasta"))
        filename = "Roche/E3MFGYR02_random_10_reads.qual"
        qual_trim = list(SeqIO.parse(filename, "qual"))

        for s, sT, f, q, fT, qT in zip(
            sff, sff_trim, fasta_no_trim, qual_no_trim, fasta_trim, qual_trim
        ):
            self.assertEqual(len({s.id, f.id, q.id}), 1)  # All values are the same
            self.assertEqual(s.seq, f.seq)
            self.assertEqual(
                s.letter_annotations["phred_quality"],
                q.letter_annotations["phred_quality"],
            )
            self.assertEqual(
                len({s.id, sT.id, fT.id, qT.id}), 1
            )  # All values are the same
            self.assertEqual(sT.seq, fT.seq)
            self.assertEqual(
                sT.letter_annotations["phred_quality"],
                qT.letter_annotations["phred_quality"],
            )
Beispiel #3
0
    def test_both_ways(self):
        filename = "Roche/E3MFGYR02_random_10_reads.sff"
        with open(filename, "rb") as handle:
            index1 = sorted(_sff_read_roche_index(handle))
        with open(filename, "rb") as handle:
            index2 = sorted(_sff_do_slow_index(handle))
        self.assertEqual(index1, index2)
        with open(filename, "rb") as handle:
            self.assertEqual(len(index1), len(list(SffIterator(handle))))
        with open(filename, "rb") as handle:
            self.assertEqual(len(index1),
                             len(list(SffIterator(BytesIO(handle.read())))))

        if sys.platform != "win32" and sys.version_info[0] < 3:
            # Can be lazy and treat as binary...
            with open(filename, "r") as handle:
                self.assertEqual(len(index1), len(list(SffIterator(handle))))
            with open(filename) as handle:
                index2 = sorted(_sff_read_roche_index(handle))
            self.assertEqual(index1, index2)
            with open(filename, "r") as handle:
                index2 = sorted(_sff_do_slow_index(handle))
            self.assertEqual(index1, index2)
            with open(filename, "r") as handle:
                self.assertEqual(len(index1), len(list(SffIterator(handle))))
            with open(filename, "r") as handle:
                self.assertEqual(
                    len(index1),
                    len(list(SffIterator(BytesIO(handle.read())))))
Beispiel #4
0
 def test_both_ways(self):
     filename = "Roche/E3MFGYR02_random_10_reads.sff"
     with open(filename, "rb") as handle:
         index1 = sorted(_sff_read_roche_index(handle))
     with open(filename, "rb") as handle:
         index2 = sorted(_sff_do_slow_index(handle))
     self.assertEqual(index1, index2)
     with open(filename, "rb") as handle:
         self.assertEqual(len(index1), len(list(SffIterator(handle))))
     with open(filename, "rb") as handle:
         self.assertEqual(len(index1), len(list(SffIterator(BytesIO(handle.read())))))
def sff_filter(in_file, pos_file, neg_file, wanted):
    """SFF filter."""
    try:
        from Bio.SeqIO.SffIO import SffIterator, SffWriter
    except ImportError:
        sys.exit("SFF filtering requires Biopython 1.54 or later")

    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb")  # must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None

    # This makes two passes though the SFF file with isn't so efficient,
    # but this makes the code simple.
    pos_count = neg_count = 0
    if pos_file is not None:
        out_handle = open(pos_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0)  # start again after getting manifest
        pos_count = writer.write_file(rec for rec in SffIterator(in_handle)
                                      if clean_name(rec.id) in wanted)
        out_handle.close()
    if neg_file is not None:
        out_handle = open(neg_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0)  # start again
        neg_count = writer.write_file(rec for rec in SffIterator(in_handle)
                                      if clean_name(rec.id) not in wanted)
        out_handle.close()
    # And we're done
    in_handle.close()
    # At the time of writing, Galaxy doesn't show SFF file read counts,
    # so it is useful to put them in stdout and thus shown in job info.
    return pos_count, neg_count
Beispiel #6
0
    def test_read_wrong(self):
        filename = "Roche/greek.sff"
        with open(filename, "rb") as handle:
            self.assertRaises(ValueError, ReadRocheXmlManifest, handle)

        with open(filename, "rb") as handle:
            for record in SffIterator(handle):
                pass

            def fileiter(handle):
                for record in SffIterator(handle):
                    # print(record.id)
                    i = record.id

            self.assertRaises(ValueError, fileiter, handle)
Beispiel #7
0
class TestAlternativeIndexes(unittest.TestCase):
    filename = "Roche/E3MFGYR02_random_10_reads.sff"
    with open(filename, "rb") as handle:
        sff = list(SffIterator(handle))

    def check_same(self, new_sff):
        self.assertEqual(len(self.sff), len(new_sff))
        for old, new in zip(self.sff, new_sff):
            self.assertEqual(old.id, new.id)
            self.assertEqual(old.seq, new.seq)

    def test_alt_index_at_end(self):
        with open("Roche/E3MFGYR02_alt_index_at_end.sff", "rb") as handle:
            sff2 = list(SffIterator(handle))
        self.check_same(sff2)

    def test_alt_index_at_start(self):
        with open("Roche/E3MFGYR02_alt_index_at_start.sff", "rb") as handle:
            sff2 = list(SffIterator(handle))
        self.check_same(sff2)

    def test_alt_index_in_middle(self):
        with open("Roche/E3MFGYR02_alt_index_in_middle.sff", "rb") as handle:
            sff2 = list(SffIterator(handle))
        self.check_same(sff2)

    def test_index_at_start(self):
        with open("Roche/E3MFGYR02_index_at_start.sff", "rb") as handle:
            sff2 = list(SffIterator(handle))
        self.check_same(sff2)

    def test_index_in_middle(self):
        with open("Roche/E3MFGYR02_index_in_middle.sff", "rb") as handle:
            sff2 = list(SffIterator(handle))
        self.check_same(sff2)

    def test_trim(self):
        with open(self.filename, "rb") as handle:
            sff_trim = list(SffIterator(handle, trim=True))
        self.assertEqual(len(self.sff), len(sff_trim))
        for old, new in zip(self.sff, sff_trim):
            self.assertEqual(old.id, new.id)
Beispiel #8
0
 def test_write(self):
     filename = "Roche/E3MFGYR02_random_10_reads.sff"
     with open(filename, "rb") as handle:
         metadata = ReadRocheXmlManifest(handle)
     with open(filename, "rb") as handle:
         sff = list(SffIterator(handle))
     b_handle = BytesIO()
     w = SffWriter(b_handle, xml=metadata)
     w.write_file(sff)  # list
     data = b_handle.getvalue()
     # And again with an iterator...
     handle = BytesIO()
     w = SffWriter(handle, xml=metadata)
     w.write_file(iter(sff))
     self.assertEqual(data, handle.getvalue())
     # Check 100% identical to the original:
     with open(filename, "rb") as handle:
         original = handle.read()
     self.assertEqual(len(data), len(original))
     self.assertEqual(data, original)
     del data
Beispiel #9
0
def sff_filter(in_file, out_file, iterator_filter):
    count = 0
    try:
        from Bio.SeqIO.SffIO import SffIterator, SffWriter
    except ImportError:
        stop_err("SFF filtering requires Biopython 1.54 or later")
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    with open(in_file, "rb") as in_handle:
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        in_handle.seek(0)
        with open(out_file, "wb") as out_handle:
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  #start again after getting manifest
            count = writer.write_file(iterator_filter(SffIterator(in_handle)))
            #count = writer.write_file(SffIterator(in_handle))
    return count
Beispiel #10
0
                elif keep_negatives:
                    if len(seq) >= min_len:
                        negs += 1
                        yield record
                    else:
                        short_neg += 1

    in_handle = open(in_file, "rb")
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.seek(0)
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    writer.write_file(process(SffIterator(in_handle)))
    #End of SFF code
elif seq_format.lower().startswith("fastq"):
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastqReader(in_handle)
    writer = fastqWriter(out_handle)
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
                if len(record.sequence) >= min_len:
Beispiel #11
0
 def fileiter(handle):
     for record in SffIterator(handle):
         # print(record.id)
         i = record.id
Beispiel #12
0
 def test_trim(self):
     with open(self.filename, "rb") as handle:
         sff_trim = list(SffIterator(handle, trim=True))
     self.assertEqual(len(self.sff), len(sff_trim))
     for old, new in zip(self.sff, sff_trim):
         self.assertEqual(old.id, new.id)
Beispiel #13
0
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb")  # must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    in_handle.seek(0)  # start again after getting manifest
    count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
    out_handle.close()
    in_handle.close()
else:
    # Use Galaxy for FASTA, QUAL or FASTQ
    if seq_format.lower() in ["fasta", "csfasta"] or seq_format.lower().startswith(
        "qual"
    ):
        from galaxy_utils.sequence.fasta import fastaReader, fastaWriter

        reader = fastaReader(open(in_file, "rU"))
        writer = fastaWriter(open(out_file, "w"))
        marker = ">"
    elif seq_format.lower().startswith("fastq"):
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
Beispiel #14
0
 def test_index_in_middle(self):
     with open("Roche/E3MFGYR02_index_in_middle.sff", "rb") as handle:
         sff2 = list(SffIterator(handle))
     self.check_same(sff2)
Beispiel #15
0
 def test_alt_index_at_start(self):
     with open("Roche/E3MFGYR02_alt_index_at_start.sff", "rb") as handle:
         sff2 = list(SffIterator(handle))
     self.check_same(sff2)
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    in_handle = open(in_file, "rb")  #must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    #This makes two passes though the SFF file with isn't so efficient,
    #but this makes the code simple.
    pos_count = neg_count = 0
    if out_positive_file is not None:
        out_handle = open(out_positive_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0)  #start again after getting manifest
        pos_count = writer.write_file(rec for rec in SffIterator(in_handle)
                                      if clean_name(rec.id) in ids)
        out_handle.close()
    if out_negative_file is not None:
        out_handle = open(out_negative_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0)  #start again
        neg_count = writer.write_file(rec for rec in SffIterator(in_handle)
                                      if clean_name(rec.id) not in ids)
        out_handle.close()
    #And we're done
    in_handle.close()
    #At the time of writing, Galaxy doesn't show SFF file read counts,
    #so it is useful to put them in stdout and thus shown in job info.
    print "%i with and %i without specified IDs" % (pos_count, neg_count)
elif seq_format.lower() == "fasta":
Beispiel #17
0
    # Ugly code to make test files...
    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
    padding = len(index) % 8
    if padding:
        padding = 8 - padding
    index += chr(0) * padding
    assert len(index) % 8 == 0

    # Ugly bit of code to make a fake index at start
    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
    padding = len(index) % 8
    if padding:
        padding = 8 - padding
    index += chr(0) * padding
    with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle:
        records = list(SffIterator(handle))
    with open("Roche/E3MFGYR02_alt_index_at_start.sff", "w") as out_handle:
        w = SffWriter(out_handle, index=False, xml=None)
        # Fake the header...
        w._number_of_reads = len(records)
        w._index_start = 0
        w._index_length = 0
        w._key_sequence = records[0].annotations["flow_key"]
        w._flow_chars = records[0].annotations["flow_chars"]
        w._number_of_flows_per_read = len(w._flow_chars)
        w.write_header()
        w._index_start = out_handle.tell()
        w._index_length = len(index)
        out_handle.seek(0)
        w.write_header()  # this time with index info
        w.handle.write(index)
Beispiel #18
0
if __name__ == "__main__":
    runner = unittest.TextTestRunner(verbosity=2)
    unittest.main(testRunner=runner)

if False:
    # Ugly code to make test files...
    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
    padding = len(index) % 8
    if padding:
        padding = 8 - padding
    index += chr(0) * padding
    assert len(index) % 8 == 0

    # Ugly bit of code to make a fake index at start
    records = list(
        SffIterator(open("Roche/E3MFGYR02_random_10_reads.sff", "rb")))
    out_handle = open("Roche/E3MFGYR02_alt_index_at_start.sff", "w")
    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
    padding = len(index) % 8
    if padding:
        padding = 8 - padding
    index += chr(0) * padding
    w = SffWriter(out_handle, index=False, xml=None)
    # Fake the header...
    w._number_of_reads = len(records)
    w._index_start = 0
    w._index_length = 0
    w._key_sequence = records[0].annotations["flow_key"]
    w._flow_chars = records[0].annotations["flow_chars"]
    w._number_of_flows_per_read = len(w._flow_chars)
    w.write_header()