Esempio n. 1
0
 def from_lines(cls, lines):
     """Parses a MSA from a file/list of lines, and returns a dictionary
     of names to sequences. If read_meta is True, meta information included
     after the first space in header of each sequence:
       >NAME META-INFORMATION
       SEQUENCE
     As suggested above, sequences are expected to be in FASTA format."""
     return MSA(FASTA.from_lines(lines))
Esempio n. 2
0
 def from_lines(cls, lines):
     """Parses a MSA from a file/list of lines, and returns a dictionary
     of names to sequences. If read_meta is True, meta information included
     after the first space in header of each sequence:
       >NAME META-INFORMATION
       SEQUENCE
     As suggested above, sequences are expected to be in FASTA format."""
     return MSA(FASTA.from_lines(lines))
Esempio n. 3
0
def test_fasta__from_lines__multiple_records():
    lines = [">first\n",  "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n",
             ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n",
             ">Third\n",  "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"]
    expected = [FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"),
                FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"),
                FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC")]
    assert_list_equal(FASTA.from_lines(lines), expected)
Esempio n. 4
0
def test_fasta__from_lines__multiple_records():
    lines = [">first\n",  "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n",
             ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n",
             ">Third\n",  "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"]
    expected = [FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"),
                FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"),
                FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC")]
    assert_list_equal(FASTA.from_lines(lines), expected)
Esempio n. 5
0
    def _read_mitochondria(self, tar_handle, filename):
        try:
            tar_handle.getmember(filename)
        except KeyError:
            # Missing MT file is allowed
            return None

        handle = tar_handle.extractfile(filename)

        results = {}
        for record in FASTA.from_lines(handle):
            record = FASTA(name=record.name,
                           meta=record.meta,
                           sequence=record.sequence.upper())

            unexpected = set(record.sequence) - set("ACGTN-")
            if unexpected:
                unexpected = ", ".join(map(repr, sorted(unexpected)))
                raise ZonkeyDBError("Unexpected nucleotide in %s; only A, C, "
                                    "G, T, N, and - are allowed, not %s"
                                    % (unexpected, filename))
            elif record.name in results:
                raise ZonkeyDBError("Duplicate sequence name in %s: %r"
                                    % (filename, record.name))

            results[record.name] = record

        lengths = frozenset(len(record.sequence)
                            for record in results.itervalues())

        if not lengths:
            raise ZonkeyDBError("No mitochondrial sequences found in %r"
                                % (filename,))
        elif len(lengths) > 2:
            lengths = tuple(sorted(lengths))
            lengths_s = "%s, and %s" % (", ".join(map(str, lengths[:-1])),
                                        lengths[-1])

            raise ZonkeyDBError("At most two different sequence lengths "
                                "expected for mitochondrial sequences, but "
                                "found %i different lengths in %r: %s"
                                % (len(lengths), filename, lengths_s))
        elif len(lengths) != 1:
            # Unpadded sequences are allowed
            delta_len = max(lengths) - min(lengths)
            mito_padding = self.settings["MitoPadding"]

            if (delta_len != mito_padding):
                raise ZonkeyDBError("Length difference between mitochondrial "
                                    "sequences in %r does not match the "
                                    "padding; expected a difference of %i bp, "
                                    "but found a %i bp difference."
                                    % (filename, mito_padding, delta_len))

        return results
Esempio n. 6
0
    def _read_mitochondria(self, tar_handle, filename):
        try:
            tar_handle.getmember(filename)
        except KeyError:
            # Missing MT file is allowed
            return None

        handle = tar_handle.extractfile(filename)

        results = {}
        for record in FASTA.from_lines(handle):
            record = FASTA(name=record.name,
                           meta=record.meta,
                           sequence=record.sequence.upper())

            unexpected = set(record.sequence) - set("ACGTN-")
            if unexpected:
                unexpected = ", ".join(map(repr, sorted(unexpected)))
                raise ZonkeyDBError("Unexpected nucleotide in %s; only A, C, "
                                    "G, T, N, and - are allowed, not %s" %
                                    (unexpected, filename))
            elif record.name in results:
                raise ZonkeyDBError("Duplicate sequence name in %s: %r" %
                                    (filename, record.name))

            results[record.name] = record

        lengths = frozenset(
            len(record.sequence) for record in results.itervalues())

        if not lengths:
            raise ZonkeyDBError("No mitochondrial sequences found in %r" %
                                (filename, ))
        elif len(lengths) > 2:
            lengths = tuple(sorted(lengths))
            lengths_s = "%s, and %s" % (", ".join(map(
                str, lengths[:-1])), lengths[-1])

            raise ZonkeyDBError("At most two different sequence lengths "
                                "expected for mitochondrial sequences, but "
                                "found %i different lengths in %r: %s" %
                                (len(lengths), filename, lengths_s))
        elif len(lengths) != 1:
            # Unpadded sequences are allowed
            delta_len = max(lengths) - min(lengths)
            mito_padding = self.settings["MitoPadding"]

            if (delta_len != mito_padding):
                raise ZonkeyDBError("Length difference between mitochondrial "
                                    "sequences in %r does not match the "
                                    "padding; expected a difference of %i bp, "
                                    "but found a %i bp difference." %
                                    (filename, mito_padding, delta_len))

        return results
Esempio n. 7
0
def test_fasta__from_lines_single_record():
    lines = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"]
    expected = [FASTA("single", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA")]
    assert list(FASTA.from_lines(lines)) == list(expected)
Esempio n. 8
0
def test_fasta__from_lines__empty_name__alone():
    lines = [">\n", "ACGT\n"]
    list(FASTA.from_lines(lines))
Esempio n. 9
0
def test_fasta__from_lines__empty_record_last():
    lines = [">fasta1\n", "ACGT\n", ">fasta2\n"]
    list(FASTA.from_lines(lines))
Esempio n. 10
0
def test_fasta__from_lines__empty_record_name_only__first():
    list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))
Esempio n. 11
0
def test_fasta__from_lines__no_records():
    assert_list_equal(FASTA.from_lines([]), [])
Esempio n. 12
0
def test_fasta__from_lines__empty_name__with_others():
    lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"]
    list(FASTA.from_lines(lines))
Esempio n. 13
0
def test_fasta__from_lines__no_records():
    assert list(FASTA.from_lines([])) == list([])
Esempio n. 14
0
def test_fasta__from_lines__empty_name__alone():
    lines = [">\n", "ACGT\n"]
    list(FASTA.from_lines(lines))
Esempio n. 15
0
def test_fasta__from_lines__missing_name__alone():
    lines = ["ACGT\n"]
    list(FASTA.from_lines(lines))
Esempio n. 16
0
def test_fasta__from_lines__empty_record_last():
    lines = [">fasta1\n", "ACGT\n", ">fasta2\n"]
    list(FASTA.from_lines(lines))
Esempio n. 17
0
def test_fasta__from_lines__empty_record__middle():
    lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"]
    list(FASTA.from_lines(lines))
Esempio n. 18
0
def test_fasta__from_lines__empty_record_name_only__nothing_else():
    list(FASTA.from_lines([">fasta1\n"]))
Esempio n. 19
0
def test_fasta__from_lines_single_record():
    lines = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"]
    expected = [FASTA("single", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA")]
    assert_list_equal(FASTA.from_lines(lines), expected)
Esempio n. 20
0
def test_fasta__from_lines__empty_record_name_only__nothing_else():
    with pytest.raises(FASTAError):
        list(FASTA.from_lines([">fasta1\n"]))
Esempio n. 21
0
def test_fasta__from_lines__empty_record_name_only__nothing_else():
    list(FASTA.from_lines([">fasta1\n"]))
Esempio n. 22
0
def test_fasta__from_lines__empty_record_name_only__first():
    with pytest.raises(FASTAError):
        list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))
Esempio n. 23
0
def test_fasta__from_lines__empty_record__middle():
    lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"]
    list(FASTA.from_lines(lines))
Esempio n. 24
0
def test_fasta__from_lines__empty_record_last():
    lines = [">fasta1\n", "ACGT\n", ">fasta2\n"]
    with pytest.raises(FASTAError):
        list(FASTA.from_lines(lines))
Esempio n. 25
0
def test_fasta__from_lines__missing_name__alone():
    lines = ["ACGT\n"]
    list(FASTA.from_lines(lines))
Esempio n. 26
0
def test_fasta__from_lines__missing_name__alone():
    with pytest.raises(FASTAError):
        list(FASTA.from_lines(["ACGT\n"]))
Esempio n. 27
0
def test_fasta__from_lines__empty_name__with_others():
    lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"]
    list(FASTA.from_lines(lines))
Esempio n. 28
0
def test_fasta__from_lines__empty_name__alone():
    with pytest.raises(FASTAError):
        list(FASTA.from_lines([">\n", "ACGT\n"]))
Esempio n. 29
0
def test_fasta__from_lines__empty_name__with_others():
    with pytest.raises(FASTAError):
        list(FASTA.from_lines([">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"]))
Esempio n. 30
0
def test_fasta__from_lines__no_records():
    assert_list_equal(FASTA.from_lines([]), [])