def from_lines(cls, lines): """Parses a MSA from a file/list of lines, and returns a dictionary of names to sequences. If read_meta is True, meta information included after the first space in header of each sequence: >NAME META-INFORMATION SEQUENCE As suggested above, sequences are expected to be in FASTA format.""" return MSA(FASTA.from_lines(lines))
def test_fasta__from_lines__multiple_records(): lines = [">first\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n", ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n", ">Third\n", "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"] expected = [FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"), FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"), FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC")] assert_list_equal(FASTA.from_lines(lines), expected)
def _read_mitochondria(self, tar_handle, filename): try: tar_handle.getmember(filename) except KeyError: # Missing MT file is allowed return None handle = tar_handle.extractfile(filename) results = {} for record in FASTA.from_lines(handle): record = FASTA(name=record.name, meta=record.meta, sequence=record.sequence.upper()) unexpected = set(record.sequence) - set("ACGTN-") if unexpected: unexpected = ", ".join(map(repr, sorted(unexpected))) raise ZonkeyDBError("Unexpected nucleotide in %s; only A, C, " "G, T, N, and - are allowed, not %s" % (unexpected, filename)) elif record.name in results: raise ZonkeyDBError("Duplicate sequence name in %s: %r" % (filename, record.name)) results[record.name] = record lengths = frozenset(len(record.sequence) for record in results.itervalues()) if not lengths: raise ZonkeyDBError("No mitochondrial sequences found in %r" % (filename,)) elif len(lengths) > 2: lengths = tuple(sorted(lengths)) lengths_s = "%s, and %s" % (", ".join(map(str, lengths[:-1])), lengths[-1]) raise ZonkeyDBError("At most two different sequence lengths " "expected for mitochondrial sequences, but " "found %i different lengths in %r: %s" % (len(lengths), filename, lengths_s)) elif len(lengths) != 1: # Unpadded sequences are allowed delta_len = max(lengths) - min(lengths) mito_padding = self.settings["MitoPadding"] if (delta_len != mito_padding): raise ZonkeyDBError("Length difference between mitochondrial " "sequences in %r does not match the " "padding; expected a difference of %i bp, " "but found a %i bp difference." % (filename, mito_padding, delta_len)) return results
def _read_mitochondria(self, tar_handle, filename): try: tar_handle.getmember(filename) except KeyError: # Missing MT file is allowed return None handle = tar_handle.extractfile(filename) results = {} for record in FASTA.from_lines(handle): record = FASTA(name=record.name, meta=record.meta, sequence=record.sequence.upper()) unexpected = set(record.sequence) - set("ACGTN-") if unexpected: unexpected = ", ".join(map(repr, sorted(unexpected))) raise ZonkeyDBError("Unexpected nucleotide in %s; only A, C, " "G, T, N, and - are allowed, not %s" % (unexpected, filename)) elif record.name in results: raise ZonkeyDBError("Duplicate sequence name in %s: %r" % (filename, record.name)) results[record.name] = record lengths = frozenset( len(record.sequence) for record in results.itervalues()) if not lengths: raise ZonkeyDBError("No mitochondrial sequences found in %r" % (filename, )) elif len(lengths) > 2: lengths = tuple(sorted(lengths)) lengths_s = "%s, and %s" % (", ".join(map( str, lengths[:-1])), lengths[-1]) raise ZonkeyDBError("At most two different sequence lengths " "expected for mitochondrial sequences, but " "found %i different lengths in %r: %s" % (len(lengths), filename, lengths_s)) elif len(lengths) != 1: # Unpadded sequences are allowed delta_len = max(lengths) - min(lengths) mito_padding = self.settings["MitoPadding"] if (delta_len != mito_padding): raise ZonkeyDBError("Length difference between mitochondrial " "sequences in %r does not match the " "padding; expected a difference of %i bp, " "but found a %i bp difference." % (filename, mito_padding, delta_len)) return results
def test_fasta__from_lines_single_record(): lines = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"] expected = [FASTA("single", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA")] assert list(FASTA.from_lines(lines)) == list(expected)
def test_fasta__from_lines__empty_name__alone(): lines = [">\n", "ACGT\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_record_last(): lines = [">fasta1\n", "ACGT\n", ">fasta2\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_record_name_only__first(): list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))
def test_fasta__from_lines__no_records(): assert_list_equal(FASTA.from_lines([]), [])
def test_fasta__from_lines__empty_name__with_others(): lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__no_records(): assert list(FASTA.from_lines([])) == list([])
def test_fasta__from_lines__missing_name__alone(): lines = ["ACGT\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_record__middle(): lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"] list(FASTA.from_lines(lines))
def test_fasta__from_lines__empty_record_name_only__nothing_else(): list(FASTA.from_lines([">fasta1\n"]))
def test_fasta__from_lines_single_record(): lines = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"] expected = [FASTA("single", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA")] assert_list_equal(FASTA.from_lines(lines), expected)
def test_fasta__from_lines__empty_record_name_only__nothing_else(): with pytest.raises(FASTAError): list(FASTA.from_lines([">fasta1\n"]))
def test_fasta__from_lines__empty_record_name_only__first(): with pytest.raises(FASTAError): list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))
def test_fasta__from_lines__empty_record_last(): lines = [">fasta1\n", "ACGT\n", ">fasta2\n"] with pytest.raises(FASTAError): list(FASTA.from_lines(lines))
def test_fasta__from_lines__missing_name__alone(): with pytest.raises(FASTAError): list(FASTA.from_lines(["ACGT\n"]))
def test_fasta__from_lines__empty_name__alone(): with pytest.raises(FASTAError): list(FASTA.from_lines([">\n", "ACGT\n"]))
def test_fasta__from_lines__empty_name__with_others(): with pytest.raises(FASTAError): list(FASTA.from_lines([">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"]))