Esempio n. 1
0
def _alignment_to_phylip(obj, fh):

    if obj.is_empty():
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.sequence_length()
    if sequence_length == 0:
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    for id_ in obj.ids():
        if len(id_) > chunk_size:
            raise PhylipFormatError(
                "Alignment can only be written in PHYLIP format if all "
                "sequence IDs have %d or fewer characters. Found sequence "
                "with ID '%s' that exceeds this limit. Use "
                "Alignment.update_ids to assign shorter IDs." %
                (chunk_size, id_))

    sequence_count = obj.sequence_count()
    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for seq in obj:
        chunked_seq = _chunk_str(str(seq), chunk_size, ' ')
        fh.write(fmt.format(seq.metadata['id'], chunked_seq))
Esempio n. 2
0
def _alignment_to_phylip(obj, fh):

    if obj.is_empty():
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.sequence_length()
    if sequence_length == 0:
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    for id_ in obj.ids():
        if len(id_) > chunk_size:
            raise PhylipFormatError(
                "Alignment can only be written in PHYLIP format if all "
                "sequence IDs have %d or fewer characters. Found sequence "
                "with ID '%s' that exceeds this limit. Use "
                "Alignment.update_ids to assign shorter IDs." %
                (chunk_size, id_))

    sequence_count = obj.sequence_count()
    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for seq in obj:
        chunked_seq = _chunk_str(str(seq), chunk_size, ' ')
        fh.write(fmt.format(seq.id, chunked_seq))
Esempio n. 3
0
def _generator_to_fasta(obj, fh, qual=FileSentinel,
                        id_whitespace_replacement='_',
                        description_newline_replacement=' ', max_width=None):
    if max_width is not None:
        if max_width < 1:
            raise ValueError(
                "Maximum line width must be greater than zero (max_width=%d)."
                % max_width)
        if qual is not None:
            # define text wrapper for splitting quality scores here for
            # efficiency. textwrap docs recommend reusing a TextWrapper
            # instance when it is used many times. configure text wrapper to
            # never break "words" (i.e., integer quality scores) across lines
            qual_wrapper = textwrap.TextWrapper(
                width=max_width, break_long_words=False,
                break_on_hyphens=False)

    formatted_records = _format_fasta_like_records(
        obj, id_whitespace_replacement, description_newline_replacement,
        qual is not None)
    for header, seq_str, qual_scores in formatted_records:
        if max_width is not None:
            seq_str = _chunk_str(seq_str, max_width, '\n')

        fh.write('>%s\n%s\n' % (header, seq_str))

        if qual is not None:
            qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str))
            if max_width is not None:
                qual_str = qual_wrapper.fill(qual_str)
            qual.write('>%s\n%s\n' % (header, qual_str))
Esempio n. 4
0
def _generator_to_fasta(obj, fh, qual=FileSentinel,
                        id_whitespace_replacement='_',
                        description_newline_replacement=' ', max_width=None):
    if max_width is not None:
        if max_width < 1:
            raise ValueError(
                "Maximum line width must be greater than zero (max_width=%d)."
                % max_width)
        if qual is not None:
            # define text wrapper for splitting quality scores here for
            # efficiency. textwrap docs recommend reusing a TextWrapper
            # instance when it is used many times. configure text wrapper to
            # never break "words" (i.e., integer quality scores) across lines
            qual_wrapper = textwrap.TextWrapper(
                width=max_width, break_long_words=False,
                break_on_hyphens=False)

    formatted_records = _format_fasta_like_records(
        obj, id_whitespace_replacement, description_newline_replacement,
        qual is not None)
    for header, seq_str, qual_scores in formatted_records:
        if max_width is not None:
            seq_str = _chunk_str(seq_str, max_width, '\n')

        fh.write('>%s\n%s\n' % (header, seq_str))

        if qual is not None:
            qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str))
            if max_width is not None:
                qual_str = qual_wrapper.fill(qual_str)
            qual.write('>%s\n%s\n' % (header, qual_str))
Esempio n. 5
0
 def test_even_split(self):
     self.assertEqual(_chunk_str('abcdef', 6, ' '), 'abcdef')
     self.assertEqual(_chunk_str('abcdef', 3, ' '), 'abc def')
     self.assertEqual(_chunk_str('abcdef', 2, ' '), 'ab cd ef')
     self.assertEqual(_chunk_str('abcdef', 1, ' '), 'a b c d e f')
     self.assertEqual(_chunk_str('a', 1, ' '), 'a')
     self.assertEqual(_chunk_str('abcdef', 2, ''), 'abcdef')
Esempio n. 6
0
 def test_even_split(self):
     self.assertEqual(_chunk_str('abcdef', 6, ' '), 'abcdef')
     self.assertEqual(_chunk_str('abcdef', 3, ' '), 'abc def')
     self.assertEqual(_chunk_str('abcdef', 2, ' '), 'ab cd ef')
     self.assertEqual(_chunk_str('abcdef', 1, ' '), 'a b c d e f')
     self.assertEqual(_chunk_str('a', 1, ' '), 'a')
     self.assertEqual(_chunk_str('abcdef', 2, ''), 'abcdef')
Esempio n. 7
0
def _generator_to_fasta(obj, fh, id_whitespace_replacement='_',
                        description_newline_replacement=' ', max_width=None):
    if ((id_whitespace_replacement is not None and
         '\n' in id_whitespace_replacement) or
        (description_newline_replacement is not None and
         '\n' in description_newline_replacement)):
        raise FASTAFormatError(
            "Newline character (\\n) cannot be used to replace whitespace in "
            "biological sequence IDs, nor to replace newlines in biological "
            "sequence descriptions. Otherwise, the FASTA-formatted file will "
            "be invalid.")
    ws_pattern = re.compile(r'\s')
    nl_pattern = re.compile(r'\n')

    for idx, seq in enumerate(obj):
        if len(seq) < 1:
            raise FASTAFormatError(
                "Cannot write %s biological sequence in FASTA format because "
                "it does not contain any characters (i.e., it is an "
                "empty/blank sequence). Empty sequences are not supported in "
                "the FASTA file format." % cardinal_to_ordinal(idx + 1))

        id_ = seq.id
        if id_whitespace_replacement is not None:
            id_ = re.sub(ws_pattern, id_whitespace_replacement, id_)

        desc = seq.description
        if description_newline_replacement is not None:
            desc = re.sub(nl_pattern, description_newline_replacement, desc)

        if desc:
            header = '%s %s' % (id_, desc)
        else:
            header = id_

        seq_str = str(seq)
        if max_width is not None:
            seq_str = _chunk_str(seq_str, max_width, '\n')

        fh.write('>%s\n%s\n' % (header, seq_str))
Esempio n. 8
0
    def test_invalid_n(self):
        with self.assertRaisesRegexp(ValueError, 'n=0'):
            _chunk_str('abcdef', 0, ' ')

        with self.assertRaisesRegexp(ValueError, 'n=-42'):
            _chunk_str('abcdef', -42, ' ')
Esempio n. 9
0
 def test_uneven_split(self):
     self.assertEqual(_chunk_str('abcdef', 5, '|'), 'abcde|f')
     self.assertEqual(_chunk_str('abcdef', 4, '|'), 'abcd|ef')
     self.assertEqual(_chunk_str('abcdefg', 3, ' - '), 'abc - def - g')
Esempio n. 10
0
 def test_no_split(self):
     self.assertEqual(_chunk_str('', 2, '\n'), '')
     self.assertEqual(_chunk_str('a', 100, '\n'), 'a')
     self.assertEqual(_chunk_str('abcdef', 42, '|'), 'abcdef')
Esempio n. 11
0
    def test_invalid_n(self):
        with self.assertRaisesRegexp(ValueError, 'n=0'):
            _chunk_str('abcdef', 0, ' ')

        with self.assertRaisesRegexp(ValueError, 'n=-42'):
            _chunk_str('abcdef', -42, ' ')
Esempio n. 12
0
 def test_uneven_split(self):
     self.assertEqual(_chunk_str('abcdef', 5, '|'), 'abcde|f')
     self.assertEqual(_chunk_str('abcdef', 4, '|'), 'abcd|ef')
     self.assertEqual(_chunk_str('abcdefg', 3, ' - '), 'abc - def - g')
Esempio n. 13
0
 def test_no_split(self):
     self.assertEqual(_chunk_str('', 2, '\n'), '')
     self.assertEqual(_chunk_str('a', 100, '\n'), 'a')
     self.assertEqual(_chunk_str('abcdef', 42, '|'), 'abcdef')