Example #1
0
def _tabular_msa_to_phylip(obj, fh):
    sequence_count = obj.shape.sequence
    if sequence_count < 1:
        raise PhylipFormatError(
            "TabularMSA can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.shape.position
    if sequence_length < 1:
        raise PhylipFormatError(
            "TabularMSA can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    labels = [str(label) for label in obj.index]
    for label in labels:
        if len(label) > chunk_size:
            raise PhylipFormatError(
                "``TabularMSA`` can only be written in PHYLIP format if all "
                "sequence index labels have %d or fewer characters. Found "
                "sequence with index label '%s' that exceeds this limit. Use "
                "``TabularMSA.reassign_index`` to assign shorter index labels."
                % (chunk_size, label))

    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for label, seq in zip(labels, obj):
        chunked_seq = chunk_str(str(seq), chunk_size, ' ')
        fh.write(fmt.format(label, chunked_seq))
Example #2
0
def _alignment_to_phylip(obj, fh):
    if obj.is_empty():
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.sequence_length()
    if sequence_length == 0:
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    for id_ in obj.ids():
        if len(id_) > chunk_size:
            raise PhylipFormatError(
                "Alignment can only be written in PHYLIP format if all "
                "sequence IDs have %d or fewer characters. Found sequence "
                "with ID '%s' that exceeds this limit. Use "
                "Alignment.update_ids to assign shorter IDs." %
                (chunk_size, id_))

    sequence_count = obj.sequence_count()
    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for seq in obj:
        chunked_seq = chunk_str(str(seq), chunk_size, ' ')
        fh.write(fmt.format(seq.metadata['id'], chunked_seq))
Example #3
0
def _alignment_to_phylip(obj, fh):

    if obj.is_empty():
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.sequence_length()
    if sequence_length == 0:
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    for id_ in obj.ids():
        if len(id_) > chunk_size:
            raise PhylipFormatError(
                "Alignment can only be written in PHYLIP format if all "
                "sequence IDs have %d or fewer characters. Found sequence "
                "with ID '%s' that exceeds this limit. Use "
                "Alignment.update_ids to assign shorter IDs." %
                (chunk_size, id_))

    sequence_count = obj.sequence_count()
    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for seq in obj:
        chunked_seq = chunk_str(str(seq), chunk_size, ' ')
        fh.write(fmt.format(seq.metadata['id'], chunked_seq))
Example #4
0
def _generator_to_fasta(obj, fh, qual=FileSentinel,
                        id_whitespace_replacement='_',
                        description_newline_replacement=' ', max_width=None,
                        lowercase=None):
    if max_width is not None:
        if max_width < 1:
            raise ValueError(
                "Maximum line width must be greater than zero (max_width=%d)."
                % max_width)
        if qual is not None:
            # define text wrapper for splitting quality scores here for
            # efficiency. textwrap docs recommend reusing a TextWrapper
            # instance when it is used many times. configure text wrapper to
            # never break "words" (i.e., integer quality scores) across lines
            qual_wrapper = textwrap.TextWrapper(
                width=max_width, break_long_words=False,
                break_on_hyphens=False)

    formatted_records = _format_fasta_like_records(
        obj, id_whitespace_replacement, description_newline_replacement,
        qual is not None, lowercase)
    for header, seq_str, qual_scores in formatted_records:
        if max_width is not None:
            seq_str = chunk_str(seq_str, max_width, '\n')

        fh.write('>%s\n%s\n' % (header, seq_str))

        if qual is not None:
            qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str))
            if max_width is not None:
                qual_str = qual_wrapper.fill(qual_str)
            qual.write('>%s\n%s\n' % (header, qual_str))
Example #5
0
def _tabular_msa_to_phylip(obj, fh):
    sequence_count = obj.shape.sequence
    if sequence_count < 1:
        raise PhylipFormatError(
            "TabularMSA can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.shape.position
    if sequence_length < 1:
        raise PhylipFormatError(
            "TabularMSA can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    labels = [str(label) for label in obj.index]
    for label in labels:
        if len(label) > chunk_size:
            raise PhylipFormatError(
                "``TabularMSA`` can only be written in PHYLIP format if all "
                "sequence index labels have %d or fewer characters. Found "
                "sequence with index label '%s' that exceeds this limit. Use "
                "``TabularMSA.reassign_index`` to assign shorter index labels."
                % (chunk_size, label))

    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for label, seq in zip(labels, obj):
        chunked_seq = chunk_str(str(seq), chunk_size, ' ')
        fh.write(fmt.format(label, chunked_seq))
Example #6
0
def _generator_to_fasta(obj,
                        fh,
                        qual=FileSentinel,
                        id_whitespace_replacement='_',
                        description_newline_replacement=' ',
                        max_width=None,
                        lowercase=None):
    if max_width is not None:
        if max_width < 1:
            raise ValueError(
                "Maximum line width must be greater than zero (max_width=%d)."
                % max_width)
        if qual is not None:
            # define text wrapper for splitting quality scores here for
            # efficiency. textwrap docs recommend reusing a TextWrapper
            # instance when it is used many times. configure text wrapper to
            # never break "words" (i.e., integer quality scores) across lines
            qual_wrapper = textwrap.TextWrapper(width=max_width,
                                                break_long_words=False,
                                                break_on_hyphens=False)

    formatted_records = _format_fasta_like_records(
        obj, id_whitespace_replacement, description_newline_replacement, qual
        is not None, lowercase)
    for header, seq_str, qual_scores in formatted_records:
        if max_width is not None:
            seq_str = chunk_str(seq_str, max_width, '\n')

        fh.write('>%s\n%s\n' % (header, seq_str))

        if qual is not None:
            qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str))
            if max_width is not None:
                qual_str = qual_wrapper.fill(qual_str)
            qual.write('>%s\n%s\n' % (header, qual_str))
Example #7
0
 def _format_chunked_seq(self, line_idxs, num_chars, column_width):
     """Format specified lines of chunked sequence data."""
     lines = []
     for line_idx in line_idxs:
         seq_idx = line_idx * num_chars
         chars = str(self._obj[seq_idx:seq_idx + num_chars])
         chunked_chars = chunk_str(chars, self._chunk_size, ' ')
         lines.append(('%d' % seq_idx).ljust(column_width) + chunked_chars)
     return lines
Example #8
0
 def _format_chunked_seq(self, line_idxs, num_chars, column_width):
     """Format specified lines of chunked sequence data."""
     lines = []
     for line_idx in line_idxs:
         seq_idx = line_idx * num_chars
         chars = str(self._obj[seq_idx:seq_idx+num_chars])
         chunked_chars = chunk_str(chars, self._chunk_size, ' ')
         lines.append(('%d' % seq_idx).ljust(column_width) + chunked_chars)
     return lines
Example #9
0
 def test_even_split(self):
     self.assertEqual(chunk_str('abcdef', 6, ' '), 'abcdef')
     self.assertEqual(chunk_str('abcdef', 3, ' '), 'abc def')
     self.assertEqual(chunk_str('abcdef', 2, ' '), 'ab cd ef')
     self.assertEqual(chunk_str('abcdef', 1, ' '), 'a b c d e f')
     self.assertEqual(chunk_str('a', 1, ' '), 'a')
     self.assertEqual(chunk_str('abcdef', 2, ''), 'abcdef')
Example #10
0
 def test_even_split(self):
     self.assertEqual(chunk_str('abcdef', 6, ' '), 'abcdef')
     self.assertEqual(chunk_str('abcdef', 3, ' '), 'abc def')
     self.assertEqual(chunk_str('abcdef', 2, ' '), 'ab cd ef')
     self.assertEqual(chunk_str('abcdef', 1, ' '), 'a b c d e f')
     self.assertEqual(chunk_str('a', 1, ' '), 'a')
     self.assertEqual(chunk_str('abcdef', 2, ''), 'abcdef')
Example #11
0
def _serialize_origin(seq, indent=9):
    '''Serialize seq to ORIGIN.

    Parameters
    ----------
    seq : str
    '''
    n = 1
    line_size = 60
    frag_size = 10
    for i in range(0, len(seq), line_size):
        line = seq[i:i+line_size]
        s = '{n:>{indent}} {s}\n'.format(
            n=n, indent=indent, s=chunk_str(line, frag_size, ' '))
        if n == 1:
            s = 'ORIGIN\n' + s
        n = n + line_size
        yield s
Example #12
0
def _serialize_origin(seq, indent=9):
    '''Serialize seq to ORIGIN.

    Parameters
    ----------
    seq : str
    '''
    n = 1
    line_size = 60
    frag_size = 10
    for i in range(0, len(seq), line_size):
        line = seq[i:i+line_size]
        s = '{n:>{indent}} {s}\n'.format(
            n=n, indent=indent, s=chunk_str(line, frag_size, ' '))
        if n == 1:
            s = 'ORIGIN\n' + s
        n = n + line_size
        yield s
Example #13
0
    def test_invalid_n(self):
        with six.assertRaisesRegex(self, ValueError, 'n=0'):
            chunk_str('abcdef', 0, ' ')

        with six.assertRaisesRegex(self, ValueError, 'n=-42'):
            chunk_str('abcdef', -42, ' ')
Example #14
0
 def test_uneven_split(self):
     self.assertEqual(chunk_str('abcdef', 5, '|'), 'abcde|f')
     self.assertEqual(chunk_str('abcdef', 4, '|'), 'abcd|ef')
     self.assertEqual(chunk_str('abcdefg', 3, ' - '), 'abc - def - g')
Example #15
0
 def test_no_split(self):
     self.assertEqual(chunk_str('', 2, '\n'), '')
     self.assertEqual(chunk_str('a', 100, '\n'), 'a')
     self.assertEqual(chunk_str('abcdef', 42, '|'), 'abcdef')
Example #16
0
    def test_invalid_n(self):
        with six.assertRaisesRegex(self, ValueError, 'n=0'):
            chunk_str('abcdef', 0, ' ')

        with six.assertRaisesRegex(self, ValueError, 'n=-42'):
            chunk_str('abcdef', -42, ' ')
Example #17
0
 def test_uneven_split(self):
     self.assertEqual(chunk_str('abcdef', 5, '|'), 'abcde|f')
     self.assertEqual(chunk_str('abcdef', 4, '|'), 'abcd|ef')
     self.assertEqual(chunk_str('abcdefg', 3, ' - '), 'abc - def - g')
Example #18
0
 def test_no_split(self):
     self.assertEqual(chunk_str('', 2, '\n'), '')
     self.assertEqual(chunk_str('a', 100, '\n'), 'a')
     self.assertEqual(chunk_str('abcdef', 42, '|'), 'abcdef')