def _tabular_msa_to_phylip(obj, fh): sequence_count = obj.shape.sequence if sequence_count < 1: raise PhylipFormatError( "TabularMSA can only be written in PHYLIP format if there is at " "least one sequence in the alignment.") sequence_length = obj.shape.position if sequence_length < 1: raise PhylipFormatError( "TabularMSA can only be written in PHYLIP format if there is at " "least one position in the alignment.") chunk_size = 10 labels = [str(label) for label in obj.index] for label in labels: if len(label) > chunk_size: raise PhylipFormatError( "``TabularMSA`` can only be written in PHYLIP format if all " "sequence index labels have %d or fewer characters. Found " "sequence with index label '%s' that exceeds this limit. Use " "``TabularMSA.reassign_index`` to assign shorter index labels." % (chunk_size, label)) fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length)) fmt = '{0:%d}{1}\n' % chunk_size for label, seq in zip(labels, obj): chunked_seq = chunk_str(str(seq), chunk_size, ' ') fh.write(fmt.format(label, chunked_seq))
def _alignment_to_phylip(obj, fh): if obj.is_empty(): raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one sequence in the alignment.") sequence_length = obj.sequence_length() if sequence_length == 0: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one position in the alignment.") chunk_size = 10 for id_ in obj.ids(): if len(id_) > chunk_size: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if all " "sequence IDs have %d or fewer characters. Found sequence " "with ID '%s' that exceeds this limit. Use " "Alignment.update_ids to assign shorter IDs." % (chunk_size, id_)) sequence_count = obj.sequence_count() fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length)) fmt = '{0:%d}{1}\n' % chunk_size for seq in obj: chunked_seq = chunk_str(str(seq), chunk_size, ' ') fh.write(fmt.format(seq.metadata['id'], chunked_seq))
def _generator_to_fasta(obj, fh, qual=FileSentinel, id_whitespace_replacement='_', description_newline_replacement=' ', max_width=None, lowercase=None): if max_width is not None: if max_width < 1: raise ValueError( "Maximum line width must be greater than zero (max_width=%d)." % max_width) if qual is not None: # define text wrapper for splitting quality scores here for # efficiency. textwrap docs recommend reusing a TextWrapper # instance when it is used many times. configure text wrapper to # never break "words" (i.e., integer quality scores) across lines qual_wrapper = textwrap.TextWrapper( width=max_width, break_long_words=False, break_on_hyphens=False) formatted_records = _format_fasta_like_records( obj, id_whitespace_replacement, description_newline_replacement, qual is not None, lowercase) for header, seq_str, qual_scores in formatted_records: if max_width is not None: seq_str = chunk_str(seq_str, max_width, '\n') fh.write('>%s\n%s\n' % (header, seq_str)) if qual is not None: qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str)) if max_width is not None: qual_str = qual_wrapper.fill(qual_str) qual.write('>%s\n%s\n' % (header, qual_str))
def _generator_to_fasta(obj, fh, qual=FileSentinel, id_whitespace_replacement='_', description_newline_replacement=' ', max_width=None, lowercase=None): if max_width is not None: if max_width < 1: raise ValueError( "Maximum line width must be greater than zero (max_width=%d)." % max_width) if qual is not None: # define text wrapper for splitting quality scores here for # efficiency. textwrap docs recommend reusing a TextWrapper # instance when it is used many times. configure text wrapper to # never break "words" (i.e., integer quality scores) across lines qual_wrapper = textwrap.TextWrapper(width=max_width, break_long_words=False, break_on_hyphens=False) formatted_records = _format_fasta_like_records( obj, id_whitespace_replacement, description_newline_replacement, qual is not None, lowercase) for header, seq_str, qual_scores in formatted_records: if max_width is not None: seq_str = chunk_str(seq_str, max_width, '\n') fh.write('>%s\n%s\n' % (header, seq_str)) if qual is not None: qual_str = ' '.join(np.asarray(qual_scores, dtype=np.str)) if max_width is not None: qual_str = qual_wrapper.fill(qual_str) qual.write('>%s\n%s\n' % (header, qual_str))
def _format_chunked_seq(self, line_idxs, num_chars, column_width): """Format specified lines of chunked sequence data.""" lines = [] for line_idx in line_idxs: seq_idx = line_idx * num_chars chars = str(self._obj[seq_idx:seq_idx + num_chars]) chunked_chars = chunk_str(chars, self._chunk_size, ' ') lines.append(('%d' % seq_idx).ljust(column_width) + chunked_chars) return lines
def _format_chunked_seq(self, line_idxs, num_chars, column_width): """Format specified lines of chunked sequence data.""" lines = [] for line_idx in line_idxs: seq_idx = line_idx * num_chars chars = str(self._obj[seq_idx:seq_idx+num_chars]) chunked_chars = chunk_str(chars, self._chunk_size, ' ') lines.append(('%d' % seq_idx).ljust(column_width) + chunked_chars) return lines
def test_even_split(self): self.assertEqual(chunk_str('abcdef', 6, ' '), 'abcdef') self.assertEqual(chunk_str('abcdef', 3, ' '), 'abc def') self.assertEqual(chunk_str('abcdef', 2, ' '), 'ab cd ef') self.assertEqual(chunk_str('abcdef', 1, ' '), 'a b c d e f') self.assertEqual(chunk_str('a', 1, ' '), 'a') self.assertEqual(chunk_str('abcdef', 2, ''), 'abcdef')
def _serialize_origin(seq, indent=9): '''Serialize seq to ORIGIN. Parameters ---------- seq : str ''' n = 1 line_size = 60 frag_size = 10 for i in range(0, len(seq), line_size): line = seq[i:i+line_size] s = '{n:>{indent}} {s}\n'.format( n=n, indent=indent, s=chunk_str(line, frag_size, ' ')) if n == 1: s = 'ORIGIN\n' + s n = n + line_size yield s
def test_invalid_n(self): with six.assertRaisesRegex(self, ValueError, 'n=0'): chunk_str('abcdef', 0, ' ') with six.assertRaisesRegex(self, ValueError, 'n=-42'): chunk_str('abcdef', -42, ' ')
def test_uneven_split(self): self.assertEqual(chunk_str('abcdef', 5, '|'), 'abcde|f') self.assertEqual(chunk_str('abcdef', 4, '|'), 'abcd|ef') self.assertEqual(chunk_str('abcdefg', 3, ' - '), 'abc - def - g')
def test_no_split(self): self.assertEqual(chunk_str('', 2, '\n'), '') self.assertEqual(chunk_str('a', 100, '\n'), 'a') self.assertEqual(chunk_str('abcdef', 42, '|'), 'abcdef')