def _parse_phylip_raw(fh): """Raw parser for PHYLIP files. Returns a list of raw (seq, id) values. It is the responsibility of the caller to construct the correct in-memory object to hold the data. """ # Note: this returns the full data instead of yielding each sequence, # because the header specifies the number of sequences, so the file cannot # be validated until it's read completely. # File should have a single header on the first line. try: header = next(_line_generator(fh)) except StopIteration: raise PhylipFormatError("This file is empty.") n_seqs, seq_len = _validate_header(header) # All following lines should be ID+sequence. No blank lines are allowed. data = [] for line in _line_generator(fh): data.append(_validate_line(line, seq_len)) if len(data) != n_seqs: raise PhylipFormatError("The number of sequences is not %s " % n_seqs + "as specified in the header.") return data
def _tabular_msa_to_phylip(obj, fh): sequence_count = obj.shape.sequence if sequence_count < 1: raise PhylipFormatError( "TabularMSA can only be written in PHYLIP format if there is at " "least one sequence in the alignment.") sequence_length = obj.shape.position if sequence_length < 1: raise PhylipFormatError( "TabularMSA can only be written in PHYLIP format if there is at " "least one position in the alignment.") chunk_size = 10 labels = [str(label) for label in obj.index] for label in labels: if len(label) > chunk_size: raise PhylipFormatError( "``TabularMSA`` can only be written in PHYLIP format if all " "sequence index labels have %d or fewer characters. Found " "sequence with index label '%s' that exceeds this limit. Use " "``TabularMSA.reassign_index`` to assign shorter index labels." % (chunk_size, label)) fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length)) fmt = '{0:%d}{1}\n' % chunk_size for label, seq in zip(labels, obj): chunked_seq = chunk_str(str(seq), chunk_size, ' ') fh.write(fmt.format(label, chunked_seq))
def _alignment_to_phylip(obj, fh): if obj.is_empty(): raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one sequence in the alignment.") sequence_length = obj.sequence_length() if sequence_length == 0: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one position in the alignment.") chunk_size = 10 for id_ in obj.ids(): if len(id_) > chunk_size: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if all " "sequence IDs have %d or fewer characters. Found sequence " "with ID '%s' that exceeds this limit. Use " "Alignment.update_ids to assign shorter IDs." % (chunk_size, id_)) sequence_count = obj.sequence_count() fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length)) fmt = '{0:%d}{1}\n' % chunk_size for seq in obj: chunked_seq = chunk_str(str(seq), chunk_size, ' ') fh.write(fmt.format(seq.metadata['id'], chunked_seq))
def _validate_line(line, seq_len): if not line: raise PhylipFormatError("Empty lines are not allowed.") ID = line[:10].strip() seq = line[10:].replace(' ', '') if len(seq) != seq_len: raise PhylipFormatError( "The length of sequence %s is not %s as specified in the header." % (ID, seq_len)) return (seq, ID)
def _validate_header(header): header_vals = header.split() try: n_seqs, seq_len = [int(x) for x in header_vals] if n_seqs < 1 or seq_len < 1: raise PhylipFormatError( 'The number of sequences and the length must be positive.') except ValueError: raise PhylipFormatError( 'Found non-header line when attempting to read the 1st record ' '(header line should have two space-separated integers): ' '"%s"' % header) return n_seqs, seq_len
def _alignment_to_phylip(obj, fh): if not obj.is_valid(): # TODO update this error message when #670 is resolved raise PhylipFormatError( "Alignment can only be written in PHYLIP format if all sequences " "are of equal length and contain only valid characters within " "their character sets.") if obj.is_empty(): raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one sequence in the alignment.") sequence_length = obj.sequence_length() if sequence_length == 0: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if there is at " "least one position in the alignment.") chunk_size = 10 for id_ in obj.ids(): if len(id_) > chunk_size: raise PhylipFormatError( "Alignment can only be written in PHYLIP format if all " "sequence IDs have %d or fewer characters. Found sequence " "with ID '%s' that exceeds this limit. Use " "Alignment.update_ids to assign shorter IDs." % (chunk_size, id_)) sequence_count = obj.sequence_count() fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length)) fmt = '{0:%d}{1}\n' % chunk_size for seq in obj: chunked_seq = _chunk_str(str(seq), chunk_size) fh.write(fmt.format(seq.id, chunked_seq))