Ejemplo n.º 1
0
def _parse_phylip_raw(fh):
    """Raw parser for PHYLIP files.

    Returns a list of raw (seq, id) values.  It is the responsibility of the
    caller to construct the correct in-memory object to hold the data.

    """
    # Note: this returns the full data instead of yielding each sequence,
    # because the header specifies the number of sequences, so the file cannot
    # be validated until it's read completely.

    # File should have a single header on the first line.
    try:
        header = next(_line_generator(fh))
    except StopIteration:
        raise PhylipFormatError("This file is empty.")
    n_seqs, seq_len = _validate_header(header)

    # All following lines should be ID+sequence. No blank lines are allowed.
    data = []
    for line in _line_generator(fh):
        data.append(_validate_line(line, seq_len))
    if len(data) != n_seqs:
        raise PhylipFormatError("The number of sequences is not %s " % n_seqs +
                                "as specified in the header.")
    return data
Ejemplo n.º 2
0
def _tabular_msa_to_phylip(obj, fh):
    sequence_count = obj.shape.sequence
    if sequence_count < 1:
        raise PhylipFormatError(
            "TabularMSA can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.shape.position
    if sequence_length < 1:
        raise PhylipFormatError(
            "TabularMSA can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    labels = [str(label) for label in obj.index]
    for label in labels:
        if len(label) > chunk_size:
            raise PhylipFormatError(
                "``TabularMSA`` can only be written in PHYLIP format if all "
                "sequence index labels have %d or fewer characters. Found "
                "sequence with index label '%s' that exceeds this limit. Use "
                "``TabularMSA.reassign_index`` to assign shorter index labels."
                % (chunk_size, label))

    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for label, seq in zip(labels, obj):
        chunked_seq = chunk_str(str(seq), chunk_size, ' ')
        fh.write(fmt.format(label, chunked_seq))
Ejemplo n.º 3
0
def _alignment_to_phylip(obj, fh):

    if obj.is_empty():
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.sequence_length()
    if sequence_length == 0:
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    for id_ in obj.ids():
        if len(id_) > chunk_size:
            raise PhylipFormatError(
                "Alignment can only be written in PHYLIP format if all "
                "sequence IDs have %d or fewer characters. Found sequence "
                "with ID '%s' that exceeds this limit. Use "
                "Alignment.update_ids to assign shorter IDs." %
                (chunk_size, id_))

    sequence_count = obj.sequence_count()
    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for seq in obj:
        chunked_seq = chunk_str(str(seq), chunk_size, ' ')
        fh.write(fmt.format(seq.metadata['id'], chunked_seq))
Ejemplo n.º 4
0
def _validate_line(line, seq_len):
    if not line:
        raise PhylipFormatError("Empty lines are not allowed.")
    ID = line[:10].strip()
    seq = line[10:].replace(' ', '')
    if len(seq) != seq_len:
        raise PhylipFormatError(
            "The length of sequence %s is not %s as specified in the header." %
            (ID, seq_len))
    return (seq, ID)
Ejemplo n.º 5
0
def _validate_header(header):
    header_vals = header.split()
    try:
        n_seqs, seq_len = [int(x) for x in header_vals]
        if n_seqs < 1 or seq_len < 1:
            raise PhylipFormatError(
                'The number of sequences and the length must be positive.')
    except ValueError:
        raise PhylipFormatError(
            'Found non-header line when attempting to read the 1st record '
            '(header line should have two space-separated integers): '
            '"%s"' % header)
    return n_seqs, seq_len
Ejemplo n.º 6
0
def _alignment_to_phylip(obj, fh):
    if not obj.is_valid():
        # TODO update this error message when #670 is resolved
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if all sequences "
            "are of equal length and contain only valid characters within "
            "their character sets.")

    if obj.is_empty():
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one sequence in the alignment.")

    sequence_length = obj.sequence_length()
    if sequence_length == 0:
        raise PhylipFormatError(
            "Alignment can only be written in PHYLIP format if there is at "
            "least one position in the alignment.")

    chunk_size = 10
    for id_ in obj.ids():
        if len(id_) > chunk_size:
            raise PhylipFormatError(
                "Alignment can only be written in PHYLIP format if all "
                "sequence IDs have %d or fewer characters. Found sequence "
                "with ID '%s' that exceeds this limit. Use "
                "Alignment.update_ids to assign shorter IDs." %
                (chunk_size, id_))

    sequence_count = obj.sequence_count()
    fh.write('{0:d} {1:d}\n'.format(sequence_count, sequence_length))

    fmt = '{0:%d}{1}\n' % chunk_size
    for seq in obj:
        chunked_seq = _chunk_str(str(seq), chunk_size)
        fh.write(fmt.format(seq.id, chunked_seq))