Ejemplo n.º 1
0
 def create_alignment(
     records,
     aligned_sequences,
     strands,
     annotations,
     column_annotations,
     score,
 ):
     """Create the Alignment object from the collected alignment data."""
     coordinates = Alignment.infer_coordinates(aligned_sequences)
     for record, strand, row in zip(records, strands, coordinates):
         if strand == "-":
             row[:] = row[-1] - row[0] - row
         start = record.seq.defined_ranges[0][0]
         row += start
     alignment = Alignment(records, coordinates)
     if annotations is not None:
         alignment.annotations = annotations
     if column_annotations is not None:
         alignment.column_annotations = column_annotations
     if score is not None:
         alignment.score = score
     return alignment
Ejemplo n.º 2
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        # If the alignment contains entries with the same sequence
        # identifier (not a good idea - but seems possible), then this
        # dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        aligned_seqs = []
        consensus = ""
        index = None  # Used to extract the consensus

        # Use the first block to get the sequence identifiers
        for line in stream:
            if line.startswith(" "):
                # Sequence consensus line...
                assert len(ids) > 0
                assert index is not None
                length = len(aligned_seq)  # noqa: F821
                consensus = line[index:index + length]
                break
            elif line.strip():
                # Sequences identifier...
                fields = line.split()

                # We expect there to be two fields, there can be an optional
                # "sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                seqid, aligned_seq = fields[:2]
                ids.append(seqid)
                aligned_seqs.append(aligned_seq)
                seq = aligned_seq.replace("-", "")
                seqs.append(seq)

                # Record the sequence position to get the consensus
                if index is None:
                    index = line.find(aligned_seq, len(seqid))

                if len(fields) == 3:
                    # This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line) from None
                    if len(seq) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            else:
                # no consensus line
                if index:
                    break
        else:
            raise StopIteration

        assert index is not None

        # Confirm all same length
        length = len(aligned_seqs[0])
        for aligned_seq in aligned_seqs:
            assert len(aligned_seq) == length
        if consensus:
            assert len(consensus) == length

        n = len(seqs)
        i = 0
        # Loop over any remaining blocks...
        for line in stream:
            if line.startswith(" "):  # Sequence consensus line
                assert index is not None
                length = len(aligned_seq)
                consensus += line[index:index + length]
            elif not line.strip():  # Blank line
                continue
            else:
                seqid = ids[i]
                # Sequences identifier...
                fields = line.split()

                # We expect there to be two fields, there can be an optional
                # "sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                assert seqid == fields[0]
                aligned_seq = fields[1]
                aligned_seqs[i] += aligned_seq
                seq = aligned_seq.replace("-", "")
                seqs[i] += seq

                if len(fields) == 3:
                    # This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line) from None
                    if len(seqs[i]) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
                i += 1
                if i == n:
                    i = 0

        records = [
            SeqRecord(Seq(seq), id=seqid, description=seqid)
            for (seqid, seq) in zip(ids, seqs)
        ]
        coordinates = Alignment.infer_coordinates(aligned_seqs)
        alignment = Alignment(records, coordinates)
        # TODO - Handle alignment annotation better, for now
        # mimic the old parser in Bio.Clustalw
        if consensus:
            rows, columns = alignment.shape
            if len(consensus) != columns:
                for aligned_seq in aligned_seqs:
                    print(aligned_seq, len(aligned_seq))
                raise ValueError(
                    "Alignment has %i columns, consensus length is %i, '%s'" %
                    (columns, len(consensus), consensus))
            alignment.column_annotations = {}
            alignment.column_annotations["clustal_consensus"] = consensus
        yield alignment
Ejemplo n.º 3
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        identifiers = None
        number_of_sequences = None
        annotations = {}
        for line in stream:
            line = line.rstrip("\r\n")
            if identifiers is None:
                # searching for alignment metadata start
                if not line:
                    continue
                elif line.startswith("#---------------------------------------"):
                    # may appear between alignments
                    continue
                elif line.startswith("#======================================="):
                    # found the alignment metadata start
                    identifiers = []
                    ncols = None
                    sequences = None
                else:
                    raise ValueError("Unexpected line: %s" % line)
            elif sequences is None:
                # parsing the alignment metadata
                if line == "#=======================================":
                    # reached the end of alignment metadata
                    if len(identifiers) == 0:
                        raise ValueError("Number of sequences missing!")
                    if ncols is None:
                        raise ValueError("Length of alignment missing!")
                    sequences = [""] * number_of_sequences
                    aligned_sequences = [""] * number_of_sequences
                    consensus = ""
                    starts = [0] * number_of_sequences
                    column = 0
                    index = 0
                    continue
                if line.strip() == "#":
                    continue
                if not line.startswith("# "):
                    raise ValueError("Unexpected line: %s") % line
                try:
                    key, value = line[2:].split(":", 1)
                except ValueError:
                    # An equal sign is used for Longest_Identity,
                    # Longest_Similarity, Shortest_Identity, and
                    # Shortest_Similarity, which are included if command line
                    # argument -nobrief was used.
                    key, value = line[2:].split(" = ", 1)
                if key == "Aligned_sequences":
                    number_of_sequences = int(value.strip())
                    assert len(identifiers) == 0
                    # Should now expect the record identifiers...
                    for i, line in enumerate(stream):
                        if not line.startswith("# "):
                            raise ValueError("Unexpected line: %s") % line
                        number, identifier = line[2:].split(":")
                        assert i + 1 == int(number)
                        identifiers.append(identifier.strip())
                        if len(identifiers) == number_of_sequences:
                            break
                elif key == "Matrix":
                    annotations["matrix"] = value.strip()
                elif key == "Gap_penalty":
                    annotations["gap_penalty"] = float(value.strip())
                elif key == "Extend_penalty":
                    annotations["extend_penalty"] = float(value.strip())
                elif key == "Length":
                    ncols = int(value.strip())
                elif key == "Identity":
                    annotations["identity"] = int(value.strip().split("/")[0])
                elif key == "Similarity":
                    annotations["similarity"] = int(value.strip().split("/")[0])
                elif key == "Gaps":
                    annotations["gaps"] = int(value.strip().split("/")[0])
                elif key == "Score":
                    annotations["score"] = float(value.strip())
                # TODO:
                # The following are generated if the -nobrief command line
                # argument used. We could simply calculate them from the
                # alignment, but then we have to define what we mean by
                # "similar". For now, simply store them as an annotation.
                elif key == "Longest_Identity":
                    annotations["longest_identity"] = value.strip()
                elif key == "Longest_Similarity":
                    annotations["longest_similarity"] = value.strip()
                elif key == "Shortest_Identity":
                    annotations["shortest_identity"] = value.strip()
                elif key == "Shortest_Similarity":
                    annotations["shortest_similarity"] = value.strip()
                else:
                    raise ValueError("Failed to parse line '%s'" % line)
            else:
                # parse the sequences
                if not line:
                    # empty line
                    if index == number_of_sequences:
                        # reached the end of an alignment block
                        index = 0
                        if column == ncols:
                            # reached the end of the sequences
                            coordinates = Alignment.infer_coordinates(aligned_sequences)
                            records = []
                            n = len(sequences)
                            for i in range(n):
                                start = starts[i]
                                if start == 0:
                                    sequence = Seq(sequences[i])
                                else:
                                    coordinates[i, :] += start
                                    # create a partially defined sequence
                                    length = start + len(sequences[i])
                                    data = {start: sequences[i]}
                                    sequence = Seq(data, length=length)
                                record = SeqRecord(sequence, identifiers[i])
                                records.append(record)
                            alignment = Alignment(records, coordinates)
                            if annotations:
                                alignment.annotations = annotations
                            if consensus:
                                alignment.column_annotations = {
                                    "emboss_consensus": consensus
                                }
                            yield alignment
                            identifiers = None
                            annotations = {}
                    continue
                prefix = line[:21].strip()
                if prefix == "":
                    # match line
                    consensus += line[21:71]
                else:
                    identifier, start = prefix.split(None, 1)
                    assert identifiers[index].startswith(identifier)
                    aligned_sequence, end = line[21:].split(None, 1)
                    start = int(start) - 1  # Python counting
                    end = int(end)
                    length = len(sequences[index])
                    sequence = aligned_sequence.replace("-", "")
                    if length == 0 and len(sequence) > 0:
                        # Record the start
                        starts[index] = start
                    else:
                        if self.align_format == "srspair" and len(sequence) == 0:
                            start += 1
                        assert start == starts[index] + length
                    assert end == start + len(sequence)
                    sequences[index] += sequence
                    aligned_sequences[index] += aligned_sequence
                    if index == 0:
                        column += len(aligned_sequence)
                    else:
                        assert column == len(aligned_sequences[index])
                    index += 1
Ejemplo n.º 4
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        identifiers = None
        number_of_sequences = None
        for line in stream:
            line = line.rstrip("\r\n")
            if identifiers is None:
                # searching for alignment metadata start
                if not line:
                    continue
                elif line.startswith(
                        "#---------------------------------------"):
                    # may appear between alignments
                    continue
                elif line.startswith(
                        "#======================================="):
                    # found the alignment metadata start
                    identifiers = []
                    ncols = None
                    sequences = None
                    matrix = None
                    gap_penalty = None
                    extend_penalty = None
                    identity = None
                    similarity = None
                    gaps = None
                    score = None
                else:
                    raise ValueError("Unexpected line: %s" % line)
            elif sequences is None:
                # parsing the alignment metadata
                if line == "#=======================================":
                    # reached the end of alignment metadata
                    if len(identifiers) == 0:
                        raise ValueError("Number of sequences missing!")
                    if ncols is None:
                        raise ValueError("Length of alignment missing!")
                    sequences = [""] * number_of_sequences
                    aligned_sequences = [""] * number_of_sequences
                    consensus = ""
                    starts = [0] * number_of_sequences
                    ends = [0] * number_of_sequences
                    column = 0
                    index = 0
                    continue
                if line.strip() == "#":
                    continue
                if not line.startswith("# "):
                    raise ValueError("Unexpected line: %s") % line
                key, value = line[2:].split(":", 1)
                if key == "Aligned_sequences":
                    number_of_sequences = int(value.strip())
                    assert len(identifiers) == 0
                    # Should now expect the record identifiers...
                    for i, line in enumerate(stream):
                        if not line.startswith("# "):
                            raise ValueError("Unexpected line: %s") % line
                        number, identifier = line[2:].split(":")
                        assert i + 1 == int(number)
                        identifiers.append(identifier.strip())
                        if len(identifiers) == number_of_sequences:
                            break
                elif key == "Matrix":
                    matrix = value.strip()
                elif key == "Gap_penalty":
                    gap_penalty = float(value.strip())
                elif key == "Extend_penalty":
                    extend_penalty = float(value.strip())
                elif key == "Length":
                    ncols = int(value.strip())
                elif key == "Identity":
                    identity = int(value.strip().split("/")[0])
                elif key == "Similarity":
                    similarity = int(value.strip().split("/")[0])
                elif key == "Gaps":
                    gaps = int(value.strip().split("/")[0])
                elif key == "Score":
                    score = float(value.strip())
            else:
                # parse the sequences
                if not line:
                    # empty line
                    if index == number_of_sequences:
                        # reached the end of an alignment block
                        index = 0
                        if column == ncols:
                            # reached the end of the sequences
                            coordinates = Alignment.infer_coordinates(
                                aligned_sequences)
                            for i, start in enumerate(starts):
                                start -= 1  # Python counting
                                coordinates[i, :] += start
                            sequences = [
                                Seq(sequence) for sequence in sequences
                            ]
                            records = [
                                SeqRecord(sequence, id=identifier)
                                for sequence, identifier in zip(
                                    sequences, identifiers)
                            ]
                            alignment = Alignment(records, coordinates)
                            if matrix is not None:
                                alignment.matrix = matrix
                            if gap_penalty is not None:
                                alignment.gap_penalty = gap_penalty
                            if extend_penalty is not None:
                                alignment.extend_penalty = extend_penalty
                            if identity is not None:
                                alignment.identity = identity
                            if similarity is not None:
                                alignment.similarity = similarity
                            if gaps is not None:
                                alignment.gaps = gaps
                            if score is not None:
                                alignment.score = score
                            if consensus:
                                alignment.column_annotations = {
                                    "emboss_consensus": consensus
                                }
                            yield alignment
                            identifiers = None
                    continue
                prefix = line[:21].strip()
                if prefix == "":
                    # match line
                    consensus += line[21:71]
                else:
                    identifier, start = prefix.split(None, 1)
                    aligned_sequence, end = line[21:].split(None, 1)
                    start = int(start)
                    end = int(end)
                    sequence = aligned_sequence.replace("-", "")
                    if len(sequences[index]) > 0:
                        length = len(sequence)
                        if length == 0:
                            assert start == ends[index]
                            assert end == ends[index]
                        else:
                            assert start == ends[index] + 1
                            assert end == ends[index] + length
                    assert identifiers[index].startswith(identifier)
                    if starts[index] == 0:
                        # Record the start and end
                        starts[index] = start
                    ends[index] = end
                    sequences[index] += sequence
                    aligned_sequences[index] += aligned_sequence
                    if index == 0:
                        column += len(aligned_sequence)
                    else:
                        assert column == len(aligned_sequences[index])
                    index += 1