Example #1
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        names, seqs = self._read_file(stream)

        seqs = ["".join(seq) for seq in seqs]
        if len(seqs) != self.number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(seqs), self.number_of_seqs))
        for seq in seqs:
            if len(seq) != self.length_of_seqs:
                raise ValueError(
                    "Expected all sequences to have length %d; found %d" %
                    (self.length_of_seqs, len(seq)))
            if "." in seq:
                raise ValueError(
                    "PHYLIP format no longer allows dots in sequence")

        coordinates = Alignment.infer_coordinates(seqs)
        seqs = [seq.replace("-", "") for seq in seqs]
        records = [
            SeqRecord(Seq(seq), id=name) for (name, seq) in zip(names, seqs)
        ]
        alignment = Alignment(records, coordinates)
        yield alignment
Example #2
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            return

        descriptions = []
        seqs = []

        line = self._line
        del self._line
        description = self._parse_description(line)
        identifier, start, end, strand, comments = description
        descriptions.append(description)
        seqs.append("")

        for line in stream:
            line = line.strip()
            if line.startswith("="):
                # There may be more data, but we've reached the end of this
                # alignment
                coordinates = Alignment.infer_coordinates(seqs)
                records = []
                for index, (description,
                            seq) in enumerate(zip(descriptions, seqs)):
                    identifier, start, end, strand, comments = description
                    length = end - start
                    seq = seq.replace("-", "")
                    assert len(seq) == end - start
                    if strand == "+":
                        pass
                    elif strand == "-":
                        seq = reverse_complement(seq, inplace=False)
                        coordinates[
                            index, :] = len(seq) - coordinates[index, :]
                    else:
                        raise ValueError("Unexpected strand '%s'" % strand)
                    coordinates[index] += start
                    if start == 0:
                        seq = Seq(seq)
                    else:
                        seq = Seq({start: seq}, length=end)
                    record = SeqRecord(seq,
                                       id=identifier,
                                       description=comments)
                    records.append(record)

                yield Alignment(records, coordinates)

                descriptions = []
                seqs = []
            elif line.startswith(">"):
                description = self._parse_description(line)
                identifier, start, end, strand, comments = description
                descriptions.append(description)
                seqs.append("")
            else:
                seqs[-1] += line
Example #3
0
 def create_alignment(self, line):
     """Parse one line of FASTA output and return an Alignment object."""
     columns = line.split()
     assert len(columns) == 13
     annotations = {}
     annotations["program"] = self._program
     annotations["database"] = self._database
     if self._query_id is not None:
         assert columns[0] == self._query_id
     query_id = columns[0]
     target_id = columns[1]
     percentage_identity = float(columns[2])
     alignment_length = int(columns[3])
     mismatches = int(columns[4])
     matches = alignment_length - mismatches
     difference = abs(100 * matches / alignment_length -
                      percentage_identity)
     assert difference < 0.015
     gap_opens = int(columns[5])
     query_start = int(columns[6]) - 1
     query_end = int(columns[7])
     target_start = int(columns[8]) - 1
     target_end = int(columns[9])
     annotations["mismatches"] = mismatches
     annotations["evalue"] = float(columns[10])
     annotations["bit_score"] = float(columns[11])
     if self._alignment_representation == "BTOP":
         coordinates = self.parse_btop(columns[12])
     elif self._alignment_representation == "CIGAR":
         coordinates = self.parse_cigar(columns[12])
     coordinates[0, :] += target_start
     query_size = self._query_size
     if query_start < query_end:
         coordinates[1, :] += query_start
     else:
         # mapped to reverse strand
         coordinates[1, :] = coordinates[1, ::-1]
         coordinates[1, :] += query_size - query_start - 1
     query_sequence = Seq(None, length=query_size)
     query = SeqRecord(query_sequence, id=query_id)
     if self._query_description is not None:
         query.description = self._query_description
     target_sequence = Seq(None, length=target_end)
     target = SeqRecord(target_sequence, id=target_id)
     records = [target, query]
     alignment = Alignment(records, coordinates)
     alignment.annotations = annotations
     return alignment
Example #4
0
    def parse(self, stream):
        """Parse the next alignment from the stream.

        This uses the Bio.Nexus module to do the hard work.

        You are expected to call this function via Bio.Align
        (and not use it directly).

        NOTE - We only expect ONE alignment matrix per Nexus file,
        meaning this iterator will only yield one Alignment.
        """
        n = Nexus.Nexus(stream)
        if not n.matrix:
            # No alignment found
            return

        # Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
        # The original names and the modified names are kept in these two lists:
        assert len(n.unaltered_taxlabels) == len(n.taxlabels)

        # TODO - Can we extract any annotation too?
        if n.datatype in ("dna", "nucleotide"):
            annotations = {"molecule_type": "DNA"}
        elif n.datatype == "rna":
            annotations = {"molecule_type": "RNA"}
        elif n.datatype == "protein":
            annotations = {"molecule_type": "protein"}
        else:
            annotations = None
        aligned_seqs = [str(n.matrix[new_name]) for new_name in n.taxlabels]
        records = [
            SeqRecord(
                n.matrix[new_name].replace("-", ""),
                id=old_name,
                annotations=annotations,
            )
            for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels)
        ]
        coordinates = Alignment.infer_coordinates(aligned_seqs)
        alignment = Alignment(records, coordinates)
        yield alignment
def rename(align, first, second, splitchar="_"):
    for a in align:
        new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
        #pdb.set_trace()
        for seq in a:
            split_name = seq.id.split('_')
            if second:
                if splitchar == "_":
                    new_seq_name = splitchar.join(
                        [split_name[first][0:3], split_name[second][0:3]])
                else:
                    new_seq_name = splitchar.join([
                        split_name[first][0:3],
                        split_name[second][0:3].title()
                    ])
            else:
                new_seq_name = split_name[first]
            seq.id, seq.name = new_seq_name, new_seq_name
            new_align.append(seq)
        yield new_align
Example #6
0
 def create_alignment(
     records,
     aligned_sequences,
     strands,
     annotations,
     column_annotations,
     score,
 ):
     """Create the Alignment object from the collected alignment data."""
     coordinates = Alignment.infer_coordinates(aligned_sequences)
     for record, strand, row in zip(records, strands, coordinates):
         if strand == "-":
             row[:] = row[-1] - row[0] - row
         start = record.seq.defined_ranges[0][0]
         row += start
     alignment = Alignment(records, coordinates)
     if annotations is not None:
         alignment.annotations = annotations
     if column_annotations is not None:
         alignment.column_annotations = column_annotations
     if score is not None:
         alignment.score = score
     return alignment
Example #7
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        # If the alignment contains entries with the same sequence
        # identifier (not a good idea - but seems possible), then this
        # dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        aligned_seqs = []
        consensus = ""
        index = None  # Used to extract the consensus

        # Use the first block to get the sequence identifiers
        for line in stream:
            if line.startswith(" "):
                # Sequence consensus line...
                assert len(ids) > 0
                assert index is not None
                length = len(aligned_seq)  # noqa: F821
                consensus = line[index:index + length]
                break
            elif line.strip():
                # Sequences identifier...
                fields = line.split()

                # We expect there to be two fields, there can be an optional
                # "sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                seqid, aligned_seq = fields[:2]
                ids.append(seqid)
                aligned_seqs.append(aligned_seq)
                seq = aligned_seq.replace("-", "")
                seqs.append(seq)

                # Record the sequence position to get the consensus
                if index is None:
                    index = line.find(aligned_seq, len(seqid))

                if len(fields) == 3:
                    # This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line) from None
                    if len(seq) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            else:
                # no consensus line
                if index:
                    break
        else:
            raise StopIteration

        assert index is not None

        # Confirm all same length
        length = len(aligned_seqs[0])
        for aligned_seq in aligned_seqs:
            assert len(aligned_seq) == length
        if consensus:
            assert len(consensus) == length

        n = len(seqs)
        i = 0
        # Loop over any remaining blocks...
        for line in stream:
            if line.startswith(" "):  # Sequence consensus line
                assert index is not None
                length = len(aligned_seq)
                consensus += line[index:index + length]
            elif not line.strip():  # Blank line
                continue
            else:
                seqid = ids[i]
                # Sequences identifier...
                fields = line.split()

                # We expect there to be two fields, there can be an optional
                # "sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                assert seqid == fields[0]
                aligned_seq = fields[1]
                aligned_seqs[i] += aligned_seq
                seq = aligned_seq.replace("-", "")
                seqs[i] += seq

                if len(fields) == 3:
                    # This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line) from None
                    if len(seqs[i]) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
                i += 1
                if i == n:
                    i = 0

        records = [
            SeqRecord(Seq(seq), id=seqid, description=seqid)
            for (seqid, seq) in zip(ids, seqs)
        ]
        coordinates = Alignment.infer_coordinates(aligned_seqs)
        alignment = Alignment(records, coordinates)
        # TODO - Handle alignment annotation better, for now
        # mimic the old parser in Bio.Clustalw
        if consensus:
            rows, columns = alignment.shape
            if len(consensus) != columns:
                for aligned_seq in aligned_seqs:
                    print(aligned_seq, len(aligned_seq))
                raise ValueError(
                    "Alignment has %i columns, consensus length is %i, '%s'" %
                    (columns, len(consensus), consensus))
            alignment.column_annotations = {}
            alignment.column_annotations["clustal_consensus"] = consensus
        yield alignment
Example #8
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        identifiers = None
        number_of_sequences = None
        annotations = {}
        for line in stream:
            line = line.rstrip("\r\n")
            if identifiers is None:
                # searching for alignment metadata start
                if not line:
                    continue
                elif line.startswith("#---------------------------------------"):
                    # may appear between alignments
                    continue
                elif line.startswith("#======================================="):
                    # found the alignment metadata start
                    identifiers = []
                    ncols = None
                    sequences = None
                else:
                    raise ValueError("Unexpected line: %s" % line)
            elif sequences is None:
                # parsing the alignment metadata
                if line == "#=======================================":
                    # reached the end of alignment metadata
                    if len(identifiers) == 0:
                        raise ValueError("Number of sequences missing!")
                    if ncols is None:
                        raise ValueError("Length of alignment missing!")
                    sequences = [""] * number_of_sequences
                    aligned_sequences = [""] * number_of_sequences
                    consensus = ""
                    starts = [0] * number_of_sequences
                    column = 0
                    index = 0
                    continue
                if line.strip() == "#":
                    continue
                if not line.startswith("# "):
                    raise ValueError("Unexpected line: %s") % line
                try:
                    key, value = line[2:].split(":", 1)
                except ValueError:
                    # An equal sign is used for Longest_Identity,
                    # Longest_Similarity, Shortest_Identity, and
                    # Shortest_Similarity, which are included if command line
                    # argument -nobrief was used.
                    key, value = line[2:].split(" = ", 1)
                if key == "Aligned_sequences":
                    number_of_sequences = int(value.strip())
                    assert len(identifiers) == 0
                    # Should now expect the record identifiers...
                    for i, line in enumerate(stream):
                        if not line.startswith("# "):
                            raise ValueError("Unexpected line: %s") % line
                        number, identifier = line[2:].split(":")
                        assert i + 1 == int(number)
                        identifiers.append(identifier.strip())
                        if len(identifiers) == number_of_sequences:
                            break
                elif key == "Matrix":
                    annotations["matrix"] = value.strip()
                elif key == "Gap_penalty":
                    annotations["gap_penalty"] = float(value.strip())
                elif key == "Extend_penalty":
                    annotations["extend_penalty"] = float(value.strip())
                elif key == "Length":
                    ncols = int(value.strip())
                elif key == "Identity":
                    annotations["identity"] = int(value.strip().split("/")[0])
                elif key == "Similarity":
                    annotations["similarity"] = int(value.strip().split("/")[0])
                elif key == "Gaps":
                    annotations["gaps"] = int(value.strip().split("/")[0])
                elif key == "Score":
                    annotations["score"] = float(value.strip())
                # TODO:
                # The following are generated if the -nobrief command line
                # argument used. We could simply calculate them from the
                # alignment, but then we have to define what we mean by
                # "similar". For now, simply store them as an annotation.
                elif key == "Longest_Identity":
                    annotations["longest_identity"] = value.strip()
                elif key == "Longest_Similarity":
                    annotations["longest_similarity"] = value.strip()
                elif key == "Shortest_Identity":
                    annotations["shortest_identity"] = value.strip()
                elif key == "Shortest_Similarity":
                    annotations["shortest_similarity"] = value.strip()
                else:
                    raise ValueError("Failed to parse line '%s'" % line)
            else:
                # parse the sequences
                if not line:
                    # empty line
                    if index == number_of_sequences:
                        # reached the end of an alignment block
                        index = 0
                        if column == ncols:
                            # reached the end of the sequences
                            coordinates = Alignment.infer_coordinates(aligned_sequences)
                            records = []
                            n = len(sequences)
                            for i in range(n):
                                start = starts[i]
                                if start == 0:
                                    sequence = Seq(sequences[i])
                                else:
                                    coordinates[i, :] += start
                                    # create a partially defined sequence
                                    length = start + len(sequences[i])
                                    data = {start: sequences[i]}
                                    sequence = Seq(data, length=length)
                                record = SeqRecord(sequence, identifiers[i])
                                records.append(record)
                            alignment = Alignment(records, coordinates)
                            if annotations:
                                alignment.annotations = annotations
                            if consensus:
                                alignment.column_annotations = {
                                    "emboss_consensus": consensus
                                }
                            yield alignment
                            identifiers = None
                            annotations = {}
                    continue
                prefix = line[:21].strip()
                if prefix == "":
                    # match line
                    consensus += line[21:71]
                else:
                    identifier, start = prefix.split(None, 1)
                    assert identifiers[index].startswith(identifier)
                    aligned_sequence, end = line[21:].split(None, 1)
                    start = int(start) - 1  # Python counting
                    end = int(end)
                    length = len(sequences[index])
                    sequence = aligned_sequence.replace("-", "")
                    if length == 0 and len(sequence) > 0:
                        # Record the start
                        starts[index] = start
                    else:
                        if self.align_format == "srspair" and len(sequence) == 0:
                            start += 1
                        assert start == starts[index] + length
                    assert end == start + len(sequence)
                    sequences[index] += sequence
                    aligned_sequences[index] += aligned_sequence
                    if index == 0:
                        column += len(aligned_sequence)
                    else:
                        assert column == len(aligned_sequences[index])
                    index += 1
Example #9
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        identifiers = None
        number_of_sequences = None
        for line in stream:
            line = line.rstrip("\r\n")
            if identifiers is None:
                # searching for alignment metadata start
                if not line:
                    continue
                elif line.startswith(
                        "#---------------------------------------"):
                    # may appear between alignments
                    continue
                elif line.startswith(
                        "#======================================="):
                    # found the alignment metadata start
                    identifiers = []
                    ncols = None
                    sequences = None
                    matrix = None
                    gap_penalty = None
                    extend_penalty = None
                    identity = None
                    similarity = None
                    gaps = None
                    score = None
                else:
                    raise ValueError("Unexpected line: %s" % line)
            elif sequences is None:
                # parsing the alignment metadata
                if line == "#=======================================":
                    # reached the end of alignment metadata
                    if len(identifiers) == 0:
                        raise ValueError("Number of sequences missing!")
                    if ncols is None:
                        raise ValueError("Length of alignment missing!")
                    sequences = [""] * number_of_sequences
                    aligned_sequences = [""] * number_of_sequences
                    consensus = ""
                    starts = [0] * number_of_sequences
                    ends = [0] * number_of_sequences
                    column = 0
                    index = 0
                    continue
                if line.strip() == "#":
                    continue
                if not line.startswith("# "):
                    raise ValueError("Unexpected line: %s") % line
                key, value = line[2:].split(":", 1)
                if key == "Aligned_sequences":
                    number_of_sequences = int(value.strip())
                    assert len(identifiers) == 0
                    # Should now expect the record identifiers...
                    for i, line in enumerate(stream):
                        if not line.startswith("# "):
                            raise ValueError("Unexpected line: %s") % line
                        number, identifier = line[2:].split(":")
                        assert i + 1 == int(number)
                        identifiers.append(identifier.strip())
                        if len(identifiers) == number_of_sequences:
                            break
                elif key == "Matrix":
                    matrix = value.strip()
                elif key == "Gap_penalty":
                    gap_penalty = float(value.strip())
                elif key == "Extend_penalty":
                    extend_penalty = float(value.strip())
                elif key == "Length":
                    ncols = int(value.strip())
                elif key == "Identity":
                    identity = int(value.strip().split("/")[0])
                elif key == "Similarity":
                    similarity = int(value.strip().split("/")[0])
                elif key == "Gaps":
                    gaps = int(value.strip().split("/")[0])
                elif key == "Score":
                    score = float(value.strip())
            else:
                # parse the sequences
                if not line:
                    # empty line
                    if index == number_of_sequences:
                        # reached the end of an alignment block
                        index = 0
                        if column == ncols:
                            # reached the end of the sequences
                            coordinates = Alignment.infer_coordinates(
                                aligned_sequences)
                            for i, start in enumerate(starts):
                                start -= 1  # Python counting
                                coordinates[i, :] += start
                            sequences = [
                                Seq(sequence) for sequence in sequences
                            ]
                            records = [
                                SeqRecord(sequence, id=identifier)
                                for sequence, identifier in zip(
                                    sequences, identifiers)
                            ]
                            alignment = Alignment(records, coordinates)
                            if matrix is not None:
                                alignment.matrix = matrix
                            if gap_penalty is not None:
                                alignment.gap_penalty = gap_penalty
                            if extend_penalty is not None:
                                alignment.extend_penalty = extend_penalty
                            if identity is not None:
                                alignment.identity = identity
                            if similarity is not None:
                                alignment.similarity = similarity
                            if gaps is not None:
                                alignment.gaps = gaps
                            if score is not None:
                                alignment.score = score
                            if consensus:
                                alignment.column_annotations = {
                                    "emboss_consensus": consensus
                                }
                            yield alignment
                            identifiers = None
                    continue
                prefix = line[:21].strip()
                if prefix == "":
                    # match line
                    consensus += line[21:71]
                else:
                    identifier, start = prefix.split(None, 1)
                    aligned_sequence, end = line[21:].split(None, 1)
                    start = int(start)
                    end = int(end)
                    sequence = aligned_sequence.replace("-", "")
                    if len(sequences[index]) > 0:
                        length = len(sequence)
                        if length == 0:
                            assert start == ends[index]
                            assert end == ends[index]
                        else:
                            assert start == ends[index] + 1
                            assert end == ends[index] + length
                    assert identifiers[index].startswith(identifier)
                    if starts[index] == 0:
                        # Record the start and end
                        starts[index] = start
                    ends[index] = end
                    sequences[index] += sequence
                    aligned_sequences[index] += aligned_sequence
                    if index == 0:
                        column += len(aligned_sequence)
                    else:
                        assert column == len(aligned_sequences[index])
                    index += 1
Example #10
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        try:
            line = next(stream)
        except StopIteration:
            raise ValueError("Empty file.") from None
        # Whitelisted headers we know about.
        known_headers = [
            "!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp"
        ]
        # Examples in "Molecular Biology Software Training Manual GCG version 10"
        # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001
        # would often start as follows:
        #
        # !!AA_MUTIPLE_ALIGNMENT 1.0
        # PileUp of: @/usr/users2/culhane/...
        #
        # etc with other seemingly free format text before getting to the
        # MSF/Type/Check line and the following Name: lines block and // line.
        #
        # MUSCLE just has a line "PileUp", while other sources just use the line
        # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT"
        # (nucleotide).
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known GCG MSF header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        for line in stream:
            line = line.rstrip("\n")
            if "MSF: " in line and line.endswith(".."):
                break
        else:
            raise ValueError(
                "Reached end of file without MSF/Type/Check header line")

        # Quoting from "Molecular Biology Software Training Manual GCG version 10"
        # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001.
        # Page 31:
        #
        # "Header information is before a .. (double dot) in a GCG format file.
        #  The file will also have a checksum specific for that file."
        #
        # This was followed by a single non-aligned sequence, but this convention
        # appears to also be used in the GCG MSF files. Quoting other examples in
        # this reference, page 31:
        #
        # localpileup_17.msf  MSF: 195  Type: P  January 6, 2000 15:41  Check: 4365 ..
        #
        # Except from page 148:
        #
        # localpileup_106.msf  MSF: 457  Type: P  November 28, 2000 16:09  Check: 2396 ..
        #
        # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum:
        #
        #   MSF: 689  Type: N  Check: 0000  ..
        #
        # By observation, the MSF value is the column count, type is N (nucleotide)
        # or P (protein / amino acid).
        #
        # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown,
        #
        # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf
        # !!NA_MULTIPLE_ALIGNMENT 1.0
        #
        #   stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 ..
        #
        #   Name: G26680     Len: 633  Check: 4334 Weight: 1.00
        #   Name: G26685     Len: 633  Check: 3818 Weight: 1.00
        #   Name: G29385     Len: 633  Check:  391 Weight: 1.00
        #
        # //
        #
        parts = line.split()
        offset = parts.index("MSF:")
        if parts[offset + 2] != "Type:" or parts[-3] not in ("Check:",
                                                             "CompCheck:"):
            raise ValueError(
                "GCG MSF header line should be "
                "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', "
                " not: %r" % line)
        try:
            aln_length = int(parts[offset + 1])
        except ValueError:
            raise ValueError(
                "GCG MSF header line should have MSF: <int> for column count, not %r"
                % parts[offset + 1]) from None
        seq_type = parts[offset + 3]
        if seq_type not in ["P", "N"]:
            raise ValueError(
                "GCG MSF header line should have 'Type: P' (protein) "
                "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type)

        # There should be a blank line after that header line, then the Name: lines
        #
        # The Name may be followed by 'oo', as shown here:
        #
        # PileUp
        #
        #
        #
        #    MSF:  628  Type: P    Check:   147   ..
        #
        #  Name: AK1H_ECOLI/1-378 oo  Len:  628  Check:  3643  Weight:  1.000
        #  Name: AKH_HAEIN/1-382 oo  Len:  628  Check:  6504  Weight:  1.000
        #
        # //
        names = []
        remaining = []
        checks = []
        weights = []
        for line in stream:
            line = line.strip()
            if line == "//":
                break
            if line.startswith("Name: "):
                words = line.split()
                try:
                    index_name = words.index("Name:")
                    index_len = words.index("Len:")
                    index_weight = words.index("Weight:")
                    index_check = words.index("Check:")
                except ValueError:
                    raise ValueError(
                        f"Malformed GCG MSF name line: {line!r}") from None
                name = words[index_name + 1]
                length = int(words[index_len + 1])
                weight = float(words[index_weight + 1])
                check = words[index_check + 1]
                if name in names:
                    raise ValueError(f"Duplicated ID of {name!r}")
                names.append(name)
                remaining.append(length)
                checks.append(check)
                weights.append(weight)
        else:
            raise ValueError(
                "End of file while looking for end of header // line.")

        try:
            line = next(stream)
        except StopIteration:
            raise ValueError(
                "End of file after // line, expected sequences.") from None
        if line.strip():
            raise ValueError(
                "After // line, expected blank line before sequences.")

        # Now load the sequences
        seqs = [""] * len(names)
        for line in stream:
            words = line.split()
            if not words:
                continue
            name = words[0]
            try:
                index = names.index(name)
            except ValueError:
                # This may be a coordinate line
                for word in words:
                    if not word.isdigit():
                        break
                else:
                    # all words are integers; assume this is a coordinate line
                    continue
                raise ValueError(
                    f"Unexpected line '{line}' in input") from None
            seq = "".join(words[1:])
            length = remaining[index] - (len(seq) - seq.count("-"))
            if length < 0:
                raise ValueError(
                    "Received longer sequence than expected for %s" % name)
            seqs[index] += seq
            remaining[index] = length
            if all(length == 0 for length in remaining):
                break
        else:
            raise ValueError("End of file where expecting sequence data.")

        length = max(len(seq) for seq in seqs)
        if length != aln_length:
            warnings.warn(
                "GCG MSF headers said alignment length %i, but found %i" %
                (aln_length, length),
                BiopythonParserWarning,
            )
            aln_length = length

        # Combine list of strings into single string, remap gaps
        for index, seq in enumerate(seqs):
            seq = "".join(seq).replace("~", "-").replace(".", "-")
            if len(seq) < aln_length:
                seq += "-" * (aln_length - len(seq))
            seqs[index] = seq

        coordinates = Alignment.infer_coordinates(seqs)
        seqs = (Seq(seq.replace("-", "")) for seq in seqs)
        records = [
            SeqRecord(
                seq,
                id=name,
                name=name,
                description=name,
                annotations={"weight": weight},
            ) for (name, seq, weight) in zip(names, seqs, weights)
        ]

        alignment = Alignment(records, coordinates)
        # This will check alignment lengths are self-consistent:
        rows, columns = alignment.shape
        if columns != aln_length:
            raise ValueError(
                "GCG MSF headers said alignment length %i, but found %i" %
                (aln_length, columns))
        yield alignment
Example #11
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        line = self.line
        self.line = None
        if line is not None:
            lines = chain([line], stream)
        else:
            lines = stream
        for line in lines:
            words = line.split()
            if len(words) != 21:
                raise ValueError("line has %d columns; expected 21" %
                                 len(words))
            strand = words[8]
            qName = words[9]
            qSize = int(words[10])
            tName = words[13]
            tSize = int(words[14])
            blockCount = int(words[17])
            blockSizes = [
                int(blockSize)
                for blockSize in words[18].rstrip(",").split(",")
            ]
            qStarts = [
                int(start) for start in words[19].rstrip(",").split(",")
            ]
            tStarts = [
                int(start) for start in words[20].rstrip(",").split(",")
            ]
            if len(blockSizes) != blockCount:
                raise ValueError(
                    "Inconsistent number of blocks (%d found, expected %d)" %
                    (len(blockSizes), blockCount))
            if len(qStarts) != blockCount:
                raise ValueError(
                    "Inconsistent number of query start positions (%d found, expected %d)"
                    % (len(qStarts), blockCount))
            if len(tStarts) != blockCount:
                raise ValueError(
                    "Inconsistent number of target start positions (%d found, expected %d)"
                    % (len(tStarts), blockCount))
            target_sequence = Seq(None, length=tSize)
            target_record = SeqRecord(target_sequence, id=tName)
            query_sequence = Seq(None, length=qSize)
            query_record = SeqRecord(query_sequence, id=qName)
            records = [target_record, query_record]
            qBlockSizes = numpy.array(blockSizes)
            qStarts = numpy.array(qStarts)
            tStarts = numpy.array(tStarts)
            if strand in ("++", "+-"):
                # protein sequence aligned against translated DNA sequence
                tBlockSizes = 3 * qBlockSizes
            else:
                tBlockSizes = qBlockSizes
            qPosition = qStarts[0]
            tPosition = tStarts[0]
            coordinates = [[tPosition, qPosition]]
            for tBlockSize, qBlockSize, tStart, qStart in zip(
                    tBlockSizes, qBlockSizes, tStarts, qStarts):
                if tStart != tPosition:
                    coordinates.append([tStart, qPosition])
                    tPosition = tStart
                if qStart != qPosition:
                    coordinates.append([tPosition, qStart])
                    qPosition = qStart
                tPosition += tBlockSize
                qPosition += qBlockSize
                coordinates.append([tPosition, qPosition])
            coordinates = numpy.array(coordinates).transpose()
            qNumInsert = 0
            qBaseInsert = 0
            tNumInsert = 0
            tBaseInsert = 0
            tStart, qStart = coordinates[:, 0]
            for tEnd, qEnd in coordinates[:, 1:].transpose():
                tCount = tEnd - tStart
                qCount = qEnd - qStart
                if tCount == 0:
                    if qStart > 0 and qEnd < qSize:
                        qNumInsert += 1
                        qBaseInsert += qCount
                    qStart = qEnd
                elif qCount == 0:
                    if tStart > 0 and tEnd < tSize:
                        tNumInsert += 1
                        tBaseInsert += tCount
                    tStart = tEnd
                else:
                    tStart = tEnd
                    qStart = qEnd
            if qNumInsert != int(words[4]):
                raise ValueError(
                    "Inconsistent qNumInsert found (%s, expected %d)" %
                    (words[4], qNumInsert))
            if qBaseInsert != int(words[5]):
                raise ValueError(
                    "Inconsistent qBaseInsert found (%s, expected %d)" %
                    (words[5], qBaseInsert))
            if tNumInsert != int(words[6]):
                raise ValueError(
                    "Inconsistent tNumInsert found (%s, expected %d)" %
                    (words[6], tNumInsert))
            if tBaseInsert != int(words[7]):
                raise ValueError(
                    "Inconsistent tBaseInsert found (%s, expected %d)" %
                    (words[7], tBaseInsert))
            qStart = int(words[11])
            qEnd = int(words[12])
            tStart = int(words[15])
            tEnd = int(words[16])
            if strand == "-":
                qStart, qEnd = qEnd, qStart
                coordinates[1, :] = qSize - coordinates[1, :]
            elif strand == "+-":
                tStart, tEnd = tEnd, tStart
                coordinates[0, :] = tSize - coordinates[0, :]
            if tStart != coordinates[0, 0]:
                raise ValueError(
                    "Inconsistent tStart found (%d, expected %d)" %
                    (tStart, coordinates[0, 0]))
            if tEnd != coordinates[0, -1]:
                raise ValueError("Inconsistent tEnd found (%d, expected %d)" %
                                 (tEnd, coordinates[0, -1]))
            if qStart != coordinates[1, 0]:
                raise ValueError(
                    "Inconsistent qStart found (%d, expected %d)" %
                    (qStart, coordinates[1, 0]))
            if qEnd != coordinates[1, -1]:
                raise ValueError("Inconsistent qEnd found (%d, expected %d)" %
                                 (qEnd, coordinates[1, -1]))
            alignment = Alignment(records, coordinates)
            alignment.matches = int(words[0])
            alignment.misMatches = int(words[1])
            alignment.repMatches = int(words[2])
            alignment.nCount = int(words[3])
            yield alignment
Example #12
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        for line in stream:
            line = line.strip()
            if not line:
                continue
            elif line == "# STOCKHOLM 1.0":
                # Starting a new alignment
                records = []
                aligned_sequences = []
                references = []
                reference_comments = []
                database_references = []
                nested_domains = []
                gf = defaultdict(list)
                gc = {}
                gs = defaultdict(lambda: {"DR": []})
                gr = defaultdict(dict)
                length = None
            elif line == "//":
                # Reached the end of the alignment.
                skipped_columns = []
                coordinates = Alignment.infer_coordinates(
                    aligned_sequences, skipped_columns
                )
                skipped_columns = set(skipped_columns)
                alignment = Alignment(records, coordinates)
                alignment.annotations = {}
                if references:
                    alignment.annotations["references"] = []
                    for reference in references:
                        reference = dict(reference)
                        reference["title"] = " ".join(reference["title"])
                        reference["author"] = " ".join(reference["author"])
                        reference["location"] = " ".join(reference["location"])
                        alignment.annotations["references"].append(reference)
                if database_references:
                    alignment.annotations["database references"] = database_references
                if nested_domains:
                    alignment.annotations["nested domains"] = nested_domains
                rows, columns = alignment.shape
                AlignmentIterator._store_per_file_annotations(alignment, gf, rows)
                AlignmentIterator._store_per_column_annotations(
                    alignment, gc, columns, skipped_columns
                )
                AlignmentIterator._store_per_sequence_annotations(alignment, gs)
                AlignmentIterator._store_per_sequence_and_per_column_annotations(
                    alignment, gr
                )
                yield alignment
            elif not line.startswith("#"):
                # Sequence
                # Format: "<seqname> <sequence>"
                try:
                    seqname, aligned_sequence = line.split(None, 1)
                except ValueError:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError(
                        "Could not split line into sequence name and aligned sequence:\n"
                        + line
                    ) from None
                if length is None:
                    length = len(aligned_sequence)
                elif length != len(aligned_sequence):
                    raise ValueError(
                        f"Aligned sequence {seqname} consists of {len(aligned_sequence)} letters, expected {length} letters)"
                    )
                aligned_sequence = aligned_sequence.replace(".", "-")
                sequence = aligned_sequence.replace("-", "")
                aligned_sequences.append(aligned_sequence)
                seq = Seq(sequence)
                record = SeqRecord(seq, id=seqname)
                records.append(record)
            elif line.startswith("#=GF "):
                # Generic per-File annotation, free text
                # Format: #=GF <feature> <free text>
                feature, text = line[5:].strip().split(None, 1)
                if feature == "RN":
                    assert text.startswith("[")
                    assert text.endswith("]")
                    number = int(text[1:-1])
                    reference = defaultdict(list)
                    reference["number"] = number
                    if reference_comments:
                        reference["comment"] = " ".join(reference_comments)
                        reference_comments = []
                    references.append(reference)
                elif feature == "RM":
                    assert not reference["medline"]
                    reference["medline"] = text
                elif feature == "RT":
                    reference["title"].append(text)
                elif feature == "RA":
                    reference["author"].append(text)
                elif feature == "RL":
                    reference["location"].append(text)
                elif feature == "RC":
                    reference_comments.append(text)
                elif feature == "DR":
                    database_reference = {"reference": text}
                    database_references.append(database_reference)
                elif feature == "DC":
                    assert "comment" not in database_reference
                    database_reference["comment"] = text
                elif feature == "NE":
                    nested_domain = {"accession": text}
                    nested_domains.append(nested_domain)
                elif feature == "NL":
                    assert "location" not in nested_domain
                    nested_domain["location"] = text
                else:
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    gf[feature].append(text)
            elif line.startswith("#=GC "):
                # Generic per-Column annotation, exactly 1 char per column
                # Format: "#=GC <feature> <exactly 1 char per column>"
                feature, text = line[5:].strip().split(None, 2)
                if feature not in gc:
                    gc[feature] = ""
                gc[feature] += text.strip()  # append to any previous entry
                # Might be interleaved blocks, so can't check length yet
            elif line.startswith("#=GS "):
                # Generic per-Sequence annotation, free text
                # Format: "#=GS <seqname> <feature> <free text>"
                try:
                    seqname, feature, text = line[5:].strip().split(None, 2)
                except ValueError:
                    # Free text can sometimes be empty, which a one line split throws an error for.
                    # See https://github.com/biopython/biopython/issues/2982 for more details
                    seqname, feature = line[5:].strip().split(None, 1)
                    text = ""
                if feature == "DR":
                    gs[seqname][feature].append(text)
                else:
                    assert feature not in gs[seqname]
                    gs[seqname][feature] = text
            elif line[:5] == "#=GR ":
                # Generic per-Sequence AND per-Column markup
                # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                terms = line[5:].split(None, 2)
                assert terms[0] == seqname
                feature = terms[1]
                gr[seqname][feature] = terms[2].strip()
Example #13
0
    def parse(self, stream):
        """Parse the next alignment from the stream."""
        if stream is None:
            raise StopIteration

        for line in stream:
            words = line.split()
            bedN = len(words)
            if bedN < 3 or bedN > 12:
                raise ValueError("expected between 3 and 12 columns, found %d" % bedN)
            chrom = words[0]
            chromStart = int(words[1])
            chromEnd = int(words[2])
            if bedN > 3:
                name = words[3]
            else:
                name = None
            if bedN > 5:
                strand = words[5]
            else:
                strand = "+"
            if bedN > 9:
                blockCount = int(words[9])
                blockSizes = [
                    int(blockSize) for blockSize in words[10].rstrip(",").split(",")
                ]
                blockStarts = [
                    int(blockStart) for blockStart in words[11].rstrip(",").split(",")
                ]
                if len(blockSizes) != blockCount:
                    raise ValueError(
                        "Inconsistent number of block sizes (%d found, expected %d)"
                        % (len(blockSizes), blockCount)
                    )
                if len(blockStarts) != blockCount:
                    raise ValueError(
                        "Inconsistent number of block start positions (%d found, expected %d)"
                        % (len(blockStarts), blockCount)
                    )
                blockSizes = numpy.array(blockSizes)
                blockStarts = numpy.array(blockStarts)
                tPosition = 0
                qPosition = 0
                coordinates = [[tPosition, qPosition]]
                for blockSize, blockStart in zip(blockSizes, blockStarts):
                    if blockStart != tPosition:
                        coordinates.append([blockStart, qPosition])
                        tPosition = blockStart
                    tPosition += blockSize
                    qPosition += blockSize
                    coordinates.append([tPosition, qPosition])
                coordinates = numpy.array(coordinates).transpose()
                qSize = sum(blockSizes)
            else:
                blockSize = chromEnd - chromStart
                coordinates = numpy.array([[0, blockSize], [0, blockSize]])
                qSize = blockSize
            coordinates[0, :] += chromStart
            query_sequence = Seq(None, length=qSize)
            query_record = SeqRecord(query_sequence, id=name)
            target_record = SeqRecord(None, id=chrom)
            records = [target_record, query_record]
            if strand == "-":
                coordinates[1, :] = qSize - coordinates[1, :]
            if chromStart != coordinates[0, 0]:
                raise ValueError(
                    "Inconsistent chromStart found (%d, expected %d)"
                    % (chromStart, coordinates[0, 0])
                )
            if chromEnd != coordinates[0, -1]:
                raise ValueError(
                    "Inconsistent chromEnd found (%d, expected %d)"
                    % (chromEnd, coordinates[0, -1])
                )
            alignment = Alignment(records, coordinates)
            if bedN <= 4:
                yield alignment
                continue
            score = words[4]
            try:
                score = float(score)
            except ValueError:
                pass
            else:
                if score.is_integer():
                    score = int(score)
            alignment.score = score
            if bedN <= 6:
                yield alignment
                continue
            alignment.thickStart = int(words[6])
            if bedN <= 7:
                yield alignment
                continue
            alignment.thickEnd = int(words[7])
            if bedN <= 8:
                yield alignment
                continue
            alignment.itemRgb = words[8]
            yield alignment
def rename_alignment_taxa(aln, name_map):
    new_align = Alignment([], alphabet=Gapped(IUPAC.unambiguous_dna, "-"))
    for seq in aln:
        seq.id, seq.name = name_map[seq.id], name_map[seq.id]
        new_align.append(seq)
    return new_align