Esempio n. 1
0
def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
    """Generator function to iterate over Fasta records (as SeqRecord objects).

    Arguments:

     - handle - input file
     - alphabet - optional alphabet
     - title2ids - A function that, when given the title of the FASTA
       file (without the beginning >), will return the id, name and
       description (in that order) for the record as a tuple of strings.
       If this is not given, then the entire title line will be used
       as the description, and the first word as the id and name.

    By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
    with no custom handling of the title lines:

    >>> with open("Fasta/dups.fasta") as handle:
    ...     for record in FastaIterator(handle):
    ...         print(record.id)
    ...
    alpha
    beta
    gamma
    alpha
    delta

    However, you can supply a title2ids function to alter this:

    >>> def take_upper(title):
    ...     return title.split(None, 1)[0].upper(), "", title
    >>> with open("Fasta/dups.fasta") as handle:
    ...     for record in FastaIterator(handle, title2ids=take_upper):
    ...         print(record.id)
    ...
    ALPHA
    BETA
    GAMMA
    ALPHA
    DELTA

    """
    if title2ids:
        for title, sequence in SimpleFastaParser(handle):
            id, name, descr = title2ids(title)
            yield SeqRecord(Seq(sequence, alphabet),
                            id=id,
                            name=name,
                            description=descr)
    else:
        for title, sequence in SimpleFastaParser(handle):
            try:
                first_word = title.split(None, 1)[0]
            except IndexError:
                assert not title, repr(title)
                # Should we use SeqRecord default for no ID?
                first_word = ""
            yield SeqRecord(Seq(sequence, alphabet),
                            id=first_word,
                            name=first_word,
                            description=title)
Esempio n. 2
0
    def add_sequence(self,
                     descriptor,
                     sequence,
                     start=None,
                     end=None,
                     weight=1.0):
        """Add a sequence to the alignment (DEPRECATED).

        The start, end, and weight arguments are not supported! This method
        only provides limited backwards compatibility with the old
        Bio.Align.Generic.Alignment object. Please use the append method with
        a SeqRecord instead, since add_sequence is likely to be removed in a
        future release of Biopython.
        """
        import warnings
        import Bio
        warnings.warn(
            "The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, as the add_sequence method is likely to be removed in a future release of Biopython.",
            Bio.BiopythonDeprecationWarning)
        # Should we handle start/end/strand information somehow? What for?
        # TODO - Should we handle weights somehow? See also AlignInfo code...
        if start is not None or end is not None or weight != 1.0:
            raise ValueError("The add_Sequence method is obsolete, and only "
                             "provides limited backwards compatibily. The"
                             "start, end and weight arguments are not "
                             "supported.")
        self.append(
            SeqRecord(Seq(sequence, self._alphabet),
                      id=descriptor,
                      description=descriptor))
Esempio n. 3
0
    def gap_consensus(self,
                      threshold=.7,
                      ambiguous="X",
                      consensus_alpha=None,
                      require_multiple=0):
        """Same as dumb_consensus(), but allows gap on the output.

        Things to do:
            - Let the user define that with only one gap, the result
              character in consensus is gap.
            - Let the user select gap character, now
              it takes the same as input.
        """
        # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
        consensus = ''

        # find the length of the consensus we are creating
        con_len = self.alignment.get_alignment_length()

        # go through each seq item
        for n in range(con_len):
            # keep track of the counts of the different atoms we get
            atom_dict = {}
            num_atoms = 0

            for record in self.alignment:
                # make sure we haven't run past the end of any sequences
                # if they are of different lengths
                if n < len(record.seq):
                    if record.seq[n] not in atom_dict:
                        atom_dict[record.seq[n]] = 1
                    else:
                        atom_dict[record.seq[n]] += 1

                    num_atoms += 1

            max_atoms = []
            max_size = 0

            for atom in atom_dict:
                if atom_dict[atom] > max_size:
                    max_atoms = [atom]
                    max_size = atom_dict[atom]
                elif atom_dict[atom] == max_size:
                    max_atoms.append(atom)

            if require_multiple and num_atoms == 1:
                consensus += ambiguous
            elif (len(max_atoms) == 1) and (
                (float(max_size) / float(num_atoms)) >= threshold):
                consensus += max_atoms[0]
            else:
                consensus += ambiguous

        # we need to guess a consensus alphabet if one isn't specified
        if consensus_alpha is None:
            # TODO - Should we make this into a Gapped alphabet?
            consensus_alpha = self._guess_consensus_alphabet(ambiguous)

        return Seq(consensus, consensus_alpha)
Esempio n. 4
0
    def add_sequence(self,
                     descriptor,
                     sequence,
                     start=None,
                     end=None,
                     weight=1.0):
        """Add a sequence to the alignment.

        This doesn't do any kind of alignment, it just adds in the sequence
        object, which is assumed to be prealigned with the existing
        sequences.

        Arguments:
            - descriptor - The descriptive id of the sequence being added.
              This will be used as the resulting SeqRecord's
              .id property (and, for historical compatibility,
              also the .description property)
            - sequence - A string with sequence info.
            - start - You can explicitly set the start point of the sequence.
              This is useful (at least) for BLAST alignments, which can
              just be partial alignments of sequences.
            - end - Specify the end of the sequence, which is important
              for the same reason as the start.
            - weight - The weight to place on the sequence in the alignment.
              By default, all sequences have the same weight. (0.0 =>
              no weight, 1.0 => highest weight)
        """
        new_seq = Seq(sequence, self._alphabet)

        # We are now effectively using the SeqRecord's .id as
        # the primary identifier (e.g. in Bio.SeqIO) so we should
        # populate it with the descriptor.
        # For backwards compatibility, also store this in the
        # SeqRecord's description property.
        new_record = SeqRecord(new_seq, id=descriptor, description=descriptor)

        # hack! We really need to work out how to deal with annotations
        # and features in biopython. Right now, I'll just use the
        # generic annotations dictionary we've got to store the start
        # and end, but we should think up something better. I don't know
        # if I'm really a big fan of the LocatableSeq thing they've got
        # in BioPerl, but I'm not positive what the best thing to do on
        # this is...
        if start:
            new_record.annotations['start'] = start
        if end:
            new_record.annotations['end'] = end

        # another hack to add weight information to the sequence
        new_record.annotations['weight'] = weight

        self._records.append(new_record)
Esempio n. 5
0
    def _set_seq(self, seq, seq_type):
        """Checks the given sequence for attribute setting

        :param seq: sequence to check
        :type seq: string or SeqRecord
        :param seq_type: sequence type
        :type seq_type: string, choice of 'hit' or 'query'

        """
        assert seq_type in ('hit', 'query')
        if seq is None:
            return seq # return immediately if seq is None
        else:
            if not isinstance(seq, (basestring, SeqRecord)):
                raise TypeError("%s sequence must be a string or a SeqRecord"
                        " object." % seq_type)
        # check length if the opposite sequence is not None
        opp_type = 'hit' if seq_type == 'query' else 'query'
        opp_seq = getattr(self, '_%s' % opp_type, None)
        if opp_seq is not None:
            if len(seq) != len(opp_seq):
                raise ValueError("Sequence lengths do not match. Expected: "
                        "%r (%s); found: %r (%s)." % (len(opp_seq), opp_type,
                        len(seq), seq_type))

        seq_id = getattr(self, '%s_id' % seq_type)
        seq_desc = getattr(self, '%s_description' % seq_type)
        seq_feats = getattr(self, '%s_features' % seq_type)
        seq_name = 'aligned %s sequence' % seq_type

        if isinstance(seq, SeqRecord):
            seq.id = seq_id
            seq.description = seq_desc
            seq.name = seq_name
            seq.features = seq_feats
            seq.seq.alphabet = self.alphabet
        elif isinstance(seq, basestring):
            seq = SeqRecord(Seq(seq, self.alphabet), id=seq_id, name=seq_name,
                    description=seq_desc, features=seq_feats)

        return seq
Esempio n. 6
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect")
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                # Quick hack until I can work out how -, * and / characters
                # and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        # TODO - Introduce an annotated alignment class?
        # For now, store the annotation a new private property:
        alignment._annotations = {}

        # Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        # Query
        # =====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        # TODO - What if a specific alphabet has been requested?
        # TODO - Use an IUPAC alphabet?
        # TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        # Match
        # =====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        # TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        # This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Esempio n. 7
0
    def __next__(self):
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            # Empty file - just give up.
            raise StopIteration
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = OrderedDict(
        )  # Really only need an OrderedSet, but python lacks this
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier "
                                     "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids[id] = True
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip(
                    )  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids.keys()
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" %
                    (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(id)
                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=id,
                                   name=name,
                                   description=id,
                                   annotations={"accession": name})
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Esempio n. 8
0
    def __next__(self):
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration

        # Whitelisted headers we know about
        known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS', 'Kalign']
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known CLUSTAL header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0] == '(' and word[-1] == ')':
                word = word[1:-1]
            if word[0] in '0123456789':
                version = word
                break

        # There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        # If the alignment contains entries with the same sequence
        # identifier (not a good idea - but seems possible), then this
        # dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None  # Used to extract the consensus

        # Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                # Sequences identifier...
                fields = line.rstrip().split()

                # We expect there to be two fields, there can be an optional
                # "sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                # Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    # This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(fields[1].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            elif line[0] == " ":
                # Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                # Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                # No consensus
                break
            line = handle.readline()
            if not line:
                break  # end of file

        assert line.strip() == ""
        assert seq_cols is not None

        # Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        # Loop over any remaining blocks...
        done = False
        while not done:
            # There should be a blank line between each block.
            # Also want to ignore any consensus line from the
            # previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if line.split(None, 1)[0] in known_headers:
                # Found concatenated alignment.
                done = True
                self._header = line
                break

            for i in range(len(ids)):
                assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
                fields = line.rstrip().split()

                # We expect there to be two fields, there can be an optional
                # "sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % repr(line))

                if fields[0] != ids[i]:
                    raise ValueError(
                        "Identifiers out of order? Got '%s' but expected '%s'"
                        % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    assert start == seq_cols.start, 'Old location %s -> %i:XX' % (
                        seq_cols, start)
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                # Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    # This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(seqs[i].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)

                # Read in the next line
                line = handle.readline()
            # There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                # Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            raise StopIteration

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i)
                   for (i, s) in zip(ids, seqs))
        alignment = MultipleSeqAlignment(records, self.alphabet)
        # TODO - Handle alignment annotation better, for now
        # mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            alignment_length = len(seqs[0])
            assert len(consensus) == alignment_length, \
                   "Alignment length is %i, consensus length is %i, '%s'" \
                   % (alignment_length, len(consensus), consensus)
            alignment._star_info = consensus
        return alignment
Esempio n. 9
0
    def __next__(self):
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            while len(s) < length_of_seqs:
                # The sequence may be split into multiple lines
                line = handle.readline().strip()
                if not line:
                    break
                if line == "":
                    continue
                s = "".join([s, line.strip().replace(" ", "")])
                if len(s) > length_of_seqs:
                    raise ValueError(
                        "Found a record of length %i, should be %i" %
                        (len(s), length_of_seqs))
            if "." in s:
                raise ValueError(
                    "PHYLIP format no longer allows dots in sequence")
            seqs.append(s)
        while True:
            # Find other alignments in the file
            line = handle.readline()
            if not line:
                break
            if self._is_header(line):
                self._header = line
                break

        records = (SeqRecord(Seq(s, self.alphabet),
                             id=i,
                             name=i,
                             description=i) for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
Esempio n. 10
0
    def __next__(self):
        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            if "." in s:
                raise ValueError(
                    "PHYLIP format no longer allows dots in sequence")
            seqs.append([s])

        # Look for further blocks
        line = ""
        while True:
            # Skip any blank lines between blocks...
            while "" == line.strip():
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if self._is_header(line):
                # Looks like the start of a concatenated alignment
                self._header = line
                break

            # print "New block..."
            for i in range(number_of_seqs):
                s = line.strip().replace(" ", "")
                if "." in s:
                    raise ValueError(
                        "PHYLIP format no longer allows dots in sequence")
                seqs[i].append(s)
                line = handle.readline()
                if (not line) and i + 1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line:
                break  # end of file

        records = (SeqRecord(Seq("".join(s), self.alphabet),
                             id=i,
                             name=i,
                             description=i) for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
Esempio n. 11
0
    def dumb_consensus(self,
                       threshold=.7,
                       ambiguous="X",
                       consensus_alpha=None,
                       require_multiple=0):
        """Output a fast consensus sequence of the alignment.

        This doesn't do anything fancy at all. It will just go through the
        sequence residue by residue and count up the number of each type
        of residue (ie. A or G or T or C for DNA) in all sequences in the
        alignment. If the percentage of the most common residue type is
        greater then the passed threshold, then we will add that residue type,
        otherwise an ambiguous character will be added.

        This could be made a lot fancier (ie. to take a substitution matrix
        into account), but it just meant for a quick and dirty consensus.

        Arguments:
            - threshold - The threshold value that is required to add a particular
              atom.
            - ambiguous - The ambiguous character to be added when the threshold is
              not reached.
            - consensus_alpha - The alphabet to return for the consensus sequence.
              If this is None, then we will try to guess the alphabet.
            - require_multiple - If set as 1, this will require that more than
              1 sequence be part of an alignment to put it in the consensus (ie.
              not just 1 sequence and gaps).
        """
        # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X"
        consensus = ''

        # find the length of the consensus we are creating
        con_len = self.alignment.get_alignment_length()

        # go through each seq item
        for n in range(con_len):
            # keep track of the counts of the different atoms we get
            atom_dict = {}
            num_atoms = 0

            for record in self.alignment:
                # make sure we haven't run past the end of any sequences
                # if they are of different lengths
                if n < len(record.seq):
                    if record.seq[n] != '-' and record.seq[n] != '.':
                        if record.seq[n] not in atom_dict:
                            atom_dict[record.seq[n]] = 1
                        else:
                            atom_dict[record.seq[n]] += 1

                        num_atoms = num_atoms + 1

            max_atoms = []
            max_size = 0

            for atom in atom_dict:
                if atom_dict[atom] > max_size:
                    max_atoms = [atom]
                    max_size = atom_dict[atom]
                elif atom_dict[atom] == max_size:
                    max_atoms.append(atom)

            if require_multiple and num_atoms == 1:
                consensus += ambiguous
            elif (len(max_atoms) == 1) and (
                (float(max_size) / float(num_atoms)) >= threshold):
                consensus += max_atoms[0]
            else:
                consensus += ambiguous

        # we need to guess a consensus alphabet if one isn't specified
        if consensus_alpha is None:
            consensus_alpha = self._guess_consensus_alphabet(ambiguous)

        return Seq(consensus, consensus_alpha)
Esempio n. 12
0
    def __next__(self):

        handle = self.handle

        if self._header is None:
            line = handle.readline()
        else:
            # Header we saved from when we were parsing
            # the previous alignment.
            line = self._header
            self._header = None

        if not line:
            raise StopIteration

        while line.rstrip() != "#=======================================":
            line = handle.readline()
            if not line:
                raise StopIteration

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []

        while line[0] == "#":
            # Read in the rest of this alignment header,
            # try and discover the number of records expected
            # and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i + 1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            # And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None:
            raise ValueError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        seq_starts = []
        index = 0

        # Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    # identifier, seq start position, seq, seq end position
                    # (an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end
                    if start == end:
                        # Special case, either a single letter is present,
                        # or no letters at all.
                        if seq.replace("-", "") == "":
                            start = int(start)
                            end = int(end)
                        else:
                            start = int(start) - 1
                            end = int(end)
                    else:
                        assert seq.replace("-", "") != "", repr(line)
                        start = int(start) - 1  # python counting
                        end = int(end)

                    # The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id == ids[index] or id == ids[index][:len(id)]

                    if len(seq_starts) == index:
                        # Record the start
                        seq_starts.append(start)

                    # Check the start...
                    if start == end:
                        assert seq.replace("-", "") == "", line
                    else:
                        assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \
                        "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \
                            % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]),
                               start, line)

                    seqs[index] += seq

                    # Check the end ...
                    assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \
                        "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \
                            % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]),
                               seq_starts[index], end, line)

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    # just a start value, this is just alignment annotation (?)
                    # print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                # Just a spacer?
                pass
            else:
                print(line)
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================":
                # End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = []
        for id, seq in zip(ids, seqs):
            if len(seq) != length_of_seqs:
                # EMBOSS 2.9.0 is known to use spaces instead of minus signs
                # for leading gaps, and thus fails to parse.  This old version
                # is still used as of Dec 2008 behind the EBI SOAP webservice:
                # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            records.append(
                SeqRecord(Seq(seq, self.alphabet), id=id, description=id))
        return MultipleSeqAlignment(records, self.alphabet)