Example #1
0
def PhdIterator(handle):
    """Returns SeqRecord objects from a PHD file.

    This uses the Bio.Sequencing.Phd module to do the hard work.
    """
    phd_records = Phd.parse(handle)
    for phd_record in phd_records:
        #Convert the PHY record into a SeqRecord...
        #The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
        #from unit test example file phd_solexa.
        #This will cause problems if used as the record identifier
        #(e.g. output for FASTQ format).
        name = phd_record.file_name.split(None, 1)[0]
        seq_record = SeqRecord(phd_record.seq,
                               id=name,
                               name=name,
                               description=phd_record.file_name)
        #Just re-use the comments dictionary as the SeqRecord's annotations
        seq_record.annotations = phd_record.comments
        #And store the qualities and peak locations as per-letter-annotation
        seq_record.letter_annotations["phred_quality"] = \
            [int(site[1]) for site in phd_record.sites]
        try:
            seq_record.letter_annotations["peak_location"] = \
                [int(site[2]) for site in phd_record.sites]
        except IndexError:
            # peak locations are not always there according to
            # David Gordon (the Consed author)
            pass
        yield seq_record
Example #2
0
def PhdIterator(handle):
    """Returns SeqRecord objects from a PHD file.

    This uses the Bio.Sequencing.Phd module to do the hard work.
    """
    phd_records = Phd.parse(handle)
    for phd_record in phd_records:
        #Convert the PHY record into a SeqRecord...
        #The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1'
        #from unit test example file phd_solexa.
        #This will cause problems if used as the record identifier
        #(e.g. output for FASTQ format).
        name = phd_record.file_name.split(None, 1)[0]
        seq_record = SeqRecord(phd_record.seq,
                               id=name, name=name,
                               description=phd_record.file_name)
        #Just re-use the comments dictionary as the SeqRecord's annotations
        seq_record.annotations = phd_record.comments
        #And store the qualities and peak locations as per-letter-annotation
        seq_record.letter_annotations["phred_quality"] = \
            [int(site[1]) for site in phd_record.sites]
        try:
            seq_record.letter_annotations["peak_location"] = \
                [int(site[2]) for site in phd_record.sites]
        except IndexError:
            # peak locations are not always there according to
            # David Gordon (the Consed author)
            pass
        yield seq_record
Example #3
0
def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
    """Generator function to iterate over Fasta records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet
    title2ids - A function that, when given the title of the FASTA
    file (without the beginning >), will return the id, name and
    description (in that order) for the record as a tuple of strings.

    If this is not given, then the entire title line will be used
    as the description, and the first word as the id and name.

    By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
    with no custom handling of the title lines:

    >>> with open("Fasta/dups.fasta") as handle:
    ...     for record in FastaIterator(handle):
    ...         print(record.id)
    ...
    alpha
    beta
    gamma
    alpha
    delta

    However, you can supply a title2ids function to alter this:

    >>> def take_upper(title):
    ...     return title.split(None, 1)[0].upper(), "", title
    >>> with open("Fasta/dups.fasta") as handle:
    ...     for record in FastaIterator(handle, title2ids=take_upper):
    ...         print(record.id)
    ...
    ALPHA
    BETA
    GAMMA
    ALPHA
    DELTA

    """
    if title2ids:
        for title, sequence in SimpleFastaParser(handle):
            id, name, descr = title2ids(title)
            yield SeqRecord(Seq(sequence, alphabet),
                            id=id, name=name, description=descr)
    else:
        for title, sequence in SimpleFastaParser(handle):
            try:
                first_word = title.split(None, 1)[0]
            except IndexError:
                assert not title, repr(title)
                #Should we use SeqRecord default for no ID?
                first_word = ""
            yield SeqRecord(Seq(sequence, alphabet),
                            id=first_word, name=first_word, description=title)
Example #4
0
    def add_sequence(self,
                     descriptor,
                     sequence,
                     start=None,
                     end=None,
                     weight=1.0):
        """Add a sequence to the alignment.

        This doesn't do any kind of alignment, it just adds in the sequence
        object, which is assumed to be prealigned with the existing
        sequences.

        Arguments:
         - descriptor - The descriptive id of the sequence being added.
                       This will be used as the resulting SeqRecord's
                       .id property (and, for historical compatibility,
                       also the .description property)
         - sequence - A string with sequence info.
         - start - You can explicitly set the start point of the sequence.
                   This is useful (at least) for BLAST alignments, which can
                   just be partial alignments of sequences.
         - end - Specify the end of the sequence, which is important
                 for the same reason as the start.
         - weight - The weight to place on the sequence in the alignment.
                    By default, all sequences have the same weight. (0.0 =>
                    no weight, 1.0 => highest weight)
        """
        new_seq = Seq(sequence, self._alphabet)

        #We are now effectively using the SeqRecord's .id as
        #the primary identifier (e.g. in Bio.SeqIO) so we should
        #populate it with the descriptor.
        #For backwards compatibility, also store this in the
        #SeqRecord's description property.
        new_record = SeqRecord(new_seq, id=descriptor, description=descriptor)

        # hack! We really need to work out how to deal with annotations
        # and features in biopython. Right now, I'll just use the
        # generic annotations dictionary we've got to store the start
        # and end, but we should think up something better. I don't know
        # if I'm really a big fan of the LocatableSeq thing they've got
        # in BioPerl, but I'm not positive what the best thing to do on
        # this is...
        if start:
            new_record.annotations['start'] = start
        if end:
            new_record.annotations['end'] = end

        # another hack to add weight information to the sequence
        new_record.annotations['weight'] = weight

        self._records.append(new_record)
Example #5
0
    def to_generic(self, alphabet):
        """Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from SAP.Bio.Align, through which you can manipulate and query
        the object.

        alphabet is the specified alphabet for the sequences in the code (for
        example IUPAC.IUPACProtein).

        Thanks to James Casbon for the code.
        """
        #TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
        seq_parts = []
        seq_names = []
        parse_number = 0
        n = 0
        for name, start, seq, end in self.alignment:
            if name == 'QUERY':  # QUERY is the first in each alignment block
                parse_number += 1
                n = 0

            if parse_number == 1:  # create on first_parse, append on all others
                seq_parts.append(seq)
                seq_names.append(name)
            else:
                seq_parts[n] += seq
                n += 1

        generic = MultipleSeqAlignment([], alphabet)
        for (name, seq) in zip(seq_names, seq_parts):
            generic.append(SeqRecord(Seq(seq, alphabet), name))

        return generic
Example #6
0
def TabIterator(handle, alphabet=single_letter_alphabet):
    """Iterates over tab separated lines (as SeqRecord objects).

    Each line of the file should contain one tab only, dividing the line
    into an identifier and the full sequence.

    handle - input file
    alphabet - optional alphabet

    The first field is taken as the record's .id and .name (regardless of
    any spaces within the text) and the second field is the sequence.

    Any blank lines are ignored.
    """
    for line in handle:
        try:
            title, seq = line.split("\t")  # will fail if more than one tab!
        except:
            if line.strip() == "":
                #It's a blank line, ignore it
                continue
            raise ValueError("Each line should have one tab separating the" +
                             " title and sequence, this line has %i tabs: %s" %
                             (line.count("\t"), repr(line)))
        title = title.strip()
        seq = seq.strip()  # removes the trailing new line
        yield SeqRecord(Seq(seq, alphabet),
                        id=title,
                        name=title,
                        description="")
Example #7
0
def NexusIterator(handle, seq_count=None):
    """Returns SeqRecord objects from a Nexus file.

    Thus uses the Bio.Nexus module to do the hard work.

    You are expected to call this function via Bio.SeqIO or Bio.AlignIO
    (and not use it directly).

    NOTE - We only expect ONE alignment matrix per Nexus file,
    meaning this iterator will only yield one MultipleSeqAlignment.
    """
    n = Nexus.Nexus(handle)
    if not n.matrix:
        #No alignment found
        raise StopIteration

    #Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
    #The original names and the modified names are kept in these two lists:
    assert len(n.unaltered_taxlabels) == len(n.taxlabels)

    if seq_count and seq_count != len(n.unaltered_taxlabels):
        raise ValueError("Found %i sequences, but seq_count=%i"
               % (len(n.unaltered_taxlabels), seq_count))

    #ToDo - Can we extract any annotation too?
    records = (SeqRecord(n.matrix[new_name], id=new_name,
                         name=old_name, description="")
               for old_name, new_name
               in zip(n.unaltered_taxlabels, n.taxlabels))
    #All done
    yield MultipleSeqAlignment(records, n.alphabet)
Example #8
0
 def toMultipleSeqAlignment(self):
     """Return a MultipleSeqAlignment containing all the
     SeqRecord in the CodonAlignment using Seq to store 
     sequences
     """
     alignments = [SeqRecord(rec.seq.toSeq(), id=rec.id) for \
             rec in self._records]
     return MultipleSeqAlignment(alignments)
Example #9
0
 def from_msa(cls, align, alphabet=default_codon_alphabet):
     """Function to convert a MultipleSeqAlignment to CodonAlignment.
     It is the user's responsibility to ensure all the requirement
     needed by CodonAlignment is met.
     """
     rec = [SeqRecord(CodonSeq(str(i.seq), alphabet=alphabet), id=i.id) \
              for i in align._records]
     return cls(rec, alphabet=alphabet)
Example #10
0
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict) + 1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(
            SeqRecord(Seq(mult_align_dict[i], alpha),
                      sum_dict[i].pdb2 + sum_dict[i].chain2))
    return fssp_align
Example #11
0
    def __iter__(self):
        """Iterate over the records in the XML file.
        Returns the last parsed record."""

        record = None
        try:
            for event, node in self._events:

                if event == "START_ELEMENT" and node.namespaceURI == self._namespace:

                    if node.localName == self._recordTag:
                        #create an empty SeqRecord
                        record = SeqRecord('', id='')

                    #call matching methods with attributes only
                    if hasattr(self, "_attr_" + node.localName):
                        getattr(self, "_attr_" + node.localName)(
                            self._attributes(node), record)

                    #call matching methods with DOM tree
                    if hasattr(self, "_elem_" + node.localName):
                        #read the element and all nested elements into a DOM tree
                        self._events.expandNode(node)
                        node.normalize()

                        getattr(self, "_elem_" + node.localName)(node, record)

                elif event == "END_ELEMENT" and node.namespaceURI == self._namespace and node.localName == self._recordTag:
                    yield record

        except SAXParseException as e:

            if e.getLineNumber() == 1 and e.getColumnNumber() == 0:
                #empty file
                pass
            else:
                import os
                if e.getLineNumber() == 1 and e.getColumnNumber() == 1 \
                        and os.name == "java":
                    #empty file, see http://bugs.jython.org/issue1774
                    pass
                else:
                    raise
Example #12
0
    def _set_seq(self, seq, seq_type):
        """Checks the given sequence for attribute setting

        Arguments:
        seq -- String or SeqRecord to check
        seq_type -- String of sequence type, must be 'hit' or 'query'

        """
        assert seq_type in ('hit', 'query')
        if seq is None:
            return seq  # return immediately if seq is None
        else:
            if not isinstance(seq, (basestring, SeqRecord)):
                raise TypeError("%s sequence must be a string or a SeqRecord"
                                " object." % seq_type)
        # check length if the opposite sequence is not None
        opp_type = 'hit' if seq_type == 'query' else 'query'
        opp_seq = getattr(self, '_%s' % opp_type, None)
        if opp_seq is not None:
            if len(seq) != len(opp_seq):
                raise ValueError("Sequence lengths do not match. Expected: "
                                 "%r (%s); found: %r (%s)." %
                                 (len(opp_seq), opp_type, len(seq), seq_type))

        seq_id = getattr(self, '%s_id' % seq_type)
        seq_desc = getattr(self, '%s_description' % seq_type)
        seq_feats = getattr(self, '%s_features' % seq_type)
        seq_name = 'aligned %s sequence' % seq_type

        if isinstance(seq, SeqRecord):
            seq.id = seq_id
            seq.description = seq_desc
            seq.name = seq_name
            seq.features = seq_feats
            seq.seq.alphabet = self.alphabet
        elif isinstance(seq, basestring):
            seq = SeqRecord(Seq(seq, self.alphabet),
                            id=seq_id,
                            name=seq_name,
                            description=seq_desc,
                            features=seq_feats)

        return seq
Example #13
0
    def add_sequence(self, descriptor, sequence, start = None, end = None,
                     weight = 1.0):
        """Add a sequence to the alignment (DEPRECATED).

        The start, end, and weight arguments are not supported! This method
        only provides limited backwards compatibility with the old
        Bio.Align.Generic.Alignment object. Please use the append method with
        a SeqRecord instead, since add_sequence is likely to be removed in a
        future release of Biopython.
        """
        import warnings
        import Bio
        warnings.warn("The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, as the add_sequence method is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning)
        #Should we handle start/end/strand information somehow? What for?
        #TODO - Should we handle weights somehow? See also AlignInfo code...
        if start is not None or end is not None or weight != 1.0:
            raise ValueError("The add_Sequence method is obsolete, and only "
                             "provides limited backwards compatibily. The"
                             "start, end and weight arguments are not "
                             "supported.")
        self.append(SeqRecord(Seq(sequence, self._alphabet),
                              id = descriptor, description = descriptor))
Example #14
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Example #15
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r"
                             % (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(Seq(q, alphabet),
                           id=query_id,
                           name="query",
                           description=query_descr,
                           annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(Seq(m, alphabet),
                           id=match_id,
                           name="match",
                           description=match_descr,
                           annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
Example #16
0
    def __next__(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            #Empty file - just give up.
            raise StopIteration
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = self.handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                #The "//" line indicates the end of the alignment.
                #There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                #blank line, ignore
                pass
            elif line[0] != "#":
                #Sequence
                #Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    #This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier "
                                      + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                #Comment line or meta-data
                if line[:5] == "#=GF ":
                    #Generic per-File annotation, free text
                    #Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    #Each feature key could be used more than once,
                    #so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    #Generic per-Column annotation, exactly 1 char per column
                    #Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    #Generic per-Sequence annotation, free text
                    #Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    #Generic per-Sequence AND per-Column markup
                    #Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip()  # append to any previous entry
                    #TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            #Next line...

        assert len(seqs) <= len(ids)
        #assert len(gs)   <= len(ids)
        #assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids):
                raise ValueError("Found %i records in this alignment, told to expect %i"
                                 % (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError("Sequences have different lengths, or repeated identifier")
                name, start, end = self._identifier_split(id)
                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=id, name=name, description=id,
                                   annotations={"accession": name})
                #Accession will be overridden by _populate_meta_data if an explicit
                #accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            #TODO - Introduce an annotated alignment class?
            #For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Example #17
0
    def __next__(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        while line.rstrip() != "#=======================================":
            line = handle.readline()
            if not line:
                raise StopIteration

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []

        while line[0] == "#":
            #Read in the rest of this alignment header,
            #try and discover the number of records expected
            #and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i+1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            #And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None:
            raise ValueError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError("Found %i records in this alignment, told to expect %i"
                             % (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        seq_starts = []
        index = 0

        #Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    #identifier, seq start position, seq, seq end position
                    #(an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end
                    if start == end:
                        #Special case, either a single letter is present,
                        #or no letters at all.
                        if seq.replace("-", "") == "":
                            start = int(start)
                            end = int(end)
                        else:
                            start = int(start) - 1
                            end = int(end)
                    else:
                        assert seq.replace("-", "") != "", repr(line)
                        start = int(start) - 1  # python counting
                        end = int(end)

                    #The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id == ids[index] or id == ids[index][:len(id)]

                    if len(seq_starts) == index:
                        #Record the start
                        seq_starts.append(start)

                    #Check the start...
                    if start == end:
                        assert seq.replace("-", "") == "", line
                    else:
                        assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \
                        "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \
                            % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]),
                               start, line)

                    seqs[index] += seq

                    #Check the end ...
                    assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \
                        "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \
                            % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]),
                               seq_starts[index], end, line)

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    #just a start value, this is just alignment annotation (?)
                    #print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                #Just a spacer?
                pass
            else:
                print(line)
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================":
                #End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError("Found %i records in this alignment, told to expect %i"
                             % (len(ids), self.records_per_alignment))

        records = []
        for id, seq in zip(ids, seqs):
            if len(seq) != length_of_seqs:
                #EMBOSS 2.9.0 is known to use spaces instead of minus signs
                #for leading gaps, and thus fails to parse.  This old version
                #is still used as of Dec 2008 behind the EBI SOAP webservice:
                #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            records.append(SeqRecord(Seq(seq, self.alphabet),
                                     id=id, description=id))
        return MultipleSeqAlignment(records, self.alphabet)
Example #18
0
    def __next__(self):
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            while len(s) < length_of_seqs:
                # The sequence may be split into multiple lines
                line = handle.readline().strip()
                if not line:
                    break
                if line == "":
                    continue
                s = "".join([s, line.strip().replace(" ", "")])
                if len(s) > length_of_seqs:
                    raise ValueError(
                        "Found a record of length %i, should be %i" %
                        (len(s), length_of_seqs))
            if "." in s:
                raise ValueError(
                    "PHYLIP format no longer allows dots in sequence")
            seqs.append(s)
        while True:
            # Find other alignments in the file
            line = handle.readline()
            if not line:
                break
            if self._is_header(line):
                self._header = line
                break

        records = (SeqRecord(Seq(s, self.alphabet),
                             id=i,
                             name=i,
                             description=i) for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
Example #19
0
def PdbSeqresIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file.

    The sequences are derived from the SEQRES lines in the
    PDB file header, not the atoms of the 3D structure.

    Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES

    See: http://www.wwpdb.org/documentation/format23/sect3.html
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from SAP.Bio.SeqUtils import seq1

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    for line in handle:
        rec_name = line[0:6].strip()
        if rec_name == 'SEQRES':
            # NB: We only actually need chain ID and the residues here;
            # commented bits are placeholders from the wwPDB spec.
            # Serial number of the SEQRES record for the current chain.
            # Starts at 1 and increments by one each line.
            # Reset to 1 for each chain.
            # ser_num = int(line[8:10])
            # Chain identifier. This may be any single legal character,
            # including a blank which is used if there is only one chain.
            chn_id = line[11]
            # Number of residues in the chain (repeated on every record)
            # num_res = int(line[13:17])
            residues = [seq1(res, custom_map=protein_letters_3to1) for res in line[19:].split()]
            chains[chn_id].extend(residues)
        elif rec_name == 'DBREF':
            #  ID code of this entry (PDB ID)
            pdb_id = line[7:11]
            # Chain identifier.
            chn_id = line[12]
            # Initial sequence number of the PDB sequence segment.
            # seq_begin = int(line[14:18])
            # Initial insertion code of the PDB sequence segment.
            # icode_begin = line[18]
            # Ending sequence number of the PDB sequence segment.
            # seq_end = int(line[20:24])
            # Ending insertion code of the PDB sequence segment.
            # icode_end = line[24]
            # Sequence database name.
            database = line[26:32].strip()
            # Sequence database accession code.
            db_acc = line[33:41].strip()
            # Sequence database identification code.
            db_id_code = line[42:54].strip()
            # Initial sequence number of the database seqment.
            # db_seq_begin = int(line[55:60])
            # Insertion code of initial residue of the segment, if PDB is the
            # reference.
            # db_icode_begin = line[60]
            # Ending sequence number of the database segment.
            # db_seq_end = int(line[62:67])
            # Insertion code of the ending residue of the segment, if PDB is the
            # reference.
            # db_icode_end = line[67]
            metadata[chn_id].append({'pdb_id': pdb_id, 'database': database,
                                    'db_acc': db_acc, 'db_id_code': db_id_code})
        # ENH: 'SEQADV' 'MODRES'

    for chn_id, residues in sorted(chains.items()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = ("%s:%s %s" % (m['database'],
                                                m['db_acc'],
                                                m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])])
        else:
            record.id = chn_id
        yield record
Example #20
0
def PdbAtomIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file

    The sequences are derived from the 3D structure (ATOM records), not the
    SEQRES lines in the PDB file header.

    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
    are converted to "X" in the sequence.

    In addition to information from the PDB header (which is the same for all
    records), the following chain specific information is placed in the
    annotation:

    record.annotations["residues"] = List of residue ID strings
    record.annotations["chain"] = Chain ID (typically A, B ,...)
    record.annotations["model"] = Model ID (typically zero)

    Where amino acids are missing from the structure, as indicated by residue
    numbering, the sequence is filled in with 'X' characters to match the size
    of the missing region, and  None is included as the corresponding entry in
    the list record.annotations["residues"].

    This function uses the Bio.PDB module to do most of the hard work. The
    annotation information could be improved but this extra parsing should be
    done in parse_pdb_header, not this module.
    """
    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from SAP.Bio.PDB import PDBParser
    from SAP.Bio.SeqUtils import seq1

    def restype(residue):
        """Return a residue's type as a one-letter code.

        Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
        """
        return seq1(residue.resname, custom_map=protein_letters_3to1)

    # Deduce the PDB ID from the PDB header
    # ENH: or filename?
    from SAP.Bio.File import UndoHandle
    undo_handle = UndoHandle(handle)
    firstline = undo_handle.peekline()
    if firstline.startswith("HEADER"):
        pdb_id = firstline[62:66]
    else:
        warnings.warn("First line is not a 'HEADER'; can't determine PDB ID")
        pdb_id = '????'

    struct = PDBParser().get_structure(pdb_id, undo_handle)
    model = struct[0]
    for chn_id, chain in sorted(model.child_dict.items()):
        # HETATM mod. res. policy: remove mod if in sequence, else discard
        residues = [res for res in chain.get_unpacked_list()
                    if seq1(res.get_resname().upper(),
                        custom_map=protein_letters_3to1) != "X"]
        if not residues:
            continue
        # Identify missing residues in the structure
        # (fill the sequence with 'X' residues in these regions)
        gaps = []
        rnumbers = [r.id[1] for r in residues]
        for i, rnum in enumerate(rnumbers[:-1]):
            if rnumbers[i+1] != rnum + 1:
                # It's a gap!
                gaps.append((i+1, rnum, rnumbers[i+1]))
        if gaps:
            res_out = []
            prev_idx = 0
            for i, pregap, postgap in gaps:
                if postgap > pregap:
                    gapsize = postgap - pregap - 1
                    res_out.extend(restype(x) for x in  residues[prev_idx:i])
                    prev_idx = i
                    res_out.append('X'*gapsize)
                else:
                    warnings.warn("Ignoring out-of-order residues after a gap",
                                  UserWarning)
                    # Keep the normal part, drop the out-of-order segment
                    # (presumably modified or hetatm residues, e.g. 3BEG)
                    res_out.extend(restype(x) for x in residues[prev_idx:i])
                    break
            else:
                # Last segment
                res_out.extend(restype(x) for x in residues[prev_idx:])
        else:
            # No gaps
            res_out = [restype(x) for x in residues]
        record_id = "%s:%s" % (pdb_id, chn_id)
        # ENH - model number in SeqRecord id if multiple models?
        # id = "Chain%s" % str(chain.id)
        # if len(structure) > 1 :
        #     id = ("Model%s|" % str(model.id)) + id

        record = SeqRecord(Seq(''.join(res_out), generic_protein),
                id=record_id,
                description=record_id,
                )

        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations = struct.header.copy()
        # Plus some chain specifics:
        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        # Start & end
        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record
Example #21
0
    def to_seqrecord(self):
        """Create a SeqRecord object from this Sequence instance.

        The seqrecord.annotations dictionary is packed like so::

            { # Sequence attributes with no SeqRecord equivalent:
              'id_ref':     self.id_ref,
              'id_source':  self.id_source,
              'location':   self.location,
              'uri':        { 'value': self.uri.value,
                              'desc': self.uri.desc,
                              'type': self.uri.type },
              # Sequence.annotations attribute (list of Annotations)
              'annotations': [{ 'ref':      ann.ref,
                                'source':   ann.source,
                                'evidence': ann.evidence,
                                'type':     ann.type,
                                'confidence': [ ann.confidence.value,
                                                ann.confidence.type ],
                                'properties': [{ 'value': prop.value,
                                                 'ref': prop.ref,
                                                 'applies_to': prop.applies_to,
                                                 'datatype':   prop.datatype,
                                                 'unit':       prop.unit,
                                                 'id_ref':     prop.id_ref }
                                               for prop in ann.properties],
                              } for ann in self.annotations],
            }
        """
        def clean_dict(dct):
            """Remove None-valued items from a dictionary."""
            return dict((key, val) for key, val in dct.items()
                        if val is not None)

        seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()),
                           **clean_dict({
                               'id':    str(self.accession),
                               'name':  self.symbol,
                               'description': self.name,
                               # 'dbxrefs': None,
                               }))
        if self.domain_architecture:
            seqrec.features = [dom.to_seqfeature()
                               for dom in self.domain_architecture.domains]
        # Sequence attributes with no SeqRecord equivalent
        seqrec.annotations = clean_dict({
                'id_ref':       self.id_ref,
                'id_source':    self.id_source,
                'location':     self.location,
                'uri':          self.uri and clean_dict({
                                    'value': self.uri.value,
                                    'desc': self.uri.desc,
                                    'type': self.uri.type,
                                    }),
                'annotations':  self.annotations and [
                    clean_dict({
                        'ref':          ann.ref,
                        'source':       ann.source,
                        'evidence':     ann.evidence,
                        'type':         ann.type,
                        'confidence':   ann.confidence and [
                                            ann.confidence.value,
                                            ann.confidence.type],
                        'properties':   [clean_dict({
                                            'value':      prop.value,
                                            'ref':        prop.ref,
                                            'applies_to': prop.applies_to,
                                            'datatype':   prop.datatype,
                                            'unit':       prop.unit,
                                            'id_ref':     prop.id_ref })
                                         for prop in ann.properties],
                        }) for ann in self.annotations],
                })
        return seqrec
Example #22
0
def IgIterator(handle, alphabet=single_letter_alphabet):
    """Iterate over IntelliGenetics records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet

    The optional free format file header lines (which start with two
    semi-colons) are ignored.

    The free format commentary lines at the start of each record (which
    start with a semi-colon) are recorded as a single string with embedded
    new line characters in the SeqRecord's annotations dictionary under the
    key 'comment'.
    """
    #Skip any file header text before the first record (;; lines)
    while True:
        line = handle.readline()
        if not line:
            break  # Premature end of file, or just empty?
        if not line.startswith(";;"):
            break

    while line:
        #Now iterate over the records
        if line[0] != ";":
            raise ValueError(
                "Records should start with ';' and not:\n%s" % repr(line))

        #Try and agree with SeqRecord convention from the GenBank parser,
        #(and followed in the SwissProt parser) which stores the comments
        #as a long string with newlines under annotations key 'comment'.

        #Note some examples use "; ..." and others ";..."
        comment_lines = []
        while line.startswith(";"):
            #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
            comment_lines.append(line[1:].strip())
            line = handle.readline()
        title = line.rstrip()

        seq_lines = []
        while True:
            line = handle.readline()
            if not line:
                break
            if line[0] == ";":
                break
            #Remove trailing whitespace, and any internal spaces
            seq_lines.append(line.rstrip().replace(" ", ""))
        seq_str = "".join(seq_lines)
        if seq_str.endswith("1"):
            #Remove the optional terminator (digit one)
            seq_str = seq_str[:-1]
        if "1" in seq_str:
            raise ValueError(
                "Potential terminator digit one found within sequence.")

        #Return the record and then continue...
        record = SeqRecord(Seq(seq_str, alphabet),
                           id=title, name=title)
        record.annotations['comment'] = "\n".join(comment_lines)
        yield record

    #We should be at the end of the file now
    assert not line
Example #23
0
def PirIterator(handle):
    """Generator function to iterate over Fasta records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet
    title2ids - A function that, when given the title of the FASTA
    file (without the beginning >), will return the id, name and
    description (in that order) for the record as a tuple of strings.

    If this is not given, then the entire title line will be used
    as the description, and the first word as the id and name.

    Note that use of title2ids matches that of Bio.Fasta.SequenceParser
    but the defaults are slightly different.
    """
    #Skip any text before the first record (e.g. blank lines, comments)
    while True:
        line = handle.readline()
        if line == "":
            return  # Premature end of file, or just empty?
        if line[0] == ">":
            break

    while True:
        if line[0] != ">":
            raise ValueError(
                "Records in PIR files should start with '>' character")
        pir_type = line[1:3]
        if pir_type not in _pir_alphabets or line[3] != ";":
            raise ValueError("Records should start with '>XX;' "
                             "where XX is a valid sequence type")
        identifier = line[4:].strip()
        description = handle.readline().strip()

        lines = []
        line = handle.readline()
        while True:
            if not line:
                break
            if line[0] == ">":
                break
            #Remove trailing whitespace, and any internal spaces
            lines.append(line.rstrip().replace(" ", ""))
            line = handle.readline()
        seq = "".join(lines)
        if seq[-1] != "*":
            #Note the * terminator is present on nucleotide sequences too,
            #it is not a stop codon!
            raise ValueError(
                "Sequences in PIR files should include a * terminator!")

        #Return the record and then continue...
        record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
                           id=identifier,
                           name=identifier,
                           description=description)
        record.annotations["PIR-type"] = pir_type
        yield record

        if not line:
            return  # StopIteration
    assert False, "Should not reach this line"
Example #24
0
    def to_seqrecord(self):
        """Create a SeqRecord object from this Sequence instance.

        The seqrecord.annotations dictionary is packed like so::

            { # Sequence attributes with no SeqRecord equivalent:
              'id_ref':     self.id_ref,
              'id_source':  self.id_source,
              'location':   self.location,
              'uri':        { 'value': self.uri.value,
                              'desc': self.uri.desc,
                              'type': self.uri.type },
              # Sequence.annotations attribute (list of Annotations)
              'annotations': [{ 'ref':      ann.ref,
                                'source':   ann.source,
                                'evidence': ann.evidence,
                                'type':     ann.type,
                                'confidence': [ ann.confidence.value,
                                                ann.confidence.type ],
                                'properties': [{ 'value': prop.value,
                                                 'ref': prop.ref,
                                                 'applies_to': prop.applies_to,
                                                 'datatype':   prop.datatype,
                                                 'unit':       prop.unit,
                                                 'id_ref':     prop.id_ref }
                                               for prop in ann.properties],
                              } for ann in self.annotations],
            }
        """
        def clean_dict(dct):
            """Remove None-valued items from a dictionary."""
            return dict(
                (key, val) for key, val in dct.items() if val is not None)

        seqrec = SeqRecord(
            Seq(self.mol_seq.value, self.get_alphabet()),
            **clean_dict({
                'id': str(self.accession),
                'name': self.symbol,
                'description': self.name,
                # 'dbxrefs': None,
            }))
        if self.domain_architecture:
            seqrec.features = [
                dom.to_seqfeature() for dom in self.domain_architecture.domains
            ]
        # Sequence attributes with no SeqRecord equivalent
        seqrec.annotations = clean_dict({
            'id_ref':
            self.id_ref,
            'id_source':
            self.id_source,
            'location':
            self.location,
            'uri':
            self.uri and clean_dict({
                'value': self.uri.value,
                'desc': self.uri.desc,
                'type': self.uri.type,
            }),
            'annotations':
            self.annotations and [
                clean_dict({
                    'ref':
                    ann.ref,
                    'source':
                    ann.source,
                    'evidence':
                    ann.evidence,
                    'type':
                    ann.type,
                    'confidence':
                    ann.confidence
                    and [ann.confidence.value, ann.confidence.type],
                    'properties': [
                        clean_dict({
                            'value': prop.value,
                            'ref': prop.ref,
                            'applies_to': prop.applies_to,
                            'datatype': prop.datatype,
                            'unit': prop.unit,
                            'id_ref': prop.id_ref
                        }) for prop in ann.properties
                    ],
                }) for ann in self.annotations
            ],
        })
        return seqrec
Example #25
0
def PirIterator(handle):
    """Generator function to iterate over Fasta records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet
    title2ids - A function that, when given the title of the FASTA
    file (without the beginning >), will return the id, name and
    description (in that order) for the record as a tuple of strings.

    If this is not given, then the entire title line will be used
    as the description, and the first word as the id and name.

    Note that use of title2ids matches that of Bio.Fasta.SequenceParser
    but the defaults are slightly different.
    """
    #Skip any text before the first record (e.g. blank lines, comments)
    while True:
        line = handle.readline()
        if line == "":
            return  # Premature end of file, or just empty?
        if line[0] == ">":
            break

    while True:
        if line[0] != ">":
            raise ValueError(
                "Records in PIR files should start with '>' character")
        pir_type = line[1:3]
        if pir_type not in _pir_alphabets or line[3] != ";":
            raise ValueError(
                "Records should start with '>XX;' "
                "where XX is a valid sequence type")
        identifier = line[4:].strip()
        description = handle.readline().strip()

        lines = []
        line = handle.readline()
        while True:
            if not line:
                break
            if line[0] == ">":
                break
            #Remove trailing whitespace, and any internal spaces
            lines.append(line.rstrip().replace(" ", ""))
            line = handle.readline()
        seq = "".join(lines)
        if seq[-1] != "*":
            #Note the * terminator is present on nucleotide sequences too,
            #it is not a stop codon!
            raise ValueError(
                "Sequences in PIR files should include a * terminator!")

        #Return the record and then continue...
        record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
                           id=identifier, name=identifier,
                           description=description)
        record.annotations["PIR-type"] = pir_type
        yield record

        if not line:
            return  # StopIteration
    assert False, "Should not reach this line"
Example #26
0
    def parse(self):
        """Parse the input."""
        assert self.entry.tag == NS + 'entry'

        def append_to_annotations(key, value):
            if key not in self.ParsedSeqRecord.annotations:
                self.ParsedSeqRecord.annotations[key] = []
            if value not in self.ParsedSeqRecord.annotations[key]:
                self.ParsedSeqRecord.annotations[key].append(value)

        def _parse_name(element):
            self.ParsedSeqRecord.name = element.text
            self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)

        def _parse_accession(element):
            append_to_annotations('accessions', element.text)  # to cope with SwissProt plain text parser
            self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)

        def _parse_protein(element):
            """Parse protein names (PRIVATE)."""
            descr_set = False
            for protein_element in element:
                if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']:  # recommendedName tag are parsed before
                    #use protein fields for name and description
                    for rec_name in protein_element:
                        ann_key = '%s_%s' % (protein_element.tag.replace(NS, ''),
                                             rec_name.tag.replace(NS, ''))
                        append_to_annotations(ann_key, rec_name.text)
                        if (rec_name.tag == NS + 'fullName') and not descr_set:
                            self.ParsedSeqRecord.description = rec_name.text
                            descr_set = True
                elif protein_element.tag == NS + 'component':
                    pass  # not parsed
                elif protein_element.tag == NS + 'domain':
                    pass  # not parsed

        def _parse_gene(element):
            for genename_element in element:
                if 'type' in genename_element.attrib:
                    ann_key = 'gene_%s_%s' % (genename_element.tag.replace(NS, ''),
                                              genename_element.attrib['type'])
                    if genename_element.attrib['type'] == 'primary':
                        self.ParsedSeqRecord.annotations[ann_key] = genename_element.text
                    else:
                        append_to_annotations(ann_key, genename_element.text)

        def _parse_geneLocation(element):
            append_to_annotations('geneLocation', element.attrib['type'])

        def _parse_organism(element):
            organism_name = com_name = sci_name = ''
            for organism_element in element:
                if organism_element.tag == NS + 'name':
                    if organism_element.text:
                        if organism_element.attrib['type'] == 'scientific':
                            sci_name = organism_element.text
                        elif organism_element.attrib['type'] == 'common':
                            com_name = organism_element.text
                        else:
                            #e.g. synonym
                            append_to_annotations("organism_name", organism_element.text)
                elif organism_element.tag == NS + 'dbReference':
                    self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type'] + ':' + organism_element.attrib['id'])
                elif organism_element.tag == NS + 'lineage':
                    for taxon_element in organism_element:
                        if taxon_element.tag == NS + 'taxon':
                            append_to_annotations('taxonomy', taxon_element.text)
            if sci_name and com_name:
                organism_name = '%s (%s)' % (sci_name, com_name)
            elif sci_name:
                organism_name = sci_name
            elif com_name:
                organism_name = com_name
            self.ParsedSeqRecord.annotations['organism'] = organism_name

        def _parse_organismHost(element):
            for organism_element in element:
                if organism_element.tag == NS + 'name':
                    append_to_annotations("organism_host", organism_element.text)

        def _parse_keyword(element):
            append_to_annotations('keywords', element.text)

        def _parse_comment(element):
            """Parse comments (PRIVATE).

            Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
            To store all the contained data, more complex data structures are needed, such as
            annotated dictionaries. This is left to end user, by optionally setting:

            return_raw_comments=True

            The original XML is returned in the annotation fields.

            Available comment types at december 2009:
                "allergen"
                "alternative products"
                "biotechnology"
                "biophysicochemical properties"
                "catalytic activity"
                "caution"
                "cofactor"
                "developmental stage"
                "disease"
                "domain"
                "disruption phenotype"
                "enzyme regulation"
                "function"
                "induction"
                "miscellaneous"
                "pathway"
                "pharmaceutical"
                "polymorphism"
                "PTM"
                "RNA editing"
                "similarity"
                "subcellular location"
                "sequence caution"
                "subunit"
                "tissue specificity"
                "toxic dose"
                "online information"
                "mass spectrometry"
                "interaction"
            """

            simple_comments = ["allergen",
                               "biotechnology",
                               "biophysicochemical properties",
                               "catalytic activity",
                               "caution",
                               "cofactor",
                               "developmental stage",
                               "disease",
                               "domain",
                               "disruption phenotype",
                               "enzyme regulation",
                               "function",
                               "induction",
                               "miscellaneous",
                               "pathway",
                               "pharmaceutical",
                               "polymorphism",
                               "PTM",
                               "RNA editing",  # positions not parsed
                               "similarity",
                               "subunit",
                               "tissue specificity",
                               "toxic dose",
                               ]

            if element.attrib['type'] in simple_comments:
                ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '')
                for text_element in element.getiterator(NS + 'text'):
                    if text_element.text:
                        append_to_annotations(ann_key, text_element.text)
            elif element.attrib['type'] == 'subcellular location':
                for subloc_element in element.getiterator(NS + 'subcellularLocation'):
                    for el in subloc_element:
                        if el.text:
                            ann_key = 'comment_%s_%s' % (element.attrib['type'].replace(' ', ''), el.tag.replace(NS, ''))
                            append_to_annotations(ann_key, el.text)
            elif element.attrib['type'] == 'interaction':
                for interact_element in element.getiterator(NS + 'interactant'):
                    ann_key = 'comment_%s_intactId' % element.attrib['type']
                    append_to_annotations(ann_key, interact_element.attrib['intactId'])
            elif element.attrib['type'] == 'alternative products':
                for alt_element in element.getiterator(NS + 'isoform'):
                    ann_key = 'comment_%s_isoform' % element.attrib['type'].replace(' ', '')
                    for id_element in alt_element.getiterator(NS + 'id'):
                        append_to_annotations(ann_key, id_element.text)
            elif element.attrib['type'] == 'mass spectrometry':
                ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '')
                start = end = 0
                for loc_element in element.getiterator(NS + 'location'):
                    pos_els = loc_element.getiterator(NS + 'position')
                    pos_els = list(pos_els)
                    # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry
                    try:
                        if pos_els:
                            end = int(pos_els[0].attrib['position'])
                            start = end - 1
                        else:
                            start =  int(list(loc_element.getiterator(NS + 'begin'))[0].attrib['position']) - 1
                            end = int(list(loc_element.getiterator(NS + 'end'))[0].attrib['position'])
                    except:  # undefined positions or erroneously mapped
                        pass
                mass = element.attrib['mass']
                method = element.attrib['method']
                if start == end == 0:
                    append_to_annotations(ann_key, 'undefined:%s|%s' % (mass, method))
                else:
                    append_to_annotations(ann_key, '%s..%s:%s|%s' % (start, end, mass, method))
            elif element.attrib['type'] == 'sequence caution':
                pass  # not parsed: few information, complex structure
            elif element.attrib['type'] == 'online information':
                for link_element in element.getiterator(NS + 'link'):
                    ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '')
                    for id_element in link_element.getiterator(NS + 'link'):
                        append_to_annotations(ann_key,
                                              '%s@%s' % (element.attrib['name'], link_element.attrib['uri']))

            #return raw XML comments if needed
            if self.return_raw_comments:
                ann_key = 'comment_%s_xml' % element.attrib['type'].replace(' ', '')
                append_to_annotations(ann_key, ElementTree.tostring(element))

        def _parse_dbReference(element):
            self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id'])
            #e.g.
            # <dbReference type="PDB" key="11" id="2GEZ">
            #   <property value="X-ray" type="method"/>
            #   <property value="2.60 A" type="resolution"/>
            #   <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/>
            # </dbReference>
            if 'type' in element.attrib:
                if element.attrib['type'] == 'PDB':
                    method = ""
                    resolution = ""
                    for ref_element in element:
                        if ref_element.tag == NS + 'property':
                            dat_type = ref_element.attrib['type']
                            if dat_type == 'method':
                                method = ref_element.attrib['value']
                            if dat_type == 'resolution':
                                resolution = ref_element.attrib['value']
                            if dat_type == 'chains':
                                pairs = ref_element.attrib['value'].split(',')
                                for elem in pairs:
                                    pair = elem.strip().split('=')
                                    if pair[1] != '-':
                                        #TODO - How best to store these, do SeqFeatures make sense?
                                        feature = SeqFeature.SeqFeature()
                                        feature.type = element.attrib['type']
                                        feature.qualifiers['name'] = element.attrib['id']
                                        feature.qualifiers['method'] = method
                                        feature.qualifiers['resolution'] = resolution
                                        feature.qualifiers['chains'] = pair[0].split('/')
                                        start = int(pair[1].split('-')[0]) - 1
                                        end = int(pair[1].split('-')[1])
                                        feature.location = SeqFeature.FeatureLocation(start, end)
                                        #self.ParsedSeqRecord.features.append(feature)

            for ref_element in element:
                if ref_element.tag == NS + 'property':
                    pass  # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs

        def _parse_reference(element):
            reference = SeqFeature.Reference()
            authors = []
            scopes = []
            tissues = []
            journal_name = ''
            pub_type = ''
            pub_date = ''
            for ref_element in element:
                if ref_element.tag == NS + 'citation':
                    pub_type = ref_element.attrib['type']
                    if pub_type == 'submission':
                        pub_type += ' to the ' + ref_element.attrib['db']
                    if 'name' in ref_element.attrib:
                        journal_name = ref_element.attrib['name']
                    pub_date = ref_element.attrib.get('date', '')
                    j_volume = ref_element.attrib.get('volume', '')
                    j_first = ref_element.attrib.get('first', '')
                    j_last = ref_element.attrib.get('last', '')
                    for cit_element in ref_element:
                        if cit_element.tag == NS + 'title':
                            reference.title = cit_element.text
                        elif cit_element.tag == NS + 'authorList':
                            for person_element in cit_element:
                                authors.append(person_element.attrib['name'])
                        elif cit_element.tag == NS + 'dbReference':
                            self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']
                                                                + ':' + cit_element.attrib['id'])
                            if cit_element.attrib['type'] == 'PubMed':
                                reference.pubmed_id = cit_element.attrib['id']
                            elif ref_element.attrib['type'] == 'MEDLINE':
                                reference.medline_id = cit_element.attrib['id']
                elif ref_element.tag == NS + 'scope':
                    scopes.append(ref_element.text)
                elif ref_element.tag == NS + 'source':
                    for source_element in ref_element:
                        if source_element.tag == NS + 'tissue':
                            tissues.append(source_element.text)
            if scopes:
                scopes_str = 'Scope: ' + ', '.join(scopes)
            else:
                scopes_str = ''
            if tissues:
                tissues_str = 'Tissue: ' + ', '.join(tissues)
            else:
                tissues_str = ''

            # locations cannot be parsed since they are actually written in
            # free text inside scopes so all the references are put in the
            # annotation.
            reference.location = []
            reference.authors = ', '.join(authors)
            if journal_name:
                if pub_date and j_volume and j_first and j_last:
                    reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
                        volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
                else:
                    reference.journal = journal_name
            reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str))
            append_to_annotations('references', reference)

        def _parse_position(element, offset=0):
            try:
                position = int(element.attrib['position']) + offset
            except KeyError as err:
                position = None
            status = element.attrib.get('status', '')
            if status == 'unknown':
                assert position is None
                return SeqFeature.UnknownPosition()
            elif not status:
                return SeqFeature.ExactPosition(position)
            elif status == 'greater than':
                return SeqFeature.AfterPosition(position)
            elif status == 'less than':
                return SeqFeature.BeforePosition(position)
            elif status == 'uncertain':
                return SeqFeature.UncertainPosition(position)
            else:
                raise NotImplementedError("Position status %r" % status)

        def _parse_feature(element):
            feature = SeqFeature.SeqFeature()
            for k, v in element.attrib.items():
                feature.qualifiers[k] = v
            feature.type = element.attrib.get('type', '')
            if 'id' in element.attrib:
                feature.id = element.attrib['id']
            for feature_element in element:
                if feature_element.tag == NS + 'location':
                    position_elements = feature_element.findall(NS + 'position')
                    if position_elements:
                        element = position_elements[0]
                        start_position = _parse_position(element, -1)
                        end_position = _parse_position(element)
                    else:
                        element = feature_element.findall(NS + 'begin')[0]
                        start_position = _parse_position(element, -1)
                        element = feature_element.findall(NS + 'end')[0]
                        end_position = _parse_position(element)
                    feature.location = SeqFeature.FeatureLocation(start_position, end_position)
                else:
                    try:
                        feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text
                    except:
                        pass  # skip unparsable tag
            self.ParsedSeqRecord.features.append(feature)

        def _parse_proteinExistence(element):
            append_to_annotations('proteinExistence', element.attrib['type'])

        def _parse_evidence(element):
            for k, v in element.attrib.items():
                ann_key = k
                append_to_annotations(ann_key, v)

        def  _parse_sequence(element):
            for k, v in element.attrib.items():
                if k in ("length", "mass", "version"):
                    self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v)
                else:
                    self.ParsedSeqRecord.annotations['sequence_%s' % k] = v
            seq = ''.join((element.text.split()))
            self.ParsedSeqRecord.seq = Seq.Seq(seq, self.alphabet)

        #============================================#
        #Initialize SeqRecord
        self.ParsedSeqRecord = SeqRecord('', id='')

        #Entry attribs parsing
        #Unknown dataset should not happen!
        self.dbname = self.entry.attrib.get('dataset', 'UnknownDataset')
        #add attribs to annotations
        for k, v in self.entry.attrib.items():
            if k in ("version"):
                #original
                #self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v)
                #To cope with swissProt plain text parser. this can cause errors
                #if the attrib has the same name of an other annotation
                self.ParsedSeqRecord.annotations[k] = int(v)
            else:
                #self.ParsedSeqRecord.annotations["entry_%s" % k] = v
                self.ParsedSeqRecord.annotations[k] = v  # to cope with swissProt plain text parser

        #Top-to-bottom entry children parsing
        for element in self.entry:
            if element.tag == NS + 'name':
                _parse_name(element)
            elif element.tag == NS + 'accession':
                _parse_accession(element)
            elif element.tag == NS + 'protein':
                _parse_protein(element)
            elif element.tag == NS + 'gene':
                _parse_gene(element)
            elif element.tag == NS + 'geneLocation':
                _parse_geneLocation(element)
            elif element.tag == NS + 'organism':
                _parse_organism(element)
            elif element.tag == NS + 'organismHost':
                _parse_organismHost(element)
            elif element.tag == NS + 'keyword':
                _parse_keyword(element)
            elif element.tag == NS + 'comment':
                _parse_comment(element)
            elif element.tag == NS + 'dbReference':
                _parse_dbReference(element)
            elif element.tag == NS + 'reference':
                _parse_reference(element)
            elif element.tag == NS + 'feature':
                _parse_feature(element)
            elif element.tag == NS + 'proteinExistence':
                _parse_proteinExistence(element)
            elif element.tag == NS + 'evidence':
                _parse_evidence(element)
            elif element.tag == NS + 'sequence':
                _parse_sequence(element)
            else:
                pass

        #remove duplicate dbxrefs
        self.ParsedSeqRecord.dbxrefs = sorted(list(set(self.ParsedSeqRecord.dbxrefs)))

        # use first accession as id
        if not self.ParsedSeqRecord.id:
            self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations['accessions'][0]

        return self.ParsedSeqRecord
Example #27
0
def AceIterator(handle):
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags.

    Ace files include the base quality for each position, which are taken
    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
    letter_annotations dictionary under the "phred_quality" key.

    >>> from SAP.Bio import SeqIO
    >>> with open("Ace/consed_sample.ace", "rU") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s %s... %i" % (record.id, record.seq[:10], len(record)))
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 agccccgggc... 1475
    90

    However, ACE files do not include a base quality for any gaps in the
    consensus sequence, and these are represented in Biopython with a quality
    of zero. Using zero is perhaps misleading as there may be very strong
    evidence to support the gap in the consensus. Previous versions of
    Biopython therefore used None instead, but this complicated usage, and
    prevented output of the gapped sequence as FASTQ format.

    >>> from SAP.Bio import SeqIO
    >>> with open("Ace/contig1.ace", "rU") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s ...%s..." % (record.id, record.seq[85:95]))
    ...         print(record.letter_annotations["phred_quality"][85:95])
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 ...AGAGG-ATGC...
    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
    90
    Contig2 ...GAATTACTAT...
    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
    90

    """
    for ace_contig in Ace.parse(handle):
        #Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        #Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                #Very odd! Error?
                alpha = generic_nucleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            #For consistency with most other file formats, map
            #any * gaps into - gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"),
                                Gapped(alpha, gap_char="-"))
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        #TODO? - Base segments (BS lines) which indicates which read
        #phrap has chosen to be the consensus at a particular position.
        #Perhaps as SeqFeature objects?

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        #Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)

        #Consensus base quality (BQ lines).  Note that any gaps (originally
        #as * characters) in the consensus do not get a quality entry, so
        #we assign a quality of None (zero would be missleading as there may
        #be excelent support for having a gap here).
        quals = []
        i = 0
        for base in consensus_seq:
            if base == "-":
                quals.append(0)
            else:
                quals.append(ace_contig.quality[i])
                i += 1
        assert i == len(ace_contig.quality)
        seq_record.letter_annotations["phred_quality"] = quals

        yield seq_record
Example #28
0
def AceIterator(handle):
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags.

    Ace files include the base quality for each position, which are taken
    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
    letter_annotations dictionary under the "phred_quality" key.

    >>> from SAP.Bio import SeqIO
    >>> with open("Ace/consed_sample.ace", "rU") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s %s... %i" % (record.id, record.seq[:10], len(record)))
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 agccccgggc... 1475
    90

    However, ACE files do not include a base quality for any gaps in the
    consensus sequence, and these are represented in Biopython with a quality
    of zero. Using zero is perhaps misleading as there may be very strong
    evidence to support the gap in the consensus. Previous versions of
    Biopython therefore used None instead, but this complicated usage, and
    prevented output of the gapped sequence as FASTQ format.

    >>> from SAP.Bio import SeqIO
    >>> with open("Ace/contig1.ace", "rU") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s ...%s..." % (record.id, record.seq[85:95]))
    ...         print(record.letter_annotations["phred_quality"][85:95])
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 ...AGAGG-ATGC...
    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
    90
    Contig2 ...GAATTACTAT...
    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
    90

    """
    for ace_contig in Ace.parse(handle):
        #Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        #Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                #Very odd! Error?
                alpha = generic_nucleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            #For consistency with most other file formats, map
            #any * gaps into - gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"),
                                Gapped(alpha, gap_char="-"))
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        #TODO? - Base segments (BS lines) which indicates which read
        #phrap has chosen to be the consensus at a particular position.
        #Perhaps as SeqFeature objects?

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        #Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)

        #Consensus base quality (BQ lines).  Note that any gaps (originally
        #as * characters) in the consensus do not get a quality entry, so
        #we assign a quality of None (zero would be missleading as there may
        #be excelent support for having a gap here).
        quals = []
        i = 0
        for base in consensus_seq:
            if base == "-":
                quals.append(0)
            else:
                quals.append(ace_contig.quality[i])
                i += 1
        assert i == len(ace_contig.quality)
        seq_record.letter_annotations["phred_quality"] = quals

        yield seq_record
Example #29
0
def PdbAtomIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file

    The sequences are derived from the 3D structure (ATOM records), not the
    SEQRES lines in the PDB file header.

    Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries
    are converted to "X" in the sequence.

    In addition to information from the PDB header (which is the same for all
    records), the following chain specific information is placed in the
    annotation:

    record.annotations["residues"] = List of residue ID strings
    record.annotations["chain"] = Chain ID (typically A, B ,...)
    record.annotations["model"] = Model ID (typically zero)

    Where amino acids are missing from the structure, as indicated by residue
    numbering, the sequence is filled in with 'X' characters to match the size
    of the missing region, and  None is included as the corresponding entry in
    the list record.annotations["residues"].

    This function uses the Bio.PDB module to do most of the hard work. The
    annotation information could be improved but this extra parsing should be
    done in parse_pdb_header, not this module.
    """
    # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO
    from SAP.Bio.PDB import PDBParser
    from SAP.Bio.SeqUtils import seq1

    def restype(residue):
        """Return a residue's type as a one-letter code.

        Non-standard residues (e.g. CSD, ANP) are returned as 'X'.
        """
        return seq1(residue.resname, custom_map=protein_letters_3to1)

    # Deduce the PDB ID from the PDB header
    # ENH: or filename?
    from SAP.Bio.File import UndoHandle
    undo_handle = UndoHandle(handle)
    firstline = undo_handle.peekline()
    if firstline.startswith("HEADER"):
        pdb_id = firstline[62:66]
    else:
        warnings.warn("First line is not a 'HEADER'; can't determine PDB ID")
        pdb_id = '????'

    struct = PDBParser().get_structure(pdb_id, undo_handle)
    model = struct[0]
    for chn_id, chain in sorted(model.child_dict.items()):
        # HETATM mod. res. policy: remove mod if in sequence, else discard
        residues = [
            res for res in chain.get_unpacked_list()
            if seq1(res.get_resname().upper(), custom_map=protein_letters_3to1)
            != "X"
        ]
        if not residues:
            continue
        # Identify missing residues in the structure
        # (fill the sequence with 'X' residues in these regions)
        gaps = []
        rnumbers = [r.id[1] for r in residues]
        for i, rnum in enumerate(rnumbers[:-1]):
            if rnumbers[i + 1] != rnum + 1:
                # It's a gap!
                gaps.append((i + 1, rnum, rnumbers[i + 1]))
        if gaps:
            res_out = []
            prev_idx = 0
            for i, pregap, postgap in gaps:
                if postgap > pregap:
                    gapsize = postgap - pregap - 1
                    res_out.extend(restype(x) for x in residues[prev_idx:i])
                    prev_idx = i
                    res_out.append('X' * gapsize)
                else:
                    warnings.warn("Ignoring out-of-order residues after a gap",
                                  UserWarning)
                    # Keep the normal part, drop the out-of-order segment
                    # (presumably modified or hetatm residues, e.g. 3BEG)
                    res_out.extend(restype(x) for x in residues[prev_idx:i])
                    break
            else:
                # Last segment
                res_out.extend(restype(x) for x in residues[prev_idx:])
        else:
            # No gaps
            res_out = [restype(x) for x in residues]
        record_id = "%s:%s" % (pdb_id, chn_id)
        # ENH - model number in SeqRecord id if multiple models?
        # id = "Chain%s" % str(chain.id)
        # if len(structure) > 1 :
        #     id = ("Model%s|" % str(model.id)) + id

        record = SeqRecord(
            Seq(''.join(res_out), generic_protein),
            id=record_id,
            description=record_id,
        )

        # The PDB header was loaded as a dictionary, so let's reuse it all
        record.annotations = struct.header.copy()
        # Plus some chain specifics:
        record.annotations["model"] = model.id
        record.annotations["chain"] = chain.id

        # Start & end
        record.annotations["start"] = int(rnumbers[0])
        record.annotations["end"] = int(rnumbers[-1])

        # ENH - add letter annotations -- per-residue info, e.g. numbers

        yield record
Example #30
0
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes('ABIF'):
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT,
                           handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace('.ab1', '')
    except:
        file_name = ""

    record = SeqRecord(Seq(seq, alphabet),
                       id=sample_id, name=file_name,
                       description='',
                       annotations=annot,
                       letter_annotations={'phred_quality': qual})

    if not trim:
        yield record
    else:
        yield _abi_trim(record)
Example #31
0
    def __next__(self):
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            if "." in s:
                raise ValueError(
                    "PHYLIP format no longer allows dots in sequence")
            seqs.append([s])

        #Look for further blocks
        line = ""
        while True:
            #Skip any blank lines between blocks...
            while "" == line.strip():
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if self._is_header(line):
                #Looks like the start of a concatenated alignment
                self._header = line
                break

            #print "New block..."
            for i in range(number_of_seqs):
                s = line.strip().replace(" ", "")
                if "." in s:
                    raise ValueError(
                        "PHYLIP format no longer allows dots in sequence")
                seqs[i].append(s)
                line = handle.readline()
                if (not line) and i + 1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line:
                break  # end of file

        records = (SeqRecord(Seq("".join(s), self.alphabet),
                             id=i,
                             name=i,
                             description=i) for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
Example #32
0
def PdbSeqresIterator(handle):
    """Returns SeqRecord objects for each chain in a PDB file.

    The sequences are derived from the SEQRES lines in the
    PDB file header, not the atoms of the 3D structure.

    Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES

    See: http://www.wwpdb.org/documentation/format23/sect3.html
    """
    # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils
    from SAP.Bio.SeqUtils import seq1

    chains = collections.defaultdict(list)
    metadata = collections.defaultdict(list)
    for line in handle:
        rec_name = line[0:6].strip()
        if rec_name == 'SEQRES':
            # NB: We only actually need chain ID and the residues here;
            # commented bits are placeholders from the wwPDB spec.
            # Serial number of the SEQRES record for the current chain.
            # Starts at 1 and increments by one each line.
            # Reset to 1 for each chain.
            # ser_num = int(line[8:10])
            # Chain identifier. This may be any single legal character,
            # including a blank which is used if there is only one chain.
            chn_id = line[11]
            # Number of residues in the chain (repeated on every record)
            # num_res = int(line[13:17])
            residues = [
                seq1(res, custom_map=protein_letters_3to1)
                for res in line[19:].split()
            ]
            chains[chn_id].extend(residues)
        elif rec_name == 'DBREF':
            #  ID code of this entry (PDB ID)
            pdb_id = line[7:11]
            # Chain identifier.
            chn_id = line[12]
            # Initial sequence number of the PDB sequence segment.
            # seq_begin = int(line[14:18])
            # Initial insertion code of the PDB sequence segment.
            # icode_begin = line[18]
            # Ending sequence number of the PDB sequence segment.
            # seq_end = int(line[20:24])
            # Ending insertion code of the PDB sequence segment.
            # icode_end = line[24]
            # Sequence database name.
            database = line[26:32].strip()
            # Sequence database accession code.
            db_acc = line[33:41].strip()
            # Sequence database identification code.
            db_id_code = line[42:54].strip()
            # Initial sequence number of the database seqment.
            # db_seq_begin = int(line[55:60])
            # Insertion code of initial residue of the segment, if PDB is the
            # reference.
            # db_icode_begin = line[60]
            # Ending sequence number of the database segment.
            # db_seq_end = int(line[62:67])
            # Insertion code of the ending residue of the segment, if PDB is the
            # reference.
            # db_icode_end = line[67]
            metadata[chn_id].append({
                'pdb_id': pdb_id,
                'database': database,
                'db_acc': db_acc,
                'db_id_code': db_id_code
            })
        # ENH: 'SEQADV' 'MODRES'

    for chn_id, residues in sorted(chains.items()):
        record = SeqRecord(Seq(''.join(residues), generic_protein))
        record.annotations = {"chain": chn_id}
        if chn_id in metadata:
            m = metadata[chn_id][0]
            record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id)
            record.description = (
                "%s:%s %s" % (m['database'], m['db_acc'], m['db_id_code']))
            for melem in metadata[chn_id]:
                record.dbxrefs.extend([
                    "%s:%s" % (melem['database'], melem['db_acc']),
                    "%s:%s" % (melem['database'], melem['db_id_code'])
                ])
        else:
            record.id = chn_id
        yield record
Example #33
0
    def __next__(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            # Empty file - just give up.
            raise StopIteration
        if not line.strip() == "# STOCKHOLM 1.0":
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = self.handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == "# STOCKHOLM 1.0":
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, "")
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == "#=GC ":
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == "#=GS ":
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip()  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None and self.records_per_alignment != len(ids):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)
                )

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError("Sequences have different lengths, or repeated identifier")
                name, start, end = self._identifier_split(id)
                record = SeqRecord(
                    Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}
                )
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Example #34
0
def _get_codon_rec(pro,
                   nucl,
                   span_mode,
                   alphabet,
                   gap_char="-",
                   codon_table=default_codon_table,
                   complete_protein=False,
                   max_score=10):
    """Generate codon alignment based on regular re match (PRIVATE)

    span_mode is a tuple returned by _check_corr. The first element
    is the span of a re search, and the second element is the mode
    for the match.

    mode
     - 0: direct match
     - 1: mismatch (no indels)
     - 2: frameshift

    """
    import re
    from SAP.Bio.Seq import Seq

    nucl_seq = nucl.seq.ungap(gap_char)
    codon_seq = ""
    span = span_mode[0]
    mode = span_mode[1]
    aa2re = _get_aa_regex(codon_table)
    if mode in (0, 1):
        if len(pro.seq.ungap(gap_char)) * 3 != (span[1] - span[0]):
            raise ValueError("Protein Record {0} and Nucleotide Record {1} "
                             "do not match!".format((pro.id, nucl.id)))
        aa_num = 0
        for aa in pro.seq:
            if aa == "-":
                codon_seq += "---"
            elif complete_protein is True and aa_num == 0:
                this_codon = nucl_seq._data[span[0]:span[0] + 3]
                if not re.search(_codons2re[codon_table.start_codons],
                                 this_codon.upper()):
                    max_score -= 1
                    warnings.warn("start codon of {0} ({1} {2}) does not "
                                  "correspond to {3} "
                                  "({4})".format(pro.id, aa, aa_num, nucl.id,
                                                 this_codon))
                if max_score == 0:
                    raise RuntimeError("max_score reached for {0}! Please "
                                       "raise up the tolerance to get an "
                                       "alignment in anyway".format(nucl.id))
                codon_seq += this_codon
                aa_num += 1
            else:
                this_codon = nucl_seq._data[(span[0] +
                                             3 * aa_num):(span[0] + 3 *
                                                          (aa_num + 1))]
                if not str(Seq(this_codon.upper()).translate()) == aa:
                    max_score -= 1
                    warnings.warn("%s(%s %d) does not correspond to %s(%s)" %
                                  (pro.id, aa, aa_num, nucl.id, this_codon))
                if max_score == 0:
                    raise RuntimeError("max_score reached for {0}! Please "
                                       "raise up the tolerance to get an "
                                       "alignment in anyway".format(nucl.id))
                codon_seq += this_codon
                aa_num += 1
        return SeqRecord(CodonSeq(codon_seq, alphabet=alphabet), id=nucl.id)
    elif mode == 2:
        from collections import deque
        shift_pos = deque([])
        shift_start = []
        match = span_mode[2]
        m_groupdict = list(match.groupdict().keys())
        # backward frameshift
        for i in m_groupdict:
            shift_pos.append(match.span(i))
            shift_start.append(match.start(i))
        rf_table = []
        i = match.start()
        while True:
            rf_table.append(i)
            i += 3
            if i in shift_start and \
                    m_groupdict[shift_start.index(i)].isupper():
                shift_index = shift_start.index(i)
                shift_val = 6 - (shift_pos[shift_index][1] -
                                 shift_pos[shift_index][0])
                rf_table.append(i)
                rf_table.append(i + 3 - shift_val)
                i = shift_pos[shift_index][1]
            elif i in shift_start and \
                    m_groupdict[shift_start.index(i)].islower():
                i = shift_pos[shift_start.index(i)][1]
            if i >= match.end():
                break
        aa_num = 0
        for aa in pro.seq:
            if aa == "-":
                codon_seq += "---"
            elif complete_protein is True and aa_num == 0:
                this_codon = nucl_seq._data[rf_table[0]:rf_table[0] + 3]
                if not re.search(_codons2re[codon_table.start_codons],
                                 this_codon.upper()):
                    max_score -= 1
                    warnings.warn("start codon of {0}({1} {2}) does not "
                                  "correspond to {3}({4})".format(
                                      pro.id, aa, aa_num, nucl.id, this_codon))
                    codon_seq += this_codon
                    aa_num += 1
            else:
                if aa_num < len(pro.seq.ungap('-'))-1 and \
                        rf_table[aa_num+1]-rf_table[aa_num]-3 < 0:
                    max_score -= 1
                    start = rf_table[aa_num]
                    end = start + (3 - shift_val)
                    ngap = shift_val
                    this_codon = nucl_seq._data[start:end] + '-' * ngap
                elif rf_table[aa_num] - rf_table[aa_num - 1] - 3 > 0:
                    max_score -= 1
                    start = rf_table[aa_num - 1] + 3
                    end = rf_table[aa_num]
                    ngap = 3 - (rf_table[aa_num] - rf_table[aa_num - 1] - 3)
                    this_codon = nucl_seq._data[start:end] + '-'*ngap + \
                            nucl_seq._data[rf_table[aa_num]:rf_table[aa_num]+3]
                else:
                    start = rf_table[aa_num]
                    end = start + 3
                    this_codon = nucl_seq._data[start:end]
                    if not str(Seq(this_codon.upper()).translate()) == aa:
                        max_score -= 1
                        warnings.warn("Codon of {0}({1} {2}) does not "
                                      "correspond to {3}({4})".format(
                                          pro.id, aa, aa_num, nucl.id,
                                          this_codon))
                if max_score == 0:
                    raise RuntimeError("max_score reached for {0}! Please "
                                       "raise up the tolerance to get an "
                                       "alignment in anyway".format(nucl.id))
                codon_seq += this_codon
                aa_num += 1
        return SeqRecord(CodonSeq(codon_seq,
                                  alphabet=alphabet,
                                  rf_table=rf_table),
                         id=nucl.id)
Example #35
0
    def __next__(self):
        handle = self.handle
        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        #Whitelisted headers we know about
        known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS']
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known CLUSTAL header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0] == '(' and word[-1] == ')':
                word = word[1:-1]
            if word[0] in '0123456789':
                version = word
                break

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None  # Used to extract the consensus

        #Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                #Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(fields[1].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            elif line[0] == " ":
                #Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                #No consensus
                break
            line = handle.readline()
            if not line:
                break  # end of file

        assert line.strip() == ""
        assert seq_cols is not None

        #Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if line.split(None, 1)[0] in known_headers:
                #Found concatenated alignment.
                done = True
                self._header = line
                break

            for i in range(len(ids)):
                assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % repr(line))

                if fields[0] != ids[i]:
                    raise ValueError(
                        "Identifiers out of order? Got '%s' but expected '%s'"
                        % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    assert start == seq_cols.start, 'Old location %s -> %i:XX' % (
                        seq_cols, start)
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                #Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(seqs[i].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)

                #Read in the next line
                line = handle.readline()
            #There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            raise StopIteration

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i)
                   for (i, s) in zip(ids, seqs))
        alignment = MultipleSeqAlignment(records, self.alphabet)
        #TODO - Handle alignment annotation better, for now
        #mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            alignment_length = len(seqs[0])
            assert len(consensus) == alignment_length, \
                   "Alignment length is %i, consensus length is %i, '%s'" \
                   % (alignment_length, len(consensus), consensus)
            alignment._star_info = consensus
        return alignment
Example #36
0
File: IgIO.py Project: cbirdlab/sap
def IgIterator(handle, alphabet=single_letter_alphabet):
    """Iterate over IntelliGenetics records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet

    The optional free format file header lines (which start with two
    semi-colons) are ignored.

    The free format commentary lines at the start of each record (which
    start with a semi-colon) are recorded as a single string with embedded
    new line characters in the SeqRecord's annotations dictionary under the
    key 'comment'.
    """
    #Skip any file header text before the first record (;; lines)
    while True:
        line = handle.readline()
        if not line:
            break  # Premature end of file, or just empty?
        if not line.startswith(";;"):
            break

    while line:
        #Now iterate over the records
        if line[0] != ";":
            raise ValueError("Records should start with ';' and not:\n%s" %
                             repr(line))

        #Try and agree with SeqRecord convention from the GenBank parser,
        #(and followed in the SwissProt parser) which stores the comments
        #as a long string with newlines under annotations key 'comment'.

        #Note some examples use "; ..." and others ";..."
        comment_lines = []
        while line.startswith(";"):
            #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
            comment_lines.append(line[1:].strip())
            line = handle.readline()
        title = line.rstrip()

        seq_lines = []
        while True:
            line = handle.readline()
            if not line:
                break
            if line[0] == ";":
                break
            #Remove trailing whitespace, and any internal spaces
            seq_lines.append(line.rstrip().replace(" ", ""))
        seq_str = "".join(seq_lines)
        if seq_str.endswith("1"):
            #Remove the optional terminator (digit one)
            seq_str = seq_str[:-1]
        if "1" in seq_str:
            raise ValueError(
                "Potential terminator digit one found within sequence.")

        #Return the record and then continue...
        record = SeqRecord(Seq(seq_str, alphabet), id=title, name=title)
        record.annotations['comment'] = "\n".join(comment_lines)
        yield record

    #We should be at the end of the file now
    assert not line