예제 #1
0
파일: Record.py 프로젝트: cbirdlab/sap
    def to_generic(self, alphabet):
        """Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from SAP.Bio.Align, through which you can manipulate and query
        the object.

        alphabet is the specified alphabet for the sequences in the code (for
        example IUPAC.IUPACProtein).

        Thanks to James Casbon for the code.
        """
        #TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
        seq_parts = []
        seq_names = []
        parse_number = 0
        n = 0
        for name, start, seq, end in self.alignment:
            if name == 'QUERY':  # QUERY is the first in each alignment block
                parse_number += 1
                n = 0

            if parse_number == 1:  # create on first_parse, append on all others
                seq_parts.append(seq)
                seq_names.append(name)
            else:
                seq_parts[n] += seq
                n += 1

        generic = MultipleSeqAlignment([], alphabet)
        for (name, seq) in zip(seq_names, seq_parts):
            generic.append(SeqRecord(Seq(seq, alphabet), name))

        return generic
예제 #2
0
 def __getitem__(self, index, alphabet=None):
     """Return a CodonAlignment object for single indexing
     """
     if isinstance(index, int):
         return self._records[index]
     elif isinstance(index, slice):
         return CodonAlignment(self._records[index], self._alphabet)
     elif len(index) != 2:
         raise TypeError("Invalid index type.")
     # Handle double indexing
     row_index, col_index = index
     if isinstance(row_index, int):
         return self._records[row_index][col_index]
     elif isinstance(col_index, int):
         return "".join(str(rec[col_index]) for rec in \
                                                 self._records[row_index])
     else:
         if alphabet is None:
             from SAP.Bio.Alphabet import generic_nucleotide
             return MultipleSeqAlignment((rec[col_index] for rec in \
                                                 self._records[row_index]),
                                          generic_nucleotide)
         else:
             return MultipleSeqAlignment((rec[col_index] for rec in \
                                                 self._records[row_index]),
                                          generic_nucleotide)
예제 #3
0
파일: Record.py 프로젝트: kaspermunch/sap
    def to_generic(self, alphabet):
        """Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from SAP.Bio.Align, through which you can manipulate and query
        the object.

        alphabet is the specified alphabet for the sequences in the code (for
        example IUPAC.IUPACProtein).

        Thanks to James Casbon for the code.
        """
        #TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
        seq_parts = []
        seq_names = []
        parse_number = 0
        n = 0
        for name, start, seq, end in self.alignment:
            if name == 'QUERY':  # QUERY is the first in each alignment block
                parse_number += 1
                n = 0

            if parse_number == 1:  # create on first_parse, append on all others
                seq_parts.append(seq)
                seq_names.append(name)
            else:
                seq_parts[n] += seq
                n += 1

        generic = MultipleSeqAlignment([], alphabet)
        for (name, seq) in zip(seq_names, seq_parts):
            generic.append(SeqRecord(Seq(seq, alphabet), name))

        return generic
예제 #4
0
 def _aln_get(self):
     if self.query is None and self.hit is None:
         return None
     elif self.hit is None:
         return MultipleSeqAlignment([self.query], self.alphabet)
     elif self.query is None:
         return MultipleSeqAlignment([self.hit], self.alphabet)
     else:
         return MultipleSeqAlignment([self.query, self.hit], self.alphabet)
예제 #5
0
    def __init__(self, records='', name=None, alphabet=default_codon_alphabet):

        MultipleSeqAlignment.__init__(self, records, alphabet=alphabet)

        # check the type of the alignment to be nucleotide
        for rec in self:
            if not isinstance(rec.seq, CodonSeq):
                raise TypeError("CodonSeq object are expected in each "
                                "SeqRecord in CodonAlignment")

        assert self.get_alignment_length() % 3 == 0, \
            "Alignment length is not a triple number"
예제 #6
0
    def __init__(self, records='', name=None, alphabet=default_codon_alphabet):

        MultipleSeqAlignment.__init__(self, records, alphabet=alphabet)

        # check the type of the alignment to be nucleotide
        for rec in self:
            if not isinstance(rec.seq, CodonSeq):
                raise TypeError("CodonSeq object are expected in each "
                                "SeqRecord in CodonAlignment")

        assert self.get_alignment_length() % 3 == 0, \
            "Alignment length is not a triple number"
예제 #7
0
파일: PhyloXML.py 프로젝트: kaspermunch/sap
 def to_alignment(self):
     """Construct an alignment from the aligned sequences in this tree."""
     def is_aligned_seq(elem):
         if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
             return True
         return False
     seqs = self._filter_search(is_aligned_seq, 'preorder', True)
     try:
         first_seq = next(seqs)
     except StopIteration:
         # No aligned sequences were found --> empty MSA
         return MultipleSeqAlignment([])
     msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
                                first_seq.get_alphabet())
     msa.extend(seq.to_seqrecord() for seq in seqs)
     return msa
예제 #8
0
def NexusIterator(handle, seq_count=None):
    """Returns SeqRecord objects from a Nexus file.

    Thus uses the Bio.Nexus module to do the hard work.

    You are expected to call this function via Bio.SeqIO or Bio.AlignIO
    (and not use it directly).

    NOTE - We only expect ONE alignment matrix per Nexus file,
    meaning this iterator will only yield one MultipleSeqAlignment.
    """
    n = Nexus.Nexus(handle)
    if not n.matrix:
        #No alignment found
        raise StopIteration

    #Bio.Nexus deals with duplicated names by adding a '.copy' suffix.
    #The original names and the modified names are kept in these two lists:
    assert len(n.unaltered_taxlabels) == len(n.taxlabels)

    if seq_count and seq_count != len(n.unaltered_taxlabels):
        raise ValueError("Found %i sequences, but seq_count=%i"
               % (len(n.unaltered_taxlabels), seq_count))

    #ToDo - Can we extract any annotation too?
    records = (SeqRecord(n.matrix[new_name], id=new_name,
                         name=old_name, description="")
               for old_name, new_name
               in zip(n.unaltered_taxlabels, n.taxlabels))
    #All done
    yield MultipleSeqAlignment(records, n.alphabet)
예제 #9
0
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict)+1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(SeqRecord(Seq(mult_align_dict[i], alpha),
                                    sum_dict[i].pdb2+sum_dict[i].chain2))
    return fssp_align
예제 #10
0
 def toMultipleSeqAlignment(self):
     """Return a MultipleSeqAlignment containing all the
     SeqRecord in the CodonAlignment using Seq to store 
     sequences
     """
     alignments = [SeqRecord(rec.seq.toSeq(), id=rec.id) for \
             rec in self._records]
     return MultipleSeqAlignment(alignments)
예제 #11
0
    def to_alignment(self):
        """Construct an alignment from the aligned sequences in this tree."""
        def is_aligned_seq(elem):
            if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
                return True
            return False

        seqs = self._filter_search(is_aligned_seq, 'preorder', True)
        try:
            first_seq = next(seqs)
        except StopIteration:
            # No aligned sequences were found --> empty MSA
            return MultipleSeqAlignment([])
        msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
                                   first_seq.get_alphabet())
        msa.extend(seq.to_seqrecord() for seq in seqs)
        return msa
예제 #12
0
파일: __init__.py 프로젝트: cbirdlab/sap
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

     - sequences - A list (or iterator) of SeqRecord objects, or (if using
                   Biopython 1.54 or later) a single SeqRecord.
     - handle    - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of records written (as an integer).
    """
    from SAP.Bio import AlignIO

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(sequences, SeqRecord):
        #This raised an exception in order version of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = 'wb'
    else:
        mode = 'w'

    with as_handle(handle, mode) as fp:
        #Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            #Try and turn all the records into a single alignment,
            #and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            assert alignment_count == 1, \
                "Internal error - the underlying writer " \
                " should have returned 1, not %s" % repr(alignment_count)
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

        assert isinstance(count, int), "Internal error - the underlying %s " \
            "writer should have returned the record count, not %s" \
            % (format, repr(count))

    return count
예제 #13
0
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict) + 1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(
            SeqRecord(Seq(mult_align_dict[i], alpha),
                      sum_dict[i].pdb2 + sum_dict[i].chain2))
    return fssp_align
예제 #14
0
파일: __init__.py 프로젝트: cbirdlab/sap
def _SeqIO_to_alignment_iterator(handle,
                                 format,
                                 alphabet=None,
                                 seq_count=None):
    """Uses Bio.SeqIO to create an MultipleSeqAlignment iterator (PRIVATE).

    Arguments:
     - handle    - handle to the file.
     - format    - string describing the file format.
     - alphabet  - optional Alphabet object, useful when the sequence type
                   cannot be automatically inferred from the file itself
                   (e.g. fasta, phylip, clustal)
     - seq_count - Optional integer, number of sequences expected in each
                   alignment.  Recommended for fasta format files.

    If count is omitted (default) then all the sequences in the file are
    combined into a single MultipleSeqAlignment.
    """
    from SAP.Bio import SeqIO
    assert format in SeqIO._FormatToIterator

    if seq_count:
        #Use the count to split the records into batches.
        seq_record_iterator = SeqIO.parse(handle, format, alphabet)

        records = []
        for record in seq_record_iterator:
            records.append(record)
            if len(records) == seq_count:
                yield MultipleSeqAlignment(records, alphabet)
                records = []
        if len(records) > 0:
            raise ValueError("Check seq_count argument, not enough sequences?")
    else:
        #Must assume that there is a single alignment using all
        #the SeqRecord objects:
        records = list(SeqIO.parse(handle, format, alphabet))
        if records:
            yield MultipleSeqAlignment(records, alphabet)
    raise StopIteration
예제 #15
0
    def __next__(self):
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            while len(s) < length_of_seqs:
                # The sequence may be split into multiple lines
                line = handle.readline().strip()
                if not line:
                    break
                if line == "":
                    continue
                s = "".join([s, line.strip().replace(" ", "")])
                if len(s) > length_of_seqs:
                    raise ValueError(
                        "Found a record of length %i, should be %i" %
                        (len(s), length_of_seqs))
            if "." in s:
                raise ValueError(
                    "PHYLIP format no longer allows dots in sequence")
            seqs.append(s)
        while True:
            # Find other alignments in the file
            line = handle.readline()
            if not line:
                break
            if self._is_header(line):
                self._header = line
                break

        records = (SeqRecord(Seq(s, self.alphabet),
                             id=i,
                             name=i,
                             description=i) for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
예제 #16
0
    def __next__(self):
        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()

        if not line:
            raise StopIteration
        line = line.strip()
        parts = [x for x in line.split() if x]
        if len(parts) != 2:
            raise ValueError("First line should have two integers")
        try:
            number_of_seqs = int(parts[0])
            length_of_seqs = int(parts[1])
        except ValueError:
            raise ValueError("First line should have two integers")

        assert self._is_header(line)

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (number_of_seqs, self.records_per_alignment))

        ids = []
        seqs = []

        # By default, expects STRICT truncation / padding to 10 characters.
        # Does not require any whitespace between name and seq.
        for i in range(number_of_seqs):
            line = handle.readline().rstrip()
            sequence_id, s = self._split_id(line)
            ids.append(sequence_id)
            if "." in s:
                raise ValueError(
                    "PHYLIP format no longer allows dots in sequence")
            seqs.append([s])

        #Look for further blocks
        line = ""
        while True:
            #Skip any blank lines between blocks...
            while "" == line.strip():
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if self._is_header(line):
                #Looks like the start of a concatenated alignment
                self._header = line
                break

            #print "New block..."
            for i in range(number_of_seqs):
                s = line.strip().replace(" ", "")
                if "." in s:
                    raise ValueError(
                        "PHYLIP format no longer allows dots in sequence")
                seqs[i].append(s)
                line = handle.readline()
                if (not line) and i + 1 < number_of_seqs:
                    raise ValueError("End of file mid-block")
            if not line:
                break  # end of file

        records = (SeqRecord(Seq("".join(s), self.alphabet),
                             id=i,
                             name=i,
                             description=i) for (i, s) in zip(ids, seqs))
        return MultipleSeqAlignment(records, self.alphabet)
예제 #17
0
    def __next__(self):
        handle = self.handle
        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        #Whitelisted headers we know about
        known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS']
        if line.strip().split()[0] not in known_headers:
            raise ValueError("%s is not a known CLUSTAL header: %s" %
                             (line.strip().split()[0],
                              ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0] == '(' and word[-1] == ')':
                word = word[1:-1]
            if word[0] in '0123456789':
                version = word
                break

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None  # Used to extract the consensus

        #Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                #Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
                    if len(fields[1].replace("-", "")) != letters:
                        raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
            elif line[0] == " ":
                #Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                #No consensus
                break
            line = handle.readline()
            if not line:
                break  # end of file

        assert line.strip() == ""
        assert seq_cols is not None

        #Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line:
                    break # end of file
            if not line:
                break # end of file

            if line.split(None, 1)[0] in known_headers:
                #Found concatenated alignment.
                done = True
                self._header = line
                break

            for i in range(len(ids)):
                assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % repr(line))

                if fields[0] != ids[i]:
                    raise ValueError("Identifiers out of order? Got '%s' but expected '%s'"
                                      % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
                    assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start)
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                #Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
                    if len(seqs[i].replace("-", "")) != letters:
                        raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)

                #Read in the next line
                line = handle.readline()
            #There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            raise StopIteration

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError("Found %i records in this alignment, told to expect %i"
                             % (len(ids), self.records_per_alignment))

        records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i)
                   for (i, s) in zip(ids, seqs))
        alignment = MultipleSeqAlignment(records, self.alphabet)
        #TODO - Handle alignment annotation better, for now
        #mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            alignment_length = len(seqs[0])
            assert len(consensus) == alignment_length, \
                   "Alignment length is %i, consensus length is %i, '%s'" \
                   % (alignment_length, len(consensus), consensus)
            alignment._star_info = consensus
        return alignment
예제 #18
0
    def __next__(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            #Empty file - just give up.
            raise StopIteration
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = self.handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                #The "//" line indicates the end of the alignment.
                #There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                #blank line, ignore
                pass
            elif line[0] != "#":
                #Sequence
                #Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    #This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier "
                                      + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                #Comment line or meta-data
                if line[:5] == "#=GF ":
                    #Generic per-File annotation, free text
                    #Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    #Each feature key could be used more than once,
                    #so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    #Generic per-Column annotation, exactly 1 char per column
                    #Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    #Generic per-Sequence annotation, free text
                    #Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    #Generic per-Sequence AND per-Column markup
                    #Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    #if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip()  # append to any previous entry
                    #TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            #Next line...

        assert len(seqs) <= len(ids)
        #assert len(gs)   <= len(ids)
        #assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids):
                raise ValueError("Found %i records in this alignment, told to expect %i"
                                 % (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError("Sequences have different lengths, or repeated identifier")
                name, start, end = self._identifier_split(id)
                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=id, name=name, description=id,
                                   annotations={"accession": name})
                #Accession will be overridden by _populate_meta_data if an explicit
                #accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            #TODO - Introduce an annotated alignment class?
            #For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
예제 #19
0
    def __next__(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            # Empty file - just give up.
            raise StopIteration
        if not line.strip() == "# STOCKHOLM 1.0":
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = self.handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == "# STOCKHOLM 1.0":
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " + "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, "")
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == "#=GC ":
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == "#=GS ":
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip()  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None and self.records_per_alignment != len(ids):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)
                )

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError("Sequences have different lengths, or repeated identifier")
                name, start, end = self._identifier_split(id)
                record = SeqRecord(
                    Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}
                )
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
예제 #20
0
파일: FastaIO.py 프로젝트: kaspermunch/sap
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r"
                             % (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(Seq(q, alphabet),
                           id=query_id,
                           name="query",
                           description=query_descr,
                           annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(Seq(m, alphabet),
                           id=match_id,
                           name="match",
                           description=match_descr,
                           annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
예제 #21
0
    def __next__(self):

        handle = self.handle

        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        while line.rstrip() != "#=======================================":
            line = handle.readline()
            if not line:
                raise StopIteration

        length_of_seqs = None
        number_of_seqs = None
        ids = []
        seqs = []

        while line[0] == "#":
            #Read in the rest of this alignment header,
            #try and discover the number of records expected
            #and their length
            parts = line[1:].split(":", 1)
            key = parts[0].lower().strip()
            if key == "aligned_sequences":
                number_of_seqs = int(parts[1].strip())
                assert len(ids) == 0
                # Should now expect the record identifiers...
                for i in range(number_of_seqs):
                    line = handle.readline()
                    parts = line[1:].strip().split(":", 1)
                    assert i+1 == int(parts[0].strip())
                    ids.append(parts[1].strip())
                assert len(ids) == number_of_seqs
            if key == "length":
                length_of_seqs = int(parts[1].strip())

            #And read in another line...
            line = handle.readline()

        if number_of_seqs is None:
            raise ValueError("Number of sequences missing!")
        if length_of_seqs is None:
            raise ValueError("Length of sequences missing!")

        if self.records_per_alignment is not None \
        and self.records_per_alignment != number_of_seqs:
            raise ValueError("Found %i records in this alignment, told to expect %i"
                             % (number_of_seqs, self.records_per_alignment))

        seqs = ["" for id in ids]
        seq_starts = []
        index = 0

        #Parse the seqs
        while line:
            if len(line) > 21:
                id_start = line[:21].strip().split(None, 1)
                seq_end = line[21:].strip().split(None, 1)
                if len(id_start) == 2 and len(seq_end) == 2:
                    #identifier, seq start position, seq, seq end position
                    #(an aligned seq is broken up into multiple lines)
                    id, start = id_start
                    seq, end = seq_end
                    if start == end:
                        #Special case, either a single letter is present,
                        #or no letters at all.
                        if seq.replace("-", "") == "":
                            start = int(start)
                            end = int(end)
                        else:
                            start = int(start) - 1
                            end = int(end)
                    else:
                        assert seq.replace("-", "") != "", repr(line)
                        start = int(start) - 1  # python counting
                        end = int(end)

                    #The identifier is truncated...
                    assert 0 <= index and index < number_of_seqs, \
                           "Expected index %i in range [0,%i)" \
                           % (index, number_of_seqs)
                    assert id == ids[index] or id == ids[index][:len(id)]

                    if len(seq_starts) == index:
                        #Record the start
                        seq_starts.append(start)

                    #Check the start...
                    if start == end:
                        assert seq.replace("-", "") == "", line
                    else:
                        assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \
                        "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \
                            % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]),
                               start, line)

                    seqs[index] += seq

                    #Check the end ...
                    assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \
                        "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \
                            % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]),
                               seq_starts[index], end, line)

                    index += 1
                    if index >= number_of_seqs:
                        index = 0
                else:
                    #just a start value, this is just alignment annotation (?)
                    #print "Skipping: " + line.rstrip()
                    pass
            elif line.strip() == "":
                #Just a spacer?
                pass
            else:
                print(line)
                assert False

            line = handle.readline()
            if line.rstrip() == "#---------------------------------------" \
            or line.rstrip() == "#=======================================":
                #End of alignment
                self._header = line
                break

        assert index == 0

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError("Found %i records in this alignment, told to expect %i"
                             % (len(ids), self.records_per_alignment))

        records = []
        for id, seq in zip(ids, seqs):
            if len(seq) != length_of_seqs:
                #EMBOSS 2.9.0 is known to use spaces instead of minus signs
                #for leading gaps, and thus fails to parse.  This old version
                #is still used as of Dec 2008 behind the EBI SOAP webservice:
                #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl
                raise ValueError("Error parsing alignment - sequences of "
                                 "different length? You could be using an "
                                 "old version of EMBOSS.")
            records.append(SeqRecord(Seq(seq, self.alphabet),
                                     id=id, description=id))
        return MultipleSeqAlignment(records, self.alphabet)
예제 #22
0
    def __next__(self):
        handle = self.handle
        try:
            #Header we saved from when we were parsing
            #the previous alignment.
            line = self._header
            del self._header
        except AttributeError:
            line = handle.readline()
        if not line:
            raise StopIteration

        #Whitelisted headers we know about
        known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS']
        if line.strip().split()[0] not in known_headers:
            raise ValueError(
                "%s is not a known CLUSTAL header: %s" %
                (line.strip().split()[0], ", ".join(known_headers)))

        # find the clustal version in the header line
        version = None
        for word in line.split():
            if word[0] == '(' and word[-1] == ')':
                word = word[1:-1]
            if word[0] in '0123456789':
                version = word
                break

        #There should be two blank lines after the header line
        line = handle.readline()
        while line.strip() == "":
            line = handle.readline()

        #If the alignment contains entries with the same sequence
        #identifier (not a good idea - but seems possible), then this
        #dictionary based parser will merge their sequences.  Fix this?
        ids = []
        seqs = []
        consensus = ""
        seq_cols = None  # Used to extract the consensus

        #Use the first block to get the sequence identifiers
        while True:
            if line[0] != " " and line.strip() != "":
                #Sequences identifier...
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % line)

                ids.append(fields[0])
                seqs.append(fields[1])

                #Record the sequence position to get the consensus
                if seq_cols is None:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end
                assert fields[1] == line[seq_cols]

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(fields[1].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)
            elif line[0] == " ":
                #Sequence consensus line...
                assert len(ids) == len(seqs)
                assert len(ids) > 0
                assert seq_cols is not None
                consensus = line[seq_cols]
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Check for blank line (or end of file)
                line = handle.readline()
                assert line.strip() == ""
                break
            else:
                #No consensus
                break
            line = handle.readline()
            if not line:
                break  # end of file

        assert line.strip() == ""
        assert seq_cols is not None

        #Confirm all same length
        for s in seqs:
            assert len(s) == len(seqs[0])
        if consensus:
            assert len(consensus) == len(seqs[0])

        #Loop over any remaining blocks...
        done = False
        while not done:
            #There should be a blank line between each block.
            #Also want to ignore any consensus line from the
            #previous block.
            while (not line) or line.strip() == "":
                line = handle.readline()
                if not line:
                    break  # end of file
            if not line:
                break  # end of file

            if line.split(None, 1)[0] in known_headers:
                #Found concatenated alignment.
                done = True
                self._header = line
                break

            for i in range(len(ids)):
                assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
                fields = line.rstrip().split()

                #We expect there to be two fields, there can be an optional
                #"sequence number" field containing the letter count.
                if len(fields) < 2 or len(fields) > 3:
                    raise ValueError("Could not parse line:\n%s" % repr(line))

                if fields[0] != ids[i]:
                    raise ValueError(
                        "Identifiers out of order? Got '%s' but expected '%s'"
                        % (fields[0], ids[i]))

                if fields[1] != line[seq_cols]:
                    start = len(fields[0]) + line[len(fields[0]):].find(
                        fields[1])
                    assert start == seq_cols.start, 'Old location %s -> %i:XX' % (
                        seq_cols, start)
                    end = start + len(fields[1])
                    seq_cols = slice(start, end)
                    del start, end

                #Append the sequence
                seqs[i] += fields[1]
                assert len(seqs[i]) == len(seqs[0])

                if len(fields) == 3:
                    #This MAY be an old style file with a letter count...
                    try:
                        letters = int(fields[2])
                    except ValueError:
                        raise ValueError(
                            "Could not parse line, bad sequence number:\n%s" %
                            line)
                    if len(seqs[i].replace("-", "")) != letters:
                        raise ValueError(
                            "Could not parse line, invalid sequence number:\n%s"
                            % line)

                #Read in the next line
                line = handle.readline()
            #There should now be a consensus line
            if consensus:
                assert line[0] == " "
                assert seq_cols is not None
                consensus += line[seq_cols]
                assert len(consensus) == len(seqs[0])
                assert not line[:seq_cols.start].strip()
                assert not line[seq_cols.stop:].strip()
                #Read in the next line
                line = handle.readline()

        assert len(ids) == len(seqs)
        if len(seqs) == 0 or len(seqs[0]) == 0:
            raise StopIteration

        if self.records_per_alignment is not None \
        and self.records_per_alignment != len(ids):
            raise ValueError(
                "Found %i records in this alignment, told to expect %i" %
                (len(ids), self.records_per_alignment))

        records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i)
                   for (i, s) in zip(ids, seqs))
        alignment = MultipleSeqAlignment(records, self.alphabet)
        #TODO - Handle alignment annotation better, for now
        #mimic the old parser in Bio.Clustalw
        if version:
            alignment._version = version
        if consensus:
            alignment_length = len(seqs[0])
            assert len(consensus) == alignment_length, \
                   "Alignment length is %i, consensus length is %i, '%s'" \
                   % (alignment_length, len(consensus), consensus)
            alignment._star_info = consensus
        return alignment
예제 #23
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment