def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None): """Generator function to iterate over Fasta records (as SeqRecord objects). Arguments: - handle - input file - alphabet - optional alphabet - title2ids - A function that, when given the title of the FASTA file (without the beginning >), will return the id, name and description (in that order) for the record as a tuple of strings. If this is not given, then the entire title line will be used as the description, and the first word as the id and name. By default this will act like calling Bio.SeqIO.parse(handle, "fasta") with no custom handling of the title lines: >>> with open("Fasta/dups.fasta") as handle: ... for record in FastaIterator(handle): ... print(record.id) ... alpha beta gamma alpha delta However, you can supply a title2ids function to alter this: >>> def take_upper(title): ... return title.split(None, 1)[0].upper(), "", title >>> with open("Fasta/dups.fasta") as handle: ... for record in FastaIterator(handle, title2ids=take_upper): ... print(record.id) ... ALPHA BETA GAMMA ALPHA DELTA """ if title2ids: for title, sequence in SimpleFastaParser(handle): id, name, descr = title2ids(title) yield SeqRecord(Seq(sequence, alphabet), id=id, name=name, description=descr) else: for title, sequence in SimpleFastaParser(handle): try: first_word = title.split(None, 1)[0] except IndexError: assert not title, repr(title) # Should we use SeqRecord default for no ID? first_word = "" yield SeqRecord(Seq(sequence, alphabet), id=first_word, name=first_word, description=title)
def add_sequence(self, descriptor, sequence, start=None, end=None, weight=1.0): """Add a sequence to the alignment (DEPRECATED). The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, since add_sequence is likely to be removed in a future release of Biopython. """ import warnings import Bio warnings.warn( "The start, end, and weight arguments are not supported! This method only provides limited backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the append method with a SeqRecord instead, as the add_sequence method is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) # Should we handle start/end/strand information somehow? What for? # TODO - Should we handle weights somehow? See also AlignInfo code... if start is not None or end is not None or weight != 1.0: raise ValueError("The add_Sequence method is obsolete, and only " "provides limited backwards compatibily. The" "start, end and weight arguments are not " "supported.") self.append( SeqRecord(Seq(sequence, self._alphabet), id=descriptor, description=descriptor))
def gap_consensus(self, threshold=.7, ambiguous="X", consensus_alpha=None, require_multiple=0): """Same as dumb_consensus(), but allows gap on the output. Things to do: - Let the user define that with only one gap, the result character in consensus is gap. - Let the user select gap character, now it takes the same as input. """ # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" consensus = '' # find the length of the consensus we are creating con_len = self.alignment.get_alignment_length() # go through each seq item for n in range(con_len): # keep track of the counts of the different atoms we get atom_dict = {} num_atoms = 0 for record in self.alignment: # make sure we haven't run past the end of any sequences # if they are of different lengths if n < len(record.seq): if record.seq[n] not in atom_dict: atom_dict[record.seq[n]] = 1 else: atom_dict[record.seq[n]] += 1 num_atoms += 1 max_atoms = [] max_size = 0 for atom in atom_dict: if atom_dict[atom] > max_size: max_atoms = [atom] max_size = atom_dict[atom] elif atom_dict[atom] == max_size: max_atoms.append(atom) if require_multiple and num_atoms == 1: consensus += ambiguous elif (len(max_atoms) == 1) and ( (float(max_size) / float(num_atoms)) >= threshold): consensus += max_atoms[0] else: consensus += ambiguous # we need to guess a consensus alphabet if one isn't specified if consensus_alpha is None: # TODO - Should we make this into a Gapped alphabet? consensus_alpha = self._guess_consensus_alphabet(ambiguous) return Seq(consensus, consensus_alpha)
def add_sequence(self, descriptor, sequence, start=None, end=None, weight=1.0): """Add a sequence to the alignment. This doesn't do any kind of alignment, it just adds in the sequence object, which is assumed to be prealigned with the existing sequences. Arguments: - descriptor - The descriptive id of the sequence being added. This will be used as the resulting SeqRecord's .id property (and, for historical compatibility, also the .description property) - sequence - A string with sequence info. - start - You can explicitly set the start point of the sequence. This is useful (at least) for BLAST alignments, which can just be partial alignments of sequences. - end - Specify the end of the sequence, which is important for the same reason as the start. - weight - The weight to place on the sequence in the alignment. By default, all sequences have the same weight. (0.0 => no weight, 1.0 => highest weight) """ new_seq = Seq(sequence, self._alphabet) # We are now effectively using the SeqRecord's .id as # the primary identifier (e.g. in Bio.SeqIO) so we should # populate it with the descriptor. # For backwards compatibility, also store this in the # SeqRecord's description property. new_record = SeqRecord(new_seq, id=descriptor, description=descriptor) # hack! We really need to work out how to deal with annotations # and features in biopython. Right now, I'll just use the # generic annotations dictionary we've got to store the start # and end, but we should think up something better. I don't know # if I'm really a big fan of the LocatableSeq thing they've got # in BioPerl, but I'm not positive what the best thing to do on # this is... if start: new_record.annotations['start'] = start if end: new_record.annotations['end'] = end # another hack to add weight information to the sequence new_record.annotations['weight'] = weight self._records.append(new_record)
def _set_seq(self, seq, seq_type): """Checks the given sequence for attribute setting :param seq: sequence to check :type seq: string or SeqRecord :param seq_type: sequence type :type seq_type: string, choice of 'hit' or 'query' """ assert seq_type in ('hit', 'query') if seq is None: return seq # return immediately if seq is None else: if not isinstance(seq, (basestring, SeqRecord)): raise TypeError("%s sequence must be a string or a SeqRecord" " object." % seq_type) # check length if the opposite sequence is not None opp_type = 'hit' if seq_type == 'query' else 'query' opp_seq = getattr(self, '_%s' % opp_type, None) if opp_seq is not None: if len(seq) != len(opp_seq): raise ValueError("Sequence lengths do not match. Expected: " "%r (%s); found: %r (%s)." % (len(opp_seq), opp_type, len(seq), seq_type)) seq_id = getattr(self, '%s_id' % seq_type) seq_desc = getattr(self, '%s_description' % seq_type) seq_feats = getattr(self, '%s_features' % seq_type) seq_name = 'aligned %s sequence' % seq_type if isinstance(seq, SeqRecord): seq.id = seq_id seq.description = seq_desc seq.name = seq_name seq.features = seq_feats seq.seq.alphabet = self.alphabet elif isinstance(seq, basestring): seq = SeqRecord(Seq(seq, self.alphabet), id=seq_id, name=seq_name, description=seq_desc, features=seq_feats) return seq
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError as err: print("Darn... amino acids vs nucleotide coordinates?") print(tool) print(query_seq) print(query_tags) print("%s %i" % (q, len(q))) print(match_seq) print(match_tags) print("%s %i" % (m, len(m))) print(handle.name) raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: # Empty file - just give up. raise StopIteration if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = OrderedDict( ) # Really only need an OrderedSet, but python lacks this gs = {} gr = {} gf = {} passed_end_alignment = False while True: line = handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " "and sequence:\n" + line) id, seq = parts if id not in ids: ids[id] = True seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip( ) # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids.keys() self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(id) record = SeqRecord(Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(id, record) records.append(record) alignment = MultipleSeqAlignment(records, self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration # Whitelisted headers we know about known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS', 'Kalign'] if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known CLUSTAL header: %s" % (line.strip().split()[0], ", ".join(known_headers))) # find the clustal version in the header line version = None for word in line.split(): if word[0] == '(' and word[-1] == ')': word = word[1:-1] if word[0] in '0123456789': version = word break # There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() # If the alignment contains entries with the same sequence # identifier (not a good idea - but seems possible), then this # dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] consensus = "" seq_cols = None # Used to extract the consensus # Use the first block to get the sequence identifiers while True: if line[0] != " " and line.strip() != "": # Sequences identifier... fields = line.rstrip().split() # We expect there to be two fields, there can be an optional # "sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) # Record the sequence position to get the consensus if seq_cols is None: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end assert fields[1] == line[seq_cols] if len(fields) == 3: # This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) elif line[0] == " ": # Sequence consensus line... assert len(ids) == len(seqs) assert len(ids) > 0 assert seq_cols is not None consensus = line[seq_cols] assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() # Check for blank line (or end of file) line = handle.readline() assert line.strip() == "" break else: # No consensus break line = handle.readline() if not line: break # end of file assert line.strip() == "" assert seq_cols is not None # Confirm all same length for s in seqs: assert len(s) == len(seqs[0]) if consensus: assert len(consensus) == len(seqs[0]) # Loop over any remaining blocks... done = False while not done: # There should be a blank line between each block. # Also want to ignore any consensus line from the # previous block. while (not line) or line.strip() == "": line = handle.readline() if not line: break # end of file if not line: break # end of file if line.split(None, 1)[0] in known_headers: # Found concatenated alignment. done = True self._header = line break for i in range(len(ids)): assert line[0] != " ", "Unexpected line:\n%s" % repr(line) fields = line.rstrip().split() # We expect there to be two fields, there can be an optional # "sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % repr(line)) if fields[0] != ids[i]: raise ValueError( "Identifiers out of order? Got '%s' but expected '%s'" % (fields[0], ids[i])) if fields[1] != line[seq_cols]: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) assert start == seq_cols.start, 'Old location %s -> %i:XX' % ( seq_cols, start) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end # Append the sequence seqs[i] += fields[1] assert len(seqs[i]) == len(seqs[0]) if len(fields) == 3: # This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) # Read in the next line line = handle.readline() # There should now be a consensus line if consensus: assert line[0] == " " assert seq_cols is not None consensus += line[seq_cols] assert len(consensus) == len(seqs[0]) assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() # Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: raise StopIteration if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) for (i, s) in zip(ids, seqs)) alignment = MultipleSeqAlignment(records, self.alphabet) # TODO - Handle alignment annotation better, for now # mimic the old parser in Bio.Clustalw if version: alignment._version = version if consensus: alignment_length = len(seqs[0]) assert len(consensus) == alignment_length, \ "Alignment length is %i, consensus length is %i, '%s'" \ % (alignment_length, len(consensus), consensus) alignment._star_info = consensus return alignment
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration line = line.strip() parts = [x for x in line.split() if x] if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs: raise ValueError( "Found %i records in this alignment, told to expect %i" % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] # By default, expects STRICT truncation / padding to 10 characters. # Does not require any whitespace between name and seq. for i in range(number_of_seqs): line = handle.readline().rstrip() sequence_id, s = self._split_id(line) ids.append(sequence_id) while len(s) < length_of_seqs: # The sequence may be split into multiple lines line = handle.readline().strip() if not line: break if line == "": continue s = "".join([s, line.strip().replace(" ", "")]) if len(s) > length_of_seqs: raise ValueError( "Found a record of length %i, should be %i" % (len(s), length_of_seqs)) if "." in s: raise ValueError( "PHYLIP format no longer allows dots in sequence") seqs.append(s) while True: # Find other alignments in the file line = handle.readline() if not line: break if self._is_header(line): self._header = line break records = (SeqRecord(Seq(s, self.alphabet), id=i, name=i, description=i) for (i, s) in zip(ids, seqs)) return MultipleSeqAlignment(records, self.alphabet)
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration line = line.strip() parts = [x for x in line.split() if x] if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs: raise ValueError( "Found %i records in this alignment, told to expect %i" % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] # By default, expects STRICT truncation / padding to 10 characters. # Does not require any whitespace between name and seq. for i in range(number_of_seqs): line = handle.readline().rstrip() sequence_id, s = self._split_id(line) ids.append(sequence_id) if "." in s: raise ValueError( "PHYLIP format no longer allows dots in sequence") seqs.append([s]) # Look for further blocks line = "" while True: # Skip any blank lines between blocks... while "" == line.strip(): line = handle.readline() if not line: break # end of file if not line: break # end of file if self._is_header(line): # Looks like the start of a concatenated alignment self._header = line break # print "New block..." for i in range(number_of_seqs): s = line.strip().replace(" ", "") if "." in s: raise ValueError( "PHYLIP format no longer allows dots in sequence") seqs[i].append(s) line = handle.readline() if (not line) and i + 1 < number_of_seqs: raise ValueError("End of file mid-block") if not line: break # end of file records = (SeqRecord(Seq("".join(s), self.alphabet), id=i, name=i, description=i) for (i, s) in zip(ids, seqs)) return MultipleSeqAlignment(records, self.alphabet)
def dumb_consensus(self, threshold=.7, ambiguous="X", consensus_alpha=None, require_multiple=0): """Output a fast consensus sequence of the alignment. This doesn't do anything fancy at all. It will just go through the sequence residue by residue and count up the number of each type of residue (ie. A or G or T or C for DNA) in all sequences in the alignment. If the percentage of the most common residue type is greater then the passed threshold, then we will add that residue type, otherwise an ambiguous character will be added. This could be made a lot fancier (ie. to take a substitution matrix into account), but it just meant for a quick and dirty consensus. Arguments: - threshold - The threshold value that is required to add a particular atom. - ambiguous - The ambiguous character to be added when the threshold is not reached. - consensus_alpha - The alphabet to return for the consensus sequence. If this is None, then we will try to guess the alphabet. - require_multiple - If set as 1, this will require that more than 1 sequence be part of an alignment to put it in the consensus (ie. not just 1 sequence and gaps). """ # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" consensus = '' # find the length of the consensus we are creating con_len = self.alignment.get_alignment_length() # go through each seq item for n in range(con_len): # keep track of the counts of the different atoms we get atom_dict = {} num_atoms = 0 for record in self.alignment: # make sure we haven't run past the end of any sequences # if they are of different lengths if n < len(record.seq): if record.seq[n] != '-' and record.seq[n] != '.': if record.seq[n] not in atom_dict: atom_dict[record.seq[n]] = 1 else: atom_dict[record.seq[n]] += 1 num_atoms = num_atoms + 1 max_atoms = [] max_size = 0 for atom in atom_dict: if atom_dict[atom] > max_size: max_atoms = [atom] max_size = atom_dict[atom] elif atom_dict[atom] == max_size: max_atoms.append(atom) if require_multiple and num_atoms == 1: consensus += ambiguous elif (len(max_atoms) == 1) and ( (float(max_size) / float(num_atoms)) >= threshold): consensus += max_atoms[0] else: consensus += ambiguous # we need to guess a consensus alphabet if one isn't specified if consensus_alpha is None: consensus_alpha = self._guess_consensus_alphabet(ambiguous) return Seq(consensus, consensus_alpha)
def __next__(self): handle = self.handle if self._header is None: line = handle.readline() else: # Header we saved from when we were parsing # the previous alignment. line = self._header self._header = None if not line: raise StopIteration while line.rstrip() != "#=======================================": line = handle.readline() if not line: raise StopIteration length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#": # Read in the rest of this alignment header, # try and discover the number of records expected # and their length parts = line[1:].split(":", 1) key = parts[0].lower().strip() if key == "aligned_sequences": number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs): line = handle.readline() parts = line[1:].strip().split(":", 1) assert i + 1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length": length_of_seqs = int(parts[1].strip()) # And read in another line... line = handle.readline() if number_of_seqs is None: raise ValueError("Number of sequences missing!") if length_of_seqs is None: raise ValueError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs: raise ValueError( "Found %i records in this alignment, told to expect %i" % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] seq_starts = [] index = 0 # Parse the seqs while line: if len(line) > 21: id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: # identifier, seq start position, seq, seq end position # (an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end if start == end: # Special case, either a single letter is present, # or no letters at all. if seq.replace("-", "") == "": start = int(start) end = int(end) else: start = int(start) - 1 end = int(end) else: assert seq.replace("-", "") != "", repr(line) start = int(start) - 1 # python counting end = int(end) # The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id == ids[index] or id == ids[index][:len(id)] if len(seq_starts) == index: # Record the start seq_starts.append(start) # Check the start... if start == end: assert seq.replace("-", "") == "", line else: assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \ "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \ % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), start, line) seqs[index] += seq # Check the end ... assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \ "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \ % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), seq_starts[index], end, line) index += 1 if index >= number_of_seqs: index = 0 else: # just a start value, this is just alignment annotation (?) # print "Skipping: " + line.rstrip() pass elif line.strip() == "": # Just a spacer? pass else: print(line) assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================": # End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) records = [] for id, seq in zip(ids, seqs): if len(seq) != length_of_seqs: # EMBOSS 2.9.0 is known to use spaces instead of minus signs # for leading gaps, and thus fails to parse. This old version # is still used as of Dec 2008 behind the EBI SOAP webservice: # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl raise ValueError("Error parsing alignment - sequences of " "different length? You could be using an " "old version of EMBOSS.") records.append( SeqRecord(Seq(seq, self.alphabet), id=id, description=id)) return MultipleSeqAlignment(records, self.alphabet)