def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None #Whitelisted headers we know about known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE'] if line.strip().split()[0] not in known_headers: raise ValueError("%s is not a known CLUSTAL header: %s" % \ (line.strip().split()[0], ", ".join(known_headers))) # find the clustal version in the header line version = None for word in line.split(): if word[0]=='(' and word[-1]==')': word = word[1:-1] if word[0] in '0123456789': version = word break #There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() #If the alignment contains entries with the same sequence #identifier (not a good idea - but seems possible), then this #dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] consensus = "" seq_cols = None #: Used to extract the consensus #Use the first block to get the sequence identifiers while True: if line[0] != " " and line.strip() != "": #Sequences identifier... fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) #Record the sequence position to get the consensus if seq_cols is None: start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end assert fields[1] == line[seq_cols] if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError("Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-","")) != letters: raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) elif line[0] == " ": #Sequence consensus line... assert len(ids) == len(seqs) assert len(ids) > 0 assert seq_cols is not None consensus = line[seq_cols] assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Check for blank line (or end of file) line = handle.readline() assert line.strip() == "" break else: #No consensus break line = handle.readline() if not line : break #end of file assert line.strip() == "" assert seq_cols is not None #Confirm all same length for s in seqs: assert len(s) == len(seqs[0]) if consensus: assert len(consensus) == len(seqs[0]) #Loop over any remaining blocks... done = False while not done: #There should be a blank line between each block. #Also want to ignore any consensus line from the #previous block. while (not line) or line.strip() == "": line = handle.readline() if not line : break # end of file if not line : break # end of file if line.split(None,1)[0] in known_headers: #Found concatenated alignment. done = True self._header = line break for i in range(len(ids)): assert line[0] != " ", "Unexpected line:\n%s" % repr(line) fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % repr(line)) if fields[0] != ids[i]: raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ % (fields[0], ids[i])) if fields[1] != line[seq_cols]: start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end #Append the sequence seqs[i] += fields[1] assert len(seqs[i]) == len(seqs[0]) if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError("Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-","")) != letters: raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) #Read in the next line line = handle.readline() #There should now be a consensus line if consensus: assert line[0] == " " assert seq_cols is not None consensus += line[seq_cols] assert len(consensus) == len(seqs[0]) assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: return None if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) alignment_length = len(seqs[0]) for i in range(len(ids)): if len(seqs[i]) != alignment_length: raise ValueError("Error parsing alignment - sequences of different length?") alignment.add_sequence(ids[i], seqs[i]) #TODO - Handle alignment annotation better, for now #mimic the old parser in Bio.Clustalw if version: alignment._version = version if consensus: assert len(consensus) == alignment_length, \ "Alignment length is %i, consensus length is %i, '%s'" \ % (alignment_length, len(consensus), consensus) alignment._star_info = consensus return alignment
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None #Whitelisted headers we know about known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE'] if line.strip().split()[0] not in known_headers: raise ValueError("%s is not a known CLUSTAL header: %s" % \ (line.strip().split()[0], ", ".join(known_headers))) # find the clustal version in the header line version = None for word in line.split(): if word[0] == '(' and word[-1] == ')': word = word[1:-1] if word[0] in '0123456789': version = word break #There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() #If the alignment contains entries with the same sequence #identifier (not a good idea - but seems possible), then this #dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] consensus = "" seq_cols = None #: Used to extract the consensus #Use the first block to get the sequence identifiers while True: if line[0] != " " and line.strip() != "": #Sequences identifier... fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) #Record the sequence position to get the consensus if seq_cols is None: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end assert fields[1] == line[seq_cols] if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) elif line[0] == " ": #Sequence consensus line... assert len(ids) == len(seqs) assert len(ids) > 0 assert seq_cols is not None consensus = line[seq_cols] assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Check for blank line (or end of file) line = handle.readline() assert line.strip() == "" break else: #No consensus break line = handle.readline() if not line: break #end of file assert line.strip() == "" assert seq_cols is not None #Confirm all same length for s in seqs: assert len(s) == len(seqs[0]) if consensus: assert len(consensus) == len(seqs[0]) #Loop over any remaining blocks... done = False while not done: #There should be a blank line between each block. #Also want to ignore any consensus line from the #previous block. while (not line) or line.strip() == "": line = handle.readline() if not line: break # end of file if not line: break # end of file if line.split(None, 1)[0] in known_headers: #Found concatenated alignment. done = True self._header = line break for i in range(len(ids)): assert line[0] != " ", "Unexpected line:\n%s" % repr(line) fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % repr(line)) if fields[0] != ids[i]: raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ % (fields[0], ids[i])) if fields[1] != line[seq_cols]: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) assert start == seq_cols.start, 'Old location %s -> %i:XX' % ( seq_cols, start) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end #Append the sequence seqs[i] += fields[1] assert len(seqs[i]) == len(seqs[0]) if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) #Read in the next line line = handle.readline() #There should now be a consensus line if consensus: assert line[0] == " " assert seq_cols is not None consensus += line[seq_cols] assert len(consensus) == len(seqs[0]) assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: return None if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) alignment_length = len(seqs[0]) for i in range(len(ids)): if len(seqs[i]) != alignment_length: raise ValueError( "Error parsing alignment - sequences of different length?") alignment.add_sequence(ids[i], seqs[i]) #TODO - Handle alignment annotation better, for now #mimic the old parser in Bio.Clustalw if version: alignment._version = version if consensus: assert len(consensus) == alignment_length, \ "Alignment length is %i, consensus length is %i, '%s'" \ % (alignment_length, len(consensus), consensus) alignment._star_info = consensus return alignment