def create_alignment( records, aligned_sequences, strands, annotations, column_annotations, score, ): """Create the Alignment object from the collected alignment data.""" coordinates = Alignment.infer_coordinates(aligned_sequences) for record, strand, row in zip(records, strands, coordinates): if strand == "-": row[:] = row[-1] - row[0] - row start = record.seq.defined_ranges[0][0] row += start alignment = Alignment(records, coordinates) if annotations is not None: alignment.annotations = annotations if column_annotations is not None: alignment.column_annotations = column_annotations if score is not None: alignment.score = score return alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration # If the alignment contains entries with the same sequence # identifier (not a good idea - but seems possible), then this # dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] aligned_seqs = [] consensus = "" index = None # Used to extract the consensus # Use the first block to get the sequence identifiers for line in stream: if line.startswith(" "): # Sequence consensus line... assert len(ids) > 0 assert index is not None length = len(aligned_seq) # noqa: F821 consensus = line[index:index + length] break elif line.strip(): # Sequences identifier... fields = line.split() # We expect there to be two fields, there can be an optional # "sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) seqid, aligned_seq = fields[:2] ids.append(seqid) aligned_seqs.append(aligned_seq) seq = aligned_seq.replace("-", "") seqs.append(seq) # Record the sequence position to get the consensus if index is None: index = line.find(aligned_seq, len(seqid)) if len(fields) == 3: # This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) from None if len(seq) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) else: # no consensus line if index: break else: raise StopIteration assert index is not None # Confirm all same length length = len(aligned_seqs[0]) for aligned_seq in aligned_seqs: assert len(aligned_seq) == length if consensus: assert len(consensus) == length n = len(seqs) i = 0 # Loop over any remaining blocks... for line in stream: if line.startswith(" "): # Sequence consensus line assert index is not None length = len(aligned_seq) consensus += line[index:index + length] elif not line.strip(): # Blank line continue else: seqid = ids[i] # Sequences identifier... fields = line.split() # We expect there to be two fields, there can be an optional # "sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) assert seqid == fields[0] aligned_seq = fields[1] aligned_seqs[i] += aligned_seq seq = aligned_seq.replace("-", "") seqs[i] += seq if len(fields) == 3: # This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) from None if len(seqs[i]) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) i += 1 if i == n: i = 0 records = [ SeqRecord(Seq(seq), id=seqid, description=seqid) for (seqid, seq) in zip(ids, seqs) ] coordinates = Alignment.infer_coordinates(aligned_seqs) alignment = Alignment(records, coordinates) # TODO - Handle alignment annotation better, for now # mimic the old parser in Bio.Clustalw if consensus: rows, columns = alignment.shape if len(consensus) != columns: for aligned_seq in aligned_seqs: print(aligned_seq, len(aligned_seq)) raise ValueError( "Alignment has %i columns, consensus length is %i, '%s'" % (columns, len(consensus), consensus)) alignment.column_annotations = {} alignment.column_annotations["clustal_consensus"] = consensus yield alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration identifiers = None number_of_sequences = None annotations = {} for line in stream: line = line.rstrip("\r\n") if identifiers is None: # searching for alignment metadata start if not line: continue elif line.startswith("#---------------------------------------"): # may appear between alignments continue elif line.startswith("#======================================="): # found the alignment metadata start identifiers = [] ncols = None sequences = None else: raise ValueError("Unexpected line: %s" % line) elif sequences is None: # parsing the alignment metadata if line == "#=======================================": # reached the end of alignment metadata if len(identifiers) == 0: raise ValueError("Number of sequences missing!") if ncols is None: raise ValueError("Length of alignment missing!") sequences = [""] * number_of_sequences aligned_sequences = [""] * number_of_sequences consensus = "" starts = [0] * number_of_sequences column = 0 index = 0 continue if line.strip() == "#": continue if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line try: key, value = line[2:].split(":", 1) except ValueError: # An equal sign is used for Longest_Identity, # Longest_Similarity, Shortest_Identity, and # Shortest_Similarity, which are included if command line # argument -nobrief was used. key, value = line[2:].split(" = ", 1) if key == "Aligned_sequences": number_of_sequences = int(value.strip()) assert len(identifiers) == 0 # Should now expect the record identifiers... for i, line in enumerate(stream): if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line number, identifier = line[2:].split(":") assert i + 1 == int(number) identifiers.append(identifier.strip()) if len(identifiers) == number_of_sequences: break elif key == "Matrix": annotations["matrix"] = value.strip() elif key == "Gap_penalty": annotations["gap_penalty"] = float(value.strip()) elif key == "Extend_penalty": annotations["extend_penalty"] = float(value.strip()) elif key == "Length": ncols = int(value.strip()) elif key == "Identity": annotations["identity"] = int(value.strip().split("/")[0]) elif key == "Similarity": annotations["similarity"] = int(value.strip().split("/")[0]) elif key == "Gaps": annotations["gaps"] = int(value.strip().split("/")[0]) elif key == "Score": annotations["score"] = float(value.strip()) # TODO: # The following are generated if the -nobrief command line # argument used. We could simply calculate them from the # alignment, but then we have to define what we mean by # "similar". For now, simply store them as an annotation. elif key == "Longest_Identity": annotations["longest_identity"] = value.strip() elif key == "Longest_Similarity": annotations["longest_similarity"] = value.strip() elif key == "Shortest_Identity": annotations["shortest_identity"] = value.strip() elif key == "Shortest_Similarity": annotations["shortest_similarity"] = value.strip() else: raise ValueError("Failed to parse line '%s'" % line) else: # parse the sequences if not line: # empty line if index == number_of_sequences: # reached the end of an alignment block index = 0 if column == ncols: # reached the end of the sequences coordinates = Alignment.infer_coordinates(aligned_sequences) records = [] n = len(sequences) for i in range(n): start = starts[i] if start == 0: sequence = Seq(sequences[i]) else: coordinates[i, :] += start # create a partially defined sequence length = start + len(sequences[i]) data = {start: sequences[i]} sequence = Seq(data, length=length) record = SeqRecord(sequence, identifiers[i]) records.append(record) alignment = Alignment(records, coordinates) if annotations: alignment.annotations = annotations if consensus: alignment.column_annotations = { "emboss_consensus": consensus } yield alignment identifiers = None annotations = {} continue prefix = line[:21].strip() if prefix == "": # match line consensus += line[21:71] else: identifier, start = prefix.split(None, 1) assert identifiers[index].startswith(identifier) aligned_sequence, end = line[21:].split(None, 1) start = int(start) - 1 # Python counting end = int(end) length = len(sequences[index]) sequence = aligned_sequence.replace("-", "") if length == 0 and len(sequence) > 0: # Record the start starts[index] = start else: if self.align_format == "srspair" and len(sequence) == 0: start += 1 assert start == starts[index] + length assert end == start + len(sequence) sequences[index] += sequence aligned_sequences[index] += aligned_sequence if index == 0: column += len(aligned_sequence) else: assert column == len(aligned_sequences[index]) index += 1
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration identifiers = None number_of_sequences = None for line in stream: line = line.rstrip("\r\n") if identifiers is None: # searching for alignment metadata start if not line: continue elif line.startswith( "#---------------------------------------"): # may appear between alignments continue elif line.startswith( "#======================================="): # found the alignment metadata start identifiers = [] ncols = None sequences = None matrix = None gap_penalty = None extend_penalty = None identity = None similarity = None gaps = None score = None else: raise ValueError("Unexpected line: %s" % line) elif sequences is None: # parsing the alignment metadata if line == "#=======================================": # reached the end of alignment metadata if len(identifiers) == 0: raise ValueError("Number of sequences missing!") if ncols is None: raise ValueError("Length of alignment missing!") sequences = [""] * number_of_sequences aligned_sequences = [""] * number_of_sequences consensus = "" starts = [0] * number_of_sequences ends = [0] * number_of_sequences column = 0 index = 0 continue if line.strip() == "#": continue if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line key, value = line[2:].split(":", 1) if key == "Aligned_sequences": number_of_sequences = int(value.strip()) assert len(identifiers) == 0 # Should now expect the record identifiers... for i, line in enumerate(stream): if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line number, identifier = line[2:].split(":") assert i + 1 == int(number) identifiers.append(identifier.strip()) if len(identifiers) == number_of_sequences: break elif key == "Matrix": matrix = value.strip() elif key == "Gap_penalty": gap_penalty = float(value.strip()) elif key == "Extend_penalty": extend_penalty = float(value.strip()) elif key == "Length": ncols = int(value.strip()) elif key == "Identity": identity = int(value.strip().split("/")[0]) elif key == "Similarity": similarity = int(value.strip().split("/")[0]) elif key == "Gaps": gaps = int(value.strip().split("/")[0]) elif key == "Score": score = float(value.strip()) else: # parse the sequences if not line: # empty line if index == number_of_sequences: # reached the end of an alignment block index = 0 if column == ncols: # reached the end of the sequences coordinates = Alignment.infer_coordinates( aligned_sequences) for i, start in enumerate(starts): start -= 1 # Python counting coordinates[i, :] += start sequences = [ Seq(sequence) for sequence in sequences ] records = [ SeqRecord(sequence, id=identifier) for sequence, identifier in zip( sequences, identifiers) ] alignment = Alignment(records, coordinates) if matrix is not None: alignment.matrix = matrix if gap_penalty is not None: alignment.gap_penalty = gap_penalty if extend_penalty is not None: alignment.extend_penalty = extend_penalty if identity is not None: alignment.identity = identity if similarity is not None: alignment.similarity = similarity if gaps is not None: alignment.gaps = gaps if score is not None: alignment.score = score if consensus: alignment.column_annotations = { "emboss_consensus": consensus } yield alignment identifiers = None continue prefix = line[:21].strip() if prefix == "": # match line consensus += line[21:71] else: identifier, start = prefix.split(None, 1) aligned_sequence, end = line[21:].split(None, 1) start = int(start) end = int(end) sequence = aligned_sequence.replace("-", "") if len(sequences[index]) > 0: length = len(sequence) if length == 0: assert start == ends[index] assert end == ends[index] else: assert start == ends[index] + 1 assert end == ends[index] + length assert identifiers[index].startswith(identifier) if starts[index] == 0: # Record the start and end starts[index] = start ends[index] = end sequences[index] += sequence aligned_sequences[index] += aligned_sequence if index == 0: column += len(aligned_sequence) else: assert column == len(aligned_sequences[index]) index += 1