def create_alignment(self, line): """Parse one line of FASTA output and return an Alignment object.""" columns = line.split() assert len(columns) == 13 annotations = {} annotations["program"] = self._program annotations["database"] = self._database if self._query_id is not None: assert columns[0] == self._query_id query_id = columns[0] target_id = columns[1] percentage_identity = float(columns[2]) alignment_length = int(columns[3]) mismatches = int(columns[4]) matches = alignment_length - mismatches difference = abs(100 * matches / alignment_length - percentage_identity) assert difference < 0.015 gap_opens = int(columns[5]) query_start = int(columns[6]) - 1 query_end = int(columns[7]) target_start = int(columns[8]) - 1 target_end = int(columns[9]) annotations["mismatches"] = mismatches annotations["evalue"] = float(columns[10]) annotations["bit_score"] = float(columns[11]) if self._alignment_representation == "BTOP": coordinates = self.parse_btop(columns[12]) elif self._alignment_representation == "CIGAR": coordinates = self.parse_cigar(columns[12]) coordinates[0, :] += target_start query_size = self._query_size if query_start < query_end: coordinates[1, :] += query_start else: # mapped to reverse strand coordinates[1, :] = coordinates[1, ::-1] coordinates[1, :] += query_size - query_start - 1 query_sequence = Seq(None, length=query_size) query = SeqRecord(query_sequence, id=query_id) if self._query_description is not None: query.description = self._query_description target_sequence = Seq(None, length=target_end) target = SeqRecord(target_sequence, id=target_id) records = [target, query] alignment = Alignment(records, coordinates) alignment.annotations = annotations return alignment
def create_alignment( records, aligned_sequences, strands, annotations, column_annotations, score, ): """Create the Alignment object from the collected alignment data.""" coordinates = Alignment.infer_coordinates(aligned_sequences) for record, strand, row in zip(records, strands, coordinates): if strand == "-": row[:] = row[-1] - row[0] - row start = record.seq.defined_ranges[0][0] row += start alignment = Alignment(records, coordinates) if annotations is not None: alignment.annotations = annotations if column_annotations is not None: alignment.column_annotations = column_annotations if score is not None: alignment.score = score return alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration identifiers = None number_of_sequences = None annotations = {} for line in stream: line = line.rstrip("\r\n") if identifiers is None: # searching for alignment metadata start if not line: continue elif line.startswith("#---------------------------------------"): # may appear between alignments continue elif line.startswith("#======================================="): # found the alignment metadata start identifiers = [] ncols = None sequences = None else: raise ValueError("Unexpected line: %s" % line) elif sequences is None: # parsing the alignment metadata if line == "#=======================================": # reached the end of alignment metadata if len(identifiers) == 0: raise ValueError("Number of sequences missing!") if ncols is None: raise ValueError("Length of alignment missing!") sequences = [""] * number_of_sequences aligned_sequences = [""] * number_of_sequences consensus = "" starts = [0] * number_of_sequences column = 0 index = 0 continue if line.strip() == "#": continue if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line try: key, value = line[2:].split(":", 1) except ValueError: # An equal sign is used for Longest_Identity, # Longest_Similarity, Shortest_Identity, and # Shortest_Similarity, which are included if command line # argument -nobrief was used. key, value = line[2:].split(" = ", 1) if key == "Aligned_sequences": number_of_sequences = int(value.strip()) assert len(identifiers) == 0 # Should now expect the record identifiers... for i, line in enumerate(stream): if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line number, identifier = line[2:].split(":") assert i + 1 == int(number) identifiers.append(identifier.strip()) if len(identifiers) == number_of_sequences: break elif key == "Matrix": annotations["matrix"] = value.strip() elif key == "Gap_penalty": annotations["gap_penalty"] = float(value.strip()) elif key == "Extend_penalty": annotations["extend_penalty"] = float(value.strip()) elif key == "Length": ncols = int(value.strip()) elif key == "Identity": annotations["identity"] = int(value.strip().split("/")[0]) elif key == "Similarity": annotations["similarity"] = int(value.strip().split("/")[0]) elif key == "Gaps": annotations["gaps"] = int(value.strip().split("/")[0]) elif key == "Score": annotations["score"] = float(value.strip()) # TODO: # The following are generated if the -nobrief command line # argument used. We could simply calculate them from the # alignment, but then we have to define what we mean by # "similar". For now, simply store them as an annotation. elif key == "Longest_Identity": annotations["longest_identity"] = value.strip() elif key == "Longest_Similarity": annotations["longest_similarity"] = value.strip() elif key == "Shortest_Identity": annotations["shortest_identity"] = value.strip() elif key == "Shortest_Similarity": annotations["shortest_similarity"] = value.strip() else: raise ValueError("Failed to parse line '%s'" % line) else: # parse the sequences if not line: # empty line if index == number_of_sequences: # reached the end of an alignment block index = 0 if column == ncols: # reached the end of the sequences coordinates = Alignment.infer_coordinates(aligned_sequences) records = [] n = len(sequences) for i in range(n): start = starts[i] if start == 0: sequence = Seq(sequences[i]) else: coordinates[i, :] += start # create a partially defined sequence length = start + len(sequences[i]) data = {start: sequences[i]} sequence = Seq(data, length=length) record = SeqRecord(sequence, identifiers[i]) records.append(record) alignment = Alignment(records, coordinates) if annotations: alignment.annotations = annotations if consensus: alignment.column_annotations = { "emboss_consensus": consensus } yield alignment identifiers = None annotations = {} continue prefix = line[:21].strip() if prefix == "": # match line consensus += line[21:71] else: identifier, start = prefix.split(None, 1) assert identifiers[index].startswith(identifier) aligned_sequence, end = line[21:].split(None, 1) start = int(start) - 1 # Python counting end = int(end) length = len(sequences[index]) sequence = aligned_sequence.replace("-", "") if length == 0 and len(sequence) > 0: # Record the start starts[index] = start else: if self.align_format == "srspair" and len(sequence) == 0: start += 1 assert start == starts[index] + length assert end == start + len(sequence) sequences[index] += sequence aligned_sequences[index] += aligned_sequence if index == 0: column += len(aligned_sequence) else: assert column == len(aligned_sequences[index]) index += 1
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration for line in stream: line = line.strip() if not line: continue elif line == "# STOCKHOLM 1.0": # Starting a new alignment records = [] aligned_sequences = [] references = [] reference_comments = [] database_references = [] nested_domains = [] gf = defaultdict(list) gc = {} gs = defaultdict(lambda: {"DR": []}) gr = defaultdict(dict) length = None elif line == "//": # Reached the end of the alignment. skipped_columns = [] coordinates = Alignment.infer_coordinates( aligned_sequences, skipped_columns ) skipped_columns = set(skipped_columns) alignment = Alignment(records, coordinates) alignment.annotations = {} if references: alignment.annotations["references"] = [] for reference in references: reference = dict(reference) reference["title"] = " ".join(reference["title"]) reference["author"] = " ".join(reference["author"]) reference["location"] = " ".join(reference["location"]) alignment.annotations["references"].append(reference) if database_references: alignment.annotations["database references"] = database_references if nested_domains: alignment.annotations["nested domains"] = nested_domains rows, columns = alignment.shape AlignmentIterator._store_per_file_annotations(alignment, gf, rows) AlignmentIterator._store_per_column_annotations( alignment, gc, columns, skipped_columns ) AlignmentIterator._store_per_sequence_annotations(alignment, gs) AlignmentIterator._store_per_sequence_and_per_column_annotations( alignment, gr ) yield alignment elif not line.startswith("#"): # Sequence # Format: "<seqname> <sequence>" try: seqname, aligned_sequence = line.split(None, 1) except ValueError: # This might be someone attempting to store a zero length sequence? raise ValueError( "Could not split line into sequence name and aligned sequence:\n" + line ) from None if length is None: length = len(aligned_sequence) elif length != len(aligned_sequence): raise ValueError( f"Aligned sequence {seqname} consists of {len(aligned_sequence)} letters, expected {length} letters)" ) aligned_sequence = aligned_sequence.replace(".", "-") sequence = aligned_sequence.replace("-", "") aligned_sequences.append(aligned_sequence) seq = Seq(sequence) record = SeqRecord(seq, id=seqname) records.append(record) elif line.startswith("#=GF "): # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) if feature == "RN": assert text.startswith("[") assert text.endswith("]") number = int(text[1:-1]) reference = defaultdict(list) reference["number"] = number if reference_comments: reference["comment"] = " ".join(reference_comments) reference_comments = [] references.append(reference) elif feature == "RM": assert not reference["medline"] reference["medline"] = text elif feature == "RT": reference["title"].append(text) elif feature == "RA": reference["author"].append(text) elif feature == "RL": reference["location"].append(text) elif feature == "RC": reference_comments.append(text) elif feature == "DR": database_reference = {"reference": text} database_references.append(database_reference) elif feature == "DC": assert "comment" not in database_reference database_reference["comment"] = text elif feature == "NE": nested_domain = {"accession": text} nested_domains.append(nested_domain) elif feature == "NL": assert "location" not in nested_domain nested_domain["location"] = text else: # Each feature key could be used more than once, # so store the entries as a list of strings. gf[feature].append(text) elif line.startswith("#=GC "): # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" feature, text = line[5:].strip().split(None, 2) if feature not in gc: gc[feature] = "" gc[feature] += text.strip() # append to any previous entry # Might be interleaved blocks, so can't check length yet elif line.startswith("#=GS "): # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" try: seqname, feature, text = line[5:].strip().split(None, 2) except ValueError: # Free text can sometimes be empty, which a one line split throws an error for. # See https://github.com/biopython/biopython/issues/2982 for more details seqname, feature = line[5:].strip().split(None, 1) text = "" if feature == "DR": gs[seqname][feature].append(text) else: assert feature not in gs[seqname] gs[seqname][feature] = text elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" terms = line[5:].split(None, 2) assert terms[0] == seqname feature = terms[1] gr[seqname][feature] = terms[2].strip()