def parse(self, stream): """Parse the next alignment from the stream.""" names, seqs = self._read_file(stream) seqs = ["".join(seq) for seq in seqs] if len(seqs) != self.number_of_seqs: raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(seqs), self.number_of_seqs)) for seq in seqs: if len(seq) != self.length_of_seqs: raise ValueError( "Expected all sequences to have length %d; found %d" % (self.length_of_seqs, len(seq))) if "." in seq: raise ValueError( "PHYLIP format no longer allows dots in sequence") coordinates = Alignment.infer_coordinates(seqs) seqs = [seq.replace("-", "") for seq in seqs] records = [ SeqRecord(Seq(seq), id=name) for (name, seq) in zip(names, seqs) ] alignment = Alignment(records, coordinates) yield alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: return descriptions = [] seqs = [] line = self._line del self._line description = self._parse_description(line) identifier, start, end, strand, comments = description descriptions.append(description) seqs.append("") for line in stream: line = line.strip() if line.startswith("="): # There may be more data, but we've reached the end of this # alignment coordinates = Alignment.infer_coordinates(seqs) records = [] for index, (description, seq) in enumerate(zip(descriptions, seqs)): identifier, start, end, strand, comments = description length = end - start seq = seq.replace("-", "") assert len(seq) == end - start if strand == "+": pass elif strand == "-": seq = reverse_complement(seq, inplace=False) coordinates[ index, :] = len(seq) - coordinates[index, :] else: raise ValueError("Unexpected strand '%s'" % strand) coordinates[index] += start if start == 0: seq = Seq(seq) else: seq = Seq({start: seq}, length=end) record = SeqRecord(seq, id=identifier, description=comments) records.append(record) yield Alignment(records, coordinates) descriptions = [] seqs = [] elif line.startswith(">"): description = self._parse_description(line) identifier, start, end, strand, comments = description descriptions.append(description) seqs.append("") else: seqs[-1] += line
def create_alignment(self, line): """Parse one line of FASTA output and return an Alignment object.""" columns = line.split() assert len(columns) == 13 annotations = {} annotations["program"] = self._program annotations["database"] = self._database if self._query_id is not None: assert columns[0] == self._query_id query_id = columns[0] target_id = columns[1] percentage_identity = float(columns[2]) alignment_length = int(columns[3]) mismatches = int(columns[4]) matches = alignment_length - mismatches difference = abs(100 * matches / alignment_length - percentage_identity) assert difference < 0.015 gap_opens = int(columns[5]) query_start = int(columns[6]) - 1 query_end = int(columns[7]) target_start = int(columns[8]) - 1 target_end = int(columns[9]) annotations["mismatches"] = mismatches annotations["evalue"] = float(columns[10]) annotations["bit_score"] = float(columns[11]) if self._alignment_representation == "BTOP": coordinates = self.parse_btop(columns[12]) elif self._alignment_representation == "CIGAR": coordinates = self.parse_cigar(columns[12]) coordinates[0, :] += target_start query_size = self._query_size if query_start < query_end: coordinates[1, :] += query_start else: # mapped to reverse strand coordinates[1, :] = coordinates[1, ::-1] coordinates[1, :] += query_size - query_start - 1 query_sequence = Seq(None, length=query_size) query = SeqRecord(query_sequence, id=query_id) if self._query_description is not None: query.description = self._query_description target_sequence = Seq(None, length=target_end) target = SeqRecord(target_sequence, id=target_id) records = [target, query] alignment = Alignment(records, coordinates) alignment.annotations = annotations return alignment
def parse(self, stream): """Parse the next alignment from the stream. This uses the Bio.Nexus module to do the hard work. You are expected to call this function via Bio.Align (and not use it directly). NOTE - We only expect ONE alignment matrix per Nexus file, meaning this iterator will only yield one Alignment. """ n = Nexus.Nexus(stream) if not n.matrix: # No alignment found return # Bio.Nexus deals with duplicated names by adding a '.copy' suffix. # The original names and the modified names are kept in these two lists: assert len(n.unaltered_taxlabels) == len(n.taxlabels) # TODO - Can we extract any annotation too? if n.datatype in ("dna", "nucleotide"): annotations = {"molecule_type": "DNA"} elif n.datatype == "rna": annotations = {"molecule_type": "RNA"} elif n.datatype == "protein": annotations = {"molecule_type": "protein"} else: annotations = None aligned_seqs = [str(n.matrix[new_name]) for new_name in n.taxlabels] records = [ SeqRecord( n.matrix[new_name].replace("-", ""), id=old_name, annotations=annotations, ) for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels) ] coordinates = Alignment.infer_coordinates(aligned_seqs) alignment = Alignment(records, coordinates) yield alignment
def rename(align, first, second, splitchar="_"): for a in align: new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) #pdb.set_trace() for seq in a: split_name = seq.id.split('_') if second: if splitchar == "_": new_seq_name = splitchar.join( [split_name[first][0:3], split_name[second][0:3]]) else: new_seq_name = splitchar.join([ split_name[first][0:3], split_name[second][0:3].title() ]) else: new_seq_name = split_name[first] seq.id, seq.name = new_seq_name, new_seq_name new_align.append(seq) yield new_align
def create_alignment( records, aligned_sequences, strands, annotations, column_annotations, score, ): """Create the Alignment object from the collected alignment data.""" coordinates = Alignment.infer_coordinates(aligned_sequences) for record, strand, row in zip(records, strands, coordinates): if strand == "-": row[:] = row[-1] - row[0] - row start = record.seq.defined_ranges[0][0] row += start alignment = Alignment(records, coordinates) if annotations is not None: alignment.annotations = annotations if column_annotations is not None: alignment.column_annotations = column_annotations if score is not None: alignment.score = score return alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration # If the alignment contains entries with the same sequence # identifier (not a good idea - but seems possible), then this # dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] aligned_seqs = [] consensus = "" index = None # Used to extract the consensus # Use the first block to get the sequence identifiers for line in stream: if line.startswith(" "): # Sequence consensus line... assert len(ids) > 0 assert index is not None length = len(aligned_seq) # noqa: F821 consensus = line[index:index + length] break elif line.strip(): # Sequences identifier... fields = line.split() # We expect there to be two fields, there can be an optional # "sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) seqid, aligned_seq = fields[:2] ids.append(seqid) aligned_seqs.append(aligned_seq) seq = aligned_seq.replace("-", "") seqs.append(seq) # Record the sequence position to get the consensus if index is None: index = line.find(aligned_seq, len(seqid)) if len(fields) == 3: # This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) from None if len(seq) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) else: # no consensus line if index: break else: raise StopIteration assert index is not None # Confirm all same length length = len(aligned_seqs[0]) for aligned_seq in aligned_seqs: assert len(aligned_seq) == length if consensus: assert len(consensus) == length n = len(seqs) i = 0 # Loop over any remaining blocks... for line in stream: if line.startswith(" "): # Sequence consensus line assert index is not None length = len(aligned_seq) consensus += line[index:index + length] elif not line.strip(): # Blank line continue else: seqid = ids[i] # Sequences identifier... fields = line.split() # We expect there to be two fields, there can be an optional # "sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) assert seqid == fields[0] aligned_seq = fields[1] aligned_seqs[i] += aligned_seq seq = aligned_seq.replace("-", "") seqs[i] += seq if len(fields) == 3: # This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) from None if len(seqs[i]) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) i += 1 if i == n: i = 0 records = [ SeqRecord(Seq(seq), id=seqid, description=seqid) for (seqid, seq) in zip(ids, seqs) ] coordinates = Alignment.infer_coordinates(aligned_seqs) alignment = Alignment(records, coordinates) # TODO - Handle alignment annotation better, for now # mimic the old parser in Bio.Clustalw if consensus: rows, columns = alignment.shape if len(consensus) != columns: for aligned_seq in aligned_seqs: print(aligned_seq, len(aligned_seq)) raise ValueError( "Alignment has %i columns, consensus length is %i, '%s'" % (columns, len(consensus), consensus)) alignment.column_annotations = {} alignment.column_annotations["clustal_consensus"] = consensus yield alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration identifiers = None number_of_sequences = None annotations = {} for line in stream: line = line.rstrip("\r\n") if identifiers is None: # searching for alignment metadata start if not line: continue elif line.startswith("#---------------------------------------"): # may appear between alignments continue elif line.startswith("#======================================="): # found the alignment metadata start identifiers = [] ncols = None sequences = None else: raise ValueError("Unexpected line: %s" % line) elif sequences is None: # parsing the alignment metadata if line == "#=======================================": # reached the end of alignment metadata if len(identifiers) == 0: raise ValueError("Number of sequences missing!") if ncols is None: raise ValueError("Length of alignment missing!") sequences = [""] * number_of_sequences aligned_sequences = [""] * number_of_sequences consensus = "" starts = [0] * number_of_sequences column = 0 index = 0 continue if line.strip() == "#": continue if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line try: key, value = line[2:].split(":", 1) except ValueError: # An equal sign is used for Longest_Identity, # Longest_Similarity, Shortest_Identity, and # Shortest_Similarity, which are included if command line # argument -nobrief was used. key, value = line[2:].split(" = ", 1) if key == "Aligned_sequences": number_of_sequences = int(value.strip()) assert len(identifiers) == 0 # Should now expect the record identifiers... for i, line in enumerate(stream): if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line number, identifier = line[2:].split(":") assert i + 1 == int(number) identifiers.append(identifier.strip()) if len(identifiers) == number_of_sequences: break elif key == "Matrix": annotations["matrix"] = value.strip() elif key == "Gap_penalty": annotations["gap_penalty"] = float(value.strip()) elif key == "Extend_penalty": annotations["extend_penalty"] = float(value.strip()) elif key == "Length": ncols = int(value.strip()) elif key == "Identity": annotations["identity"] = int(value.strip().split("/")[0]) elif key == "Similarity": annotations["similarity"] = int(value.strip().split("/")[0]) elif key == "Gaps": annotations["gaps"] = int(value.strip().split("/")[0]) elif key == "Score": annotations["score"] = float(value.strip()) # TODO: # The following are generated if the -nobrief command line # argument used. We could simply calculate them from the # alignment, but then we have to define what we mean by # "similar". For now, simply store them as an annotation. elif key == "Longest_Identity": annotations["longest_identity"] = value.strip() elif key == "Longest_Similarity": annotations["longest_similarity"] = value.strip() elif key == "Shortest_Identity": annotations["shortest_identity"] = value.strip() elif key == "Shortest_Similarity": annotations["shortest_similarity"] = value.strip() else: raise ValueError("Failed to parse line '%s'" % line) else: # parse the sequences if not line: # empty line if index == number_of_sequences: # reached the end of an alignment block index = 0 if column == ncols: # reached the end of the sequences coordinates = Alignment.infer_coordinates(aligned_sequences) records = [] n = len(sequences) for i in range(n): start = starts[i] if start == 0: sequence = Seq(sequences[i]) else: coordinates[i, :] += start # create a partially defined sequence length = start + len(sequences[i]) data = {start: sequences[i]} sequence = Seq(data, length=length) record = SeqRecord(sequence, identifiers[i]) records.append(record) alignment = Alignment(records, coordinates) if annotations: alignment.annotations = annotations if consensus: alignment.column_annotations = { "emboss_consensus": consensus } yield alignment identifiers = None annotations = {} continue prefix = line[:21].strip() if prefix == "": # match line consensus += line[21:71] else: identifier, start = prefix.split(None, 1) assert identifiers[index].startswith(identifier) aligned_sequence, end = line[21:].split(None, 1) start = int(start) - 1 # Python counting end = int(end) length = len(sequences[index]) sequence = aligned_sequence.replace("-", "") if length == 0 and len(sequence) > 0: # Record the start starts[index] = start else: if self.align_format == "srspair" and len(sequence) == 0: start += 1 assert start == starts[index] + length assert end == start + len(sequence) sequences[index] += sequence aligned_sequences[index] += aligned_sequence if index == 0: column += len(aligned_sequence) else: assert column == len(aligned_sequences[index]) index += 1
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration identifiers = None number_of_sequences = None for line in stream: line = line.rstrip("\r\n") if identifiers is None: # searching for alignment metadata start if not line: continue elif line.startswith( "#---------------------------------------"): # may appear between alignments continue elif line.startswith( "#======================================="): # found the alignment metadata start identifiers = [] ncols = None sequences = None matrix = None gap_penalty = None extend_penalty = None identity = None similarity = None gaps = None score = None else: raise ValueError("Unexpected line: %s" % line) elif sequences is None: # parsing the alignment metadata if line == "#=======================================": # reached the end of alignment metadata if len(identifiers) == 0: raise ValueError("Number of sequences missing!") if ncols is None: raise ValueError("Length of alignment missing!") sequences = [""] * number_of_sequences aligned_sequences = [""] * number_of_sequences consensus = "" starts = [0] * number_of_sequences ends = [0] * number_of_sequences column = 0 index = 0 continue if line.strip() == "#": continue if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line key, value = line[2:].split(":", 1) if key == "Aligned_sequences": number_of_sequences = int(value.strip()) assert len(identifiers) == 0 # Should now expect the record identifiers... for i, line in enumerate(stream): if not line.startswith("# "): raise ValueError("Unexpected line: %s") % line number, identifier = line[2:].split(":") assert i + 1 == int(number) identifiers.append(identifier.strip()) if len(identifiers) == number_of_sequences: break elif key == "Matrix": matrix = value.strip() elif key == "Gap_penalty": gap_penalty = float(value.strip()) elif key == "Extend_penalty": extend_penalty = float(value.strip()) elif key == "Length": ncols = int(value.strip()) elif key == "Identity": identity = int(value.strip().split("/")[0]) elif key == "Similarity": similarity = int(value.strip().split("/")[0]) elif key == "Gaps": gaps = int(value.strip().split("/")[0]) elif key == "Score": score = float(value.strip()) else: # parse the sequences if not line: # empty line if index == number_of_sequences: # reached the end of an alignment block index = 0 if column == ncols: # reached the end of the sequences coordinates = Alignment.infer_coordinates( aligned_sequences) for i, start in enumerate(starts): start -= 1 # Python counting coordinates[i, :] += start sequences = [ Seq(sequence) for sequence in sequences ] records = [ SeqRecord(sequence, id=identifier) for sequence, identifier in zip( sequences, identifiers) ] alignment = Alignment(records, coordinates) if matrix is not None: alignment.matrix = matrix if gap_penalty is not None: alignment.gap_penalty = gap_penalty if extend_penalty is not None: alignment.extend_penalty = extend_penalty if identity is not None: alignment.identity = identity if similarity is not None: alignment.similarity = similarity if gaps is not None: alignment.gaps = gaps if score is not None: alignment.score = score if consensus: alignment.column_annotations = { "emboss_consensus": consensus } yield alignment identifiers = None continue prefix = line[:21].strip() if prefix == "": # match line consensus += line[21:71] else: identifier, start = prefix.split(None, 1) aligned_sequence, end = line[21:].split(None, 1) start = int(start) end = int(end) sequence = aligned_sequence.replace("-", "") if len(sequences[index]) > 0: length = len(sequence) if length == 0: assert start == ends[index] assert end == ends[index] else: assert start == ends[index] + 1 assert end == ends[index] + length assert identifiers[index].startswith(identifier) if starts[index] == 0: # Record the start and end starts[index] = start ends[index] = end sequences[index] += sequence aligned_sequences[index] += aligned_sequence if index == 0: column += len(aligned_sequence) else: assert column == len(aligned_sequences[index]) index += 1
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration try: line = next(stream) except StopIteration: raise ValueError("Empty file.") from None # Whitelisted headers we know about. known_headers = [ "!!NA_MULTIPLE_ALIGNMENT", "!!AA_MULTIPLE_ALIGNMENT", "PileUp" ] # Examples in "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK, Copyright 1996-2001 # would often start as follows: # # !!AA_MUTIPLE_ALIGNMENT 1.0 # PileUp of: @/usr/users2/culhane/... # # etc with other seemingly free format text before getting to the # MSF/Type/Check line and the following Name: lines block and // line. # # MUSCLE just has a line "PileUp", while other sources just use the line # "!!AA_MULTIPLE_ALIGNMENT" (amino acid) or "!!NA_MULTIPLE_ALIGNMENT" # (nucleotide). if line.strip().split()[0] not in known_headers: raise ValueError( "%s is not a known GCG MSF header: %s" % (line.strip().split()[0], ", ".join(known_headers))) for line in stream: line = line.rstrip("\n") if "MSF: " in line and line.endswith(".."): break else: raise ValueError( "Reached end of file without MSF/Type/Check header line") # Quoting from "Molecular Biology Software Training Manual GCG version 10" # by BBSRC Bioscuences IT Services (BITS), Harpenden, UK. Copyright 1996-2001. # Page 31: # # "Header information is before a .. (double dot) in a GCG format file. # The file will also have a checksum specific for that file." # # This was followed by a single non-aligned sequence, but this convention # appears to also be used in the GCG MSF files. Quoting other examples in # this reference, page 31: # # localpileup_17.msf MSF: 195 Type: P January 6, 2000 15:41 Check: 4365 .. # # Except from page 148: # # localpileup_106.msf MSF: 457 Type: P November 28, 2000 16:09 Check: 2396 .. # # Quoting output from MUSCLE v3.8, have two leading spaces and a zero checksum: # # MSF: 689 Type: N Check: 0000 .. # # By observation, the MSF value is the column count, type is N (nucleotide) # or P (protein / amino acid). # # In a possible bug, EMBOSS v6.6.0.0 uses CompCheck: rather than Check: as shown, # # $ seqret -sequence Tests/Fasta/f002 -auto -stdout -osformat msf # !!NA_MULTIPLE_ALIGNMENT 1.0 # # stdout MSF: 633 Type: N 01/08/19 CompCheck: 8543 .. # # Name: G26680 Len: 633 Check: 4334 Weight: 1.00 # Name: G26685 Len: 633 Check: 3818 Weight: 1.00 # Name: G29385 Len: 633 Check: 391 Weight: 1.00 # # // # parts = line.split() offset = parts.index("MSF:") if parts[offset + 2] != "Type:" or parts[-3] not in ("Check:", "CompCheck:"): raise ValueError( "GCG MSF header line should be " "'<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..', " " not: %r" % line) try: aln_length = int(parts[offset + 1]) except ValueError: raise ValueError( "GCG MSF header line should have MSF: <int> for column count, not %r" % parts[offset + 1]) from None seq_type = parts[offset + 3] if seq_type not in ["P", "N"]: raise ValueError( "GCG MSF header line should have 'Type: P' (protein) " "or 'Type: N' (nucleotide), not 'Type: %s'" % seq_type) # There should be a blank line after that header line, then the Name: lines # # The Name may be followed by 'oo', as shown here: # # PileUp # # # # MSF: 628 Type: P Check: 147 .. # # Name: AK1H_ECOLI/1-378 oo Len: 628 Check: 3643 Weight: 1.000 # Name: AKH_HAEIN/1-382 oo Len: 628 Check: 6504 Weight: 1.000 # # // names = [] remaining = [] checks = [] weights = [] for line in stream: line = line.strip() if line == "//": break if line.startswith("Name: "): words = line.split() try: index_name = words.index("Name:") index_len = words.index("Len:") index_weight = words.index("Weight:") index_check = words.index("Check:") except ValueError: raise ValueError( f"Malformed GCG MSF name line: {line!r}") from None name = words[index_name + 1] length = int(words[index_len + 1]) weight = float(words[index_weight + 1]) check = words[index_check + 1] if name in names: raise ValueError(f"Duplicated ID of {name!r}") names.append(name) remaining.append(length) checks.append(check) weights.append(weight) else: raise ValueError( "End of file while looking for end of header // line.") try: line = next(stream) except StopIteration: raise ValueError( "End of file after // line, expected sequences.") from None if line.strip(): raise ValueError( "After // line, expected blank line before sequences.") # Now load the sequences seqs = [""] * len(names) for line in stream: words = line.split() if not words: continue name = words[0] try: index = names.index(name) except ValueError: # This may be a coordinate line for word in words: if not word.isdigit(): break else: # all words are integers; assume this is a coordinate line continue raise ValueError( f"Unexpected line '{line}' in input") from None seq = "".join(words[1:]) length = remaining[index] - (len(seq) - seq.count("-")) if length < 0: raise ValueError( "Received longer sequence than expected for %s" % name) seqs[index] += seq remaining[index] = length if all(length == 0 for length in remaining): break else: raise ValueError("End of file where expecting sequence data.") length = max(len(seq) for seq in seqs) if length != aln_length: warnings.warn( "GCG MSF headers said alignment length %i, but found %i" % (aln_length, length), BiopythonParserWarning, ) aln_length = length # Combine list of strings into single string, remap gaps for index, seq in enumerate(seqs): seq = "".join(seq).replace("~", "-").replace(".", "-") if len(seq) < aln_length: seq += "-" * (aln_length - len(seq)) seqs[index] = seq coordinates = Alignment.infer_coordinates(seqs) seqs = (Seq(seq.replace("-", "")) for seq in seqs) records = [ SeqRecord( seq, id=name, name=name, description=name, annotations={"weight": weight}, ) for (name, seq, weight) in zip(names, seqs, weights) ] alignment = Alignment(records, coordinates) # This will check alignment lengths are self-consistent: rows, columns = alignment.shape if columns != aln_length: raise ValueError( "GCG MSF headers said alignment length %i, but found %i" % (aln_length, columns)) yield alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration line = self.line self.line = None if line is not None: lines = chain([line], stream) else: lines = stream for line in lines: words = line.split() if len(words) != 21: raise ValueError("line has %d columns; expected 21" % len(words)) strand = words[8] qName = words[9] qSize = int(words[10]) tName = words[13] tSize = int(words[14]) blockCount = int(words[17]) blockSizes = [ int(blockSize) for blockSize in words[18].rstrip(",").split(",") ] qStarts = [ int(start) for start in words[19].rstrip(",").split(",") ] tStarts = [ int(start) for start in words[20].rstrip(",").split(",") ] if len(blockSizes) != blockCount: raise ValueError( "Inconsistent number of blocks (%d found, expected %d)" % (len(blockSizes), blockCount)) if len(qStarts) != blockCount: raise ValueError( "Inconsistent number of query start positions (%d found, expected %d)" % (len(qStarts), blockCount)) if len(tStarts) != blockCount: raise ValueError( "Inconsistent number of target start positions (%d found, expected %d)" % (len(tStarts), blockCount)) target_sequence = Seq(None, length=tSize) target_record = SeqRecord(target_sequence, id=tName) query_sequence = Seq(None, length=qSize) query_record = SeqRecord(query_sequence, id=qName) records = [target_record, query_record] qBlockSizes = numpy.array(blockSizes) qStarts = numpy.array(qStarts) tStarts = numpy.array(tStarts) if strand in ("++", "+-"): # protein sequence aligned against translated DNA sequence tBlockSizes = 3 * qBlockSizes else: tBlockSizes = qBlockSizes qPosition = qStarts[0] tPosition = tStarts[0] coordinates = [[tPosition, qPosition]] for tBlockSize, qBlockSize, tStart, qStart in zip( tBlockSizes, qBlockSizes, tStarts, qStarts): if tStart != tPosition: coordinates.append([tStart, qPosition]) tPosition = tStart if qStart != qPosition: coordinates.append([tPosition, qStart]) qPosition = qStart tPosition += tBlockSize qPosition += qBlockSize coordinates.append([tPosition, qPosition]) coordinates = numpy.array(coordinates).transpose() qNumInsert = 0 qBaseInsert = 0 tNumInsert = 0 tBaseInsert = 0 tStart, qStart = coordinates[:, 0] for tEnd, qEnd in coordinates[:, 1:].transpose(): tCount = tEnd - tStart qCount = qEnd - qStart if tCount == 0: if qStart > 0 and qEnd < qSize: qNumInsert += 1 qBaseInsert += qCount qStart = qEnd elif qCount == 0: if tStart > 0 and tEnd < tSize: tNumInsert += 1 tBaseInsert += tCount tStart = tEnd else: tStart = tEnd qStart = qEnd if qNumInsert != int(words[4]): raise ValueError( "Inconsistent qNumInsert found (%s, expected %d)" % (words[4], qNumInsert)) if qBaseInsert != int(words[5]): raise ValueError( "Inconsistent qBaseInsert found (%s, expected %d)" % (words[5], qBaseInsert)) if tNumInsert != int(words[6]): raise ValueError( "Inconsistent tNumInsert found (%s, expected %d)" % (words[6], tNumInsert)) if tBaseInsert != int(words[7]): raise ValueError( "Inconsistent tBaseInsert found (%s, expected %d)" % (words[7], tBaseInsert)) qStart = int(words[11]) qEnd = int(words[12]) tStart = int(words[15]) tEnd = int(words[16]) if strand == "-": qStart, qEnd = qEnd, qStart coordinates[1, :] = qSize - coordinates[1, :] elif strand == "+-": tStart, tEnd = tEnd, tStart coordinates[0, :] = tSize - coordinates[0, :] if tStart != coordinates[0, 0]: raise ValueError( "Inconsistent tStart found (%d, expected %d)" % (tStart, coordinates[0, 0])) if tEnd != coordinates[0, -1]: raise ValueError("Inconsistent tEnd found (%d, expected %d)" % (tEnd, coordinates[0, -1])) if qStart != coordinates[1, 0]: raise ValueError( "Inconsistent qStart found (%d, expected %d)" % (qStart, coordinates[1, 0])) if qEnd != coordinates[1, -1]: raise ValueError("Inconsistent qEnd found (%d, expected %d)" % (qEnd, coordinates[1, -1])) alignment = Alignment(records, coordinates) alignment.matches = int(words[0]) alignment.misMatches = int(words[1]) alignment.repMatches = int(words[2]) alignment.nCount = int(words[3]) yield alignment
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration for line in stream: line = line.strip() if not line: continue elif line == "# STOCKHOLM 1.0": # Starting a new alignment records = [] aligned_sequences = [] references = [] reference_comments = [] database_references = [] nested_domains = [] gf = defaultdict(list) gc = {} gs = defaultdict(lambda: {"DR": []}) gr = defaultdict(dict) length = None elif line == "//": # Reached the end of the alignment. skipped_columns = [] coordinates = Alignment.infer_coordinates( aligned_sequences, skipped_columns ) skipped_columns = set(skipped_columns) alignment = Alignment(records, coordinates) alignment.annotations = {} if references: alignment.annotations["references"] = [] for reference in references: reference = dict(reference) reference["title"] = " ".join(reference["title"]) reference["author"] = " ".join(reference["author"]) reference["location"] = " ".join(reference["location"]) alignment.annotations["references"].append(reference) if database_references: alignment.annotations["database references"] = database_references if nested_domains: alignment.annotations["nested domains"] = nested_domains rows, columns = alignment.shape AlignmentIterator._store_per_file_annotations(alignment, gf, rows) AlignmentIterator._store_per_column_annotations( alignment, gc, columns, skipped_columns ) AlignmentIterator._store_per_sequence_annotations(alignment, gs) AlignmentIterator._store_per_sequence_and_per_column_annotations( alignment, gr ) yield alignment elif not line.startswith("#"): # Sequence # Format: "<seqname> <sequence>" try: seqname, aligned_sequence = line.split(None, 1) except ValueError: # This might be someone attempting to store a zero length sequence? raise ValueError( "Could not split line into sequence name and aligned sequence:\n" + line ) from None if length is None: length = len(aligned_sequence) elif length != len(aligned_sequence): raise ValueError( f"Aligned sequence {seqname} consists of {len(aligned_sequence)} letters, expected {length} letters)" ) aligned_sequence = aligned_sequence.replace(".", "-") sequence = aligned_sequence.replace("-", "") aligned_sequences.append(aligned_sequence) seq = Seq(sequence) record = SeqRecord(seq, id=seqname) records.append(record) elif line.startswith("#=GF "): # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) if feature == "RN": assert text.startswith("[") assert text.endswith("]") number = int(text[1:-1]) reference = defaultdict(list) reference["number"] = number if reference_comments: reference["comment"] = " ".join(reference_comments) reference_comments = [] references.append(reference) elif feature == "RM": assert not reference["medline"] reference["medline"] = text elif feature == "RT": reference["title"].append(text) elif feature == "RA": reference["author"].append(text) elif feature == "RL": reference["location"].append(text) elif feature == "RC": reference_comments.append(text) elif feature == "DR": database_reference = {"reference": text} database_references.append(database_reference) elif feature == "DC": assert "comment" not in database_reference database_reference["comment"] = text elif feature == "NE": nested_domain = {"accession": text} nested_domains.append(nested_domain) elif feature == "NL": assert "location" not in nested_domain nested_domain["location"] = text else: # Each feature key could be used more than once, # so store the entries as a list of strings. gf[feature].append(text) elif line.startswith("#=GC "): # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" feature, text = line[5:].strip().split(None, 2) if feature not in gc: gc[feature] = "" gc[feature] += text.strip() # append to any previous entry # Might be interleaved blocks, so can't check length yet elif line.startswith("#=GS "): # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" try: seqname, feature, text = line[5:].strip().split(None, 2) except ValueError: # Free text can sometimes be empty, which a one line split throws an error for. # See https://github.com/biopython/biopython/issues/2982 for more details seqname, feature = line[5:].strip().split(None, 1) text = "" if feature == "DR": gs[seqname][feature].append(text) else: assert feature not in gs[seqname] gs[seqname][feature] = text elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" terms = line[5:].split(None, 2) assert terms[0] == seqname feature = terms[1] gr[seqname][feature] = terms[2].strip()
def parse(self, stream): """Parse the next alignment from the stream.""" if stream is None: raise StopIteration for line in stream: words = line.split() bedN = len(words) if bedN < 3 or bedN > 12: raise ValueError("expected between 3 and 12 columns, found %d" % bedN) chrom = words[0] chromStart = int(words[1]) chromEnd = int(words[2]) if bedN > 3: name = words[3] else: name = None if bedN > 5: strand = words[5] else: strand = "+" if bedN > 9: blockCount = int(words[9]) blockSizes = [ int(blockSize) for blockSize in words[10].rstrip(",").split(",") ] blockStarts = [ int(blockStart) for blockStart in words[11].rstrip(",").split(",") ] if len(blockSizes) != blockCount: raise ValueError( "Inconsistent number of block sizes (%d found, expected %d)" % (len(blockSizes), blockCount) ) if len(blockStarts) != blockCount: raise ValueError( "Inconsistent number of block start positions (%d found, expected %d)" % (len(blockStarts), blockCount) ) blockSizes = numpy.array(blockSizes) blockStarts = numpy.array(blockStarts) tPosition = 0 qPosition = 0 coordinates = [[tPosition, qPosition]] for blockSize, blockStart in zip(blockSizes, blockStarts): if blockStart != tPosition: coordinates.append([blockStart, qPosition]) tPosition = blockStart tPosition += blockSize qPosition += blockSize coordinates.append([tPosition, qPosition]) coordinates = numpy.array(coordinates).transpose() qSize = sum(blockSizes) else: blockSize = chromEnd - chromStart coordinates = numpy.array([[0, blockSize], [0, blockSize]]) qSize = blockSize coordinates[0, :] += chromStart query_sequence = Seq(None, length=qSize) query_record = SeqRecord(query_sequence, id=name) target_record = SeqRecord(None, id=chrom) records = [target_record, query_record] if strand == "-": coordinates[1, :] = qSize - coordinates[1, :] if chromStart != coordinates[0, 0]: raise ValueError( "Inconsistent chromStart found (%d, expected %d)" % (chromStart, coordinates[0, 0]) ) if chromEnd != coordinates[0, -1]: raise ValueError( "Inconsistent chromEnd found (%d, expected %d)" % (chromEnd, coordinates[0, -1]) ) alignment = Alignment(records, coordinates) if bedN <= 4: yield alignment continue score = words[4] try: score = float(score) except ValueError: pass else: if score.is_integer(): score = int(score) alignment.score = score if bedN <= 6: yield alignment continue alignment.thickStart = int(words[6]) if bedN <= 7: yield alignment continue alignment.thickEnd = int(words[7]) if bedN <= 8: yield alignment continue alignment.itemRgb = words[8] yield alignment
def rename_alignment_taxa(aln, name_map): new_align = Alignment([], alphabet=Gapped(IUPAC.unambiguous_dna, "-")) for seq in aln: seq.id, seq.name = name_map[seq.id], name_map[seq.id] new_align.append(seq) return new_align