def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) if len(q) != len(m): message = """Darn... amino acids vs nucleotide coordinates? tool: {0} query_seq: {1} query_tags: {2} {3} length: {4} match_seq: {5} match_tags: {6} {7} length: {8} handle.name: {9} """.format(tool, query_seq, query_tags, q, len(q), match_seq, match_tags, m, len(m), handle.name) raise ValueError(message) assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/MafIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord(Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord(Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) if len(q) != len(m): message = """Darn... amino acids vs nucleotide coordinates? tool: {0} query_seq: {1} query_tags: {2} {3} length: {4} match_seq: {5} match_tags: {6} {7} length: {8} handle.name: {9} """.format(tool, query_seq, query_tags, q, len(q), match_seq, match_tags, m, len(m), handle.name) raise ValueError(message) assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # See also Bio/AlignIO/MafIO.py for same requirement. # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def FastaM10Iterator(handle, alphabet=single_letter_alphabet): """Alignment iterator for the FASTA tool's pairwise alignment output. This is for reading the pairwise alignments output by Bill Pearson's FASTA program when called with the -m 10 command line option for machine readable output. For more details about the FASTA tools, see the website http://fasta.bioch.virginia.edu/ and the paper: W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 This class is intended to be used via the Bio.AlignIO.parse() function by specifying the format as "fasta-m10" as shown in the following code: from Bio import AlignIO handle = ... for a in AlignIO.parse(handle, "fasta-m10"): assert len(a) == 2, "Should be pairwise!" print "Alignment length %i" % a.get_alignment_length() for record in a: print record.seq, record.name, record.id Note that this is not a full blown parser for all the information in the FASTA output - for example, most of the header and all of the footer is ignored. Also, the alignments are not batched according to the input queries. Also note that there can be up to about 30 letters of flanking region included in the raw FASTA output as contextual information. This is NOT part of the alignment itself, and is not included in the resulting MultipleSeqAlignment objects returned. """ if alphabet is None: alphabet = single_letter_alphabet state_PREAMBLE = -1 state_NONE = 0 state_QUERY_HEADER = 1 state_ALIGN_HEADER = 2 state_ALIGN_QUERY = 3 state_ALIGN_MATCH = 4 state_ALIGN_CONS = 5 def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" \ % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" #Just for printing len(q) in debug below m = "?" #Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordindates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError, err: print "Darn... amino acids vs nucleotide coordinates?" print tool print query_seq print query_tags print q, len(q) print match_seq print match_tags print m, len(m) print handle.name raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.iteritems(): alignment._annotations[key] = value for key, value in align_tags.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def FastaM10Iterator(handle, alphabet=single_letter_alphabet): """Alignment iterator for the FASTA tool's pairwise alignment output. This is for reading the pairwise alignments output by Bill Pearson's FASTA program when called with the -m 10 command line option for machine readable output. For more details about the FASTA tools, see the website http://fasta.bioch.virginia.edu/ and the paper: W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 This class is intended to be used via the Bio.AlignIO.parse() function by specifying the format as "fasta-m10" as shown in the following code: from Bio import AlignIO handle = ... for a in AlignIO.parse(handle, "fasta-m10"): assert len(a) == 2, "Should be pairwise!" print "Alignment length %i" % a.get_alignment_length() for record in a: print record.seq, record.name, record.id Note that this is not a full blown parser for all the information in the FASTA output - for example, most of the header and all of the footer is ignored. Also, the alignments are not batched according to the input queries. Also note that there can be up to about 30 letters of flanking region included in the raw FASTA output as contextual information. This is NOT part of the alignment itself, and is not included in the resulting MultipleSeqAlignment objects returned. """ if alphabet is None: alphabet = single_letter_alphabet state_PREAMBLE = -1 state_NONE = 0 state_QUERY_HEADER = 1 state_ALIGN_HEADER = 2 state_ALIGN_QUERY = 3 state_ALIGN_MATCH = 4 state_ALIGN_CONS = 5 def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError, err: print "Darn... amino acids vs nucleotide coordinates?" print tool print query_seq print query_tags print q, len(q) print match_seq print match_tags print m, len(m) print handle.name raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = {} # Want to record both the query header tags, and the alignment tags. for key, value in header_tags.iteritems(): alignment._annotations[key] = value for key, value in align_tags.iteritems(): alignment._annotations[key] = value # Query # ===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) # TODO - What if a specific alphabet has been requested? # TODO - Use an IUPAC alphabet? # TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") # Match # ===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) # This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def next(self): """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an MultipleSeqAlignment object containing two rows. """ handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: raise StopIteration if line.startswith("#"): #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>"): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line: #End of file raise StopIteration if ">>><<<" in line: #Reached the end of the alignments, no need to read the footer... raise StopIteration #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>": raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None, 1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">": query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError( "Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None, 1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; ": assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else: align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region( query_seq, query_annotation) match_align_seq = self._extract_alignment_region( match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq): raise ValueError( "Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation: if int(alignment_annotation["sw_overlap"]) != len(query_align_seq): raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems(): alignment._annotations[key] = value for key, value in alignment_annotation.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(query_align_seq, alphabet), id=self._query_descr.split(None, 1)[0].strip(","), name="query", description=self._query_descr, annotations={"original_length": int(query_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation: if query_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in query_align_seq: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(match_align_seq, alphabet), id=match_descr.split(None, 1)[0].strip(","), name="match", description=match_descr, annotations={"original_length": int(match_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_annotation["al_start"]) record._al_stop = int(match_annotation["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation: if match_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in match_align_seq: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def next(self): """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an MultipleSeqAlignment object containing two rows. """ handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#"): #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>"): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line: #End of file return None if ">>><<<" in line: #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>": raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">": query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; ": assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else: align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq): raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation: if int(alignment_annotation["sw_overlap"]) != len(query_align_seq): raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems(): alignment._annotations[key] = value for key, value in alignment_annotation.iteritems(): alignment._annotations[key] = value #Query #===== record = SeqRecord(Seq(query_align_seq, alphabet), id = self._query_descr.split(None,1)[0].strip(","), name = "query", description = self._query_descr, annotations = {"original_length" : int(query_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation: if query_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in query_align_seq: if not hasattr(record.seq.alphabet,"gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord(Seq(match_align_seq, alphabet), id = match_descr.split(None,1)[0].strip(","), name = "match", description = match_descr, annotations = {"original_length" : int(match_annotation["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_annotation["al_start"]) record._al_stop = int(match_annotation["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation: if match_annotation["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in match_align_seq: if not hasattr(record.seq.alphabet,"gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect") tool = global_tags.get("tool", "").upper() q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq # Quick hack until I can work out how -, * and / characters # and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) if len(q) != len(m): raise ValueError(f"""\ Darn... amino acids vs nucleotide coordinates? tool: {tool} query_seq: {query_seq} query_tags: {query_tags} {q} length: {len(q)} match_seq: {match_seq} match_tags: {match_tags} {m} length: {len(m)} handle.name: {handle.name} """) annotations = {} records = [] # Want to record both the query header tags, and the alignment tags. annotations.update(header_tags) annotations.update(align_tags) # Query # ===== record = SeqRecord( Seq(q), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) # TODO - Can FASTA output RNA? if "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.annotations["molecule_type"] = "DNA" elif query_tags["sq_type"] == "p": record.annotations["molecule_type"] = "protein" records.append(record) # Match # ===== record = SeqRecord( Seq(m), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}, ) # TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) if "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.annotations["molecule_type"] = "DNA" elif match_tags["sq_type"] == "p": record.annotations["molecule_type"] = "protein" records.append(record) return MultipleSeqAlignment(records, annotations=annotations)