def to_generic(self, alphabet): """Retrieve generic alignment object for the given alignment. Instead of the tuples, this returns a MultipleSeqAlignment object from SAP.Bio.Align, through which you can manipulate and query the object. alphabet is the specified alphabet for the sequences in the code (for example IUPAC.IUPACProtein). Thanks to James Casbon for the code. """ #TODO - Switch to new Bio.Align.MultipleSeqAlignment class? seq_parts = [] seq_names = [] parse_number = 0 n = 0 for name, start, seq, end in self.alignment: if name == 'QUERY': # QUERY is the first in each alignment block parse_number += 1 n = 0 if parse_number == 1: # create on first_parse, append on all others seq_parts.append(seq) seq_names.append(name) else: seq_parts[n] += seq n += 1 generic = MultipleSeqAlignment([], alphabet) for (name, seq) in zip(seq_names, seq_parts): generic.append(SeqRecord(Seq(seq, alphabet), name)) return generic
def mult_align(sum_dict, align_dict): """Returns a biopython multiple alignment instance (MultipleSeqAlignment)""" mult_align_dict = {} for j in align_dict.abs(1).pos_align_dict: mult_align_dict[j] = '' for i in range(1, len(align_dict)+1): # loop on positions for j in align_dict.abs(i).pos_align_dict: # loop within a position mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein) fssp_align = MultipleSeqAlignment([], alphabet=alpha) for i in sorted(mult_align_dict): fssp_align.append(SeqRecord(Seq(mult_align_dict[i], alpha), sum_dict[i].pdb2+sum_dict[i].chain2)) return fssp_align
def mult_align(sum_dict, align_dict): """Returns a biopython multiple alignment instance (MultipleSeqAlignment)""" mult_align_dict = {} for j in align_dict.abs(1).pos_align_dict: mult_align_dict[j] = '' for i in range(1, len(align_dict) + 1): # loop on positions for j in align_dict.abs(i).pos_align_dict: # loop within a position mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein) fssp_align = MultipleSeqAlignment([], alphabet=alpha) for i in sorted(mult_align_dict): fssp_align.append( SeqRecord(Seq(mult_align_dict[i], alpha), sum_dict[i].pdb2 + sum_dict[i].chain2)) return fssp_align
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError as err: print("Darn... amino acids vs nucleotide coordinates?") print(tool) print(query_seq) print(query_tags) print("%s %i" % (q, len(q))) print(match_seq) print(match_tags) print("%s %i" % (m, len(m))) print(handle.name) raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value #Query #===== record = SeqRecord(Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord(Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError as err: print("Darn... amino acids vs nucleotide coordinates?") print(tool) print(query_seq) print(query_tags) print("%s %i" % (q, len(q))) print(match_seq) print(match_tags) print("%s %i" % (m, len(m))) print(handle.name) raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment