def __next__(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: # Empty file - just give up. raise StopIteration if not line.strip() == "# STOCKHOLM 1.0": raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while True: line = self.handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == "# STOCKHOLM 1.0": self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, "") seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == "#=GC ": # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == "#=GS ": # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids: # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip() # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment) ) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError("Sequences have different lengths, or repeated identifier") name, start, end = self._identifier_split(id) record = SeqRecord( Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name} ) # Accession will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(id, record) records.append(record) alignment = MultipleSeqAlignment(records, self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def __next__(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: #Empty file - just give up. raise StopIteration if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while True: line = self.handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": #The "//" line indicates the end of the alignment. #There may still be more meta-data passed_end_alignment = True elif line == "": #blank line, ignore pass elif line[0] != "#": #Sequence #Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: #This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: #Comment line or meta-data if line[:5] == "#=GF ": #Generic per-File annotation, free text #Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) #Each feature key could be used more than once, #so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': #Generic per-Column annotation, exactly 1 char per column #Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': #Generic per-Sequence annotation, free text #Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids: # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": #Generic per-Sequence AND per-Column markup #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids: # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip() # append to any previous entry #TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines #Next line... assert len(seqs) <= len(ids) #assert len(gs) <= len(ids) #assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError("Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment)) alignment_length = len(list(seqs.values())[0]) records = [] # Alignment obj will put them all in a list anyway for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError("Sequences have different lengths, or repeated identifier") name, start, end = self._identifier_split(id) record = SeqRecord(Seq(seq, self.alphabet), id=id, name=name, description=id, annotations={"accession": name}) #Accession will be overridden by _populate_meta_data if an explicit #accession is provided: record.annotations["accession"] = name if start is not None: record.annotations["start"] = start if end is not None: record.annotations["end"] = end self._populate_meta_data(id, record) records.append(record) alignment = MultipleSeqAlignment(records, self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = gr return alignment else: raise StopIteration
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError as err: print("Darn... amino acids vs nucleotide coordinates?") print(tool) print(query_seq) print(query_tags) print("%s %i" % (q, len(q))) print(match_seq) print(match_tags) print("%s %i" % (m, len(m))) print(handle.name) raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value #Query #===== record = SeqRecord(Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord(Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def build_hsp(): if not query_tags and not match_tags: raise ValueError("No data for query %r, match %r" % (query_id, match_id)) assert query_tags, query_tags assert match_tags, match_tags evalue = align_tags.get("fa_expect", None) q = "?" # Just for printing len(q) in debug below m = "?" # Just for printing len(m) in debug below tool = global_tags.get("tool", "").upper() try: q = _extract_alignment_region(query_seq, query_tags) if tool in ["TFASTX"] and len(match_seq) == len(q): m = match_seq #Quick hack until I can work out how -, * and / characters #and the apparent mix of aa and bp coordinates works. else: m = _extract_alignment_region(match_seq, match_tags) assert len(q) == len(m) except AssertionError as err: print("Darn... amino acids vs nucleotide coordinates?") print(tool) print(query_seq) print(query_tags) print("%s %i" % (q, len(q))) print(match_seq) print(match_tags) print("%s %i" % (m, len(m))) print(handle.name) raise err assert alphabet is not None alignment = MultipleSeqAlignment([], alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in header_tags.items(): alignment._annotations[key] = value for key, value in align_tags.items(): alignment._annotations[key] = value #Query #===== record = SeqRecord( Seq(q, alphabet), id=query_id, name="query", description=query_descr, annotations={"original_length": int(query_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_tags["al_start"]) record._al_stop = int(query_tags["al_stop"]) alignment.append(record) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_tags: if query_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif query_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in q: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") #Match #===== record = SeqRecord( Seq(m, alphabet), id=match_id, name="match", description=match_descr, annotations={"original_length": int(match_tags["sq_len"])}) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(match_tags["al_start"]) record._al_stop = int(match_tags["al_stop"]) alignment.append(record) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_tags: if match_tags["sq_type"] == "D": record.seq.alphabet = generic_dna elif match_tags["sq_type"] == "p": record.seq.alphabet = generic_protein if "-" in m: if not hasattr(record.seq.alphabet, "gap_char"): record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment