def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: # Empty file - just give up. return if not line.strip() == "# STOCKHOLM 1.0": raise ValueError("Did not find STOCKHOLM header") # import sys # print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == "# STOCKHOLM 1.0": self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, "") seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == "#=GC ": # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == "#=GS ": # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids : # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids : # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip() # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment) ) alignment = Alignment(self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr alignment_length = len(seqs.values()[0]) for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError("Sequences have different lengths, or repeated identifier") name, start, end = self._identifier_split(id) alignment.add_sequence(id, seq, start=start, end=end) record = alignment.get_all_seqs()[-1] assert record.id == id or record.description == id record.id = id record.name = name record.description = id # will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name self._populate_meta_data(id, record) return alignment else: return None
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header print self._header.strip(), '--> self_header' del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith('#-') : #Reached the end of the alignments, no need to read the footer... return None if line.startswith("##") : #Skip the file header before the alignments. e.g. # print line.strip() line = self._skip_file_header(line) # print 'Back from file header skip' assert line.startswith('#'), line while not line.startswith('#=') : line = self.handle.readline() if line.startswith('#='): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) if not line : #End of file return None assert line.startswith(">>") and not line.startswith(">>>"), line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match numbering line, then more tags. #e.g. """ >>#2 ; sw_score: 41.0 ; sw_ident: 0.846 ; sw_overlap: 13 """ if not line.startswith(">>") and not line.startswith(">>>") : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #print match_descr, 'match' #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line.startswith("; ") #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split()[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence (with leading flanking region) while not line.startswith(">") : query_seq_parts.append(line.strip()) line = handle.readline() # print 'queryseq', line.strip() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) #print '----->', line.strip(), match_descr match_descr = line[1:].split()[0] + match_descr #assert match_descr.startswith(line[1:].split()[0]) # assert self._match_descr.startswith(line[1:].split()[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence with flanking region... while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'): match_seq_parts.append(line.strip()) line = handle.readline() if line.startswith('>') or '>>>' in line: self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #The "sq_offset" values can be specified with the -X command line option. #The appear to just shift the origin used in the calculation of the coordinates. if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \ or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") : #Note that until some point in the v35 series, FASTA always recorded one #for the query offset, and ommitted the match offset (even when these were #query_seq the -X command line option). #TODO - Work out how exactly the use of -X offsets changes things. #raise ValueError("Offsets from the -X command line option are not (yet) supported") pass # this is not useful when using stretcher # if len(query_align_seq) != len(match_align_seq) : # raise ValueError("Problem parsing the alignment sequence coordinates") if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split()[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = query_annotation[k] alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr assert record.seq.tostring() == match_align_seq record.id = match_descr.split()[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = match_annotation[k] return alignment
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header print self._header.strip(), '--> self_header' del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith('#-') : #Reached the end of the alignments, no need to read the footer... return None if line.startswith("##") : #Skip the file header before the alignments. e.g. # print line.strip() line = self._skip_file_header(line) # print 'Back from file header skip' assert line.startswith('#'), line while not line.startswith('#=') : line = self.handle.readline() if line.startswith('#='): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) if not line : #End of file return None assert line.startswith(">>") and not line.startswith(">>>"), line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match numbering line, then more tags. #e.g. """ >>#2 ; sw_score: 41.0 ; sw_ident: 0.846 ; sw_overlap: 13 """ if not line.startswith(">>") and not line.startswith(">>>") : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #print match_descr, 'match' #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line.startswith("; ") #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split()[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence (with leading flanking region) while not line.startswith(">") : query_seq_parts.append(line.strip()) line = handle.readline() # print 'queryseq', line.strip() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) #print '----->', line.strip(), match_descr match_descr = line[1:].split()[0] + match_descr #assert match_descr.startswith(line[1:].split()[0]) # assert self._match_descr.startswith(line[1:].split()[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence with flanking region... while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'): match_seq_parts.append(line.strip()) line = handle.readline() if not line: #End of file return None if line.startswith('>') or '>>>' in line: self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #The "sq_offset" values can be specified with the -X command line option. #The appear to just shift the origin used in the calculation of the coordinates. if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \ or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") : #Note that until some point in the v35 series, FASTA always recorded one #for the query offset, and ommitted the match offset (even when these were #query_seq the -X command line option). #TODO - Work out how exactly the use of -X offsets changes things. #raise ValueError("Offsets from the -X command line option are not (yet) supported") pass # this is not useful when using stretcher # if len(query_align_seq) != len(match_align_seq) : # raise ValueError("Problem parsing the alignment sequence coordinates") if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split()[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = query_annotation[k] alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr assert record.seq.tostring() == match_align_seq record.id = match_descr.split()[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = match_annotation[k] return alignment
def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: #Empty file - just give up. return if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") #import sys #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break #end of file line = line.strip() #remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": #The "//" line indicates the end of the alignment. #There may still be more meta-data passed_end_alignment = True elif line == "": #blank line, ignore pass elif line[0] != "#": #Sequence #Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: #This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " \ + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: #Comment line or meta-data if line[:5] == "#=GF ": #Generic per-File annotation, free text #Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) #Each feature key could be used more than once, #so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': #Generic per-Column annotation, exactly 1 char per column #Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': #Generic per-Sequence annotation, free text #Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids : # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": #Generic per-Sequence AND per-Column markup #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids : # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip( ) # append to any previous entry #TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines #Next line... assert len(seqs) <= len(ids) #assert len(gs) <= len(ids) #assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = gr alignment_length = len(seqs.values()[0]) for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(id) alignment.add_sequence(id, seq, start=start, end=end) record = alignment.get_all_seqs()[-1] assert record.id == id or record.description == id record.id = id record.name = name record.description = id #will be overridden by _populate_meta_data if an explicit #accession is provided: record.annotations["accession"] = name self._populate_meta_data(id, record) return alignment else: return None
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#") : #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>") : #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line : #End of file return None if ">>><<<" in line : #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>" : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">" : query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; " : assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else : align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq) : raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = Alignment(alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr #assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split(None,1)[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation : if query_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in query_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr #assert record.seq.tostring() == match_align_seq record.id = match_descr.split(None,1)[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation : if match_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in match_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#") : #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>") : #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line : #End of file return None if ">>><<<" in line : #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>" : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">" : query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; " : assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else : align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq) : raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = Alignment(alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr #assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split(None,1)[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation : if query_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in query_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr #assert record.seq.tostring() == match_align_seq record.id = match_descr.split(None,1)[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation : if match_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in match_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment