def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.nexus) old_names = set() for f in files: for align in AlignIO.parse(f, 'nexus'): for seq in list(align): old_names.update([seq.name]) #pdb.set_trace() name_map = abbreviator(old_names) for count, f in enumerate(files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) #filename = os.path.basename(f) #chromo_name = filename.split('.')[0] for align in AlignIO.parse(f, 'nexus'): for seq in list(align): new_seq_name = name_map[seq.name] new_align.add_sequence(new_seq_name, str(seq.seq)) #pdb.set_trace() outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count
def ace2fasta(in_file, out_file): ace_gen = Ace.parse(open(in_file, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "All contigs treated" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) # Now we have started our alignment we can add sequences to it # Add concensus sequence to alignment align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) output_file.write(align.format("fasta"))
def _domain_alignment(self,alignment,domain_region, alignment_index): # Now we need to subselect the portion of the alignment # that contains the domain. protein_record = alignment[alignment_index] protein_seq = str(protein_record.seq) # Figure out which columns encapsulate the domain. aa_count = 0 column_start = None column_stop = None #print protein_seq for column,aa in enumerate(protein_seq): #print column,aa if aa!='-': aa_count=aa_count+1 if aa_count==domain_region.start and column_start==None: column_start = column if aa_count==domain_region.stop and column_stop==None: column_stop = column break #print column_start,column_stop assert column_start != None, str(column_start) assert column_stop != None, str(column_stop) domain_alignment = Alignment(alphabet = alignment._alphabet) # Grab the portion of each sequence that correspond to columns # for the domain. for record in alignment: domain_alignment.add_sequence(record.id, str(record.seq)[column_start:column_stop]) return (domain_alignment, column_start, column_stop)
def add_gaps_to_align(organisms, missing, align, verbatim=False, genera=False, min_taxa=3): local_organisms = copy.deepcopy(organisms) for a in align: if len(a) < min_taxa: new_align = None break elif len(a) >= min_taxa: #pdb.set_trace() new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) overall_length = len(a[0]) for seq in a: if genera and any(sp for sp in genera if sp in seq.name): new_seq_name = '_'.join(seq.name.split('_')[-1:]) elif not verbatim: new_seq_name = '_'.join(seq.name.split('_')[-2:]) else: new_seq_name = seq.name.lower() new_align.add_sequence(new_seq_name, str(seq.seq)) local_organisms.remove(new_seq_name) for org in local_organisms: if genera and any(sp for sp in genera if sp in seq.name): loc = '_'.join(seq.name.split('_')[:-1]) elif not verbatim: loc = '_'.join(seq.name.split('_')[:-2]) else: loc = seq.name if missing: try: assert loc in missing[org], "Locus missing" except: assert loc in missing['{}*'.format(org)], "Locus missing" new_align.add_sequence(org, '?' * overall_length) return new_align
def main(): options, args = interface() # iterate through all the files to determine the longest alignment files = get_files(options.input) for count, f in enumerate(files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) #filename = os.path.basename(f) #chromo_name = filename.split('.')[0] for align in AlignIO.parse(f, 'nexus'): for seq in list(align): if '.copy' in seq.name: pass else: #pdb.set_trace() #new_seq_name = seq.name.split('|')[0] new_seq_name = '_'.join( seq.name.split('_')[options.position:]) new_align.add_sequence(new_seq_name, str(seq.seq)) #pdb.set_trace() outf = os.path.join(options.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count
def _domain_alignment(self, alignment, domain_region, alignment_index): # Now we need to subselect the portion of the alignment # that contains the domain. protein_record = alignment[alignment_index] protein_seq = str(protein_record.seq) # Figure out which columns encapsulate the domain. aa_count = 0 column_start = None column_stop = None #print protein_seq for column, aa in enumerate(protein_seq): #print column,aa if aa != '-': aa_count = aa_count + 1 if aa_count == domain_region.start and column_start == None: column_start = column if aa_count == domain_region.stop and column_stop == None: column_stop = column break #print column_start,column_stop assert column_start != None, str(column_start) assert column_stop != None, str(column_stop) domain_alignment = Alignment(alphabet=alignment._alphabet) # Grab the portion of each sequence that correspond to columns # for the domain. for record in alignment: domain_alignment.add_sequence( record.id, str(record.seq)[column_start:column_stop]) return (domain_alignment, column_start, column_stop)
def createAlignment(sequences, alphabet): """Create an Alignment object from a list of sequences""" align = Alignment(alphabet) counter = 0 for sequence in sequences: name = "sequence" + str(counter) align.add_sequence(name, sequence) counter+=1 return align
def phylip(handle): seqs,columns = handle.readline().split() from Bio.Align.Generic import Alignment from Bio.Alphabet import IUPAC, Gapped alignment = Alignment(Gapped(IUPAC.protein, "-")) for line in handle: name,seq = line.split() alignment.add_sequence(name, seq) return alignment
def build_align( self, seq ): align = Alignment( Gapped( DNAAlphabet() ) ) alphabet = self.alphabet len_seq = len( seq ) step = self.segment_size for j in range( 0, len_seq, step ): segment = seq[j : j + step] align.add_sequence( name, segment ) self.friendly = align
def createAlignment(sequences, alphabet): """Create an Alignment object from a list of sequences""" align = Alignment(alphabet) counter = 0 for sequence in sequences: name = "sequence" + str(counter) align.add_sequence(name, sequence) counter += 1 return align
def testCulledColumnMapper(self): align = Alignment(Gapped(IUPAC.protein, "-")) original = "ABCDEFGHI" align.add_sequence("test",original) culled = [0,1,4,8] # should yield result = "CDFGH" mapper = CulledColumnMapper(align,culled) for i,aa in enumerate(result): assert original[mapper[i]]==aa
def gene_expression_2matrix(in_ace, out_file, tags, min_seq): """Count sequences with each tags in all contigs. """ print print "USING MATRIX OUTPUT FORMAT" print ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("gene_name\tgene_length") for tag in tags: output_file.write("\t" + tag) output_file.write("\tXX_noTag") output_file.write("\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta_2list(align.format("fasta")) if len(sequences) < min_seq: continue contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] contig_seq = sequences[0][1].replace("*", "") contig_length = str(len(contig_seq)) output_file.write(contig_name + "\t" + contig_length) print "Treating", contig_name d = defaultdict(int) for tag in tags: d[tag] = 0 d["XX_noTag"] = 0 fasta_counter = 0 for fasta in sequences: fasta_counter += 1 found_tag = 0 for tag in tags: if fasta[0].find(tag) > -1: d[tag] += 1 found_tag = 1 if found_tag == 0 and fasta[0].find("Consensus") < 0: d["XX_noTag"] += 1 for tag in sorted(d): output_file.write("\t" + str(d[tag])) output_file.write("\n")
def main(): args = get_args() nexus_files = get_files(args.input) for count, align_file in enumerate(nexus_files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for align in AlignIO.parse(align_file, "nexus"): for taxon in list(align): if taxon.name not in args.taxa: new_align.add_sequence(taxon.name, str(taxon.seq)) outf = os.path.join(args.output, os.path.basename(align_file)) AlignIO.write(new_align, open(outf, 'w'), 'nexus') print count
def rename(align, first, second): for a in align: new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for seq in a: split_name = seq.id.split('_') #pdb.set_trace() if first and second: new_seq_name = '_'.join([split_name[first][0:3], split_name[second][0:3]]) elif not second: new_seq_name = split_name[first] new_align.add_sequence(new_seq_name, str(seq.seq)) yield new_align
def rename(align, first, second): for a in align: new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for seq in a: split_name = seq.id.split('_') #pdb.set_trace() if first and second: new_seq_name = '_'.join( [split_name[first][0:3], split_name[second][0:3]]) elif not second: new_seq_name = split_name[first] new_align.add_sequence(new_seq_name, str(seq.seq)) yield new_align
def main(): args = get_args() nexus_files = get_files(args.input) taxa = get_all_taxon_names(nexus_files) taxa_to_keep = get_samples_to_run(args, taxa) for count, align_file in enumerate(nexus_files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for align in AlignIO.parse(align_file, "nexus"): for taxon in list(align): if taxon.name in taxa_to_keep: new_align.add_sequence(taxon.name, str(taxon.seq)) outf = os.path.join(args.output, os.path.basename(align_file)) AlignIO.write(new_align, open(outf, 'w'), 'nexus') print count
def proteins_alignment_to_biopython(al, seq1, seq2, name1, name2): "Convert our internal alignment format into BioPython Alignment" s1 = "" s2 = "" align = Alignment(Gapped(IUPAC.protein, "-")) for a, b in al: if a!=-1: s1 += seq1[a].upper() else: s1 += "-" if b!=-1: s2 += seq2[b].upper() align.add_sequence(name1, s1) align.add_sequence(name2, s2) return align
def parse_ace(ace_file): ace_gen = Ace.parse(open(ace_file, 'r')) contig = ace_gen.next() align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in range(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) align.add_sequence(contig.reads[readn].rd.name + "_" + contig.af[readn].coru, seq) return contig, align
def get_alignment(self): """Construct an alignment from the aligned sequences in this tree.""" def seq_is_aligned(node): if isinstance(node, Sequence) and node.mol_seq.is_aligned: return True return False seqs = self.depth_first_search(self, seq_is_aligned) try: first_seq = seqs.next() except StopIteration: warnings.warn("No aligned sequences were found in this tree.", Warning, stacklevel=2) aln = Alignment(first_seq.get_alphabet()) aln.add_sequence(str(first_seq), first_seq.mol_seq.value) for seq in seqs: aln.add_sequence(str(seq), seq.mol_seq.value) return aln
def get_alignment(self): """Construct an alignment from the aligned sequences in this tree.""" def is_aligned_seq(node): if isinstance(node, Sequence) and node.mol_seq.is_aligned: return True return False seqs = self._filter_search(is_aligned_seq, 'preorder', True) try: first_seq = seqs.next() except StopIteration: # No aligned sequences were found # Can't construct an Alignment without an alphabet, so... nothin' return aln = Alignment(first_seq.get_alphabet()) aln.add_sequence(str(first_seq), first_seq.mol_seq.value) for seq in seqs: aln.add_sequence(str(seq), seq.mol_seq.value) return aln
def strarray2biopy(align): """ take a 2d character array with an associated ID list and convert it into a biopython DNA alignment.""" seqs = align[0] ids = align[1] alphabet = Gapped(IUPAC.unambiguous_dna) alignment = Alignment(alphabet) for count, array_seq in enumerate(seqs): bases = '' for base in array_seq: bases += base alignment.add_sequence(ids[count],bases) return alignment
def add_gaps_to_align(organisms, missing, align, verbatim=False, genera=False, min_taxa=3): local_organisms = copy.deepcopy(organisms) for a in align: if len(a) < min_taxa: new_align = None break elif len(a) >= min_taxa: #pdb.set_trace() new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) overall_length = len(a[0]) for seq in a: if genera and any(sp for sp in genera if sp in seq.name): new_seq_name = '_'.join(seq.name.split('_')[-1:]) elif not verbatim: new_seq_name = '_'.join(seq.name.split('_')[-2:]) else: new_seq_name = seq.name.lower() new_align.add_sequence(new_seq_name, str(seq.seq)) local_organisms.remove(new_seq_name) for org in local_organisms: if genera and any(sp for sp in genera if sp in seq.name): loc = '_'.join(seq.name.split('_')[:-1]) elif not verbatim: loc = '_'.join(seq.name.split('_')[:-2]) else: loc = seq.name if missing: try: assert loc in missing[org], "Locus missing" except: assert loc in missing['{}*'.format( org)], "Locus missing" new_align.add_sequence(org, '?' * overall_length) return new_align
def main(): options, args = interface() # iterate through all the files to determine the longest alignment files = get_files(options.input) for count, f in enumerate(files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) # filename = os.path.basename(f) # chromo_name = filename.split('.')[0] for align in AlignIO.parse(f, "nexus"): for seq in list(align): if ".copy" in seq.name: pass else: # pdb.set_trace() # new_seq_name = seq.name.split('|')[0] new_seq_name = "_".join(seq.name.split("_")[options.position :]) new_align.add_sequence(new_seq_name, str(seq.seq)) # pdb.set_trace() outf = os.path.join(options.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, "w"), "nexus") except ValueError: pdb.set_trace() print count
def formatData (AlignData, Score): LIMIT1 = 450 LIMIT2 = 2000 i = 0; ScorePoints = [] for i in xrange(6196): ScorePoints.append(0) i = 0 for record in AlignData.Alignment: #print "Here" j = 0 for c in record.seq.tostring(): if (Score[j] <= LIMIT1): if c != '-': ScorePoints[i] -= 2 if (Score[j] >= LIMIT2): if c != '-': ScorePoints[i] += 2 else: ScorePoints[i] -= 1 #NewAlignData.add_sequence(record.seq.tostring(),record.id) j += 1 i+=1 # return NewAlignData # return ScorePoints i = 0 DataList = list() for record in AlignData.Alignment: if(ScorePoints[i] >= -250): NewAlignData = Alignment(Gapped(IUPAC.protein,"-")) NewAlignData.add_sequence(record.id,record.seq.tostring()) DataList.append(NewAlignData) i+=1 return DataList
def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: #Empty file - just give up. return if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") #import sys #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break #end of file line = line.strip() #remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": #The "//" line indicates the end of the alignment. #There may still be more meta-data passed_end_alignment = True elif line == "": #blank line, ignore pass elif line[0] != "#": #Sequence #Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: #This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " \ + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: #Comment line or meta-data if line[:5] == "#=GF ": #Generic per-File annotation, free text #Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) #Each feature key could be used more than once, #so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': #Generic per-Column annotation, exactly 1 char per column #Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': #Generic per-Sequence annotation, free text #Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids : # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": #Generic per-Sequence AND per-Column markup #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids : # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip( ) # append to any previous entry #TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines #Next line... assert len(seqs) <= len(ids) #assert len(gs) <= len(ids) #assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = gr alignment_length = len(seqs.values()[0]) for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(id) alignment.add_sequence(id, seq, start=start, end=end) record = alignment.get_all_seqs()[-1] assert record.id == id or record.description == id record.id = id record.name = name record.description = id #will be overridden by _populate_meta_data if an explicit #accession is provided: record.annotations["accession"] = name self._populate_meta_data(id, record) return alignment else: return None
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#") : #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>") : #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line : #End of file return None if ">>><<<" in line : #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>" : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">" : query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; " : assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else : align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq) : raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = Alignment(alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr #assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split(None,1)[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation : if query_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in query_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr #assert record.seq.tostring() == match_align_seq record.id = match_descr.split(None,1)[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation : if match_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in match_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
from Bio.Align.FormatConvert import FormatConverter from Bio.Align import AlignInfo from Bio.Fasta import FastaAlign from Bio.SubsMat import FreqTable from Bio.Align.Generic import Alignment #Very simple tests on an empty alignment alignment = Alignment(Alphabet.generic_alphabet) assert alignment.get_alignment_length() == 0 assert alignment.get_all_seqs() == [] del alignment #Basic tests on simple three string alignment alignment = Alignment(Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.add_sequence("mixed", letters) alignment.add_sequence("lower", letters.lower()) alignment.add_sequence("upper", letters.upper()) assert alignment.get_alignment_length() == 26 assert len(alignment.get_all_seqs()) == 3 assert alignment.get_seq_by_num(0).tostring() == letters assert alignment.get_seq_by_num(1).tostring() == letters.lower() assert alignment.get_seq_by_num(2).tostring() == letters.upper() assert alignment.get_all_seqs()[0].description == "mixed" assert alignment.get_all_seqs()[1].description == "lower" assert alignment.get_all_seqs()[2].description == "upper" for (col, letter) in enumerate(letters) : assert alignment.get_column(col) == letter \ + letter.lower() \ + letter.upper() #Check row extractions:
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) -1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p+1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list(sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list(sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([h[0] for h in haplotypes[-1] if h[0].startswith(g)]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str(len([h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g)]))) bamova_file.write("\n") with open ("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write("Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str(contig_counter), "were treated"
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None #Whitelisted headers we know about known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE'] if line.strip().split()[0] not in known_headers: raise ValueError("%s is not a known CLUSTAL header: %s" % \ (line.strip().split()[0], ", ".join(known_headers))) # find the clustal version in the header line version = None for word in line.split(): if word[0] == '(' and word[-1] == ')': word = word[1:-1] if word[0] in '0123456789': version = word break #There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() #If the alignment contains entries with the same sequence #identifier (not a good idea - but seems possible), then this #dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] consensus = "" seq_cols = None #: Used to extract the consensus #Use the first block to get the sequence identifiers while True: if line[0] != " " and line.strip() != "": #Sequences identifier... fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) #Record the sequence position to get the consensus if seq_cols is None: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end assert fields[1] == line[seq_cols] if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) elif line[0] == " ": #Sequence consensus line... assert len(ids) == len(seqs) assert len(ids) > 0 assert seq_cols is not None consensus = line[seq_cols] assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Check for blank line (or end of file) line = handle.readline() assert line.strip() == "" break else: #No consensus break line = handle.readline() if not line: break #end of file assert line.strip() == "" assert seq_cols is not None #Confirm all same length for s in seqs: assert len(s) == len(seqs[0]) if consensus: assert len(consensus) == len(seqs[0]) #Loop over any remaining blocks... done = False while not done: #There should be a blank line between each block. #Also want to ignore any consensus line from the #previous block. while (not line) or line.strip() == "": line = handle.readline() if not line: break # end of file if not line: break # end of file if line.split(None, 1)[0] in known_headers: #Found concatenated alignment. done = True self._header = line break for i in range(len(ids)): assert line[0] != " ", "Unexpected line:\n%s" % repr(line) fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % repr(line)) if fields[0] != ids[i]: raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ % (fields[0], ids[i])) if fields[1] != line[seq_cols]: start = len(fields[0]) + line[len(fields[0]):].find( fields[1]) assert start == seq_cols.start, 'Old location %s -> %i:XX' % ( seq_cols, start) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end #Append the sequence seqs[i] += fields[1] assert len(seqs[i]) == len(seqs[0]) if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-", "")) != letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) #Read in the next line line = handle.readline() #There should now be a consensus line if consensus: assert line[0] == " " assert seq_cols is not None consensus += line[seq_cols] assert len(consensus) == len(seqs[0]) assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: return None if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) alignment_length = len(seqs[0]) for i in range(len(ids)): if len(seqs[i]) != alignment_length: raise ValueError( "Error parsing alignment - sequences of different length?") alignment.add_sequence(ids[i], seqs[i]) #TODO - Handle alignment annotation better, for now #mimic the old parser in Bio.Clustalw if version: alignment._version = version if consensus: assert len(consensus) == alignment_length, \ "Alignment length is %i, consensus length is %i, '%s'" \ % (alignment_length, len(consensus), consensus) alignment._star_info = consensus return alignment
def main(): # Configuration #Select the desired NCBI translation table translationTable = 11 # Open the DNA sequence file and read the fasta sequences into a dictionary if (len(argv) > 1): dnaFileName = argv[1] else: dnaFileName = None dnaSeqFile = fileinput.input(dnaFileName) dnaSeqDict = SeqIO.to_dict(SeqIO.parse(dnaSeqFile, "fasta")) # Translate the sequences aaSeqRecords = [] for key in dnaSeqDict: aaSeq = SeqRecord(dnaSeqDict[key].seq.translate(table=translationTable), id=key) aaSeqRecords.append(aaSeq) dnaSeqFile.close() # Replace stop codons with X (unknown aa) so muscle doesn't drop them for aaSeq in aaSeqRecords: noStopCodonSeq = str(aaSeq.seq).replace('*', 'X') aaSeq.seq = Seq(noStopCodonSeq) # Align the aa sequences commandLine = str(MuscleCommandline(seqtype='protein')) childProcess = subprocess.Popen(commandLine, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform!="win32")) #don't pipe stderr or muscle hangs SeqIO.write(aaSeqRecords, childProcess.stdin, "fasta") childProcess.stdin.close() aaAlignment = AlignIO.read(childProcess.stdout, "fasta") # Convert the aa alignment into a dna alignment dnaAlignment = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for taxon in aaAlignment: aaCount = 0 dnaSeq = '' for aaResidue in taxon.seq: if (aaResidue == '-'): dnaSeq = dnaSeq + '---' else: dnaSeq = dnaSeq + dnaSeqDict[taxon.id].seq[aaCount*3:aaCount*3+3] aaCount+=1 # As we add the sequences to the alignment remove gene name from the sequence id so they taxon match the PAML constraint tree dnaAlignment.add_sequence(taxon.id.split('_')[0], str(dnaSeq)) if (dnaFileName): outFileName = dnaFileName.split('.')[0] + '_aln.phy' else: outFileName = 'out_aln.phy' outFile = open(outFileName, 'w+') AlignIO.write([dnaAlignment], outFile, "phylip") #I think this section should be removed. If I put the 'I' into the alignment file now, I can't open the alignment with BioPython-based scripts (for manual editing etc). I can use pamlize.py to add the I right before using paml. # Biopython doesn't tag Interleaved phylip files and PAML requires it so... # outFile.seek(0,0) # modifiedAlignmentText = outFile.readlines() # modifiedAlignmentText[0] = modifiedAlignmentText[0].rstrip() + ' I\n' # outFile.seek(0,0) # outFile.writelines(modifiedAlignmentText) outFile.close()
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) - 1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p + 1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list( sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list( sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([ h[0] for h in haplotypes[-1] if h[0].startswith(g) ]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str( len([ h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g) ]))) bamova_file.write("\n") with open("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write( "Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str( contig_counter), "were treated"
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars): """Genotype individuals at SNPs loci. """ win_buffer = (win_len - 1) / 2 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start # GOOD clipe = contig.reads[readn].qa.qual_clipping_end # GOOD clipst2 = contig.reads[readn].qa.align_clipping_start # Added clipe2 = contig.reads[readn].qa.align_clipping_end # Added if clipst2 > clipst: # Added clipst = clipst2 # Added if clipe2 < clipe2: # Added clipe = clipe2 # Added start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name positions = [] try: positions = snp_dict[contig_name] except: continue d = {} for pos in positions: if stars == True: pos_ok = correct_position(pos, sequences[0][1]) else: pos_ok = pos left = pos_ok - 5 if left < 0: left = 0 right = pos_ok + 1 + 5 # takes into account the middle nucleotide ref_window = sequences[0][1][left:right] d.setdefault(pos, {}) d[pos].setdefault("XX_noTag", {}) for nuc in list("ACGTN*-"): d[pos]["XX_noTag"].setdefault(nuc, 0) for tag in tags: d[pos].setdefault(tag, {}) for nuc in list("ACGTN*-"): d[pos][tag].setdefault(nuc, 0) for fasta in sequences: window = fasta[1][left:right] del_count = 0 if window.count("-") > win_buffer - 3: continue # Need at least 3 nucleotides on each side for tag in tags: if tag in fasta[0]: t = tag break else: t = "XX_noTag" if len(ref_window) == len(window): for i in xrange(len(window)): if ref_window[i].isalpha() and window[i] == "*" or \ window[i].isalpha() and ref_window[i] == "*": del_count += 1 if del_count > max_del: continue p = pos s = fasta[1] # Sequence n = s[pos_ok - 1].upper() d[p][t][n] += 1 for p in sorted(d): for t in sorted(d[p]): output_file.write(contig_name + "\t" + str(p) + "\t" + str(t)) for n in list("ACGTN*-"): output_file.write("\t" + str(d[p][t][n])) output_file.write("\n")
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes +=1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")
# seq: Seq object, required #additional attributes # name, description: name and more info of sequence # dbxrefs: list of strings, each string an id of a DB # features: list of SeqFeature objects, those found in Genbank records # annotations: dictionary with further info, can't be set on initialization seqrec=SeqRecord(Seq('mdstnvrsgmksrkkkpkttvidddddcmtcsacqsklvkisditkvsldyintmrgntlacaacgsslkllndfas',Bio.Alphabet.generic_protein), id='P20994.1', name='P20994', description='Protein A19', dbxrefs=['Pfam:PF05077', 'InterPro:IPR007769', 'DIP:2186N']) seqrec.annotations['note']='A simple note' print seqrec #tipo de dato alineamiento de secuencias, guarda no procesa from Bio.Align.Generic import Alignment seq1='MHQAIFIYQIGYPLKSGYIQSIRSPEYDNW' seq2='MH--IFIYQIGYALKSGYIQSIRSPEY-NW' align=Alignment(Bio.Alphabet.Gapped(IUPAC.protein)) #instance of Alignment class align.add_sequence('asp',seq1) align.add_sequence('unk',seq2) print align #Alignment methods #get_all_seqs: return all sequences in the alignment as a list of SeqRecord for s in align.get_all_seqs(): #in align: (the same) print '->',s.seq #get_seq_by_num(n): return only the selected sequence by index print str(align.get_seq_by_num(1)) #Seq object print align[0] #SeqRecord object print str(align[0].seq) #get_alignment_length(): get length of alignment print align.get_alignment_length() #get_column(n): return a string with all the letters in the n column print align.get_column(0) print align.get_column(2)
consensus = summary.gap_consensus(ambiguous="N") print(consensus) print("") print(summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus)) print("") # Have a generic alphabet, without a declared gap char, so must tell # provide the frequencies and chars to ignore explicitly. print(summary.information_content(e_freq_table=expected, chars_to_ignore=['-'])) print("") print("Trying a protein sequence with gaps and stops") alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") a = Alignment(alpha) a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") print(a) print("=" * a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") print(c) c = s.gap_consensus(ambiguous="X") print(c) print("") print(s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)) print(s.information_content(chars_to_ignore=['-', '*']))
from Bio import Clustalw from Bio.Align import AlignInfo from Bio import AlignIO from Bio.SubsMat import FreqTable from Bio.Align.Generic import Alignment #Very simple tests on an empty alignment alignment = Alignment(Alphabet.generic_alphabet) assert alignment.get_alignment_length() == 0 assert len(alignment) == 0 del alignment #Basic tests on simple three string alignment alignment = Alignment(Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.add_sequence("mixed", letters) alignment.add_sequence("lower", letters.lower()) alignment.add_sequence("upper", letters.upper()) assert alignment.get_alignment_length() == 26 assert len(alignment) == 3 assert alignment.get_seq_by_num(0).tostring() == letters assert alignment.get_seq_by_num(1).tostring() == letters.lower() assert alignment.get_seq_by_num(2).tostring() == letters.upper() assert alignment[0].description == "mixed" assert alignment[1].description == "lower" assert alignment[2].description == "upper" for (col, letter) in enumerate(letters): assert alignment.get_column(col) == letter \ + letter.lower() \ + letter.upper() #Check row extractions:
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None #Whitelisted headers we know about known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE'] if line.strip().split()[0] not in known_headers: raise ValueError("%s is not a known CLUSTAL header: %s" % \ (line.strip().split()[0], ", ".join(known_headers))) # find the clustal version in the header line version = None for word in line.split(): if word[0]=='(' and word[-1]==')': word = word[1:-1] if word[0] in '0123456789': version = word break #There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() #If the alignment contains entries with the same sequence #identifier (not a good idea - but seems possible), then this #dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] consensus = "" seq_cols = None #: Used to extract the consensus #Use the first block to get the sequence identifiers while True: if line[0] != " " and line.strip() != "": #Sequences identifier... fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) #Record the sequence position to get the consensus if seq_cols is None: start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end assert fields[1] == line[seq_cols] if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError("Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-","")) != letters: raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) elif line[0] == " ": #Sequence consensus line... assert len(ids) == len(seqs) assert len(ids) > 0 assert seq_cols is not None consensus = line[seq_cols] assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Check for blank line (or end of file) line = handle.readline() assert line.strip() == "" break else: #No consensus break line = handle.readline() if not line : break #end of file assert line.strip() == "" assert seq_cols is not None #Confirm all same length for s in seqs: assert len(s) == len(seqs[0]) if consensus: assert len(consensus) == len(seqs[0]) #Loop over any remaining blocks... done = False while not done: #There should be a blank line between each block. #Also want to ignore any consensus line from the #previous block. while (not line) or line.strip() == "": line = handle.readline() if not line : break # end of file if not line : break # end of file if line.split(None,1)[0] in known_headers: #Found concatenated alignment. done = True self._header = line break for i in range(len(ids)): assert line[0] != " ", "Unexpected line:\n%s" % repr(line) fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % repr(line)) if fields[0] != ids[i]: raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ % (fields[0], ids[i])) if fields[1] != line[seq_cols]: start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) end = start + len(fields[1]) seq_cols = slice(start, end) del start, end #Append the sequence seqs[i] += fields[1] assert len(seqs[i]) == len(seqs[0]) if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError("Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-","")) != letters: raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) #Read in the next line line = handle.readline() #There should now be a consensus line if consensus: assert line[0] == " " assert seq_cols is not None consensus += line[seq_cols] assert len(consensus) == len(seqs[0]) assert not line[:seq_cols.start].strip() assert not line[seq_cols.stop:].strip() #Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: return None if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) alignment_length = len(seqs[0]) for i in range(len(ids)): if len(seqs[i]) != alignment_length: raise ValueError("Error parsing alignment - sequences of different length?") alignment.add_sequence(ids[i], seqs[i]) #TODO - Handle alignment annotation better, for now #mimic the old parser in Bio.Clustalw if version: alignment._version = version if consensus: assert len(consensus) == alignment_length, \ "Alignment length is %i, consensus length is %i, '%s'" \ % (alignment_length, len(consensus), consensus) alignment._star_info = consensus return alignment
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header print self._header.strip(), '--> self_header' del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith('#-') : #Reached the end of the alignments, no need to read the footer... return None if line.startswith("##") : #Skip the file header before the alignments. e.g. # print line.strip() line = self._skip_file_header(line) # print 'Back from file header skip' assert line.startswith('#'), line while not line.startswith('#=') : line = self.handle.readline() if line.startswith('#='): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) if not line : #End of file return None assert line.startswith(">>") and not line.startswith(">>>"), line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match numbering line, then more tags. #e.g. """ >>#2 ; sw_score: 41.0 ; sw_ident: 0.846 ; sw_overlap: 13 """ if not line.startswith(">>") and not line.startswith(">>>") : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #print match_descr, 'match' #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line.startswith("; ") #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split()[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence (with leading flanking region) while not line.startswith(">") : query_seq_parts.append(line.strip()) line = handle.readline() # print 'queryseq', line.strip() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) #print '----->', line.strip(), match_descr match_descr = line[1:].split()[0] + match_descr #assert match_descr.startswith(line[1:].split()[0]) # assert self._match_descr.startswith(line[1:].split()[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence with flanking region... while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'): match_seq_parts.append(line.strip()) line = handle.readline() if not line: #End of file return None if line.startswith('>') or '>>>' in line: self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #The "sq_offset" values can be specified with the -X command line option. #The appear to just shift the origin used in the calculation of the coordinates. if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \ or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") : #Note that until some point in the v35 series, FASTA always recorded one #for the query offset, and ommitted the match offset (even when these were #query_seq the -X command line option). #TODO - Work out how exactly the use of -X offsets changes things. #raise ValueError("Offsets from the -X command line option are not (yet) supported") pass # this is not useful when using stretcher # if len(query_align_seq) != len(match_align_seq) : # raise ValueError("Problem parsing the alignment sequence coordinates") if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split()[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = query_annotation[k] alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr assert record.seq.tostring() == match_align_seq record.id = match_descr.split()[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = match_annotation[k] return alignment
def next(self) : handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError : line = handle.readline() if not line: return line = line.strip() parts = filter(None, line.split()) if len(parts)!=2 : raise ValueError("First line should have two integers") try : number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] #Expects STRICT truncation/padding to 10 characters #Does not require any white space between name and seq. for i in range(0,number_of_seqs) : line = handle.readline().rstrip() ids.append(line[:10].strip()) #first ten characters seqs.append([line[10:].strip().replace(" ","")]) #Look for further blocks line="" while True : #Skip any blank lines between blocks... while ""==line.strip(): line = handle.readline() if not line : break #end of file if not line : break #end of file if self._is_header(line) : #Looks like the start of a concatenated alignment self._header = line break #print "New block..." for i in range(0,number_of_seqs) : seqs[i].append(line.strip().replace(" ","")) line = handle.readline() if (not line) and i+1 < number_of_seqs : raise ValueError("End of file mid-block") if not line : break #end of file alignment = Alignment(self.alphabet) for i in range(0,number_of_seqs) : seq = "".join(seqs[i]) if len(seq)!=length_of_seqs : raise ValueError("Sequence %i length %i, expected length %i" \ % (i+1, len(seq), length_of_seqs)) alignment.add_sequence(ids[i], seq) record = alignment.get_all_seqs()[-1] assert ids[i] == record.id or ids[i] == record.description record.id = ids[i] record.name = ids[i] record.description = ids[i] return alignment
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return line = line.strip() parts = filter(None, line.split()) if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] #Expects STRICT truncation/padding to 10 characters #Does not require any white space between name and seq. for i in range(0, number_of_seqs): line = handle.readline().rstrip() ids.append(line[:10].strip()) #first ten characters seqs.append([line[10:].strip().replace(" ", "")]) #Look for further blocks line = "" while True: #Skip any blank lines between blocks... while "" == line.strip(): line = handle.readline() if not line: break #end of file if not line: break #end of file if self._is_header(line): #Looks like the start of a concatenated alignment self._header = line break #print "New block..." for i in range(0, number_of_seqs): seqs[i].append(line.strip().replace(" ", "")) line = handle.readline() if (not line) and i + 1 < number_of_seqs: raise ValueError("End of file mid-block") if not line: break #end of file alignment = Alignment(self.alphabet) for i in range(0, number_of_seqs): seq = "".join(seqs[i]) if len(seq) != length_of_seqs: raise ValueError("Sequence %i length %i, expected length %i" \ % (i+1, len(seq), length_of_seqs)) alignment.add_sequence(ids[i], seq) record = alignment.get_all_seqs()[-1] assert ids[i] == record.id or ids[i] == record.description record.id = ids[i] record.name = ids[i] record.description = ids[i] return alignment
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes += 1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")
print consensus print print summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus) print #Have a generic alphabet, without a declared gap char, so must tell #provide the frequencies and chars to ignore explicitly. print summary.information_content(e_freq_table=expected, chars_to_ignore=['-']) print print "Trying a protein sequence with gaps and stops" alpha = Alphabet.HasStopCodon( Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") a = Alignment(alpha) a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") print a print "=" * a.get_alignment_length() s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") print c c = s.gap_consensus(ambiguous="X") print c print print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) print s.information_content(chars_to_ignore=['-', '*'])
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line[:7] <> 'CLUSTAL': raise ValueError("Did not find CLUSTAL header") #There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() #If the alignment contains entries with the same sequence #identifier (not a good idea - but seems possible), then this #dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] #Use the first block to get the sequence identifiers while line.strip() <> "": if line[0] <> " ": #Sequences identifier... fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-", "")) <> letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) else: #Sequence consensus line... pass line = handle.readline() if not line: break #end of file assert line.strip() == "" #Loop over any remaining blocks... done = False while not done: #There should be a blank line between each block. #Also want to ignore any consensus line from the #previous block. while (not line) or line.strip() == "" or line[0] == " ": line = handle.readline() if not line: break # end of file if not line: break # end of file for i in range(len(ids)): fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: if line[:7] == 'CLUSTAL': #Found concatenated alignment. done = True self._header = line break else: raise ValueError("Could not parse line:\n%s" % line) if fields[0] <> ids[i]: raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ % (fields[0], ids[i])) #Append the sequence seqs[i] += fields[1] if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-", "")) <> letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) #Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: return None if self.records_per_alignment is not None \ and self.records_per_alignment <> len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) alignment_length = len(seqs[0]) for i in range(len(ids)): if len(seqs[i]) <> alignment_length: raise ValueError( "Error parsing alignment - sequences of different length?") alignment.add_sequence(ids[i], seqs[i]) return alignment
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header print self._header.strip(), '--> self_header' del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith('#-') : #Reached the end of the alignments, no need to read the footer... return None if line.startswith("##") : #Skip the file header before the alignments. e.g. # print line.strip() line = self._skip_file_header(line) # print 'Back from file header skip' assert line.startswith('#'), line while not line.startswith('#=') : line = self.handle.readline() if line.startswith('#='): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) if not line : #End of file return None assert line.startswith(">>") and not line.startswith(">>>"), line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match numbering line, then more tags. #e.g. """ >>#2 ; sw_score: 41.0 ; sw_ident: 0.846 ; sw_overlap: 13 """ if not line.startswith(">>") and not line.startswith(">>>") : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #print match_descr, 'match' #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line.startswith("; ") #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split()[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence (with leading flanking region) while not line.startswith(">") : query_seq_parts.append(line.strip()) line = handle.readline() # print 'queryseq', line.strip() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) #print '----->', line.strip(), match_descr match_descr = line[1:].split()[0] + match_descr #assert match_descr.startswith(line[1:].split()[0]) # assert self._match_descr.startswith(line[1:].split()[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence with flanking region... while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'): match_seq_parts.append(line.strip()) line = handle.readline() if line.startswith('>') or '>>>' in line: self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #The "sq_offset" values can be specified with the -X command line option. #The appear to just shift the origin used in the calculation of the coordinates. if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \ or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") : #Note that until some point in the v35 series, FASTA always recorded one #for the query offset, and ommitted the match offset (even when these were #query_seq the -X command line option). #TODO - Work out how exactly the use of -X offsets changes things. #raise ValueError("Offsets from the -X command line option are not (yet) supported") pass # this is not useful when using stretcher # if len(query_align_seq) != len(match_align_seq) : # raise ValueError("Problem parsing the alignment sequence coordinates") if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split()[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = query_annotation[k] alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr assert record.seq.tostring() == match_align_seq record.id = match_descr.split()[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = match_annotation[k] return alignment
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#") : #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>") : #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line : #End of file return None if ">>><<<" in line : #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>" : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">" : query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; " : assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else : align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq) : raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = Alignment(alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr #assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split(None,1)[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation : if query_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in query_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr #assert record.seq.tostring() == match_align_seq record.id = match_descr.split(None,1)[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) #TODO - handle start/end coordinates properly. Short term hack for now: record._al_start = int(query_annotation["al_start"]) record._al_stop = int(query_annotation["al_stop"]) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation : if match_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in match_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: # Empty file - just give up. return if not line.strip() == "# STOCKHOLM 1.0": raise ValueError("Did not find STOCKHOLM header") # import sys # print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == "# STOCKHOLM 1.0": self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, "") seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == "#=GC ": # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == "#=GS ": # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids : # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids : # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip() # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment) ) alignment = Alignment(self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr alignment_length = len(seqs.values()[0]) for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError("Sequences have different lengths, or repeated identifier") name, start, end = self._identifier_split(id) alignment.add_sequence(id, seq, start=start, end=end) record = alignment.get_all_seqs()[-1] assert record.id == id or record.description == id record.id = id record.name = name record.description = id # will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name self._populate_meta_data(id, record) return alignment else: return None
def next(self) : handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None while line.rstrip() != "#=======================================" : line = handle.readline() if not line : return None length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#" : #Read in the rest of this alignment header, #try and discover the number of records expected #and their length parts = line[1:].split(":",1) key = parts[0].lower().strip() if key == "aligned_sequences" : number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs) : line = handle.readline() parts = line[1:].strip().split(":",1) assert i+1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length" : length_of_seqs = int(parts[1].strip()) #And read in another line... line = handle.readline() if number_of_seqs is None : raise ValueError("Number of sequences missing!") if length_of_seqs is None : raise ValueError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] index = 0 #Parse the seqs while line : if len(line) > 21 : id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: #identifier, seq start position, seq, seq end position #(an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end #The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id==ids[index] or id == ids[index][:len(id)] #Check the start... if int(start) == 0: #Special case when one sequence starts long before the other assert len(seqs[index].replace("-",""))==0 assert len(seq.replace("-","")) == 0, line elif int(start) == len(seqs[index].replace("-","")) : #Special case when one sequence ends long before the other assert len(seq.replace("-","")) == 0, line else : assert int(start) - 1 == len(seqs[index].replace("-","")), \ "Found %i chars so far for sequence %i (%s), file says start %i:\n%s" \ % (len(seqs[index].replace("-","")), index, id, int(start), seqs[index]) seqs[index] += seq #Check the end ... assert int(end) == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says end %i:\n%s" \ % (len(seqs[index]), id, int(end), repr(seqs[index])) index += 1 if index >= number_of_seqs : index = 0 else : #just a start value, this is just alignment annotation (?) #print "Skipping: " + line.rstrip() pass elif line.strip() == "" : #Just a spacer? pass else : print line assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================" : #End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) for id, seq in zip(ids, seqs) : if len(seq) != length_of_seqs : #EMBOSS 2.9.0 is known to use spaces instead of minus signs #for leading gaps, and thus fails to parse. This old version #is still used as of Dec 2008 behind the EBI SOAP webservice: #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl raise ValueError("Error parsing alignment - sequences of " "different length? You could be using an " "old version of EMBOSS.") alignment.add_sequence(id, seq) return alignment
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None while line.rstrip() != "#=======================================": line = handle.readline() if not line: return None length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#": #Read in the rest of this alignment header, #try and discover the number of records expected #and their length parts = line[1:].split(":", 1) key = parts[0].lower().strip() if key == "aligned_sequences": number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs): line = handle.readline() parts = line[1:].strip().split(":", 1) assert i + 1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length": length_of_seqs = int(parts[1].strip()) #And read in another line... line = handle.readline() if number_of_seqs is None: raise ValueError("Number of sequences missing!") if length_of_seqs is None: raise ValueError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] index = 0 #Parse the seqs while line: if len(line) > 21: id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: #identifier, seq start position, seq, seq end position #(an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end #The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id == ids[index] or id == ids[index][:len(id)] #Check the start... if int(start) == 0: #Special case when one sequence starts long before the other assert len(seqs[index].replace("-", "")) == 0 assert len(seq.replace("-", "")) == 0, line elif int(start) == len(seqs[index].replace("-", "")): #Special case when one sequence ends long before the other assert len(seq.replace("-", "")) == 0, line else: assert int(start) - 1 == len(seqs[index].replace("-","")), \ "Found %i chars so far for sequence %i (%s), file says start %i:\n%s" \ % (len(seqs[index].replace("-","")), index, id, int(start), seqs[index]) seqs[index] += seq #Check the end ... assert int(end) == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says end %i:\n%s" \ % (len(seqs[index]), id, int(end), repr(seqs[index])) index += 1 if index >= number_of_seqs: index = 0 else: #just a start value, this is just alignment annotation (?) #print "Skipping: " + line.rstrip() pass elif line.strip() == "": #Just a spacer? pass else: print line assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================" : #End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) for id, seq in zip(ids, seqs): if len(seq) != length_of_seqs: #EMBOSS 2.9.0 is known to use spaces instead of minus signs #for leading gaps, and thus fails to parse. This old version #is still used as of Dec 2008 behind the EBI SOAP webservice: #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl raise ValueError("Error parsing alignment - sequences of " "different length? You could be using an " "old version of EMBOSS.") alignment.add_sequence(id, seq) return alignment
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None while line.rstrip() <> "#=======================================": line = handle.readline() if not line: return None length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#": #Read in the rest of this alignment header, #try and discover the number of records expected #and their length parts = line[1:].split(":", 1) key = parts[0].lower().strip() if key == "aligned_sequences": number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs): line = handle.readline() parts = line[1:].strip().split(":", 1) assert i + 1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length": length_of_seqs = int(parts[1].strip()) #And read in another line... line = handle.readline() if number_of_seqs is None: raise SyntaxError("Number of sequences missing!") if length_of_seqs is None: raise SyntaxError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment <> number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] index = 0 #Parse the seqs while line: if len(line) > 21: id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: #identifier, seq start position, seq, seq end position #(an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end #The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id == ids[index] or id == ids[index][:len(id)] #Check the start... assert int(start) - 1 == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says start %i:\n%s" \ % (len(seqs[index]), id, int(start), seqs[index]) seqs[index] += seq #Check the end ... assert int(end) == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says end %i:\n%s" \ % (len(seqs[index]), id, int(end), seqs[index]) index += 1 if index >= number_of_seqs: index = 0 else: #just a start value, this is just alignment annotation (?) #print "Skipping: " + line.rstrip() pass elif line.strip() == "": #Just a spacer? pass else: print line assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================" : #End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment <> len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) for id, seq in zip(ids, seqs): if len(seq) <> length_of_seqs: raise SyntaxError( "Error parsing alignment - sequences of different length?") alignment.add_sequence(id, seq) return alignment