def add_gaps_to_align(organisms, missing, align, verbatim=False, genera=False, min_taxa=3): local_organisms = copy.deepcopy(organisms) for a in align: if len(a) < min_taxa: new_align = None break elif len(a) >= min_taxa: #pdb.set_trace() new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) overall_length = len(a[0]) for seq in a: if genera and any(sp for sp in genera if sp in seq.name): new_seq_name = '_'.join(seq.name.split('_')[-1:]) elif not verbatim: new_seq_name = '_'.join(seq.name.split('_')[-2:]) else: new_seq_name = seq.name.lower() new_align.add_sequence(new_seq_name, str(seq.seq)) local_organisms.remove(new_seq_name) for org in local_organisms: if genera and any(sp for sp in genera if sp in seq.name): loc = '_'.join(seq.name.split('_')[:-1]) elif not verbatim: loc = '_'.join(seq.name.split('_')[:-2]) else: loc = seq.name if missing: try: assert loc in missing[org], "Locus missing" except: assert loc in missing['{}*'.format(org)], "Locus missing" new_align.add_sequence(org, '?' * overall_length) return new_align
def main(): options, args = interface() # iterate through all the files to determine the longest alignment files = get_files(options.input) for count, f in enumerate(files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) #filename = os.path.basename(f) #chromo_name = filename.split('.')[0] for align in AlignIO.parse(f, 'nexus'): for seq in list(align): if '.copy' in seq.name: pass else: #pdb.set_trace() #new_seq_name = seq.name.split('|')[0] new_seq_name = '_'.join( seq.name.split('_')[options.position:]) new_align.add_sequence(new_seq_name, str(seq.seq)) #pdb.set_trace() outf = os.path.join(options.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count
def _domain_alignment(self,alignment,domain_region, alignment_index): # Now we need to subselect the portion of the alignment # that contains the domain. protein_record = alignment[alignment_index] protein_seq = str(protein_record.seq) # Figure out which columns encapsulate the domain. aa_count = 0 column_start = None column_stop = None #print protein_seq for column,aa in enumerate(protein_seq): #print column,aa if aa!='-': aa_count=aa_count+1 if aa_count==domain_region.start and column_start==None: column_start = column if aa_count==domain_region.stop and column_stop==None: column_stop = column break #print column_start,column_stop assert column_start != None, str(column_start) assert column_stop != None, str(column_stop) domain_alignment = Alignment(alphabet = alignment._alphabet) # Grab the portion of each sequence that correspond to columns # for the domain. for record in alignment: domain_alignment.add_sequence(record.id, str(record.seq)[column_start:column_stop]) return (domain_alignment, column_start, column_stop)
def main(): args = get_args() # iterate through all the files to determine the longest alignment files = get_files(args.nexus) old_names = set() for f in files: for align in AlignIO.parse(f, 'nexus'): for seq in list(align): old_names.update([seq.name]) #pdb.set_trace() name_map = abbreviator(old_names) for count, f in enumerate(files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) #filename = os.path.basename(f) #chromo_name = filename.split('.')[0] for align in AlignIO.parse(f, 'nexus'): for seq in list(align): new_seq_name = name_map[seq.name] new_align.add_sequence(new_seq_name, str(seq.seq)) #pdb.set_trace() outf = os.path.join(args.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, 'w'), 'nexus') except ValueError: pdb.set_trace() print count
def _domain_alignment(self, alignment, domain_region, alignment_index): # Now we need to subselect the portion of the alignment # that contains the domain. protein_record = alignment[alignment_index] protein_seq = str(protein_record.seq) # Figure out which columns encapsulate the domain. aa_count = 0 column_start = None column_stop = None #print protein_seq for column, aa in enumerate(protein_seq): #print column,aa if aa != '-': aa_count = aa_count + 1 if aa_count == domain_region.start and column_start == None: column_start = column if aa_count == domain_region.stop and column_stop == None: column_stop = column break #print column_start,column_stop assert column_start != None, str(column_start) assert column_stop != None, str(column_stop) domain_alignment = Alignment(alphabet=alignment._alphabet) # Grab the portion of each sequence that correspond to columns # for the domain. for record in alignment: domain_alignment.add_sequence( record.id, str(record.seq)[column_start:column_stop]) return (domain_alignment, column_start, column_stop)
def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)): Alignment.__init__(self, alphabet) # represent all of those stars in the aln output format self._star_info = '' self._version = ''
def ace2fasta(in_file, out_file): ace_gen = Ace.parse(open(in_file, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "All contigs treated" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) # Now we have started our alignment we can add sequences to it # Add concensus sequence to alignment align.add_sequence(contig.name, contig.sequence.replace("*","")) """for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq)""" output_file.write(align.format("fasta"))
def createAlignment(sequences, alphabet): """Create an Alignment object from a list of sequences""" align = Alignment(alphabet) counter = 0 for sequence in sequences: name = "sequence" + str(counter) align.add_sequence(name, sequence) counter += 1 return align
def phylip(handle): seqs,columns = handle.readline().split() from Bio.Align.Generic import Alignment from Bio.Alphabet import IUPAC, Gapped alignment = Alignment(Gapped(IUPAC.protein, "-")) for line in handle: name,seq = line.split() alignment.add_sequence(name, seq) return alignment
def createAlignment(sequences, alphabet): """Create an Alignment object from a list of sequences""" align = Alignment(alphabet) counter = 0 for sequence in sequences: name = "sequence" + str(counter) align.add_sequence(name, sequence) counter+=1 return align
def build_align( self, seq ): align = Alignment( Gapped( DNAAlphabet() ) ) alphabet = self.alphabet len_seq = len( seq ) step = self.segment_size for j in range( 0, len_seq, step ): segment = seq[j : j + step] align.add_sequence( name, segment ) self.friendly = align
def testCulledColumnMapper(self): align = Alignment(Gapped(IUPAC.protein, "-")) original = "ABCDEFGHI" align.add_sequence("test",original) culled = [0,1,4,8] # should yield result = "CDFGH" mapper = CulledColumnMapper(align,culled) for i,aa in enumerate(result): assert original[mapper[i]]==aa
def rename(align, first, second): for a in align: new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for seq in a: split_name = seq.id.split('_') #pdb.set_trace() if first and second: new_seq_name = '_'.join([split_name[first][0:3], split_name[second][0:3]]) elif not second: new_seq_name = split_name[first] new_align.add_sequence(new_seq_name, str(seq.seq)) yield new_align
def main(): args = get_args() nexus_files = get_files(args.input) for count, align_file in enumerate(nexus_files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for align in AlignIO.parse(align_file, "nexus"): for taxon in list(align): if taxon.name not in args.taxa: new_align.add_sequence(taxon.name, str(taxon.seq)) outf = os.path.join(args.output, os.path.basename(align_file)) AlignIO.write(new_align, open(outf, 'w'), 'nexus') print count
def rename(align, first, second): for a in align: new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for seq in a: split_name = seq.id.split('_') #pdb.set_trace() if first and second: new_seq_name = '_'.join( [split_name[first][0:3], split_name[second][0:3]]) elif not second: new_seq_name = split_name[first] new_align.add_sequence(new_seq_name, str(seq.seq)) yield new_align
def main(): args = get_args() nexus_files = get_files(args.input) taxa = get_all_taxon_names(nexus_files) taxa_to_keep = get_samples_to_run(args, taxa) for count, align_file in enumerate(nexus_files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for align in AlignIO.parse(align_file, "nexus"): for taxon in list(align): if taxon.name in taxa_to_keep: new_align.add_sequence(taxon.name, str(taxon.seq)) outf = os.path.join(args.output, os.path.basename(align_file)) AlignIO.write(new_align, open(outf, 'w'), 'nexus') print count
def proteins_alignment_to_biopython(al, seq1, seq2, name1, name2): "Convert our internal alignment format into BioPython Alignment" s1 = "" s2 = "" align = Alignment(Gapped(IUPAC.protein, "-")) for a, b in al: if a!=-1: s1 += seq1[a].upper() else: s1 += "-" if b!=-1: s2 += seq2[b].upper() align.add_sequence(name1, s1) align.add_sequence(name2, s2) return align
def NexusIterator(handle, seq_count=None): """Returns SeqRecord objects from a Nexus file. Thus uses the Bio.Nexus module to do the hard work. You are expected to call this function via Bio.SeqIO or Bio.AlignIO (and not use it directly). NOTE - We only expect ONE alignment matrix per Nexus file, meaning this iterator will only yield one Alignment.""" n = Nexus.Nexus(handle) if not n.matrix: #No alignment found raise StopIteration alignment = Alignment(n.alphabet) #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. #The original names and the modified names are kept in these two lists: assert len(n.unaltered_taxlabels) == len(n.taxlabels) if seq_count and seq_count != len(n.unaltered_taxlabels): raise ValueError("Found %i sequences, but seq_count=%i" \ % (len(n.unaltered_taxlabels), seq_count)) for old_name, new_name in zip(n.unaltered_taxlabels, n.taxlabels): assert new_name.startswith(old_name) seq = n.matrix[new_name] #already a Seq object with the alphabet set #ToDo - Can we extract any annotation too? #ToDo - Avoid abusing the private _records list alignment._records.append( SeqRecord(seq, id=new_name, name=old_name, description="")) #All done yield alignment
def __str__(self): """ """ outstr = _Alignment.__str__(self) if self._secStruct: outstr+='\n'+str(self._secStruct) return outstr
def get_alignment(self): """Construct an alignment from the aligned sequences in this tree.""" def seq_is_aligned(node): if isinstance(node, Sequence) and node.mol_seq.is_aligned: return True return False seqs = self.depth_first_search(self, seq_is_aligned) try: first_seq = seqs.next() except StopIteration: warnings.warn("No aligned sequences were found in this tree.", Warning, stacklevel=2) aln = Alignment(first_seq.get_alphabet()) aln.add_sequence(str(first_seq), first_seq.mol_seq.value) for seq in seqs: aln.add_sequence(str(seq), seq.mol_seq.value) return aln
def get_column(self, col): """Returns a string containing a given column (OBSOLETE). This is a method provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. You are encouraged to use the slice notation instead. """ return _Alignment.get_column(self, col)
def trim_alignment(self, method = 'edges', remove_probe = None, bases = None, consensus = True, window_size = 20, threshold = 0.5): """Trim the alignment""" if method == 'edges': # find edges of the alignment start = self._find_ends(forward=True) end = self._find_ends(forward=False) elif method == 'running': start, end = self.running_average(window_size, threshold) # create a new alignment object to hold our alignment self.trimmed_alignment = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) for sequence in self.alignment: # ignore the probe sequence we added if (method == 'edges' or method == 'running') and not remove_probe: # it is totally retarded that biopython only gives us the option to # pass the Alignment object a name and str(sequence). Given this # level of retardation, we'll fudge and use their private method self.trimmed_alignment._records.append(sequence[start:end]) elif method == 'static' and not remove_probe and bases: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment mid_point = len(sequence)/2 if self._base_checker(bases, sequence, mid_point): self.trimmed_alignment._records.append( sequence[mid_point-bases:mid_point+bases] ) else: self.trimmed_alignment = None elif method == 'static' and not remove_probe and bases and self.ploc: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment if self._base_checker(bases, sequence, self.ploc): self.trimmed_alignment._records.append( sequence[self.ploc[0]-bases:self.ploc[1]+bases] ) else: self.trimmed_alignment = None elif remove_probe and self.ploc: # we have to drop to sequence level to add sequence slices # where we basically slice around the probes location temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.ploc[1]:] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) elif method == 'static' and remove_probe and bases and self.ploc: if self._base_checker(bases, sequence, self.ploc): temp = sequence.seq[self.ploc[0]-bases:self.ploc[0]] + \ sequence.seq[self.ploc[1]:self.ploc[1]+bases] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) else: self.trimmed_alignment = None # build a dumb consensus if consensus: self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \ self._alignment_summary(self.trimmed_alignment)
def ace2fasta(in_file, out_file): ace_gen = Ace.parse(open(in_file, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "All contigs treated" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) # Now we have started our alignment we can add sequences to it # Add concensus sequence to alignment align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) output_file.write(align.format("fasta"))
def get_alignment(self): """Construct an alignment from the aligned sequences in this tree.""" def is_aligned_seq(node): if isinstance(node, Sequence) and node.mol_seq.is_aligned: return True return False seqs = self._filter_search(is_aligned_seq, 'preorder', True) try: first_seq = seqs.next() except StopIteration: # No aligned sequences were found # Can't construct an Alignment without an alphabet, so... nothin' return aln = Alignment(first_seq.get_alphabet()) aln.add_sequence(str(first_seq), first_seq.mol_seq.value) for seq in seqs: aln.add_sequence(str(seq), seq.mol_seq.value) return aln
def strarray2biopy(align): """ take a 2d character array with an associated ID list and convert it into a biopython DNA alignment.""" seqs = align[0] ids = align[1] alphabet = Gapped(IUPAC.unambiguous_dna) alignment = Alignment(alphabet) for count, array_seq in enumerate(seqs): bases = '' for base in array_seq: bases += base alignment.add_sequence(ids[count],bases) return alignment
def get_column(self, col): """Returns a string containing a given column (DEPRECATED). This is a method provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, since get_column is likely to be removed in a future release of Biopython.. """ import warnings import Bio warnings.warn("This method is deprecated and is provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, as get_column is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) return _Alignment.get_column(self, col)
def gene_expression_2matrix(in_ace, out_file, tags, min_seq): """Count sequences with each tags in all contigs. """ print print "USING MATRIX OUTPUT FORMAT" print ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("gene_name\tgene_length") for tag in tags: output_file.write("\t" + tag) output_file.write("\tXX_noTag") output_file.write("\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta_2list(align.format("fasta")) if len(sequences) < min_seq: continue contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] contig_seq = sequences[0][1].replace("*", "") contig_length = str(len(contig_seq)) output_file.write(contig_name + "\t" + contig_length) print "Treating", contig_name d = defaultdict(int) for tag in tags: d[tag] = 0 d["XX_noTag"] = 0 fasta_counter = 0 for fasta in sequences: fasta_counter += 1 found_tag = 0 for tag in tags: if fasta[0].find(tag) > -1: d[tag] += 1 found_tag = 1 if found_tag == 0 and fasta[0].find("Consensus") < 0: d["XX_noTag"] += 1 for tag in sorted(d): output_file.write("\t" + str(d[tag])) output_file.write("\n")
def formatData (AlignData, Score): LIMIT1 = 450 LIMIT2 = 2000 i = 0; ScorePoints = [] for i in xrange(6196): ScorePoints.append(0) i = 0 for record in AlignData.Alignment: #print "Here" j = 0 for c in record.seq.tostring(): if (Score[j] <= LIMIT1): if c != '-': ScorePoints[i] -= 2 if (Score[j] >= LIMIT2): if c != '-': ScorePoints[i] += 2 else: ScorePoints[i] -= 1 #NewAlignData.add_sequence(record.seq.tostring(),record.id) j += 1 i+=1 # return NewAlignData # return ScorePoints i = 0 DataList = list() for record in AlignData.Alignment: if(ScorePoints[i] >= -250): NewAlignData = Alignment(Gapped(IUPAC.protein,"-")) NewAlignData.add_sequence(record.id,record.seq.tostring()) DataList.append(NewAlignData) i+=1 return DataList
def main(): options, args = interface() # iterate through all the files to determine the longest alignment files = get_files(options.input) for count, f in enumerate(files): new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) # filename = os.path.basename(f) # chromo_name = filename.split('.')[0] for align in AlignIO.parse(f, "nexus"): for seq in list(align): if ".copy" in seq.name: pass else: # pdb.set_trace() # new_seq_name = seq.name.split('|')[0] new_seq_name = "_".join(seq.name.split("_")[options.position :]) new_align.add_sequence(new_seq_name, str(seq.seq)) # pdb.set_trace() outf = os.path.join(options.output, os.path.split(f)[1]) try: AlignIO.write(new_align, open(outf, "w"), "nexus") except ValueError: pdb.set_trace() print count
def getHaplotypes(aln, n=10, fmin=0.0): """Get the haplotypes of the aligment aln. """ count = {} from Bio.Align.Generic import Alignment haplotypes = Alignment(alphabet) for record in aln: count[record.seq.tostring()] = count.get(record.seq.tostring(), 0) + 1 for i, seq in enumerate( sorted(count.keys(), key=lambda x: count[x], reverse=True)[:n]): f = count[seq] / float(len(aln)) if f > fmin: haplotypes._records.append( Bio.SeqIO.SeqRecord(Bio.Seq.Seq(seq, alphabet), id="Hap%04i" % (i + 1), name="%f" % (f))) return haplotypes
def trim_ambiguous_bases(self): """snip ambiguous bases from a trimmed_alignment""" ambiguous_bases = [] # do this by finaing all ambiguous bases and then snipping the largest # chunk with no ambiguous bases from the entire alignment if not self.trimmed_alignment: self.perfect_trimmed_alignment = self.trimmed_alignment else: for column in xrange( 0, self.trimmed_alignment.get_alignment_length()): if 'N' in self.trimmed_alignment.get_column(column): ambiguous_bases.append(column) maximum = 0 maximum_pos = None #pdb.set_trace() if not ambiguous_bases: self.perfect_trimmed_alignment = self.trimmed_alignment if ambiguous_bases: # prepend and append the start and end of the sequence so consider # those chunks outside the stop and start of ambiguous base runs. ambiguous_bases.insert(0, 0) ambiguous_bases.append( self.trimmed_alignment.get_alignment_length() - 1) # create a new alignment object to hold our alignment self.perfect_trimmed_alignment = \ Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for pos in xrange(len(ambiguous_bases)): if pos + 1 < len(ambiguous_bases): difference = ambiguous_bases[pos + 1] - \ ambiguous_bases[pos] if difference > maximum: maximum = difference maximum_pos = (pos, pos + 1) else: pass # make sure we catch cases where there is not best block if maximum_pos: for sequence in self.trimmed_alignment: self.perfect_trimmed_alignment._records.append( sequence[ambiguous_bases[maximum_pos[0]] + 1:ambiguous_bases[maximum_pos[1]]]) else: self.perfect_trimmed_alignment = None
def bam2Alignment(samfile, chrom=None, start=None, stop=None, minlen=1, out=sys.stdout): """Read alignment from samfile and return Alignment object. """ iter = samfile.fetch(chrom, start, stop) from Bio.Align.Generic import Alignment from Bio.Alphabet import IUPAC, Gapped alphabet = Gapped(IUPAC.ambiguous_dna) aln = Alignment(alphabet) for read in iter: soft_clipped = sum([op[1] for op in read.cigar if op[0] in (4, 1)]) #print soft_clipped, read_cigar if read.rlen - start + read.pos + 1 > minlen + soft_clipped and stop - read.pos + 1 >= minlen + soft_clipped: aln._records.append(getSeqRecord(read, start=start, stop=stop)) return aln
class Record: """Hold Saf information in a format similar to the original record. The Record class is meant to make data easy to get to when you are just interested in looking at Saf data. Attributes: alignment """ def __init__(self): self.alignment = Alignment( Bio.Alphabet.generic_alphabet ) def __str__( self ): output = '' sequences = self.alignment.get_all_seqs() for sequence_record in sequences: output = output + '%s\n' % sequence_record.description output = output + out_sequence( sequence_record.seq.data ) return output
class Record: """Hold Saf information in a format similar to the original record. The Record class is meant to make data easy to get to when you are just interested in looking at Saf data. Attributes: alignment """ def __init__(self): self.alignment = Alignment(Bio.Alphabet.generic_alphabet) def __str__(self): output = '' sequences = self.alignment.get_all_seqs() for sequence_record in sequences: output = output + '%s\n' % sequence_record.description output = output + out_sequence(sequence_record.seq.data) return output
def parse_ace(ace_file): ace_gen = Ace.parse(open(ace_file, 'r')) contig = ace_gen.next() align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in range(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) align.add_sequence(contig.reads[readn].rd.name + "_" + contig.af[readn].coru, seq) return contig, align
def add_gaps_to_align(organisms, missing, align, verbatim=False, genera=False, min_taxa=3): local_organisms = copy.deepcopy(organisms) for a in align: if len(a) < min_taxa: new_align = None break elif len(a) >= min_taxa: #pdb.set_trace() new_align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) overall_length = len(a[0]) for seq in a: if genera and any(sp for sp in genera if sp in seq.name): new_seq_name = '_'.join(seq.name.split('_')[-1:]) elif not verbatim: new_seq_name = '_'.join(seq.name.split('_')[-2:]) else: new_seq_name = seq.name.lower() new_align.add_sequence(new_seq_name, str(seq.seq)) local_organisms.remove(new_seq_name) for org in local_organisms: if genera and any(sp for sp in genera if sp in seq.name): loc = '_'.join(seq.name.split('_')[:-1]) elif not verbatim: loc = '_'.join(seq.name.split('_')[:-2]) else: loc = seq.name if missing: try: assert loc in missing[org], "Locus missing" except: assert loc in missing['{}*'.format( org)], "Locus missing" new_align.add_sequence(org, '?' * overall_length) return new_align
# standard library import os # biopython from Bio import Alphabet from Bio import Seq from Bio.Alphabet import IUPAC from Bio import Clustalw from Bio.Align import AlignInfo from Bio import AlignIO from Bio.SubsMat import FreqTable from Bio.Align.Generic import Alignment #Very simple tests on an empty alignment alignment = Alignment(Alphabet.generic_alphabet) assert alignment.get_alignment_length() == 0 assert len(alignment) == 0 del alignment #Basic tests on simple three string alignment alignment = Alignment(Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.add_sequence("mixed", letters) alignment.add_sequence("lower", letters.lower()) alignment.add_sequence("upper", letters.upper()) assert alignment.get_alignment_length() == 26 assert len(alignment) == 3 assert alignment.get_seq_by_num(0).tostring() == letters assert alignment.get_seq_by_num(1).tostring() == letters.lower() assert alignment.get_seq_by_num(2).tostring() == letters.upper()
consensus = summary.gap_consensus(ambiguous="N") print consensus print print summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus) print #Have a generic alphabet, without a declared gap char, so must tell #provide the frequencies and chars to ignore explicitly. print summary.information_content(e_freq_table=expected, chars_to_ignore=['-']) print print "Trying a protein sequence with gaps and stops" alpha = Alphabet.HasStopCodon( Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") a = Alignment(alpha) a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") print a print "=" * a.get_alignment_length() s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") print c c = s.gap_consensus(ambiguous="X") print c print print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) print s.information_content(chars_to_ignore=['-', '*'])
def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)): Alignment.__init__(self, alphabet)
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return line = line.strip() parts = filter(None, line.split()) if len(parts) != 2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] #Expects STRICT truncation/padding to 10 characters #Does not require any white space between name and seq. for i in range(0, number_of_seqs): line = handle.readline().rstrip() ids.append(line[:10].strip()) #first ten characters seqs.append([line[10:].strip().replace(" ", "")]) #Look for further blocks line = "" while True: #Skip any blank lines between blocks... while "" == line.strip(): line = handle.readline() if not line: break #end of file if not line: break #end of file if self._is_header(line): #Looks like the start of a concatenated alignment self._header = line break #print "New block..." for i in range(0, number_of_seqs): seqs[i].append(line.strip().replace(" ", "")) line = handle.readline() if (not line) and i + 1 < number_of_seqs: raise ValueError("End of file mid-block") if not line: break #end of file alignment = Alignment(self.alphabet) for i in range(0, number_of_seqs): seq = "".join(seqs[i]) if len(seq) != length_of_seqs: raise ValueError("Sequence %i length %i, expected length %i" \ % (i+1, len(seq), length_of_seqs)) alignment.add_sequence(ids[i], seq) record = alignment.get_all_seqs()[-1] assert ids[i] == record.id or ids[i] == record.description record.id = ids[i] record.name = ids[i] record.description = ids[i] return alignment
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) - 1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p + 1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list( sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list( sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([ h[0] for h in haplotypes[-1] if h[0].startswith(g) ]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str( len([ h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g) ]))) bamova_file.write("\n") with open("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write( "Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str( contig_counter), "were treated"
def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: # Empty file - just give up. return if not line.strip() == "# STOCKHOLM 1.0": raise ValueError("Did not find STOCKHOLM header") # import sys # print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break # end of file line = line.strip() # remove trailing \n if line == "# STOCKHOLM 1.0": self._header = line break elif line == "//": # The "//" line indicates the end of the alignment. # There may still be more meta-data passed_end_alignment = True elif line == "": # blank line, ignore pass elif line[0] != "#": # Sequence # Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: # This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, "") seqs[id] += seq.replace(".", "-") elif len(line) >= 5: # Comment line or meta-data if line[:5] == "#=GF ": # Generic per-File annotation, free text # Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) # Each feature key could be used more than once, # so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == "#=GC ": # Generic per-Column annotation, exactly 1 char per column # Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == "#=GS ": # Generic per-Sequence annotation, free text # Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids : # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": # Generic per-Sequence AND per-Column markup # Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) # if id not in ids : # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip() # append to any previous entry # TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines # Next line... assert len(seqs) <= len(ids) # assert len(gs) <= len(ids) # assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None and self.records_per_alignment != len(ids): raise ValueError( "Found %i records in this alignment, told to expect %i" % (len(ids), self.records_per_alignment) ) alignment = Alignment(self.alphabet) # TODO - Introduce an annotated alignment class? # For now, store the annotation a new private property: alignment._annotations = gr alignment_length = len(seqs.values()[0]) for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError("Sequences have different lengths, or repeated identifier") name, start, end = self._identifier_split(id) alignment.add_sequence(id, seq, start=start, end=end) record = alignment.get_all_seqs()[-1] assert record.id == id or record.description == id record.id = id record.name = name record.description = id # will be overridden by _populate_meta_data if an explicit # accession is provided: record.annotations["accession"] = name self._populate_meta_data(id, record) return alignment else: return None
# standard library import os # biopython from Bio import Alphabet from Bio import Seq from Bio.Alphabet import IUPAC from Bio import Clustalw from Bio.Align.FormatConvert import FormatConverter from Bio.Align import AlignInfo from Bio.Fasta import FastaAlign from Bio.SubsMat import FreqTable from Bio.Align.Generic import Alignment #Very simple tests on an empty alignment alignment = Alignment(Alphabet.generic_alphabet) assert alignment.get_alignment_length() == 0 assert alignment.get_all_seqs() == [] del alignment #Basic tests on simple three string alignment alignment = Alignment(Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.add_sequence("mixed", letters) alignment.add_sequence("lower", letters.lower()) alignment.add_sequence("upper", letters.upper()) assert alignment.get_alignment_length() == 26 assert len(alignment.get_all_seqs()) == 3 assert alignment.get_seq_by_num(0).tostring() == letters assert alignment.get_seq_by_num(1).tostring() == letters.lower() assert alignment.get_seq_by_num(2).tostring() == letters.upper()
def next(self): try: line = self._header del self._header except AttributeError: line = self.handle.readline() if not line: #Empty file - just give up. return if not line.strip() == '# STOCKHOLM 1.0': raise ValueError("Did not find STOCKHOLM header") #import sys #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' # Note: If this file follows the PFAM conventions, there should be # a line containing the number of sequences, e.g. "#=GF SQ 67" # We do not check for this - perhaps we should, and verify that # if present it agrees with our parsing. seqs = {} ids = [] gs = {} gr = {} gf = {} passed_end_alignment = False while 1: line = self.handle.readline() if not line: break #end of file line = line.strip() #remove trailing \n if line == '# STOCKHOLM 1.0': self._header = line break elif line == "//": #The "//" line indicates the end of the alignment. #There may still be more meta-data passed_end_alignment = True elif line == "": #blank line, ignore pass elif line[0] != "#": #Sequence #Format: "<seqname> <sequence>" assert not passed_end_alignment parts = [x.strip() for x in line.split(" ", 1)] if len(parts) != 2: #This might be someone attempting to store a zero length sequence? raise ValueError("Could not split line into identifier " \ + "and sequence:\n" + line) id, seq = parts if id not in ids: ids.append(id) seqs.setdefault(id, '') seqs[id] += seq.replace(".", "-") elif len(line) >= 5: #Comment line or meta-data if line[:5] == "#=GF ": #Generic per-File annotation, free text #Format: #=GF <feature> <free text> feature, text = line[5:].strip().split(None, 1) #Each feature key could be used more than once, #so store the entries as a list of strings. if feature not in gf: gf[feature] = [text] else: gf[feature].append(text) elif line[:5] == '#=GC ': #Generic per-Column annotation, exactly 1 char per column #Format: "#=GC <feature> <exactly 1 char per column>" pass elif line[:5] == '#=GS ': #Generic per-Sequence annotation, free text #Format: "#=GS <seqname> <feature> <free text>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids : # ids.append(id) if id not in gs: gs[id] = {} if feature not in gs[id]: gs[id][feature] = [text] else: gs[id][feature].append(text) elif line[:5] == "#=GR ": #Generic per-Sequence AND per-Column markup #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" id, feature, text = line[5:].strip().split(None, 2) #if id not in ids : # ids.append(id) if id not in gr: gr[id] = {} if feature not in gr[id]: gr[id][feature] = "" gr[id][feature] += text.strip( ) # append to any previous entry #TODO - Should we check the length matches the alignment length? # For iterlaced sequences the GR data can be split over # multiple lines #Next line... assert len(seqs) <= len(ids) #assert len(gs) <= len(ids) #assert len(gr) <= len(ids) self.ids = ids self.sequences = seqs self.seq_annotation = gs self.seq_col_annotation = gr if ids and seqs: if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = gr alignment_length = len(seqs.values()[0]) for id in ids: seq = seqs[id] if alignment_length != len(seq): raise ValueError( "Sequences have different lengths, or repeated identifier" ) name, start, end = self._identifier_split(id) alignment.add_sequence(id, seq, start=start, end=end) record = alignment.get_all_seqs()[-1] assert record.id == id or record.description == id record.id = id record.name = name record.description = id #will be overridden by _populate_meta_data if an explicit #accession is provided: record.annotations["accession"] = name self._populate_meta_data(id, record) return alignment else: return None
def to_alignment(sequences, alphabet=None, strict=True): """Returns a multiple sequence alignment (OBSOLETE). - sequences -An iterator that returns SeqRecord objects, or simply a list of SeqRecord objects. All the record sequences must be the same length. - alphabet - Optional alphabet. Stongly recommended. - strict - Optional, defaults to True. Should error checking be done? Using this function is now discouraged. Rather doing this: >>> from Bio import SeqIO >>> handle = open("Clustalw/protein.aln") >>> alignment = SeqIO.to_alignment(SeqIO.parse(handle, "clustal")) >>> handle.close() You are now encouraged to use Bio.AlignIO instead, e.g. >>> from Bio import AlignIO >>> handle = open("Clustalw/protein.aln") >>> alignment = AlignIO.read(handle, "clustal") >>> handle.close() """ #TODO - Move this functionality into the Alignment class instead? from Bio.Alphabet import generic_alphabet from Bio.Alphabet import _consensus_alphabet if alphabet is None: sequences = list(sequences) alphabet = _consensus_alphabet([rec.seq.alphabet for rec in sequences \ if rec.seq is not None]) if not (isinstance(alphabet, Alphabet) or isinstance(alphabet, AlphabetEncoder)): raise ValueError("Invalid alphabet") alignment_length = None alignment = Alignment(alphabet) for record in sequences: if strict: if alignment_length is None: alignment_length = len(record.seq) elif alignment_length != len(record.seq): raise ValueError("Sequences must all be the same length") assert isinstance(record.seq.alphabet, Alphabet) \ or isinstance(record.seq.alphabet, AlphabetEncoder), \ "Sequence does not have a valid alphabet" #TODO - Move this alphabet comparison code into the Alphabet module/class? #TODO - Is a normal alphabet "ungapped" by default, or does it just mean #undecided? if isinstance(record.seq.alphabet, Alphabet) \ and isinstance(alphabet, Alphabet): #Comparing two non-gapped alphabets if not isinstance(record.seq.alphabet, alphabet.__class__): raise ValueError("Incompatible sequence alphabet " \ + "%s for %s alignment" \ % (record.seq.alphabet, alphabet)) elif isinstance(record.seq.alphabet, AlphabetEncoder) \ and isinstance(alphabet, Alphabet): raise ValueError( "Sequence has a gapped alphabet, alignment does not") elif isinstance(record.seq.alphabet, Alphabet) \ and isinstance(alphabet, Gapped): #Sequence isn't gapped, alignment is. if not isinstance(record.seq.alphabet, alphabet.alphabet.__class__): raise ValueError("Incompatible sequence alphabet " \ + "%s for %s alignment" \ % (record.seq.alphabet, alphabet)) else: #Comparing two gapped alphabets if not isinstance(record.seq.alphabet, alphabet.__class__): raise ValueError("Incompatible sequence alphabet " \ + "%s for %s alignment" \ % (record.seq.alphabet, alphabet)) if record.seq.alphabet.gap_char != alphabet.gap_char: raise ValueError( "Sequence gap characters != alignment gap char") #ToDo, additional checks on the specified alignment... #Should we look at the alphabet.contains() method? if record.seq is None: raise TypeError("SeqRecord (id=%s) has None for its sequence." % record.id) #This is abusing the "private" records list, #we should really have a method like add_sequence #but which takes SeqRecord objects. See also Bug 1944 alignment._records.append(record) return alignment
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line[:7] <> 'CLUSTAL': raise ValueError("Did not find CLUSTAL header") #There should be two blank lines after the header line line = handle.readline() while line.strip() == "": line = handle.readline() #If the alignment contains entries with the same sequence #identifier (not a good idea - but seems possible), then this #dictionary based parser will merge their sequences. Fix this? ids = [] seqs = [] #Use the first block to get the sequence identifiers while line.strip() <> "": if line[0] <> " ": #Sequences identifier... fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: raise ValueError("Could not parse line:\n%s" % line) ids.append(fields[0]) seqs.append(fields[1]) if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(fields[1].replace("-", "")) <> letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) else: #Sequence consensus line... pass line = handle.readline() if not line: break #end of file assert line.strip() == "" #Loop over any remaining blocks... done = False while not done: #There should be a blank line between each block. #Also want to ignore any consensus line from the #previous block. while (not line) or line.strip() == "" or line[0] == " ": line = handle.readline() if not line: break # end of file if not line: break # end of file for i in range(len(ids)): fields = line.rstrip().split() #We expect there to be two fields, there can be an optional #"sequence number" field containing the letter count. if len(fields) < 2 or len(fields) > 3: if line[:7] == 'CLUSTAL': #Found concatenated alignment. done = True self._header = line break else: raise ValueError("Could not parse line:\n%s" % line) if fields[0] <> ids[i]: raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \ % (fields[0], ids[i])) #Append the sequence seqs[i] += fields[1] if len(fields) == 3: #This MAY be an old style file with a letter count... try: letters = int(fields[2]) except ValueError: raise ValueError( "Could not parse line, bad sequence number:\n%s" % line) if len(seqs[i].replace("-", "")) <> letters: raise ValueError( "Could not parse line, invalid sequence number:\n%s" % line) #Read in the next line line = handle.readline() assert len(ids) == len(seqs) if len(seqs) == 0 or len(seqs[0]) == 0: return None if self.records_per_alignment is not None \ and self.records_per_alignment <> len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) alignment_length = len(seqs[0]) for i in range(len(ids)): if len(seqs[i]) <> alignment_length: raise ValueError( "Error parsing alignment - sequences of different length?") alignment.add_sequence(ids[i], seqs[i]) return alignment
def __init__(self): Alignment.__init__(self, Gapped(IUPAC.unambiguous_dna, '-'))
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None while line.rstrip() <> "#=======================================": line = handle.readline() if not line: return None length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#": #Read in the rest of this alignment header, #try and discover the number of records expected #and their length parts = line[1:].split(":", 1) key = parts[0].lower().strip() if key == "aligned_sequences": number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs): line = handle.readline() parts = line[1:].strip().split(":", 1) assert i + 1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length": length_of_seqs = int(parts[1].strip()) #And read in another line... line = handle.readline() if number_of_seqs is None: raise SyntaxError("Number of sequences missing!") if length_of_seqs is None: raise SyntaxError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment <> number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] index = 0 #Parse the seqs while line: if len(line) > 21: id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: #identifier, seq start position, seq, seq end position #(an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end #The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id == ids[index] or id == ids[index][:len(id)] #Check the start... assert int(start) - 1 == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says start %i:\n%s" \ % (len(seqs[index]), id, int(start), seqs[index]) seqs[index] += seq #Check the end ... assert int(end) == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says end %i:\n%s" \ % (len(seqs[index]), id, int(end), seqs[index]) index += 1 if index >= number_of_seqs: index = 0 else: #just a start value, this is just alignment annotation (?) #print "Skipping: " + line.rstrip() pass elif line.strip() == "": #Just a spacer? pass else: print line assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================" : #End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment <> len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) for id, seq in zip(ids, seqs): if len(seq) <> length_of_seqs: raise SyntaxError( "Error parsing alignment - sequences of different length?") alignment.add_sequence(id, seq) return alignment
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header print self._header.strip(), '--> self_header' del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith('#-') : #Reached the end of the alignments, no need to read the footer... return None if line.startswith("##") : #Skip the file header before the alignments. e.g. # print line.strip() line = self._skip_file_header(line) # print 'Back from file header skip' assert line.startswith('#'), line while not line.startswith('#=') : line = self.handle.readline() if line.startswith('#='): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) if not line : #End of file return None assert line.startswith(">>") and not line.startswith(">>>"), line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match numbering line, then more tags. #e.g. """ >>#2 ; sw_score: 41.0 ; sw_ident: 0.846 ; sw_overlap: 13 """ if not line.startswith(">>") and not line.startswith(">>>") : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #print match_descr, 'match' #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line.startswith("; ") #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split()[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence (with leading flanking region) while not line.startswith(">") : query_seq_parts.append(line.strip()) line = handle.readline() # print 'queryseq', line.strip() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) #print '----->', line.strip(), match_descr match_descr = line[1:].split()[0] + match_descr #assert match_descr.startswith(line[1:].split()[0]) # assert self._match_descr.startswith(line[1:].split()[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence with flanking region... while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'): match_seq_parts.append(line.strip()) line = handle.readline() if line.startswith('>') or '>>>' in line: self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #The "sq_offset" values can be specified with the -X command line option. #The appear to just shift the origin used in the calculation of the coordinates. if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \ or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") : #Note that until some point in the v35 series, FASTA always recorded one #for the query offset, and ommitted the match offset (even when these were #query_seq the -X command line option). #TODO - Work out how exactly the use of -X offsets changes things. #raise ValueError("Offsets from the -X command line option are not (yet) supported") pass # this is not useful when using stretcher # if len(query_align_seq) != len(match_align_seq) : # raise ValueError("Problem parsing the alignment sequence coordinates") if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split()[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = query_annotation[k] alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr assert record.seq.tostring() == match_align_seq record.id = match_descr.split()[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = match_annotation[k] return alignment
class Align(object): """docstring for Align""" def __init__(self, input): self.input = input self.alignment = None self.trimmed_alignment = None self.perfect_trimmed_alignment = None def _clean(self, outtemp): # cleanup temp file os.remove(outtemp) # cleanup input file os.remove(self.input) def _find_ends(self, forward=True): """determine the first (or last) position where all reads in an alignment start/stop matching""" if forward: theRange = xrange(self.alignment.get_alignment_length()) else: theRange = reversed(xrange(self.alignment.get_alignment_length())) for col in theRange: if '-' in self.alignment.get_column(col): pass else: break return col def _base_checker(self, bases, sequence, loc): """ensure that any trimming that occurs does not start beyong the end of the sequence being trimmed""" # deal with the case where we just want to measure out from the # middle of a particular sequence if len(loc) == 1: loc = (loc, loc) if not bases > len(sequence.seq[:loc[0]]) and \ not bases > len(sequence.seq[loc[1]:]): return True def _record_formatter(self, temp): """return a string formatted as a biopython sequence record""" temp_record = SeqRecord(temp) temp_record.id = sequence.id temp_record.name = sequence.name temp_record.description = sequence.description return temp_record def _alignment_summary(self, alignment): """return summary data for an alignment object using the AlignInfo class from BioPython""" summary = AlignInfo.SummaryInfo(alignment) consensus = summary.dumb_consensus() return summary, consensus def _read(self, format): """read an alignment from the CLI - largely for testing purposes""" self.alignment = AlignIO.read(open(self.input,'rU'), format) def get_probe_location(self): '''Pull the probe sequence from an alignment object and determine its position within the read''' # probe at bottom => reverse order for record in self.alignment[::-1]: if record.id == 'probe': start = re.search('^-*', str(record.seq)) end = re.search('-*$', str(record.seq)) # should be first record break # ooh, this seems so very backwards self.ploc = (start.end(), end.start(),) def run_alignment(self, clean = True, consensus = True): """Align, as originally written gets bogged down. Add communicate method and move away from pipes for holding information (this has always been problematic for me with multiprocessing). Move to tempfile-based output.""" # create results file fd, outtemp = tempfile.mkstemp(suffix='.align') os.close(fd) # run MUSCLE on the temp file cline = MuscleCommandline(input=self.input, out=outtemp) stdout, stderr = subprocess.Popen(str(cline), stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True).communicate(None) self.alignment = AlignIO.read(open(outtemp,'rU'), "fasta", alphabet = Gapped(IUPAC.unambiguous_dna, "-")) # build a dumb consensus if consensus: self.alignment_summary, self.alignment_consensus = \ self._alignment_summary(self.alignment) # cleanup temp files if clean: self._clean(outtemp) def running_average(self, window_size, threshold): # iterate across the columns of the alignment and determine presence # or absence of base-identity in the column differences = [] for column in xrange(self.alignment.get_alignment_length()): column_values = self.alignment.get_column(column) # get the count of different bases in a column (converting # it to a set gets only the unique values) if len(set(list(column_values))) > 1: differences.append(0) else: differences.append(1) # compute the running average from the start => end of the sequence forward_average = [] for start in xrange(len(differences)): end = start + window_size if end < len(differences): forward_average.append(sum(differences[start:end])/float(len(differences[start:end]))) # compute the running average from the end => start of the sequence # we do this, because, otherwise, this end would be neglected. reverse_average = [] for end in reversed(xrange(-len(differences), 0)): start = end - window_size if start > -len(differences): reverse_average.append(sum(differences[start:end])/float(len(differences[start:end]))) # find where each running average first reaches some threshold # identity over the run span chosen. for start_clip, avg in enumerate(forward_average): if round(avg, 1) >= float(threshold): break for temp_end_clip, avg in enumerate(reverse_average): if round(avg, 1) >= float(threshold): end_clip = len(differences) - temp_end_clip break return start_clip, end_clip def trim_alignment(self, method = 'edges', remove_probe = None, bases = None, consensus = True, window_size = 20, threshold = 0.5): """Trim the alignment""" if method == 'edges': # find edges of the alignment start = self._find_ends(forward=True) end = self._find_ends(forward=False) elif method == 'running': start, end = self.running_average(window_size, threshold) # create a new alignment object to hold our alignment self.trimmed_alignment = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) for sequence in self.alignment: # ignore the probe sequence we added if (method == 'edges' or method == 'running') and not remove_probe: # it is totally retarded that biopython only gives us the option to # pass the Alignment object a name and str(sequence). Given this # level of retardation, we'll fudge and use their private method self.trimmed_alignment._records.append(sequence[start:end]) elif method == 'static' and not remove_probe and bases: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment mid_point = len(sequence)/2 if self._base_checker(bases, sequence, mid_point): self.trimmed_alignment._records.append( sequence[mid_point-bases:mid_point+bases] ) else: self.trimmed_alignment = None elif method == 'static' and not remove_probe and bases and self.ploc: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment if self._base_checker(bases, sequence, self.ploc): self.trimmed_alignment._records.append( sequence[self.ploc[0]-bases:self.ploc[1]+bases] ) else: self.trimmed_alignment = None elif remove_probe and self.ploc: # we have to drop to sequence level to add sequence slices # where we basically slice around the probes location temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.ploc[1]:] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) elif method == 'static' and remove_probe and bases and self.ploc: if self._base_checker(bases, sequence, self.ploc): temp = sequence.seq[self.ploc[0]-bases:self.ploc[0]] + \ sequence.seq[self.ploc[1]:self.ploc[1]+bases] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) else: self.trimmed_alignment = None # build a dumb consensus if consensus: self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \ self._alignment_summary(self.trimmed_alignment) def trim_ambiguous_bases(self): """snip ambiguous bases from a trimmed_alignment""" ambiguous_bases = [] # do this by finaing all ambiguous bases and then snipping the largest # chunk with no ambiguous bases from the entire alignment for column in xrange(0, self.trimmed_alignment.get_alignment_length()): if 'N' in self.trimmed_alignment.get_column(column): ambiguous_bases.append(column) maximum = 0 maximum_pos = None #pdb.set_trace() if ambiguous_bases: # prepend and append the start and end of the sequence so consider # those chunks outside the stop and start of ambiguous base runs. ambiguous_bases.insert(0,0) ambiguous_bases.append(self.trimmed_alignment.get_alignment_length() - 1) # create a new alignment object to hold our alignment self.perfect_trimmed_alignment = \ Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for pos in xrange(len(ambiguous_bases)): if pos + 1 < len(ambiguous_bases): difference = ambiguous_bases[pos + 1] - \ ambiguous_bases[pos] if difference > maximum: maximum = difference maximum_pos = (pos, pos+1) else: pass # make sure we catch cases where there is not best block if maximum_pos: for sequence in self.trimmed_alignment: self.perfect_trimmed_alignment._records.append( sequence[ambiguous_bases[maximum_pos[0]] + 1 :ambiguous_bases[maximum_pos[1]]] ) else: self.perfect_trimmed_alignment = None else: self.perfect_trimmed_alignment = self.trimmed_alignment
print(consensus) consensus = summary.gap_consensus(ambiguous="N") print(consensus) print("") print(summary.pos_specific_score_matrix(chars_to_ignore=['-'], axis_seq=consensus)) print("") # Have a generic alphabet, without a declared gap char, so must tell # provide the frequencies and chars to ignore explicitly. print(summary.information_content(e_freq_table=expected, chars_to_ignore=['-'])) print("") print("Trying a protein sequence with gaps and stops") alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") a = Alignment(alpha) a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") print(a) print("=" * a.get_alignment_length()) s = SummaryInfo(a) c = s.dumb_consensus(ambiguous="X") print(c) c = s.gap_consensus(ambiguous="X") print(c) print("") print(s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)) print(s.information_content(chars_to_ignore=['-', '*']))
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header print self._header.strip(), '--> self_header' del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith('#-') : #Reached the end of the alignments, no need to read the footer... return None if line.startswith("##") : #Skip the file header before the alignments. e.g. # print line.strip() line = self._skip_file_header(line) # print 'Back from file header skip' assert line.startswith('#'), line while not line.startswith('#=') : line = self.handle.readline() if line.startswith('#='): #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) if not line : #End of file return None assert line.startswith(">>") and not line.startswith(">>>"), line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match numbering line, then more tags. #e.g. """ >>#2 ; sw_score: 41.0 ; sw_ident: 0.846 ; sw_overlap: 13 """ if not line.startswith(">>") and not line.startswith(">>>") : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #print match_descr, 'match' #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line.startswith("; ") #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split()[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence (with leading flanking region) while not line.startswith(">") : query_seq_parts.append(line.strip()) line = handle.readline() # print 'queryseq', line.strip() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line.startswith(">") and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) #print '----->', line.strip(), match_descr match_descr = line[1:].split()[0] + match_descr #assert match_descr.startswith(line[1:].split()[0]) # assert self._match_descr.startswith(line[1:].split()[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line.startswith("; ") #Now should have the aligned query sequence with flanking region... while not (line.startswith(">") or ">>>" in line) and not line.startswith('#'): match_seq_parts.append(line.strip()) line = handle.readline() if not line: #End of file return None if line.startswith('>') or '>>>' in line: self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #The "sq_offset" values can be specified with the -X command line option. #The appear to just shift the origin used in the calculation of the coordinates. if ("sq_offset" in query_annotation and query_annotation["sq_offset"] != "1") \ or ("sq_offset" in match_annotation and match_annotation["sq_offset"] != "1") : #Note that until some point in the v35 series, FASTA always recorded one #for the query offset, and ommitted the match offset (even when these were #query_seq the -X command line option). #TODO - Work out how exactly the use of -X offsets changes things. #raise ValueError("Offsets from the -X command line option are not (yet) supported") pass # this is not useful when using stretcher # if len(query_align_seq) != len(match_align_seq) : # raise ValueError("Problem parsing the alignment sequence coordinates") if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alignment = Alignment(self.alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split()[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = query_annotation[k] alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr assert record.seq.tostring() == match_align_seq record.id = match_descr.split()[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) # Roba mia for k in query_annotation.keys(): record.annotations[k] = match_annotation[k] return alignment
def main(): # Configuration #Select the desired NCBI translation table translationTable = 11 # Open the DNA sequence file and read the fasta sequences into a dictionary if (len(argv) > 1): dnaFileName = argv[1] else: dnaFileName = None dnaSeqFile = fileinput.input(dnaFileName) dnaSeqDict = SeqIO.to_dict(SeqIO.parse(dnaSeqFile, "fasta")) # Translate the sequences aaSeqRecords = [] for key in dnaSeqDict: aaSeq = SeqRecord(dnaSeqDict[key].seq.translate(table=translationTable), id=key) aaSeqRecords.append(aaSeq) dnaSeqFile.close() # Replace stop codons with X (unknown aa) so muscle doesn't drop them for aaSeq in aaSeqRecords: noStopCodonSeq = str(aaSeq.seq).replace('*', 'X') aaSeq.seq = Seq(noStopCodonSeq) # Align the aa sequences commandLine = str(MuscleCommandline(seqtype='protein')) childProcess = subprocess.Popen(commandLine, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform!="win32")) #don't pipe stderr or muscle hangs SeqIO.write(aaSeqRecords, childProcess.stdin, "fasta") childProcess.stdin.close() aaAlignment = AlignIO.read(childProcess.stdout, "fasta") # Convert the aa alignment into a dna alignment dnaAlignment = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for taxon in aaAlignment: aaCount = 0 dnaSeq = '' for aaResidue in taxon.seq: if (aaResidue == '-'): dnaSeq = dnaSeq + '---' else: dnaSeq = dnaSeq + dnaSeqDict[taxon.id].seq[aaCount*3:aaCount*3+3] aaCount+=1 # As we add the sequences to the alignment remove gene name from the sequence id so they taxon match the PAML constraint tree dnaAlignment.add_sequence(taxon.id.split('_')[0], str(dnaSeq)) if (dnaFileName): outFileName = dnaFileName.split('.')[0] + '_aln.phy' else: outFileName = 'out_aln.phy' outFile = open(outFileName, 'w+') AlignIO.write([dnaAlignment], outFile, "phylip") #I think this section should be removed. If I put the 'I' into the alignment file now, I can't open the alignment with BioPython-based scripts (for manual editing etc). I can use pamlize.py to add the I right before using paml. # Biopython doesn't tag Interleaved phylip files and PAML requires it so... # outFile.seek(0,0) # modifiedAlignmentText = outFile.readlines() # modifiedAlignmentText[0] = modifiedAlignmentText[0].rstrip() + ' I\n' # outFile.seek(0,0) # outFile.writelines(modifiedAlignmentText) outFile.close()
def next(self) : """Reads from the handle to construct and return the next alignment. This returns the pairwise alignment of query and match/library sequences as an Alignment object containing two rows.""" handle = self.handle try : #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None if line.startswith("#") : #Skip the file header before the alignments. e.g. line = self._skip_file_header(line) while ">>>" in line and not line.startswith(">>>") : #Moved onto the next query sequence! self._query_descr = "" self._query_header_annotation = {} #Read in the query header line = self._parse_query_header(line) #Now should be some alignments, but if not we move onto the next query if not line : #End of file return None if ">>><<<" in line : #Reached the end of the alignments, no need to read the footer... return None #Should start >>... and not >>>... assert line[0:2] == ">>" and not line[2] == ">", line query_seq_parts, match_seq_parts = [], [] query_annotation, match_annotation = {}, {} match_descr = "" alignment_annotation = {} #This should be followed by the target match ID line, then more tags. #e.g. """ >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] ; fa_frame: f ; fa_initn: 52 ; fa_init1: 52 ; fa_opt: 70 ; fa_z-score: 105.5 ; fa_bits: 27.5 ; fa_expect: 0.082 ; sw_score: 70 ; sw_ident: 0.279 ; sw_sim: 0.651 ; sw_overlap: 43 """ if (not line[0:2] == ">>") or line[0:3] == ">>>" : raise ValueError("Expected target line starting '>>'") match_descr = line[2:].strip() #Handle the following "alignment hit" tagged data, e.g. line = handle.readline() line = self._parse_tag_section(line, alignment_annotation) assert not line[0:2] == "; " #Then we have the alignment numbers and sequence for the query """ >gi|10955265| .. ; sq_len: 346 ; sq_offset: 1 ; sq_type: p ; al_start: 197 ; al_stop: 238 ; al_display_start: 167 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL GEYFTENKPKYIIREIHQET """ if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..'") assert self._query_descr.startswith(line[1:].split(None,1)[0]) #Handle the following "query alignment" tagged data line = handle.readline() line = self._parse_tag_section(line, query_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence (with leading flanking region) while not line[0] == ">" : query_seq_parts.append(line.strip()) line = handle.readline() #Handle the following "match alignment" data """ >gi|152973545|ref|YP_001338596.1| .. ; sq_len: 242 ; sq_type: p ; al_start: 52 ; al_stop: 94 ; al_display_start: 22 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR QDFAFTRKMRREARQVEQSW """ #Match identifier if not (line[0] == ">" and line.strip().endswith("..")): raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line)) assert match_descr.startswith(line[1:].split(None,1)[0]) #Tagged data, line = handle.readline() line = self._parse_tag_section(line, match_annotation) assert not line[0:2] == "; " #Now should have the aligned query sequence with flanking region... #but before that, since FASTA 35.4.1 there can be an consensus here, """ ; al_cons: .::. : :. ---. :: :. . : ..-:::-: :.: ..:...: etc """ while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): match_seq_parts.append(line.strip()) line = handle.readline() if line[0:2] == "; " : assert line.strip() == "; al_cons:" align_consensus_parts = [] line = handle.readline() while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line): align_consensus_parts.append(line.strip()) line = handle.readline() #If we do anything with this in future, must remove any flanking region. align_consensus = "".join(align_consensus_parts) del align_consensus_parts assert not line[0:2] == "; " else : align_consensus = None assert (line[0] == ">" or ">>>" in line) self._header = line #We built a list of strings and then joined them because #its faster than appending to a string. query_seq = "".join(query_seq_parts) match_seq = "".join(match_seq_parts) del query_seq_parts, match_seq_parts #Note, query_seq and match_seq will usually be of different lengths, apparently #because in the m10 format leading gaps are added but not trailing gaps! #Remove the flanking regions, query_align_seq = self._extract_alignment_region(query_seq, query_annotation) match_align_seq = self._extract_alignment_region(match_seq, match_annotation) #How can we do this for the (optional) consensus? #The "sq_offset" values can be specified with the -X command line option. #They appear to just shift the origin used in the calculation of the coordinates. if len(query_align_seq) != len(match_align_seq) : raise ValueError("Problem parsing the alignment sequence coordinates, " "following should be the same length but are not:\n" "%s - len %i\n%s - len %i" % (query_align_seq, len(query_align_seq), match_align_seq, len(match_align_seq))) if "sw_overlap" in alignment_annotation : if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) : raise ValueError("Specified sw_overlap = %s does not match expected value %i" \ % (alignment_annotation["sw_overlap"], len(query_align_seq))) #TODO - Look at the "sq_type" to assign a sensible alphabet? alphabet = self.alphabet alignment = Alignment(alphabet) #TODO - Introduce an annotated alignment class? #For now, store the annotation a new private property: alignment._annotations = {} #Want to record both the query header tags, and the alignment tags. for key, value in self._query_header_annotation.iteritems() : alignment._annotations[key] = value for key, value in alignment_annotation.iteritems() : alignment._annotations[key] = value #TODO - Once the alignment object gets an append method, use it. #(i.e. an add SeqRecord method) alignment.add_sequence(self._query_descr, query_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == self._query_descr or record.description == self._query_descr #assert record.seq.tostring() == query_align_seq record.id = self._query_descr.split(None,1)[0].strip(",") record.name = "query" record.annotations["original_length"] = int(query_annotation["sq_len"]) #TODO - What if a specific alphabet has been requested? #TODO - Use an IUPAC alphabet? #TODO - Can FASTA output RNA? if alphabet == single_letter_alphabet and "sq_type" in query_annotation : if query_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif query_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in query_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") alignment.add_sequence(match_descr, match_align_seq) record = alignment.get_all_seqs()[-1] assert record.id == match_descr or record.description == match_descr #assert record.seq.tostring() == match_align_seq record.id = match_descr.split(None,1)[0].strip(",") record.name = "match" record.annotations["original_length"] = int(match_annotation["sq_len"]) #This is still a very crude way of dealing with the alphabet: if alphabet == single_letter_alphabet and "sq_type" in match_annotation : if match_annotation["sq_type"] == "D" : record.seq.alphabet = generic_dna elif match_annotation["sq_type"] == "p" : record.seq.alphabet = generic_protein if "-" in match_align_seq : if not hasattr(record.seq.alphabet,"gap_char") : record.seq.alphabet = Gapped(record.seq.alphabet, "-") return alignment
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None while line.rstrip() != "#=======================================": line = handle.readline() if not line: return None length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#": #Read in the rest of this alignment header, #try and discover the number of records expected #and their length parts = line[1:].split(":", 1) key = parts[0].lower().strip() if key == "aligned_sequences": number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs): line = handle.readline() parts = line[1:].strip().split(":", 1) assert i + 1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length": length_of_seqs = int(parts[1].strip()) #And read in another line... line = handle.readline() if number_of_seqs is None: raise ValueError("Number of sequences missing!") if length_of_seqs is None: raise ValueError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] index = 0 #Parse the seqs while line: if len(line) > 21: id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: #identifier, seq start position, seq, seq end position #(an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end #The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id == ids[index] or id == ids[index][:len(id)] #Check the start... if int(start) == 0: #Special case when one sequence starts long before the other assert len(seqs[index].replace("-", "")) == 0 assert len(seq.replace("-", "")) == 0, line elif int(start) == len(seqs[index].replace("-", "")): #Special case when one sequence ends long before the other assert len(seq.replace("-", "")) == 0, line else: assert int(start) - 1 == len(seqs[index].replace("-","")), \ "Found %i chars so far for sequence %i (%s), file says start %i:\n%s" \ % (len(seqs[index].replace("-","")), index, id, int(start), seqs[index]) seqs[index] += seq #Check the end ... assert int(end) == len(seqs[index].replace("-","")), \ "Found %i chars so far for %s, file says end %i:\n%s" \ % (len(seqs[index]), id, int(end), repr(seqs[index])) index += 1 if index >= number_of_seqs: index = 0 else: #just a start value, this is just alignment annotation (?) #print "Skipping: " + line.rstrip() pass elif line.strip() == "": #Just a spacer? pass else: print line assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================" : #End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids) : raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = Alignment(self.alphabet) for id, seq in zip(ids, seqs): if len(seq) != length_of_seqs: #EMBOSS 2.9.0 is known to use spaces instead of minus signs #for leading gaps, and thus fails to parse. This old version #is still used as of Dec 2008 behind the EBI SOAP webservice: #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl raise ValueError("Error parsing alignment - sequences of " "different length? You could be using an " "old version of EMBOSS.") alignment.add_sequence(id, seq) return alignment