def get_column(self, col): """Returns a string containing a given column (OBSOLETE). This is a method provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. You are encouraged to use the slice notation instead. """ return _Alignment.get_column(self, col)
def get_column(self, col): """Returns a string containing a given column (DEPRECATED). This is a method provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, since get_column is likely to be removed in a future release of Biopython.. """ import warnings import Bio warnings.warn("This method is deprecated and is provided for backwards compatibility with the old Bio.Align.Generic.Alignment object. Please use the slice notation instead, as get_column is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) return _Alignment.get_column(self, col)
#Basic tests on simple three string alignment alignment = Alignment(Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.add_sequence("mixed", letters) alignment.add_sequence("lower", letters.lower()) alignment.add_sequence("upper", letters.upper()) assert alignment.get_alignment_length() == 26 assert len(alignment) == 3 assert alignment.get_seq_by_num(0).tostring() == letters assert alignment.get_seq_by_num(1).tostring() == letters.lower() assert alignment.get_seq_by_num(2).tostring() == letters.upper() assert alignment[0].description == "mixed" assert alignment[1].description == "lower" assert alignment[2].description == "upper" for (col, letter) in enumerate(letters): assert alignment.get_column(col) == letter \ + letter.lower() \ + letter.upper() #Check row extractions: assert alignment[0].id == "mixed" assert alignment[-1].id == "upper" #Check sub-alignment extraction by row slicing: assert isinstance(alignment[::-1], Alignment) assert alignment[::-1][0].id == "upper" assert alignment[::-1][2].id == "mixed" del alignment del letters print "testing reading and writing clustal format..." test_dir = os.path.join(os.getcwd(), 'Clustalw')
class Align(object): """docstring for Align""" def __init__(self, input): self.input = input self.alignment = None self.trimmed_alignment = None self.perfect_trimmed_alignment = None def _clean(self, outtemp): # cleanup temp file os.remove(outtemp) # cleanup input file os.remove(self.input) def _find_ends(self, forward=True): """determine the first (or last) position where all reads in an alignment start/stop matching""" if forward: theRange = xrange(self.alignment.get_alignment_length()) else: theRange = reversed(xrange(self.alignment.get_alignment_length())) for col in theRange: if '-' in self.alignment.get_column(col): pass else: break return col def _base_checker(self, bases, sequence, loc): """ensure that any trimming that occurs does not start beyong the end of the sequence being trimmed""" # deal with the case where we just want to measure out from the # middle of a particular sequence if len(loc) == 1: loc = (loc, loc) if not bases > len(sequence.seq[:loc[0]]) and \ not bases > len(sequence.seq[loc[1]:]): return True def _record_formatter(self, temp): """return a string formatted as a biopython sequence record""" temp_record = SeqRecord(temp) temp_record.id = sequence.id temp_record.name = sequence.name temp_record.description = sequence.description return temp_record def _alignment_summary(self, alignment): """return summary data for an alignment object using the AlignInfo class from BioPython""" summary = AlignInfo.SummaryInfo(alignment) consensus = summary.dumb_consensus() return summary, consensus def _read(self, format): """read an alignment from the CLI - largely for testing purposes""" self.alignment = AlignIO.read(open(self.input,'rU'), format) def get_probe_location(self): '''Pull the probe sequence from an alignment object and determine its position within the read''' # probe at bottom => reverse order for record in self.alignment[::-1]: if record.id == 'probe': start = re.search('^-*', str(record.seq)) end = re.search('-*$', str(record.seq)) # should be first record break # ooh, this seems so very backwards self.ploc = (start.end(), end.start(),) def run_alignment(self, clean = True, consensus = True): """Align, as originally written gets bogged down. Add communicate method and move away from pipes for holding information (this has always been problematic for me with multiprocessing). Move to tempfile-based output.""" # create results file fd, outtemp = tempfile.mkstemp(suffix='.align') os.close(fd) # run MUSCLE on the temp file cline = MuscleCommandline(input=self.input, out=outtemp) stdout, stderr = subprocess.Popen(str(cline), stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True).communicate(None) self.alignment = AlignIO.read(open(outtemp,'rU'), "fasta", alphabet = Gapped(IUPAC.unambiguous_dna, "-")) # build a dumb consensus if consensus: self.alignment_summary, self.alignment_consensus = \ self._alignment_summary(self.alignment) # cleanup temp files if clean: self._clean(outtemp) def running_average(self, window_size, threshold): # iterate across the columns of the alignment and determine presence # or absence of base-identity in the column differences = [] for column in xrange(self.alignment.get_alignment_length()): column_values = self.alignment.get_column(column) # get the count of different bases in a column (converting # it to a set gets only the unique values) if len(set(list(column_values))) > 1: differences.append(0) else: differences.append(1) # compute the running average from the start => end of the sequence forward_average = [] for start in xrange(len(differences)): end = start + window_size if end < len(differences): forward_average.append(sum(differences[start:end])/float(len(differences[start:end]))) # compute the running average from the end => start of the sequence # we do this, because, otherwise, this end would be neglected. reverse_average = [] for end in reversed(xrange(-len(differences), 0)): start = end - window_size if start > -len(differences): reverse_average.append(sum(differences[start:end])/float(len(differences[start:end]))) # find where each running average first reaches some threshold # identity over the run span chosen. for start_clip, avg in enumerate(forward_average): if round(avg, 1) >= float(threshold): break for temp_end_clip, avg in enumerate(reverse_average): if round(avg, 1) >= float(threshold): end_clip = len(differences) - temp_end_clip break return start_clip, end_clip def trim_alignment(self, method = 'edges', remove_probe = None, bases = None, consensus = True, window_size = 20, threshold = 0.5): """Trim the alignment""" if method == 'edges': # find edges of the alignment start = self._find_ends(forward=True) end = self._find_ends(forward=False) elif method == 'running': start, end = self.running_average(window_size, threshold) # create a new alignment object to hold our alignment self.trimmed_alignment = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) for sequence in self.alignment: # ignore the probe sequence we added if (method == 'edges' or method == 'running') and not remove_probe: # it is totally retarded that biopython only gives us the option to # pass the Alignment object a name and str(sequence). Given this # level of retardation, we'll fudge and use their private method self.trimmed_alignment._records.append(sequence[start:end]) elif method == 'static' and not remove_probe and bases: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment mid_point = len(sequence)/2 if self._base_checker(bases, sequence, mid_point): self.trimmed_alignment._records.append( sequence[mid_point-bases:mid_point+bases] ) else: self.trimmed_alignment = None elif method == 'static' and not remove_probe and bases and self.ploc: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment if self._base_checker(bases, sequence, self.ploc): self.trimmed_alignment._records.append( sequence[self.ploc[0]-bases:self.ploc[1]+bases] ) else: self.trimmed_alignment = None elif remove_probe and self.ploc: # we have to drop to sequence level to add sequence slices # where we basically slice around the probes location temp = sequence.seq[:self.ploc[0]] + sequence.seq[self.ploc[1]:] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) elif method == 'static' and remove_probe and bases and self.ploc: if self._base_checker(bases, sequence, self.ploc): temp = sequence.seq[self.ploc[0]-bases:self.ploc[0]] + \ sequence.seq[self.ploc[1]:self.ploc[1]+bases] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) else: self.trimmed_alignment = None # build a dumb consensus if consensus: self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \ self._alignment_summary(self.trimmed_alignment) def trim_ambiguous_bases(self): """snip ambiguous bases from a trimmed_alignment""" ambiguous_bases = [] # do this by finaing all ambiguous bases and then snipping the largest # chunk with no ambiguous bases from the entire alignment for column in xrange(0, self.trimmed_alignment.get_alignment_length()): if 'N' in self.trimmed_alignment.get_column(column): ambiguous_bases.append(column) maximum = 0 maximum_pos = None #pdb.set_trace() if ambiguous_bases: # prepend and append the start and end of the sequence so consider # those chunks outside the stop and start of ambiguous base runs. ambiguous_bases.insert(0,0) ambiguous_bases.append(self.trimmed_alignment.get_alignment_length() - 1) # create a new alignment object to hold our alignment self.perfect_trimmed_alignment = \ Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for pos in xrange(len(ambiguous_bases)): if pos + 1 < len(ambiguous_bases): difference = ambiguous_bases[pos + 1] - \ ambiguous_bases[pos] if difference > maximum: maximum = difference maximum_pos = (pos, pos+1) else: pass # make sure we catch cases where there is not best block if maximum_pos: for sequence in self.trimmed_alignment: self.perfect_trimmed_alignment._records.append( sequence[ambiguous_bases[maximum_pos[0]] + 1 :ambiguous_bases[maximum_pos[1]]] ) else: self.perfect_trimmed_alignment = None else: self.perfect_trimmed_alignment = self.trimmed_alignment
#Basic tests on simple three string alignment alignment = Alignment(Alphabet.generic_alphabet) letters = "AbcDefGhiJklMnoPqrStuVwxYz" alignment.add_sequence("mixed", letters) alignment.add_sequence("lower", letters.lower()) alignment.add_sequence("upper", letters.upper()) assert alignment.get_alignment_length() == 26 assert len(alignment.get_all_seqs()) == 3 assert alignment.get_seq_by_num(0).tostring() == letters assert alignment.get_seq_by_num(1).tostring() == letters.lower() assert alignment.get_seq_by_num(2).tostring() == letters.upper() assert alignment.get_all_seqs()[0].description == "mixed" assert alignment.get_all_seqs()[1].description == "lower" assert alignment.get_all_seqs()[2].description == "upper" for (col, letter) in enumerate(letters) : assert alignment.get_column(col) == letter \ + letter.lower() \ + letter.upper() #Check row extractions: assert alignment[0].id == "mixed" assert alignment[-1].id == "upper" #Check sub-alignment extraction by row slicing: assert isinstance(alignment[::-1], Alignment) assert alignment[::-1][0].id == "upper" assert alignment[::-1][2].id == "mixed" del alignment del letters print "testing reading and writing clustal format..." test_dir = os.path.join(os.getcwd(), 'Clustalw')
align=Alignment(Bio.Alphabet.Gapped(IUPAC.protein)) #instance of Alignment class align.add_sequence('asp',seq1) align.add_sequence('unk',seq2) print align #Alignment methods #get_all_seqs: return all sequences in the alignment as a list of SeqRecord for s in align.get_all_seqs(): #in align: (the same) print '->',s.seq #get_seq_by_num(n): return only the selected sequence by index print str(align.get_seq_by_num(1)) #Seq object print align[0] #SeqRecord object print str(align[0].seq) #get_alignment_length(): get length of alignment print align.get_alignment_length() #get_column(n): return a string with all the letters in the n column print align.get_column(0) print align.get_column(2) #AlignInfo module: to extract info from alignment objects from Bio.Align import AlignInfo #print_info_content function #SummaryInfo,PSSM classes summary=AlignInfo.SummaryInfo(align) print summary.information_content() print summary.dumb_consensus() print summary.gap_consensus() print summary.get_column(2) #get column by index print summary.alignment #complete description #Bio.SeqIO: interface to input & output sequence file formats #passed to your program as SeqRecord objects
class Align(object): """docstring for Align""" def __init__(self, input): self.input = input self.alignment = None self.trimmed_alignment = None self.perfect_trimmed_alignment = None def _clean(self, outtemp): # cleanup temp file os.remove(outtemp) # cleanup input file os.remove(self.input) def _find_ends(self, forward=True): """determine the first (or last) position where all reads in an alignment start/stop matching""" if forward: theRange = xrange(self.alignment.get_alignment_length()) else: theRange = reversed(xrange(self.alignment.get_alignment_length())) for col in theRange: if '-' in self.alignment.get_column(col): pass else: break return col def _base_checker(self, bases, sequence, loc): """ensure that any trimming that occurs does not start beyong the end of the sequence being trimmed""" # deal with the case where we just want to measure out from the # middle of a particular sequence if len(loc) == 1: loc = (loc, loc) if not bases > len(sequence.seq[:loc[0]]) and \ not bases > len(sequence.seq[loc[1]:]): return True def _record_formatter(self, temp): """return a string formatted as a biopython sequence record""" temp_record = SeqRecord(temp) temp_record.id = sequence.id temp_record.name = sequence.name temp_record.description = sequence.description return temp_record def _alignment_summary(self, alignment): """return summary data for an alignment object using the AlignInfo class from BioPython""" summary = AlignInfo.SummaryInfo(alignment) consensus = summary.dumb_consensus() return summary, consensus def _read(self, format): """read an alignment from the CLI - largely for testing purposes""" self.alignment = AlignIO.read(open(self.input, 'rU'), format) def get_probe_location(self): '''Pull the probe sequence from an alignment object and determine its position within the read''' # probe at bottom => reverse order for record in self.alignment[::-1]: if record.id == 'probe': start = re.search('^-*', str(record.seq)) end = re.search('-*$', str(record.seq)) # should be first record break # ooh, this seems so very backwards self.ploc = ( start.end(), end.start(), ) def run_alignment(self, clean=True, consensus=True): """Align, as originally written gets bogged down. Add communicate method and move away from pipes for holding information (this has always been problematic for me with multiprocessing). Move to tempfile-based output.""" # create results file fd, outtemp = tempfile.mkstemp(suffix='.align') os.close(fd) # run MUSCLE on the temp file cline = MuscleCommandline(input=self.input, out=outtemp) stdout, stderr = subprocess.Popen(str(cline), stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True).communicate(None) self.alignment = AlignIO.read(open(outtemp, 'rU'), "fasta", alphabet=Gapped(IUPAC.unambiguous_dna, "-")) # build a dumb consensus if consensus: self.alignment_summary, self.alignment_consensus = \ self._alignment_summary(self.alignment) # cleanup temp files if clean: self._clean(outtemp) def running_average(self, window_size, threshold, proportion=0.3, k=None, running_probe=False): # iterate across the columns of the alignment and determine presence # or absence of base-identity in the column differences = [] members = len(self.alignment) if not running_probe: for column in xrange(self.alignment.get_alignment_length()): column_values = self.alignment.get_column(column) # get the count of different bases in a column (converting # it to a set gets only the unique values) column_list = list(column_values) # use proportional removal of gaps if column_list.count('-') <= int(round(proportion * members, 0)): column_list = [i for i in column_list if i != '-'] #pdb.set_trace() if len(set(column_list)) > 1: differences.append(0) else: differences.append(1) else: for column in xrange(self.alignment.get_alignment_length()): column_values = list(self.alignment.get_column(column)) # drop the index of the probe from the column_values del column_values[k] # get the count of different bases in a column (converting # it to a set gets only the unique values). # # no need to convert to a list here because it is already one if len(set(column_values)) > 1: differences.append(0) else: differences.append(1) differences = numpy.array(differences) weight = numpy.repeat(1.0, window_size) / window_size running_average = numpy.convolve( differences, weight)[window_size - 1:-(window_size - 1)] good = numpy.where(running_average >= threshold)[0] # remember to add window size onto end of trim try: start_clip, end_clip = good[0], good[-1] + window_size except IndexError: start_clip, end_clip = None, None return start_clip, end_clip def trim_alignment(self, method='edges', remove_probe=None, bases=None, consensus=True, window_size=20, threshold=0.5): """Trim the alignment""" if method == 'edges': # find edges of the alignment start = self._find_ends(forward=True) end = self._find_ends(forward=False) elif method == 'running': start, end = self.running_average(window_size, threshold) elif method == 'running-probe': # get position of probe for k, v in enumerate(self.alignment): if v.name == 'probe': break else: pass start, end = self.running_average(window_size, threshold, k, True) #pdb.set_trace() if method == 'notrim': self.trimmed_alignment = self.alignment else: # create a new alignment object to hold our alignment self.trimmed_alignment = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) for sequence in self.alignment: # ignore the probe sequence we added if (method == 'edges' or method == 'running' or method == 'running-probe') and not remove_probe: # it is totally retarded that biopython only gives us the option to # pass the Alignment object a name and str(sequence). Given this # level of retardation, we'll fudge and use their private method if start >= 0 and end: self.trimmed_alignment._records.append( sequence[start:end]) else: self.trimmed_alignment = None break elif method == 'static' and not remove_probe and bases: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment mid_point = len(sequence) / 2 if self._base_checker(bases, sequence, mid_point): self.trimmed_alignment._records.append( sequence[mid_point - bases:mid_point + bases]) else: self.trimmed_alignment = None elif method == 'static' and not remove_probe and bases and self.ploc: # get middle of alignment and trim out from that - there's a # weakness here in that we are not actually locating the probe # region, we're just locating the middle of the alignment if self._base_checker(bases, sequence, self.ploc): self.trimmed_alignment._records.append( sequence[self.ploc[0] - bases:self.ploc[1] + bases]) else: self.trimmed_alignment = None elif remove_probe and self.ploc: # we have to drop to sequence level to add sequence slices # where we basically slice around the probes location temp = sequence.seq[:self.ploc[0]] + sequence.seq[self. ploc[1]:] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) elif method == 'static' and remove_probe and bases and self.ploc: if self._base_checker(bases, sequence, self.ploc): temp = sequence.seq[self.ploc[0]-bases:self.ploc[0]] + \ sequence.seq[self.ploc[1]:self.ploc[1]+bases] self.trimmed_alignment._records.append( \ self._record_formatter(temp) ) else: self.trimmed_alignment = None # build a dumb consensus if consensus and self.trimmed_alignment: self.trimmed_alignment_summary, self.trimmed_alignment_consensus = \ self._alignment_summary(self.trimmed_alignment) if not self.trimmed_alignment: print "\tAlignment {0} dropped due to trimming".format( self.alignment._records[0].description.split('|')[1]) def trim_ambiguous_bases(self): """snip ambiguous bases from a trimmed_alignment""" ambiguous_bases = [] # do this by finaing all ambiguous bases and then snipping the largest # chunk with no ambiguous bases from the entire alignment if not self.trimmed_alignment: self.perfect_trimmed_alignment = self.trimmed_alignment else: for column in xrange( 0, self.trimmed_alignment.get_alignment_length()): if 'N' in self.trimmed_alignment.get_column(column): ambiguous_bases.append(column) maximum = 0 maximum_pos = None #pdb.set_trace() if not ambiguous_bases: self.perfect_trimmed_alignment = self.trimmed_alignment if ambiguous_bases: # prepend and append the start and end of the sequence so consider # those chunks outside the stop and start of ambiguous base runs. ambiguous_bases.insert(0, 0) ambiguous_bases.append( self.trimmed_alignment.get_alignment_length() - 1) # create a new alignment object to hold our alignment self.perfect_trimmed_alignment = \ Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for pos in xrange(len(ambiguous_bases)): if pos + 1 < len(ambiguous_bases): difference = ambiguous_bases[pos + 1] - \ ambiguous_bases[pos] if difference > maximum: maximum = difference maximum_pos = (pos, pos + 1) else: pass # make sure we catch cases where there is not best block if maximum_pos: for sequence in self.trimmed_alignment: self.perfect_trimmed_alignment._records.append( sequence[ambiguous_bases[maximum_pos[0]] + 1:ambiguous_bases[maximum_pos[1]]]) else: self.perfect_trimmed_alignment = None