def alignTwoSequencesWithBiopython(seq1, seq2, match=5, mismatch=-1, gap_opening=-10, gap_extension=-1): """Easily align two sequences with Biopython's globalms. Returns an MSA and indices for use with showAlignment. """ alignment = pairwise2.align.globalms(seq1, seq2, match, mismatch, gap_opening, gap_extension) seq_indices = [0] msa_indices = [0] for i in range(len(alignment[0][0])): if alignment[0][0][i] != '-': seq_indices.append(seq_indices[i] + 1) else: seq_indices.append(seq_indices[i]) if alignment[0][1][i] != '-': msa_indices.append(msa_indices[i] + 1) else: msa_indices.append(msa_indices[i]) seq_indices = array(seq_indices) msa_indices = array(msa_indices) alignment = MSA(msa=array([array(list(alignment[0][0])), \ array(list(alignment[0][1]))])) return alignment, seq_indices, msa_indices
def alignTwoSequencesWithBiopython(seq1, seq2, **kwargs): """Easily align two sequences with Biopython's globalms or localms. Returns an MSA and indices for use with :func:`.showAlignment`. Alignment parameters can be provided as keyword arguments. Default values are as originally set in the proteins.compare module, but now found in utilities.seqtools. :arg match: a positive integer, used to reward finding a match :type match: int :arg mismatch: a negative integer, used to penalise finding a mismatch :type mismatch: int :arg gap_opening: a negative integer, used to penalise opening a gap :type gap_opening: int :arg gap_extension: a negative integer, used to penalise extending a gap :type gap_extension: int :arg method: method for pairwise2 alignment. Possible values are 'local' and 'global' :type method: str """ match = kwargs.get('match', MATCH_SCORE) mismatch = kwargs.get('mismatch', MISMATCH_SCORE) gap_opening = kwargs.get('gap_opening', GAP_PENALTY) gap_extension = kwargs.get('gap_extension', GAP_EXT_PENALTY) method = kwargs.get('method', ALIGNMENT_METHOD) if method == 'local': alignment = pairwise2.align.localms(seq1, seq2, match, mismatch, gap_opening, gap_extension) elif method == 'global': alignment = pairwise2.align.globalms(seq1, seq2, match, mismatch, gap_opening, gap_extension) else: raise ValueError('method should be local or global') seq_indices = [0] msa_indices = [0] for i in range(len(alignment[0][0])): if alignment[0][0][i] != '-': seq_indices.append(seq_indices[i]+1) else: seq_indices.append(seq_indices[i]) if alignment[0][1][i] != '-': msa_indices.append(msa_indices[i]+1) else: msa_indices.append(msa_indices[i]) seq_indices = array(seq_indices) msa_indices = array(msa_indices) alignment = MSA(msa=array([array(list(alignment[0][0])), \ array(list(alignment[0][1]))])) return alignment, seq_indices, msa_indices
def alignSequenceToMSA(seq, msa, **kwargs): """ Align a sequence from a PDB or Sequence to a sequence from an MSA and create two sets of indices. The sequence from the MSA (refSeq), the alignment and the two sets of indices are returned. The first set (indices) maps the residue numbers in the PDB to the reference sequence. The second set (msa_indices) indexes the reference sequence in the msa and is used for retrieving values from the first indices. :arg seq: an object with an associated sequence string or a sequence string itself :type seq: :class:`Atomic`, :class:`Sequence` :arg msa: MSA object :type msa: :class:`.MSA` :arg label: a label for a sequence in msa or a PDB ID ``msa.getIndex(label)`` must return a sequence index :type label: str :arg chain: which chain from pdb to use for alignment, default is `'A'` This value will be ignored if seq is not an :class:`Atomic` object. :type chain: str :arg match: a positive integer, used to reward finding a match The default is 5, which we found to work in a test case. :type match: int :arg mismatch: a negative integer, used to penalise finding a mismatch The default is -1, which we found to work in a test case :type mismatch: int :arg gap_opening: a negative integer, used to penalise opening a gap The default is -10, which we found to work in a test case :type gap_opening: int :arg gap_extension: a negative integer, used to penalise extending a gap The default is -1, which we found to work in a test case :type gap_extension: int """ label = kwargs.get('label', None) chain = kwargs.get('chain', 'A') match = kwargs.get('match', 5) mismatch = kwargs.get('mismatch', -1) gap_opening = kwargs.get('gap_opening', -10) gap_extension = kwargs.get('gap_extension', -1) if isinstance(seq, Atomic): if isinstance(chain, str): ag = seq.select('chain {0}'.format(chain)) elif chain is None: ag = seq else: raise TypeError('chain should be a string or **None**') sequence = ag.select('ca').getSequence() elif isinstance(seq, Sequence): sequence = str(seq) ag = None else: raise TypeError( 'seq must be an atomic class, sequence class, or str not {0}'. format(type(seq))) if not isinstance(msa, MSA): raise TypeError('msa must be an MSA instance') if label is None: if ag: label = ag.getTitle().split('_')[0] elif isinstance(seq, Sequence): label = seq.getLabel() else: raise ValueError( 'A label cannot be extracted from seq so please provide one.') try: seqIndex = msa.getIndex(label) except: raise ValueError('Please provide a label that can be found in msa.') if isinstance(seqIndex, int): refMsaSeq = str(msa[seqIndex]).upper().replace('-', '.') else: raise TypeError( 'The output from querying that label against msa is not a single sequence.' ) alignment = pairwise2.align.globalms(sequence, str(refMsaSeq), \ match, mismatch, gap_opening, gap_extension) seq_indices = [0] msa_indices = [0] for i in range(len(alignment[0][0])): if alignment[0][0][i] != '-': seq_indices.append(seq_indices[i] + 1) else: seq_indices.append(seq_indices[i]) if alignment[0][1][i] != '-': msa_indices.append(msa_indices[i] + 1) else: msa_indices.append(msa_indices[i]) seq_indices.pop(0) # The first element was extra for initialisation msa_indices.pop(0) # The first element was extra for initialisation seq_indices = array(seq_indices) msa_indices = array(msa_indices) alignment = MSA(msa=array([array(list(alignment[0][0])), \ array(list(alignment[0][1]))]), \ labels=[ag.getTitle(), label]) return alignment, seq_indices, msa_indices
def buildMSA(sequences, title='Unknown', labels=None, **kwargs): """ Aligns sequences with clustalw or clustalw2 and returns the resulting MSA. :arg sequences: a file, MSA object or a list or array containing sequences as Atomic objects with :func:`getSequence` or Sequence objects or strings. If strings are used then labels must be provided using ``labels`` :type sequences: :class:`Atomic`, :class:`.MSA`, :class:`~numpy.ndarray`, str :arg title: the title for the MSA and it will be used as the prefix for output files. :type title: str :arg labels: a list of labels to go with the sequences :type labels: list :arg align: whether to align the sequences default True :type align: bool :arg method: alignment method, one of either biopython.align.globalms or clustalw(2). default 'clustalw' :type align: str """ align = kwargs.get('align', True) method = kwargs.pop('method', 'clustalw') # 1. check if sequences are in a fasta file and if not make one if isinstance(sequences, str): filename = sequences elif not isinstance(sequences, MSA): try: max_len = 0 for sequence in sequences: if isinstance(sequence, Atomic): if len(sequence.ca.copy()) > max_len: max_len = len(sequence.ca.copy()) elif isinstance(sequence, MSA): if len(sequence[0]) > max_len: max_len = len(sequence[0]) else: if len(sequence) > max_len: max_len = len(sequence) msa = [] fetched_labels = [] for i, sequence in enumerate(sequences): if isinstance(sequence, Atomic): strseq = sequence.ca.getSequence() label = sequence.getTitle() elif isinstance(sequence, Sequence): strseq = str(sequence) label = sequence.getLabel() elif isinstance(sequence, MSA): strseq = str(sequence[0]) label = sequence.getLabel(0) LOGGER.warn( 'Only the first sequence in the MSA at entry {0} is used.' .format(i)) elif isinstance(sequence, str): strseq = sequence label = str(i + 1) else: raise TypeError('sequences should be a list of strings, ' 'Atomic, or Sequence instances') strseq = strseq + '-' * (max_len - len(strseq)) msa.append(array(list(strseq))) fetched_labels.append(label) sequences = array(msa) except: raise TypeError('sequences should be iterable') # "if a list" is a pythonic way to check if a list is empty or not (or none) if not labels and fetched_labels: labels = fetched_labels label = [label.replace(' ', '_') for label in labels] # labels checkers are removed because they will be properly handled in MSA class initialization msa = MSA(msa=sequences, title=title, labels=labels) if align and 'clustal' in method: filename = writeMSA(title + '.fasta', msa) if align: # 2. find and run alignment method if 'biopython' in method: if len(sequences) == 2: msa, _, _ = alignTwoSequencesWithBiopython( sequences[0], sequences[1], **kwargs) else: raise ValueError( "Provide only two sequences or another method. \ Biopython pairwise alignment can only be used \ to build an MSA with two sequences.") elif 'clustalw' in method: clustalw = which('clustalw') if clustalw is None: if which('clustalw2') is not None: clustalw = which('clustalw2') else: raise EnvironmentError( "The executable for clustalw was not found, \ install clustalw or add it to the path." ) os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') else: alignTool = which(method) if alignTool is None: raise EnvironmentError("The executable for {0} was not found, \ install it or add it to the path.". format(alignTool)) os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') return msa
def alignSequencesByChain(PDBs, **kwargs): """ Runs :func:`buildMSA` for each chain and optionally joins the results. Returns either a single :class:`MSA` or a dictionary containing an :class:`MSA` for each chain. :arg PDBs: a list of :class:`AtomGroup` objects :type PDBs: list :arg join_chains: whether to join chain alignments default is True :type join_chains: bool :arg join_char: a character for joining chain alignments default is '/' as used by PIR format alignments :type join_char: str """ if isscalar(PDBs): raise TypeError('PDBs should be array-like') if not PDBs: raise ValueError('PDBs should not be empty') pdbs = [] chains = [] for i, pdb in enumerate(PDBs): if isinstance(pdb, Atomic): pdbs.append(pdb) else: raise TypeError( 'each entry in PDBs must be a :class:`Atomic` instance') chains.append([]) for chain in list(pdbs[i].getHierView()): chains[i].append(chain) if i != 0 and len(chains[i]) != len(chains[0]): raise ValueError('all pdbs should have the same number of chains') labels = [] for pdb in pdbs: chids = '' for chain in list(pdb.getHierView()): chids += chain.getChid() labels.append(pdb.getTitle() + '_' + chids) chains = array(chains) chain_alignments = [] alignments = {} for j in range(len(chains[0])): prefix = 'chain_' + chains[0, j].getChid() msa = buildMSA(chains[:, j], title=prefix, labels=labels) msa = refineMSA(msa, colocc=1e-9) # remove gap-only cols chain_alignments.append(msa) alignments[labels[0].split('_')[1][j]] = msa join_chains = kwargs.get('join_chains', True) join_char = kwargs.get('join_char', '/') if len(chains[0]) == 1: join_chains = False if join_chains: joined_msaarr = [] for i, chain_alignment in enumerate(chain_alignments): pdb_seqs = [] for j, sequence in enumerate(chain_alignment): pdb_seqs.append(sequence) joined_msaarr.append(join_char.join(pdb_seqs)) result = MSA(joined_msaarr, title='joined_chains', labels=[label.split('_')[0] for label in labels]) else: result = alignments if len(result) == 1: result = result[list(result.keys())[0]] return result
def alignSequenceToMSA(seq, msa, **kwargs): """ Align a sequence from a PDB or Sequence to a sequence from an MSA and create two sets of indices. The sequence from the MSA (*seq*), the alignment and the two sets of indices are returned. The first set (*indices*) maps the residue numbers in the PDB to the reference sequence. The second set (*msa_indices*) indexes the reference sequence in the msa and is used for retrieving values from the first indices. :arg seq: an object with an associated sequence string or a sequence string itself :type seq: :class:`.Atomic`, :class:`.Sequence`, str :arg msa: a multiple sequence alignment :type msa: :class:`.MSA` :arg label: a label for a sequence in msa or a PDB ID ``msa.getIndex(label)`` must return a sequence index :type label: str :arg chain: which chain from pdb to use for alignment, default is **None**, which does no selection on *seq*. This value will be ignored if seq is not an :class:`.Atomic` object. :type chain: str Parameters for Biopython ``pairwise2`` alignments can be provided as keyword arguments. Default values are originally from ``proteins.compare`` module, but now found in ``utilities.seqtools``. :arg match: a positive integer, used to reward finding a match :type match: int :arg mismatch: a negative integer, used to penalise finding a mismatch :type mismatch: int :arg gap_opening: a negative integer, used to penalise opening a gap :type gap_opening: int :arg gap_extension: a negative integer, used to penalise extending a gap :type gap_extension: int :arg method: method for pairwise2 alignment. Possible values are ``"local"`` and ``"global"`` :type method: str """ label = kwargs.get('label', None) chain = kwargs.get('chain', None) match = kwargs.get('match', MATCH_SCORE) mismatch = kwargs.get('mismatch', MISMATCH_SCORE) gap_opening = kwargs.get('gap_opening', GAP_PENALTY) gap_extension = kwargs.get('gap_extension', GAP_EXT_PENALTY) method = kwargs.get('method', ALIGNMENT_METHOD) if isinstance(seq, Atomic): if isinstance(chain, str): ag = seq.select('chain {0}'.format(chain)) elif chain is None: ag = seq chids = ag.getChids() if len(unique(chids)) > 1: LOGGER.warn('%s consists of multiple chains. Please consider selecting one chain'%(seq.getTitle())) else: raise TypeError('chain should be a string or None') if ag is None: raise ValueError('seq may be None or chain ID may be invalid') sequence = ag.select('ca').getSequence() elif isinstance(seq, Sequence): sequence = str(seq) ag = None elif isinstance(seq, str): sequence = seq ag = None else: raise TypeError('seq must be an atomic class, sequence class, or str not {0}' .format(type(seq))) if not isinstance(msa, MSA): raise TypeError('msa must be an MSA instance') if label is None: if ag: label = ag.getTitle().split('_')[0] elif isinstance(seq, Sequence): label = seq.getLabel() else: raise ValueError('A label cannot be extracted from seq so please provide one.') index = msa.getIndex(label) if index is None and (len(label) == 4 or len(label) == 5): from prody import parsePDB try: structure, header = parsePDB(label[:4], header=True) except Exception as err: raise IOError('failed to parse header for {0} ({1})' .format(label[:4], str(err))) chid = chain for poly in header['polymers']: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if index is None: index = msa.getIndex(dbref.idcode) if index is not None: LOGGER.info('{0} idcode {1} for {2}{3} ' 'is found in {4}.'.format( dbref.database, dbref.idcode, label[:4], poly.chid, str(msa))) label = dbref.idcode break if index is None: index = msa.getIndex(dbref.accession) if index is not None: LOGGER.info('{0} accession {1} for {2}{3} ' 'is found in {4}.'.format( dbref.database, dbref.accession, label[:4], poly.chid, str(msa))) label = dbref.accession break if index is not None: chain = structure[poly.chid] if index is None: raise ValueError('label is not in msa, or msa is not indexed') try: len(index) except TypeError: pass else: raise ValueError('label {0} maps onto multiple sequences, ' 'so cannot be used for refinement'.format(label)) if isinstance(index, int): refMsaSeq = str(msa[index]).upper().replace('-','.') else: raise TypeError('The output from querying that label against msa is not a single sequence.') if method == 'local': alignment = pairwise2.align.localms(sequence, str(refMsaSeq), match, mismatch, gap_opening, gap_extension, one_alignment_only=1) elif method == 'global': alignment = pairwise2.align.globalms(sequence, str(refMsaSeq), match, mismatch, gap_opening, gap_extension, one_alignment_only=1) else: raise ValueError('method should be local or global') seq_indices = [0] msa_indices = [0] for i in range(len(alignment[0][0])): if alignment[0][0][i] != '-': seq_indices.append(seq_indices[i]+1) else: seq_indices.append(seq_indices[i]) if alignment[0][1][i] != '-': msa_indices.append(msa_indices[i]+1) else: msa_indices.append(msa_indices[i]) seq_indices.pop(0) # The first element was extra for initialisation msa_indices.pop(0) # The first element was extra for initialisation seq_indices = array(seq_indices) msa_indices = array(msa_indices) if ag: seq_indices += ag.getResnums()[0] - 1 alignment = MSA(msa=array([array(list(alignment[0][0])), \ array(list(alignment[0][1]))]), \ labels=[ag.getTitle(), label]) return alignment, seq_indices, msa_indices
def buildMSA(sequences, title='Unknown', labels=None, **kwargs): """ Aligns sequences with clustalw or clustalw2 and returns the resulting MSA. :arg sequences: a file, MSA object or a list or array containing sequences as Atomic objects with :func:`getSequence` or Sequence objects or strings. If strings are used then labels must be provided using ``labels`` :type sequences: :class:`Atomic`, :class:`.MSA`, :class:`~numpy.ndarray`, str :arg title: the title for the MSA and it will be used as the prefix for output files. :type title: str :arg labels: a list of labels to go with the sequences :type labels: list :arg align: whether to do alignment with clustalw(2) default True :type align: bool """ align = kwargs.get('align', True) # 1. check if sequences are in a fasta file and if not make one if isinstance(sequences, str): filename = sequences elif not isinstance(sequences, MSA): try: max_len = 0 for sequence in sequences: if len(sequence) > max_len: max_len = len(sequence) msa = [] fetched_labels = [] for i, sequence in enumerate(sequences): if isinstance(sequence, Atomic): strseq = sequence.getSequence() label = sequence.getTitle() elif isinstance(sequence, Sequence): strseq = str(sequence) label = sequence.getLabel() elif isinstance(sequence, str): strseq = sequence label = str(i + 1) else: raise TypeError('sequences should be a list of strings, ' 'Atomic, or Sequence instances') strseq = strseq + '-' * (max_len - len(strseq)) msa.append(array(list(strseq))) fetched_labels.append(label) sequences = array(msa) except: raise TypeError('sequences should be iterable') # "if a list" is a pythonic way to check if a list is empty or not (or none) if not labels and fetched_labels: labels = fetched_labels # labels checkers are removed because they will be properly handled in MSA class initialization msa = MSA(msa=sequences, title=title, labels=labels) if align: filename = writeMSA(title + '.fasta', msa) if align: # 2. find and run alignment method clustalw = which('clustalw') if clustalw is None: if which('clustalw2') is not None: clustalw = which('clustalw2') else: raise EnvironmentError( "The executable for clustalw was not found, \ install clustalw or add it to the path." ) os.system('"%s" %s' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') return msa
def alignSequencesByChain(PDBs, **kwargs): """ Runs buildMSA for each chain and optionally joins the results. Returns either a single MSA or a dictionary containing an MSA for each chain. :arg PDBs: a list or array of :class:`AtomGroup` objects or PDB IDs a mixed list containing both is acceptable :type PDBs: list or :class:`~numpy.ndarray` :arg join_chains: whether to join chain alignments default is True :type join_chains: bool :arg join_char: a character for joining chain alignments default is '/' as used by PIR format alignments :type join_char: str """ if not (isinstance(PDBs, list) or isinstance(PDBs, ndarray)): raise TypeError('PDBs should be a list or array') if PDBs == []: raise ValueError('PDBs should not be an empty list') pdbs = [] chains = [] for i, pdb in enumerate(PDBs): if isinstance(pdb, Atomic): pdbs.append(pdb) else: raise TypeError( 'each entry in PDBs must be a :class:`Atomic` instance') chains.append([]) for chain in list(pdbs[i].getHierView()): chains[i].append(chain) if i != 0 and len(chains[i]) != len(chains[0]): raise ValueError('all pdbs should have the same number of chains') labels = [] for pdb in pdbs: chids = '' for chain in list(pdb.getHierView()): chids += chain.getChid() labels.append(pdb.getTitle().split('_')[0] + '_' + chids) chains = array(chains) chain_alignments = [] alignments = {} labels_lists = [] for j in range(len(chains[0])): prefix = 'chain_' + chains[0, j].getChid() msa = buildMSA(chains[:, j], title=prefix, labels=labels) # make all alignments have the sequences in the same order as the 0th labels_lists.append([]) for sequence in msa: labels_lists[j].append(sequence.getLabel()) if j > 0: msaarr = [] for label in labels_lists[0]: msaarr.append(msa.getArray()[msa.getIndex(label)]) msaarr = array(msaarr) msa = MSA(msaarr, title='reordered_msa_1', labels=list(labels_lists[0])) writeMSA(prefix + '.aln', msa) chain_alignments.append(msa) # after reordering, create the alignments dictionary alignments[labels_lists[0][0].split('_')[1][j]] = msa join_chains = kwargs.get('join_chains', True) join_char = kwargs.get('join_char', '/') if join_chains: aligned_sequences = list(zeros(shape(chain_alignments)).T) for j in range(shape(chain_alignments)[1]): aligned_sequences[j] = list(aligned_sequences[j]) orig_labels = [] for i, chain_alignment in enumerate(chain_alignments): for j, sequence in enumerate(chain_alignment): aligned_sequences[j][i] = str(sequence) if i == 0: orig_labels.append(sequence.getLabel()) joined_msaarr = [] for j in range(shape(chain_alignments)[1]): joined_msaarr.append( array(list(join_char.join(aligned_sequences[j])))) joined_msaarr = array(joined_msaarr) result = MSA(joined_msaarr, title='joined_chains', labels=orig_labels) result = refineMSA(result, colocc=1e-9) # remove gap-only cols else: result = alignments return result
def alignSequencesByChain(PDBs, **kwargs): """ Runs :func:`buildMSA` for each chain and optionally joins the results. Returns either a single :class:`MSA` or a dictionary containing an :class:`MSA` for each chain. :arg PDBs: a list of :class:`AtomGroup` objects :type PDBs: list :arg join_chains: whether to join chain alignments default is True :type join_chains: bool :arg join_char: a character for joining chain alignments default is '/' as used by PIR format alignments :type join_char: str """ if isscalar(PDBs): raise TypeError('PDBs should be array-like') if not PDBs: raise ValueError('PDBs should not be empty') pdbs = [] chains = [] for i, pdb in enumerate(PDBs): if isinstance(pdb, Atomic): pdbs.append(pdb) else: raise TypeError('each entry in PDBs must be a :class:`Atomic` instance') chains.append([]) for chain in list(pdbs[i].getHierView()): chains[i].append(chain) if i != 0 and len(chains[i]) != len(chains[0]): raise ValueError('all pdbs should have the same number of chains') labels = [] for pdb in pdbs: chids = '' for chain in list(pdb.getHierView()): chids += chain.getChid() labels.append(pdb.getTitle() + '_' + chids) chains = array(chains) chain_alignments = [] alignments = {} for j in range(len(chains[0])): prefix = 'chain_' + chains[0, j].getChid() msa = buildMSA(chains[:, j], title=prefix, labels=labels) msa = refineMSA(msa, colocc=1e-9) # remove gap-only cols chain_alignments.append(msa) alignments[labels[0].split('_')[1][j]] = msa join_chains = kwargs.get('join_chains', True) join_char = kwargs.get('join_char', '/') if len(chains[0]) == 1: join_chains = False if join_chains: joined_msaarr = [] for i, chain_alignment in enumerate(chain_alignments): pdb_seqs = [] for j, sequence in enumerate(chain_alignment): pdb_seqs.append(sequence) joined_msaarr.append(join_char.join(pdb_seqs)) result = MSA(joined_msaarr, title='joined_chains', labels=[label.split('_')[0] for label in labels]) else: result = alignments if len(result) == 1: result = result[list(result.keys())[0]] return result