def getResnums(self, gaps=False): """Return list of residue numbers associated with non-gapped *seq*. When *gaps* is **True**, return a list containing the residue numbers with gaps appearing as **None**. Residue numbers are inferred from the full label. When label does not contain residue number information, indices a range of numbers starting from 1 is returned.""" title, start, end = splitSeqLabel(self.getLabel(True)) try: start, end = int(start), int(end) except: LOGGER.info('Cannot parse label start, end values, Setting ' 'resnums 1 to {0:d}'.format(self.numResidues())) start, end = 1, self.numResidues() else: if (end - start + 1) != self.numResidues(): LOGGER.info('Label start-end position does not match ' 'length of ungapped sequence. Setting ' 'resnums 1 to {0:d}'.format(self.numResidues())) start, end = 1, self.numResidues() resnums = iter(range(start, end + 1)) if gaps: return [next(resnums) if torf else None for torf in char.isalpha(self._array)] else: return list(resnums)
def getResnums(self, gaps=False): """Returns list of residue numbers associated with non-gapped *seq*. When *gaps* is **True**, return a list containing the residue numbers with gaps appearing as **None**. Residue numbers are inferred from the full label. When label does not contain residue number information, indices a range of numbers starting from 1 is returned.""" title, start, end = splitSeqLabel(self.getLabel(True)) try: start, end = int(start), int(end) except: LOGGER.info('Cannot parse label start, end values, Setting ' 'resnums 1 to {0:d}'.format(self.numResidues())) start, end = 1, self.numResidues() else: if (end - start + 1) != self.numResidues(): LOGGER.info('Label start-end position does not match ' 'length of ungapped sequence. Setting ' 'resnums 1 to {0:d}'.format(self.numResidues())) start, end = 1, self.numResidues() resnums = iter(range(start, end + 1)) if gaps: return [ next(resnums) if torf else None for torf in char.isalpha(self._array) ] else: return list(resnums)
def testRowCol(self): rowocc = 0.9 colocc = 1.0 refined = refineMSA(FASTA, rowocc=rowocc, colocc=colocc)._getArray() rows = FASTA_ALPHA.sum(1) / 112.0 >= rowocc expected = FASTA._getArray()[rows] cols = char.isalpha(expected).sum(0, dtype=float) / expected.shape[0] >= colocc expected = expected.take(cols.nonzero()[0], 1) assert_array_equal(refined, expected)
def testRowCol(self): rowocc = 0.9 colocc = 1.0 refined = refineMSA(FASTA, rowocc=rowocc, colocc=colocc)._getArray() rows = FASTA_ALPHA.sum(1) / 112. >= rowocc expected = FASTA._getArray()[rows] cols = char.isalpha(expected).sum( 0, dtype=float) / expected.shape[0] >= colocc expected = expected.take(cols.nonzero()[0], 1) assert_array_equal(refined, expected)
def getResnums(self, gaps=False, report_match=False): """Returns list of residue numbers associated with non-gapped *seq*. When *gaps* is **True**, return a list containing the residue numbers with gaps appearing as **None**. Residue numbers are inferred from the full label if possible. When the label does not contain residue number information, a range of numbers starting from 1 is returned.""" title, start, end = splitSeqLabel(self.getLabel(True)) match = False try: start, end = int(start), int(end) except: LOGGER.info( 'Cannot parse start and end values from sequence label {0}. Setting ' 'resnums 1 to {1:d}'.format(title, self.numResidues())) start, end = 1, self.numResidues() else: if (end - start + 1) != self.numResidues(): LOGGER.info('Label {0} start-end entry does not match ' 'length of ungapped sequence. Setting ' 'resnums 1 to {1:d}'.format( title, self.numResidues())) start, end = 1, self.numResidues() else: LOGGER.info('Label {0} start-end entry matches ' 'length of ungapped sequence. Setting ' 'resnums {1:d} to {2:d}'.format(title, start, end)) match = True resnums = iter(range(start, end + 1)) if gaps: result = [ next(resnums) if torf else None for torf in char.isalpha(self._array) ] else: result = list(resnums) if report_match: return match, result return result
def refineMSA(msa, label=None, rowocc=None, seqid=None, colocc=None, **kwargs): """Refine *msa* by removing sequences (rows) and residues (columns) that contain gaps. :arg msa: multiple sequence alignment :type msa: :class:`.MSA` :arg label: remove columns that are gaps in the sequence matching label, ``msa.getIndex(label)`` must return a sequence index, a PDB identifier is also acceptable :type label: str :arg rowocc: row occupancy, sequences with less occupancy will be removed after *label* refinement is applied :type rowocc: float :arg seqid: keep unique sequences at specified sequence identity level, unique sequences are identified using :func:`.uniqueSequences` :type seqid: float :arg colocc: column occupancy, residue positions with less occupancy will be removed after other refinements are applied :type colocc: float :arg keep: keep columns corresponding to residues not resolved in the PDB structure, default is **False**, applies when *label* is a PDB identifier :arg type: bool For Pfam MSA data, *label* is UniProt entry name for the protein. You may also use PDB structure and chain identifiers, e.g. ``'1p38'`` or ``'1p38A'``, for *label* argument and UniProt entry names will be parsed using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and :class:`.DBRef`). The order of refinements are applied in the order of arguments. If *label* and *unique* is specified is specified, sequence matching *label* will be kept in the refined :class:`.MSA` although it may be similar to some other sequence.""" # if msa is a char array, it will be refined but label won't work try: ndim, dtype_ = msa.ndim, msa.dtype except AttributeError: try: arr = msa._getArray() except AttributeError: raise TypeError('msa must be a character array or an MSA instance') ndim, dtype_ = arr.ndim, arr.dtype else: arr, msa = msa, None if dtype('|S1') != dtype_: raise ValueError('msa must be a character array or an MSA instance') if ndim != 2: raise ValueError('msa must be a 2D array or an MSA instance') title = [] cols = None index = None if label is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: upper, lower = label.upper(), label.lower() except AttributeError: raise TypeError('label must be a string') if msa is None: raise TypeError('msa must be an MSA instance, ' 'label cannot be used') index = msa.getIndex(label) if index is None: index = msa.getIndex(upper) if index is None: index = msa.getIndex(lower) chain = None if index is None and (len(label) == 4 or len(label) == 5): from prody import parsePDB try: structure, header = parsePDB(label[:4], header=True) except Exception as err: raise IOError('failed to parse header for {0} ({1})' .format(label[:4], str(err))) chid = label[4:].upper() for poly in header['polymers']: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if index is None: index = msa.getIndex(dbref.idcode) if index is not None: LOGGER.info('{0} idcode {1} for {2}{3} ' 'is found in chain {3}.'.format( dbref.database, dbref.idcode, label[:4], poly.chid, str(msa))) break if index is None: index = msa.getIndex(dbref.accession) if index is not None: LOGGER.info('{0} accession {1} for {2}{3} ' 'is found in chain {3}.'.format( dbref.database, dbref.accession, label[:4], poly.chid, str(msa))) break if index is not None: chain = structure[poly.chid] if index is None: raise ValueError('label is not in msa, or msa is not indexed') try: len(index) except TypeError: pass else: raise ValueError('label {0} maps onto multiple sequences, ' 'so cannot be used for refinement'.format(label)) title.append('label=' + label) cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) LOGGER.report('Label refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if chain is not None and not kwargs.get('keep', False): before = arr.shape[1] LOGGER.timeit('_refine') from prody.proteins.compare import importBioPairwise2 from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY pw2 = importBioPairwise2() chseq = chain.getSequence() algn = pw2.align.localms(arr[index].tostring().upper(), chseq, MATCH_SCORE, MISMATCH_SCORE, GAP_PENALTY, GAP_EXT_PENALTY, one_alignment_only=1) torf = [] for s, c in zip(*algn[0][:2]): if s == '-': continue elif c != '-': torf.append(True) else: torf.append(False) torf = array(torf) tsum = torf.sum() assert tsum <= before, 'problem in mapping sequence to structure' if tsum < before: arr = arr.take(torf.nonzero()[0], 1) LOGGER.report('Structure refinement reduced number of ' 'columns from {0} to {1} in %.2fs.' .format(before, arr.shape[1]), '_refine') else: LOGGER.debug('All residues in the sequence are contained in ' 'PDB structure {0}.'.format(label)) from .analysis import calcMSAOccupancy, uniqueSequences rows = None if rowocc is not None: before = arr.shape[0] LOGGER.timeit('_refine') try: rowocc = float(rowocc) except Exception as err: raise TypeError('rowocc must be a float ({0})'.format(str(err))) assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1' rows = calcMSAOccupancy(arr, 'row') >= rowocc if index is not None: index = rows[:index].sum() rows = (rows).nonzero()[0] arr = arr[rows] title.append('rowocc>=' + str(rowocc)) LOGGER.report('Row occupancy refinement reduced number of rows from ' '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if seqid is not None: before = arr.shape[0] LOGGER.timeit('_refine') unique = uniqueSequences(arr, seqid) if index is not None: unique[index] = True unique = unique.nonzero()[0] arr = arr[unique] title.append('seqid>=' + str(seqid)) if rows is not None: rows = rows[unique] else: rows = unique LOGGER.report('Sequence identity refinement reduced number of rows ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if colocc is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: colocc = float(colocc) except Exception as err: raise TypeError('colocc must be a float ({0})'.format(str(err))) assert 0. <= colocc <= 1., 'colocc must be between 0 and 1' cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0] arr = arr.take(cols, 1) title.append('colocc>=' + str(colocc)) LOGGER.report('Column occupancy refinement reduced number of columns ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if not title: raise ValueError('label, rowocc, colocc all cannot be None') # depending on slicing of rows, arr may not have it's own memory if arr.base is not None: arr = arr.copy() if msa is None: return arr else: if rows is None: from copy import copy labels = copy(msa._labels) mapping = copy(msa._mapping) else: labels = msa._labels labels = [labels[i] for i in rows] mapping = None return MSA(arr, title=msa.getTitle() + ' refined ({0})' .format(', '.join(title)), labels=labels, mapping=mapping)
def refineMSA(msa, index=None, label=None, rowocc=None, seqid=None, colocc=None, **kwargs): """Refine *msa* by removing sequences (rows) and residues (columns) that contain gaps. :arg msa: multiple sequence alignment :type msa: :class:`.MSA` :arg index: remove columns that are gaps in the sequence with that index :type index: int :arg label: remove columns that are gaps in the sequence matching label, ``msa.getIndex(label)`` must return a sequence index, a PDB identifier is also acceptable :type label: str :arg rowocc: row occupancy, sequences with less occupancy will be removed after *label* refinement is applied :type rowocc: float :arg seqid: keep unique sequences at specified sequence identity level, unique sequences are identified using :func:`.uniqueSequences` :type seqid: float :arg colocc: column occupancy, residue positions with less occupancy will be removed after other refinements are applied :type colocc: float :arg keep: keep columns corresponding to residues not resolved in the PDB structure, default is **False**, applies when *label* is a PDB identifier :arg type: bool For Pfam MSA data, *label* is UniProt entry name for the protein. You may also use PDB structure and chain identifiers, e.g. ``'1p38'`` or ``'1p38A'``, for *label* argument and UniProt entry names will be parsed using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and :class:`.DBRef`). The order of refinements are applied in the order of arguments. If *label* and *unique* is specified, sequence matching *label* will be kept in the refined :class:`.MSA` although it may be similar to some other sequence.""" # if msa is a char array, it will be refined but label won't work try: ndim, dtype_ = msa.ndim, msa.dtype except AttributeError: try: arr = msa._getArray() except AttributeError: raise TypeError('msa must be a character array or an MSA instance') ndim, dtype_ = arr.ndim, arr.dtype else: arr, msa = msa, None if dtype('|S1') != dtype_: raise ValueError('msa must be a character array or an MSA instance') if ndim != 2: raise ValueError('msa must be a 2D array or an MSA instance') title = [] cols = None if index is not None: before = arr.shape[1] LOGGER.timeit('_refine') cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) title.append('index=' + str(index)) LOGGER.report( 'Index refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if label is not None: if index is not None: LOGGER.info('An index was provided so the label will be ignored.') else: before = arr.shape[1] LOGGER.timeit('_refine') try: upper, lower = label.upper(), label.lower() except AttributeError: raise TypeError('label must be a string') if msa is None: raise TypeError('msa must be an MSA instance, ' 'label cannot be used') index = msa.getIndex(label) if index is None: index = msa.getIndex(upper) if index is None: index = msa.getIndex(lower) chain = None if index is None and (len(label) == 4 or len(label) == 5): from prody import parsePDB try: structure, header = parsePDB(label[:4], header=True) except Exception as err: raise IOError( 'failed to parse header for {0} ({1})'.format( label[:4], str(err))) chid = label[4:].upper() for poly in header['polymers']: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if index is None: index = msa.getIndex(dbref.idcode) if index is not None: LOGGER.info('{0} idcode {1} for {2}{3} ' 'is found in chain {4}.'.format( dbref.database, dbref.idcode, label[:4], poly.chid, str(msa))) break if index is None: index = msa.getIndex(dbref.accession) if index is not None: LOGGER.info('{0} accession {1} for {2}{3} ' 'is found in chain {4}.'.format( dbref.database, dbref.accession, label[:4], poly.chid, str(msa))) break if index is not None: chain = structure[poly.chid] if index is None: raise ValueError('label is not in msa, or msa is not indexed') try: len(index) except TypeError: pass else: raise ValueError( 'label {0} maps onto multiple sequences, ' 'so cannot be used for refinement'.format(label)) title.append('label=' + label) cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) LOGGER.report( 'Label refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if chain is not None and not kwargs.get('keep', False): before = arr.shape[1] LOGGER.timeit('_refine') from prody.proteins.compare import importBioPairwise2 from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY pw2 = importBioPairwise2() chseq = chain.getSequence() algn = pw2.align.localms(arr[index].tostring().upper(), chseq, MATCH_SCORE, MISMATCH_SCORE, GAP_PENALTY, GAP_EXT_PENALTY, one_alignment_only=1) torf = [] for s, c in zip(*algn[0][:2]): if s == '-': continue elif c != '-': torf.append(True) else: torf.append(False) torf = array(torf) tsum = torf.sum() assert tsum <= before, 'problem in mapping sequence to structure' if tsum < before: arr = arr.take(torf.nonzero()[0], 1) LOGGER.report( 'Structure refinement reduced number of ' 'columns from {0} to {1} in %.2fs.'.format( before, arr.shape[1]), '_refine') else: LOGGER.debug( 'All residues in the sequence are contained in ' 'PDB structure {0}.'.format(label)) from .analysis import calcMSAOccupancy, uniqueSequences rows = None if rowocc is not None: before = arr.shape[0] LOGGER.timeit('_refine') try: rowocc = float(rowocc) except Exception as err: raise TypeError('rowocc must be a float ({0})'.format(str(err))) assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1' rows = calcMSAOccupancy(arr, 'row') >= rowocc if index is not None: index = rows[:index].sum() rows = (rows).nonzero()[0] arr = arr[rows] title.append('rowocc>=' + str(rowocc)) LOGGER.report( 'Row occupancy refinement reduced number of rows from ' '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if seqid is not None: before = arr.shape[0] LOGGER.timeit('_refine') unique = uniqueSequences(arr, seqid) if index is not None: unique[index] = True unique = unique.nonzero()[0] arr = arr[unique] title.append('seqid>=' + str(seqid)) if rows is not None: rows = rows[unique] else: rows = unique LOGGER.report( 'Sequence identity refinement reduced number of rows ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if colocc is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: colocc = float(colocc) except Exception as err: raise TypeError('colocc must be a float ({0})'.format(str(err))) assert 0. <= colocc <= 1., 'colocc must be between 0 and 1' cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0] arr = arr.take(cols, 1) title.append('colocc>=' + str(colocc)) LOGGER.report( 'Column occupancy refinement reduced number of columns ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if not title: raise ValueError( 'label, index, seqid, rowocc, colocc all cannot be None') # depending on slicing of rows, arr may not have it's own memory if arr.base is not None: arr = arr.copy() if msa is None: return arr else: if rows is None: from copy import copy labels = copy(msa._labels) else: labels = msa._labels labels = [labels[i] for i in rows] return MSA(arr, title=msa.getTitle() + ' refined ({0})'.format(', '.join(title)), labels=labels)
def numResidues(self): """Returns the number of alphabet characters.""" return sum(char.isalpha(self._array))
def numGaps(self): """Returns number of gap characters.""" array = self._array return len(array) - sum(char.isalpha(array))
__copyright__ = 'Copyright (C) 2010-2012 Ahmet Bakan' from prody.tests import TestCase from numpy import array, log, zeros, char from numpy.testing import assert_array_equal, assert_array_almost_equal from prody.tests.test_datafiles import * from prody import LOGGER, refineMSA, parseMSA, calcMSAOccupancy, mergeMSA from prody import uniqueSequences LOGGER.verbosity = None FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta')) FASTA_ALPHA = char.isalpha(FASTA._msa) NUMSEQ = FASTA.numSequences() * 1. class TestRefinement(TestCase): def testLabel(self): label = 'FSHB_BOVIN' index = FASTA.getIndex(label) refined = refineMSA(FASTA, label=label)._getArray() expected = FASTA._getArray().take(FASTA_ALPHA[index].nonzero()[0], 1) assert_array_equal(refined, expected) def testRowocc(self):
def numResidues(self): """Return the number of alphabet characters.""" return sum(char.isalpha(self._array))
def numGaps(self): """Return number of gap characters.""" array = self._array return len(array) - sum(char.isalpha(array))
from prody.tests import TestCase from numpy import array, log, zeros, char, ones, fromfile from numpy.testing import assert_array_equal, assert_array_almost_equal from prody.tests.test_datafiles import * from prody import LOGGER, calcShannonEntropy, buildMutinfoMatrix, parseMSA from prody import calcMSAOccupancy, buildSeqidMatrix, uniqueSequences from prody import buildOMESMatrix, buildSCAMatrix LOGGER.verbosity = None FASTA = parseMSA(pathDatafile("msa_Cys_knot.fasta")) FASTA_ALPHA = char.isalpha(FASTA._msa) FASTA_UPPER = char.upper(FASTA._msa) FASTA_NUMBER, FASTA_LENGTH = FASTA_ALPHA.shape FASTA_EYE = zeros((FASTA_NUMBER, FASTA_NUMBER)) for i in range(FASTA_NUMBER): FASTA_EYE[i, i] = 1 for j in range(i + 1, FASTA_NUMBER): score = 0.0 ncols = 0 for k in range(FASTA_LENGTH): if FASTA_ALPHA[i, k] or FASTA_ALPHA[j, k]: if FASTA_UPPER[i, k] == FASTA_UPPER[j, k]: score += 1 ncols += 1 FASTA_EYE[i, j] = FASTA_EYE[j, i] = score / ncols