def testAmbiguous(self): msa = array([list("bjzxBJZX"), list("bjzxBJZX")], dtype="|S1") expect = -log(1.0 / array([2, 2, 2, 20] * 2)) result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def testTwenty(self): msa = array([[char] for char in "ACDEFGHIKLMNPQRSTVWY"], dtype="|S1") expect = -log(1.0 / 20) result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def evol_conserv(msa, **kwargs): import prody from prody import parseMSA, calcShannonEntropy, showShannonEntropy from prody import writeArray from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_conserv' msa = parseMSA(msa) entropy = calcShannonEntropy(msa, **kwargs) writeArray(prefix + '.txt', entropy, format=kwargs.get('numformat', '%12g')) if kwargs.get('figent'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) figargs = kwargs.get('figargs', ()) figure = plt.figure(figsize=(width, height)) show = showShannonEntropy(entropy, msa=msa, *figargs) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))
def testTwenty(self): msa = array([[char] for char in 'ACDEFGHIKLMNPQRSTVWY'], dtype='|S1') expect = -log(1. / 20) result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def testSmallProbability(self): msa = zeros((1000000,1), '|S1') msa[0] = 'A' msa[1:] = 'C' expect = array([1., 999999.]) / 1000000 expect = - (expect * log(expect)).sum() result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def testSmallProbability(self): msa = zeros((1000000, 1), "|S1") msa[0] = "A" msa[1:] = "C" expect = array([1.0, 999999.0]) / 1000000 expect = -(expect * log(expect)).sum() result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def testSmallProbability(self): msa = zeros((1000000, 1), '|S1') msa[0] = 'A' msa[1:] = 'C' expect = array([1., 999999.]) / 1000000 expect = -(expect * log(expect)).sum() result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def testAmbiguous(self): msa = array([ list('bjzxBJZX'), list('bjzxBJZX'), ], dtype='|S1') expect = -log(1. / array([2, 2, 2, 20] * 2)) result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def testGapDividend(self): msa = array( [list("AAAA"), list("AAAC"), list("AACD"), list("ACCE"), list("ACDF"), list("ACDG"), list("----")], dtype="|S1", ) expect = -log(1.0 / array([1, 2, 3, 6])) result = calcShannonEntropy(msa, omitgaps=True) assert_array_almost_equal(expect, result)
def testSixSequences(self): msa = array([list('AAAAaaaaAAAAaaaa'), list('AAACaaacAAACaaac'), list('AACDaacdAACDaacd'), list('ACCEacceacceACCE'), list('ACDFacdfacdfACDF'), list('ACDGacdgacdgACDG')], dtype='|S1') expect = -log(1. / array([1, 2, 3, 6] * 4)) result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def testGapDividend(self): msa = array([list('AAAA'), list('AAAC'), list('AACD'), list('ACCE'), list('ACDF'), list('ACDG'), list('----')], dtype='|S1') expect = -log(1. / array([1, 2, 3, 6])) result = calcShannonEntropy(msa, omitgaps=True) assert_array_almost_equal(expect, result)
def testSixSequences(self): msa = array([ list('AAAAaaaaAAAAaaaa'), list('AAACaaacAAACaaac'), list('AACDaacdAACDaacd'), list('ACCEacceacceACCE'), list('ACDFacdfacdfACDF'), list('ACDGacdgacdgACDG') ], dtype='|S1') expect = -log(1. / array([1, 2, 3, 6] * 4)) result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def testGapDividend(self): msa = array([ list('AAAA'), list('AAAC'), list('AACD'), list('ACCE'), list('ACDF'), list('ACDG'), list('----') ], dtype='|S1') expect = -log(1. / array([1, 2, 3, 6])) result = calcShannonEntropy(msa, omitgaps=True) assert_array_almost_equal(expect, result)
def testSixSequences(self): msa = array( [ list("AAAAaaaaAAAAaaaa"), list("AAACaaacAAACaaac"), list("AACDaacdAACDaacd"), list("ACCEacceacceACCE"), list("ACDFacdfacdfACDF"), list("ACDGacdgacdgACDG"), ], dtype="|S1", ) expect = -log(1.0 / array([1, 2, 3, 6] * 4)) result = calcShannonEntropy(msa) assert_array_almost_equal(expect, result)
def calcEvolProperties(self, resid='all', refresh=False, folder=None, max_cols=None, max_seqs=25000, **kwargs): ''' Computes Evol properties, i.e. Shannon entropy, Mutual Information and Direct Information, from Pfam Multiple Sequence Alignments, for a given residue. ''' assert type(refresh) is bool # recover Pfam mapping (if not found already) self._searchPfam(refresh=refresh) if resid == 'all': PF_list = self.Pfam.keys() else: # get list of Pfam domains containing resid PF_list = [ k for k in self.Pfam if any([ resid >= int(segment['start']) and resid <= int(segment['end']) for segment in self.Pfam[k]['locations'] ]) ] if len(PF_list) == 0: raise RuntimeError( 'No Pfam domain for resid {}.'.format(resid)) if len(PF_list) > 1: LOGGER.warn('Residue {} is found in multiple '.format(resid) + \ '({}) Pfam domains.'.format(len(PF_list))) if folder is None: folder = SETTINGS.get('rhapsody_local_folder', './') # iterate over Pfam families for PF in PF_list: d = self.Pfam[PF] # skip if properties are pre-computed if not refresh and d.get('mapping') is not None: continue d['mapping'] = None d['ref_MSA'] = None d['entropy'] = np.nan d['MutInfo'] = np.nan d['DirInfo'] = np.nan try: LOGGER.info('Processing {}...'.format(PF)) # fetch & parse MSA # fname = PF + '_full.sth' # fullname = os.path.join(folder, fname) # if not os.path.isfile(fullname): # f = fetchPfamMSA(PF) # os.rename(f, fullname) # msa = parseMSA(fullname, **kwargs) # fetch & parse MSA without saving downloaded MSA f = fetchPfamMSA(PF) msa = parseMSA(f, **kwargs) os.remove(f) # slice MSA to match all segments of the Uniprot sequence sliced_msa, indexes = self._sliceMSA(msa) # if max_cols is not None and sliced_msa.numResidues() > max_cols: # raise Exception('Unable to compute DI: MSA has ' +\ # 'too many columns (max: {}).'.format(max_cols)) # get mapping between Uniprot sequence and Pfam domain d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) d['mapping'] = str(e) continue try: # refine MSA ('seqid' param. is set as in PolyPhen-2) rowocc = 0.6 while True: sliced_msa = refineMSA(sliced_msa, rowocc=rowocc) rowocc += 0.02 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1: break ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs) d['ref_MSA'] = ref_msa # compute evolutionary properties d['entropy'] = calcShannonEntropy(ref_msa) d['MutInfo'] = buildMutinfoMatrix(ref_msa) # d['DirInfo'] = buildDirectInfoMatrix(ref_msa) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) return {k: self.Pfam[k] for k in PF_list}
def evol_coevol(msa, **kwargs): from numpy import arange import prody from prody import parseMSA, buildMutinfoMatrix, showMutinfoMatrix from prody import applyMutinfoCorr, calcShannonEntropy from prody import writeArray, LOGGER, applyMutinfoNorm, writeHeatmap from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_mutinfo' msa = parseMSA(msa) mutinfo = buildMutinfoMatrix(msa, **kwargs) numformat = kwargs.get('numformat', '%12g') heatmap = kwargs.get('heatmap', False) #writeArray(prefix + '.txt', mutinfo, format=numformat) if heatmap: hmargs = { 'xlabel': 'Residue', 'ylabel': 'Residue', 'xorigin': 1, 'xstep': 1, 'residue': arange(msa.numResidues())} todo = [(None, None)] norm = kwargs.get('normalization', []) corr = kwargs.get('correction', []) if norm is not None: if 'joint' in norm: todo.append(('norm', 'joint')) for which in norm: if which == 'join': continue todo.append(('norm', which)) if corr is not None: for which in corr: todo.append(('corr', which)) entropy = None for what, which in todo: if what is None: matrix = mutinfo suffix = '' tuffix = ' Mutual Information' elif which == 'joint': LOGGER.info('Applying {0} normalization.'.format(repr(which))) matrix = buildMutinfoMatrix(msa, norm=True, **kwargs) suffix = '_norm_joint' tuffix = ' MI - Normalization: ' + which elif what == 'norm': LOGGER.info('Applying {0} normalization.'.format(repr(which))) if entropy is None: entropy = calcShannonEntropy(msa, **kwargs) matrix = applyMutinfoNorm(mutinfo, entropy, norm=which) suffix = '_norm_' + which tuffix = ' MI - Normalization: ' + which else: LOGGER.info('Applying {0} correction.'.format(repr(which))) matrix = applyMutinfoCorr(mutinfo, which) suffix = '_corr_' + which tuffix = ' MI - Correction: ' + which writeArray(prefix + suffix + '.txt', matrix, format=kwargs.get('numformat', '%12g')) if heatmap: writeHeatmap(prefix + suffix + '.hm', matrix, title = msa.getTitle() + tuffix, **hmargs) if kwargs.get('figcoevol'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: cmin = kwargs.get('cmin', matrix.min()) cmax = kwargs.get('cmax', matrix.max()) prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) xlabel = kwargs.get('xlabel') title = kwargs.get('title') figure = plt.figure(figsize=(width, height)) show = showMutinfoMatrix(matrix, msa=msa, clim=(cmin, cmax), xlabel=xlabel, title=title) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + suffix + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))