def evol_conserv(msa, **kwargs): import prody from prody import parseMSA, calcShannonEntropy, showShannonEntropy from prody import writeArray from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_conserv' msa = parseMSA(msa) entropy = calcShannonEntropy(msa, **kwargs) writeArray(prefix + '.txt', entropy, format=kwargs.get('numformat', '%12g')) if kwargs.get('figent'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) figargs = kwargs.get('figargs', ()) figure = plt.figure(figsize=(width, height)) show = showShannonEntropy(entropy, msa=msa, *figargs) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))
def evol_occupancy(msa, **kwargs): from numpy import arange import prody from prody import parseMSA, calcMSAOccupancy, showMSAOccupancy, writeArray from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_occupancy' msa = parseMSA(msa) numformat = kwargs.get('numformat', '%12g') occupancy, suffix = [], [] occaxis = kwargs.get('occaxis', 'row') if occaxis == 'both': suffix = ['_row', '_col'] occupancy.append(calcMSAOccupancy(msa, occ='row')) occupancy.append(calcMSAOccupancy(msa, occ='col')) else: suffix = '_' + occaxis occupancy.append(calcMSAOccupancy(msa, occ=occaxis)) for i, occ in enumerate(occupancy): writeArray((prefix + suffix[i] + '.txt'), occ, format=numformat) for i, occ in enumerate(occupancy): if kwargs.get('figocc'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) xlabel = kwargs.get('xlabel') title = kwargs.get('title') figure = plt.figure(figsize=(width, height)) label = kwargs.get('label') show = showMSAOccupancy(msa=msa, occ=occ, label=label, xlabel=xlabel, title=title) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + suffix[i] + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))
def evol_occupancy(msa, **kwargs): from numpy import arange import prody from prody import parseMSA, calcMSAOccupancy, showMSAOccupancy, writeArray from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_occupancy' msa = parseMSA(msa) numformat = kwargs.get('numformat', '%12g') occupancy , suffix = [], [] occaxis = kwargs.get('occaxis', 'row') if occaxis == 'both': suffix = ['_row', '_col'] occupancy.append(calcMSAOccupancy(msa, occ='row')) occupancy.append(calcMSAOccupancy(msa, occ='col')) else: suffix = '_' + occaxis occupancy.append(calcMSAOccupancy(msa, occ=occaxis)) for i, occ in enumerate(occupancy): writeArray((prefix + suffix[i] + '.txt'), occ, format=numformat) for i, occ in enumerate(occupancy): if kwargs.get('figocc'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) xlabel = kwargs.get('xlabel') title = kwargs.get('title') figure = plt.figure(figsize=(width, height)) label = kwargs.get('label') show = showMSAOccupancy(msa=msa, occ=occ, label=label, xlabel=xlabel, title=title) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + suffix[i] + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))
def evol_refine(msa, **kwargs): from prody import parseMSA, refineMSA, writeMSA, LOGGER from os.path import splitext outname = kwargs.get('outname') if outname is None: outname, ext = splitext(msa) if ext.lower() == '.gz': outname, _ = splitext(msa) outname += '_refined' + ext writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs) LOGGER.info('Refined MSA is written in file: ' + outname)
def evol_refine(msa, **kwargs): import prody from prody import parseMSA, refineMSA, writeMSA, LOGGER from os.path import splitext outname = kwargs.get('outname') if outname is None: outname, ext = splitext(msa) if ext.lower() == '.gz': outname, _ = splitext(msa) outname += '_refined' + ext writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs) LOGGER.info('Refined MSA is written in file: ' + outname)
def evol_refine(msa, **kwargs): import prody from prody import parseMSA, refineMSA, writeMSA, LOGGER from os.path import splitext outname = kwargs.get("outname") if outname is None: outname, ext = splitext(msa) if ext.lower() == ".gz": outname, _ = splitext(msa) outname += "_refined" + ext writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs) LOGGER.info("Refined MSA is written in file: " + outname)
def evol_merge(*msa, **kwargs): import prody from prody import parseMSA, mergeMSA, LOGGER, writeMSA, MSAFile from prody.sequence.msafile import MSAEXTMAP from os.path import splitext if len(msa) < 2: raise ValueError('multiple msa filenames must be specified') msaobj = [] try: msaobj = [parseMSA(fn) for fn in msa] except: raise IOError('failed to parse {0}'.format(fn)) msafile = MSAFile(msa[0]) format = kwargs.get('format') or msafile.format outname = kwargs.get('outname') or (msafile.getTitle() + '_merged' + MSAEXTMAP[msafile.format]) writeMSA(outname, mergeMSA(*msaobj), **kwargs) LOGGER.info('Merged MSA is saved as: {0}'.format(outname))
def evol_coevol(msa, **kwargs): from numpy import arange import prody from prody import parseMSA, buildMutinfoMatrix, showMutinfoMatrix from prody import applyMutinfoCorr, calcShannonEntropy from prody import writeArray, LOGGER, applyMutinfoNorm, writeHeatmap from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_mutinfo' msa = parseMSA(msa) mutinfo = buildMutinfoMatrix(msa, **kwargs) numformat = kwargs.get('numformat', '%12g') heatmap = kwargs.get('heatmap', False) #writeArray(prefix + '.txt', mutinfo, format=numformat) if heatmap: hmargs = { 'xlabel': 'Residue', 'ylabel': 'Residue', 'xorigin': 1, 'xstep': 1, 'residue': arange(msa.numResidues())} todo = [(None, None)] norm = kwargs.get('normalization', []) corr = kwargs.get('correction', []) if norm is not None: if 'joint' in norm: todo.append(('norm', 'joint')) for which in norm: if which == 'join': continue todo.append(('norm', which)) if corr is not None: for which in corr: todo.append(('corr', which)) entropy = None for what, which in todo: if what is None: matrix = mutinfo suffix = '' tuffix = ' Mutual Information' elif which == 'joint': LOGGER.info('Applying {0} normalization.'.format(repr(which))) matrix = buildMutinfoMatrix(msa, norm=True, **kwargs) suffix = '_norm_joint' tuffix = ' MI - Normalization: ' + which elif what == 'norm': LOGGER.info('Applying {0} normalization.'.format(repr(which))) if entropy is None: entropy = calcShannonEntropy(msa, **kwargs) matrix = applyMutinfoNorm(mutinfo, entropy, norm=which) suffix = '_norm_' + which tuffix = ' MI - Normalization: ' + which else: LOGGER.info('Applying {0} correction.'.format(repr(which))) matrix = applyMutinfoCorr(mutinfo, which) suffix = '_corr_' + which tuffix = ' MI - Correction: ' + which writeArray(prefix + suffix + '.txt', matrix, format=kwargs.get('numformat', '%12g')) if heatmap: writeHeatmap(prefix + suffix + '.hm', matrix, title = msa.getTitle() + tuffix, **hmargs) if kwargs.get('figcoevol'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: cmin = kwargs.get('cmin', matrix.min()) cmax = kwargs.get('cmax', matrix.max()) prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) xlabel = kwargs.get('xlabel') title = kwargs.get('title') figure = plt.figure(figsize=(width, height)) show = showMutinfoMatrix(matrix, msa=msa, clim=(cmin, cmax), xlabel=xlabel, title=title) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + suffix + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))
def evol_rankorder(mutinfo, **kwargs): from prody import parseMSA, LOGGER, parsePDB, calcMSAOccupancy from prody.utilities import openFile from os.path import splitext delimiter = kwargs.get('delimiter') mi = np.loadtxt(str(mutinfo), delimiter=delimiter) ndim, shape = mi.ndim, mi.shape if ndim != 2 or shape[0] != shape[1]: raise ValueError('mutinfo must contain a square matrix') msa, label = kwargs.get('msa'), kwargs.get('label') pdb, pdbflag = kwargs.get('pdb'), False resnum = None if pdb is not None: from prody import parsePDB try: pdb = parsePDB(pdb) except: LOGGER.info('Could not parse PDB, ignoring PDB input') else: chains = list(pdb.iterChains()) for chain in chains: sel = chain.select('protein and name CA') if sel.numAtoms() == shape[0]: resnum = sel.getResnums() coordset = sel.getCoordsets() distance = calcAllDist(coordset) pdbflag = True label = pdb.getTitle() LOGGER.info('Residue numbers will be based on pdb: ' '{0}'.format(pdb.getTitle())) break else: LOGGER.info('Number of residues in PDB does not match ' 'mutinfo matrix, ignoring PDB input') if not pdbflag: if msa is not None: msa = parseMSA(msa) if msa.numResidues() != shape[0]: LOGGER.info('Input MSA and mutinfo do not have similar no ' 'of residues, ignoring MSA') else: index = msa.getIndex(label) if index is None: if label is not None: LOGGER.info('Could not find given label in MSA, ' 'using complete sequence from MSA') occ = calcMSAOccupancy(msa._msa, 'row') index = np.where(occ == occ.max())[0][0] label, seq, start, end = msa[index] else: label, seq, start, end = msa[index] if (start and end is not None) and (start < end): resnum = np.arange(start, end+1) if len(resnum) != shape[0]: LOGGER.info('Label: {0}/{1}-{2} and mutinfo do ' 'not have similar no of residues, using ' 'serial indexing'.format(label, start, end)) label = 'Serial Index' resnum = np.arange(1, shape[0]+1) else: LOGGER.info('Residue numbers will be based on label: ' '{0}'.format(label)) else: LOGGER.info('Could not identify residue indexes from MSA' ' using serial indexing') label = 'Serial Index' resnum = np.arange(1, shape[0]+1) else: LOGGER.info('MSA or PDB not given or does not match mutinfo, ' 'using serial indexing') resnum = np.arange(1, shape[0]+1) LOGGER.info('Residue numbers start and end with {0}-{1}'. format(str(resnum[0]), str(resnum[-1]))) outname = kwargs.get('outname') if outname is None: outname, ext = splitext(str(mutinfo)) if ext.lower() == '.gz': outname, _ = splitext(str(mutinfo)) else: outname, ext = splitext(str(outname)) if ext is None: ext = '.txt' outname += '_rankorder' + ext zscore = kwargs.get('zscore') if zscore: LOGGER.info('zscore normalization applied such that each column ' 'has 0 mean and standard deviation 1') header = 'Serial\tRow\tColumn\tZscore' mi = (mi - mi.mean(0)) / mi.std(0) else: header = 'Serial\tRow\tColumn\tMI' mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1) mi_matrix = mi[mi_ind_start, mi_ind_end] sorted_index = mi_matrix.argsort(axis=None)[::-1] row = mi_ind_start[sorted_index] column = mi_ind_end[sorted_index] count = 1 i = 0 f = openFile(outname, 'wb') if label is None: label = 'Serial Index' numpairs = kwargs.get('numpairs') size = len(row) seqsep = kwargs.get('seqsep') if not kwargs.get('usedist') or not pdbflag: if kwargs.get('usedist'): LOGGER.info('use-struct-sep set to true, but PDB not given or ' 'incorrect residue number. Using sequence separation') else: if pdbflag: LOGGER.info('use-dist not set, using sequence separation' ' to report coevolving pairs') f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + '\tSequence Separation:' + str(seqsep) + '\n')) if pdbflag: f.write((header + '\tDistance\n')) while count <=numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 else: f.write((header + '\n')) while count <=numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]])) count += 1 i += 1 else: structsep = kwargs.get('dist') f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' + str(structsep) + '\n')) f.write((header + '\tDistance\n')) while count <=numpairs and i < size: if distance[row[i], column[i]] > structsep: f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 f.close()
__author__ = 'Ahmet Bakan' __copyright__ = 'Copyright (C) 2010-2012 Ahmet Bakan' from prody.tests import TestCase from numpy import array, log, zeros, char, ones from numpy.testing import assert_array_equal, assert_array_almost_equal from prody.tests.test_datafiles import * from prody import LOGGER, calcShannonEntropy, buildMutinfoMatrix, parseMSA from prody import calcMSAOccupancy, buildSeqidMatrix, uniqueSequences LOGGER.verbosity = None FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta')) FASTA_ALPHA = char.isalpha(FASTA._msa) FASTA_UPPER = char.upper(FASTA._msa) FASTA_NUMBER, FASTA_LENGTH = FASTA_ALPHA.shape FASTA_EYE = zeros((FASTA_NUMBER, FASTA_NUMBER)) for i in range(FASTA_NUMBER): FASTA_EYE[i, i] = 1 for j in range(i + 1, FASTA_NUMBER): score = 0.0 ncols = 0 for k in range(FASTA_LENGTH): if FASTA_ALPHA[i, k] or FASTA_ALPHA[j, k]: if FASTA_UPPER[i, k] == FASTA_UPPER[j, k]: score += 1 ncols += 1
__author__ = 'Ahmet Bakan' __copyright__ = 'Copyright (C) 2010-2012 Ahmet Bakan' from prody.tests import TestCase from numpy import array, log, zeros, char from numpy.testing import assert_array_equal, assert_array_almost_equal from prody.tests.test_datafiles import * from prody import LOGGER, refineMSA, parseMSA, calcMSAOccupancy, mergeMSA from prody import uniqueSequences LOGGER.verbosity = None FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta')) FASTA_ALPHA = char.isalpha(FASTA._msa) NUMSEQ = FASTA.numSequences() * 1. class TestRefinement(TestCase): def testLabel(self): label = 'FSHB_BOVIN' index = FASTA.getIndex(label) refined = refineMSA(FASTA, label=label)._getArray() expected = FASTA._getArray().take(FASTA_ALPHA[index].nonzero()[0], 1) assert_array_equal(refined, expected) def testRowocc(self):
from os.path import join try: from io import StringIO except ImportError: from io import StringIO from numpy import array, log, zeros, char from numpy.testing import assert_array_equal, dec from prody.tests.datafiles import * from prody.tests import TEMPDIR from prody import MSA, MSAFile, parseMSA, LOGGER, writeMSA LOGGER.verbosity = None FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta')) SELEX = parseMSA(pathDatafile('msa_Cys_knot.slx')) STOCK = parseMSA(pathDatafile('msa_Cys_knot.sth')) FASTA_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.fasta'))) SELEX_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.sth'))) STOCK_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.sth'))) class TestMSAFile(TestCase): def testMSAFile(self): self.assertListEqual(FASTA_LIST, SELEX_LIST) self.assertListEqual(FASTA_LIST, STOCK_LIST) def testWriteFasta(self):
def evol_rankorder(mutinfo, **kwargs): from prody import parseMSA, LOGGER, parsePDB, calcMSAOccupancy from prody.utilities import openFile from os.path import splitext delimiter = kwargs.get('delimiter') mi = np.loadtxt(str(mutinfo), delimiter=delimiter) ndim, shape = mi.ndim, mi.shape if ndim != 2 or shape[0] != shape[1]: raise ValueError('mutinfo must contain a square matrix') msa, label = kwargs.get('msa'), kwargs.get('label') pdb, pdbflag = kwargs.get('pdb'), False resnum = None if pdb is not None: from prody import parsePDB try: pdb = parsePDB(pdb) except: LOGGER.info('Could not parse PDB, ignoring PDB input') else: chains = list(pdb.iterChains()) for chain in chains: sel = chain.select('protein and name CA') if sel.numAtoms() == shape[0]: resnum = sel.getResnums() coordset = sel.getCoordsets() distance = calcAllDist(coordset) pdbflag = True label = pdb.getTitle() LOGGER.info('Residue numbers will be based on pdb: ' '{0}'.format(pdb.getTitle())) break else: LOGGER.info('Number of residues in PDB does not match ' 'mutinfo matrix, ignoring PDB input') if not pdbflag: if msa is not None: msa = parseMSA(msa) if msa.numResidues() != shape[0]: LOGGER.info('Input MSA and mutinfo do not have similar no ' 'of residues, ignoring MSA') else: index = msa.getIndex(label) if index is None: if label is not None: LOGGER.info('Could not find given label in MSA, ' 'using complete sequence from MSA') occ = calcMSAOccupancy(msa._msa, 'row') index = np.where(occ == occ.max())[0][0] label, seq, start, end = msa[index] else: label, seq, start, end = msa[index] if (start and end is not None) and (start < end): resnum = np.arange(start, end + 1) if len(resnum) != shape[0]: LOGGER.info('Label: {0}/{1}-{2} and mutinfo do ' 'not have similar no of residues, using ' 'serial indexing'.format( label, start, end)) label = 'Serial Index' resnum = np.arange(1, shape[0] + 1) else: LOGGER.info('Residue numbers will be based on label: ' '{0}'.format(label)) else: LOGGER.info('Could not identify residue indexes from MSA' ' using serial indexing') label = 'Serial Index' resnum = np.arange(1, shape[0] + 1) else: LOGGER.info('MSA or PDB not given or does not match mutinfo, ' 'using serial indexing') resnum = np.arange(1, shape[0] + 1) LOGGER.info('Residue numbers start and end with {0}-{1}'.format( str(resnum[0]), str(resnum[-1]))) outname = kwargs.get('outname') if outname is None: outname, ext = splitext(str(mutinfo)) if ext.lower() == '.gz': outname, _ = splitext(str(mutinfo)) else: outname, ext = splitext(str(outname)) if ext is None: ext = '.txt' outname += '_rankorder' + ext zscore = kwargs.get('zscore') if zscore: LOGGER.info('zscore normalization applied such that each column ' 'has 0 mean and standard deviation 1') header = 'Serial\tRow\tColumn\tZscore' mi = (mi - mi.mean(0)) / mi.std(0) else: header = 'Serial\tRow\tColumn\tMI' mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1) mi_matrix = mi[mi_ind_start, mi_ind_end] sorted_index = mi_matrix.argsort(axis=None)[::-1] row = mi_ind_start[sorted_index] column = mi_ind_end[sorted_index] count = 1 i = 0 f = openFile(outname, 'wb') if label is None: label = 'Serial Index' numpairs = kwargs.get('numpairs') size = len(row) seqsep = kwargs.get('seqsep') if not kwargs.get('usedist') or not pdbflag: if kwargs.get('usedist'): LOGGER.info('use-struct-sep set to true, but PDB not given or ' 'incorrect residue number. Using sequence separation') else: if pdbflag: LOGGER.info('use-dist not set, using sequence separation' ' to report coevolving pairs') f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + '\tSequence Separation:' + str(seqsep) + '\n')) if pdbflag: f.write((header + '\tDistance\n')) while count <= numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 else: f.write((header + '\n')) while count <= numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]])) count += 1 i += 1 else: structsep = kwargs.get('dist') f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' + str(structsep) + '\n')) f.write((header + '\tDistance\n')) while count <= numpairs and i < size: if distance[row[i], column[i]] > structsep: f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 f.close()
import prody.sequence as sequence import prody import matplotlib.pyplot as plt alignment = prody.MSAFile("pkinase.fasta") #get positions -> by hand for now positions = [72, 83, 117, 119, 194, 251, 354, 355, 357, 429, 432] #user alignSequenceToMSA instead to derive positions automatically #set up webservice to get correspondance between MSA position and a particular PDB structure alignment.setSlice(positions) prody.writeMSA("test.fasta", alignment) pa = prody.parseMSA("pocket_type1.fasta") labs = pa.getLabels() seqidmatrix = prody.buildSeqidMatrix(pa) scamatrix = prody.buildSCAMatrix(pa) tree = prody.calcTree(names=labs, distance_matrix=seqidmatrix) plt.figure() show = prody.showTree(tree, format='plt')
from prody.tests import TestCase import os from os.path import join from numpy import array, log, zeros, char from numpy.testing import assert_array_equal, dec from prody.tests.datafiles import * from prody.tests import TEMPDIR from prody import MSA, MSAFile, parseMSA, LOGGER, writeMSA from prody.utilities import createStringIO LOGGER.verbosity = None FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta')) SELEX = parseMSA(pathDatafile('msa_Cys_knot.slx')) STOCK = parseMSA(pathDatafile('msa_Cys_knot.sth')) FASTA_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.fasta'))) SELEX_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.sth'))) STOCK_LIST = list(MSAFile(pathDatafile('msa_Cys_knot.sth'))) class TestMSAFile(TestCase): def testMSAFile(self): self.assertListEqual(FASTA_LIST, SELEX_LIST) self.assertListEqual(FASTA_LIST, STOCK_LIST) def testWriteFasta(self):
def testSelex(self): filename = writeMSA(join(TEMPDIR, 'test.slx'), SELEX) selex = parseMSA(pathDatafile(filename)) self.assertListEqual(list(SELEX), list(selex)) if os.path.isfile(filename): os.remove(filename)
def calcEvolProperties(self, resid='all', refresh=False, folder=None, max_cols=None, max_seqs=25000, **kwargs): ''' Computes Evol properties, i.e. Shannon entropy, Mutual Information and Direct Information, from Pfam Multiple Sequence Alignments, for a given residue. ''' assert type(refresh) is bool # recover Pfam mapping (if not found already) self._searchPfam(refresh=refresh) if resid == 'all': PF_list = self.Pfam.keys() else: # get list of Pfam domains containing resid PF_list = [ k for k in self.Pfam if any([ resid >= int(segment['start']) and resid <= int(segment['end']) for segment in self.Pfam[k]['locations'] ]) ] if len(PF_list) == 0: raise RuntimeError( 'No Pfam domain for resid {}.'.format(resid)) if len(PF_list) > 1: LOGGER.warn('Residue {} is found in multiple '.format(resid) + \ '({}) Pfam domains.'.format(len(PF_list))) if folder is None: folder = SETTINGS.get('rhapsody_local_folder', './') # iterate over Pfam families for PF in PF_list: d = self.Pfam[PF] # skip if properties are pre-computed if not refresh and d.get('mapping') is not None: continue d['mapping'] = None d['ref_MSA'] = None d['entropy'] = np.nan d['MutInfo'] = np.nan d['DirInfo'] = np.nan try: LOGGER.info('Processing {}...'.format(PF)) # fetch & parse MSA # fname = PF + '_full.sth' # fullname = os.path.join(folder, fname) # if not os.path.isfile(fullname): # f = fetchPfamMSA(PF) # os.rename(f, fullname) # msa = parseMSA(fullname, **kwargs) # fetch & parse MSA without saving downloaded MSA f = fetchPfamMSA(PF) msa = parseMSA(f, **kwargs) os.remove(f) # slice MSA to match all segments of the Uniprot sequence sliced_msa, indexes = self._sliceMSA(msa) # if max_cols is not None and sliced_msa.numResidues() > max_cols: # raise Exception('Unable to compute DI: MSA has ' +\ # 'too many columns (max: {}).'.format(max_cols)) # get mapping between Uniprot sequence and Pfam domain d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) d['mapping'] = str(e) continue try: # refine MSA ('seqid' param. is set as in PolyPhen-2) rowocc = 0.6 while True: sliced_msa = refineMSA(sliced_msa, rowocc=rowocc) rowocc += 0.02 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1: break ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs) d['ref_MSA'] = ref_msa # compute evolutionary properties d['entropy'] = calcShannonEntropy(ref_msa) d['MutInfo'] = buildMutinfoMatrix(ref_msa) # d['DirInfo'] = buildDirectInfoMatrix(ref_msa) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) return {k: self.Pfam[k] for k in PF_list}
__copyright__ = "Copyright (C) 2010-2012 Ahmet Bakan" from prody.tests import TestCase from numpy import array, log, zeros, char, ones, fromfile from numpy.testing import assert_array_equal, assert_array_almost_equal from prody.tests.test_datafiles import * from prody import LOGGER, calcShannonEntropy, buildMutinfoMatrix, parseMSA from prody import calcMSAOccupancy, buildSeqidMatrix, uniqueSequences from prody import buildOMESMatrix, buildSCAMatrix LOGGER.verbosity = None FASTA = parseMSA(pathDatafile("msa_Cys_knot.fasta")) FASTA_ALPHA = char.isalpha(FASTA._msa) FASTA_UPPER = char.upper(FASTA._msa) FASTA_NUMBER, FASTA_LENGTH = FASTA_ALPHA.shape FASTA_EYE = zeros((FASTA_NUMBER, FASTA_NUMBER)) for i in range(FASTA_NUMBER): FASTA_EYE[i, i] = 1 for j in range(i + 1, FASTA_NUMBER): score = 0.0 ncols = 0 for k in range(FASTA_LENGTH): if FASTA_ALPHA[i, k] or FASTA_ALPHA[j, k]: if FASTA_UPPER[i, k] == FASTA_UPPER[j, k]: score += 1 ncols += 1