def evol_occupancy(msa, **kwargs): from numpy import arange import prody from prody import parseMSA, calcMSAOccupancy, showMSAOccupancy, writeArray from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_occupancy' msa = parseMSA(msa) numformat = kwargs.get('numformat', '%12g') occupancy, suffix = [], [] occaxis = kwargs.get('occaxis', 'row') if occaxis == 'both': suffix = ['_row', '_col'] occupancy.append(calcMSAOccupancy(msa, occ='row')) occupancy.append(calcMSAOccupancy(msa, occ='col')) else: suffix = '_' + occaxis occupancy.append(calcMSAOccupancy(msa, occ=occaxis)) for i, occ in enumerate(occupancy): writeArray((prefix + suffix[i] + '.txt'), occ, format=numformat) for i, occ in enumerate(occupancy): if kwargs.get('figocc'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) xlabel = kwargs.get('xlabel') title = kwargs.get('title') figure = plt.figure(figsize=(width, height)) label = kwargs.get('label') show = showMSAOccupancy(msa=msa, occ=occ, label=label, xlabel=xlabel, title=title) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + suffix[i] + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))
def evol_occupancy(msa, **kwargs): from numpy import arange import prody from prody import parseMSA, calcMSAOccupancy, showMSAOccupancy, writeArray from os.path import splitext prefix = kwargs.get('prefix') if prefix is None: prefix, _ = splitext(msa) if _.lower() == '.gz': prefix, _ = splitext(prefix) prefix += '_occupancy' msa = parseMSA(msa) numformat = kwargs.get('numformat', '%12g') occupancy , suffix = [], [] occaxis = kwargs.get('occaxis', 'row') if occaxis == 'both': suffix = ['_row', '_col'] occupancy.append(calcMSAOccupancy(msa, occ='row')) occupancy.append(calcMSAOccupancy(msa, occ='col')) else: suffix = '_' + occaxis occupancy.append(calcMSAOccupancy(msa, occ=occaxis)) for i, occ in enumerate(occupancy): writeArray((prefix + suffix[i] + '.txt'), occ, format=numformat) for i, occ in enumerate(occupancy): if kwargs.get('figocc'): try: import matplotlib.pyplot as plt except ImportError: LOGGER.warn('Matplotlib could not be imported, ' 'figures are not saved.') else: prody.SETTINGS['auto_show'] = False width = kwargs.get('figwidth', 8) height = kwargs.get('figheight', 6) xlabel = kwargs.get('xlabel') title = kwargs.get('title') figure = plt.figure(figsize=(width, height)) label = kwargs.get('label') show = showMSAOccupancy(msa=msa, occ=occ, label=label, xlabel=xlabel, title=title) format = kwargs.get('figformat', 'pdf') figure.savefig(prefix + suffix[i] + '.' + format, format=format, dpi=kwargs.get('figdpi', 300))
def testAll(self): rowocc = 0.9 colocc = 0.9 seqid = 0.98 label = "FSHB_BOVIN" refined = refineMSA(FASTA, label=label, seqid=seqid, rowocc=rowocc, colocc=colocc) index = FASTA.getIndex(label) which = FASTA_ALPHA[index].nonzero()[0] expected = FASTA._getArray().take(which, 1) expected = expected[uniqueSequences(expected, seqid)] expected = expected[calcMSAOccupancy(expected, "row") >= rowocc] which = (calcMSAOccupancy(expected) >= colocc).nonzero()[0] expected = expected.take(which, 1) assert_array_equal(refined._getArray(), expected)
def testAll(self): rowocc = 0.9 colocc = 0.9 seqid = 0.98 label = 'FSHB_BOVIN' refined = refineMSA(FASTA, label=label, seqid=seqid, rowocc=rowocc, colocc=colocc) index = FASTA.getIndex(label) which = FASTA_ALPHA[index].nonzero()[0] expected = FASTA._getArray().take(which, 1) expected = expected[uniqueSequences(expected, seqid)] expected = expected[calcMSAOccupancy(expected, 'row') >= rowocc] which = (calcMSAOccupancy(expected) >= colocc).nonzero()[0] expected = expected.take(which, 1) assert_array_equal(refined._getArray(), expected)
def evol_rankorder(mutinfo, **kwargs): from prody import parseMSA, LOGGER, parsePDB, calcMSAOccupancy from prody.utilities import openFile from os.path import splitext delimiter = kwargs.get('delimiter') mi = np.loadtxt(str(mutinfo), delimiter=delimiter) ndim, shape = mi.ndim, mi.shape if ndim != 2 or shape[0] != shape[1]: raise ValueError('mutinfo must contain a square matrix') msa, label = kwargs.get('msa'), kwargs.get('label') pdb, pdbflag = kwargs.get('pdb'), False resnum = None if pdb is not None: from prody import parsePDB try: pdb = parsePDB(pdb) except: LOGGER.info('Could not parse PDB, ignoring PDB input') else: chains = list(pdb.iterChains()) for chain in chains: sel = chain.select('protein and name CA') if sel.numAtoms() == shape[0]: resnum = sel.getResnums() coordset = sel.getCoordsets() distance = calcAllDist(coordset) pdbflag = True label = pdb.getTitle() LOGGER.info('Residue numbers will be based on pdb: ' '{0}'.format(pdb.getTitle())) break else: LOGGER.info('Number of residues in PDB does not match ' 'mutinfo matrix, ignoring PDB input') if not pdbflag: if msa is not None: msa = parseMSA(msa) if msa.numResidues() != shape[0]: LOGGER.info('Input MSA and mutinfo do not have similar no ' 'of residues, ignoring MSA') else: index = msa.getIndex(label) if index is None: if label is not None: LOGGER.info('Could not find given label in MSA, ' 'using complete sequence from MSA') occ = calcMSAOccupancy(msa._msa, 'row') index = np.where(occ == occ.max())[0][0] label, seq, start, end = msa[index] else: label, seq, start, end = msa[index] if (start and end is not None) and (start < end): resnum = np.arange(start, end+1) if len(resnum) != shape[0]: LOGGER.info('Label: {0}/{1}-{2} and mutinfo do ' 'not have similar no of residues, using ' 'serial indexing'.format(label, start, end)) label = 'Serial Index' resnum = np.arange(1, shape[0]+1) else: LOGGER.info('Residue numbers will be based on label: ' '{0}'.format(label)) else: LOGGER.info('Could not identify residue indexes from MSA' ' using serial indexing') label = 'Serial Index' resnum = np.arange(1, shape[0]+1) else: LOGGER.info('MSA or PDB not given or does not match mutinfo, ' 'using serial indexing') resnum = np.arange(1, shape[0]+1) LOGGER.info('Residue numbers start and end with {0}-{1}'. format(str(resnum[0]), str(resnum[-1]))) outname = kwargs.get('outname') if outname is None: outname, ext = splitext(str(mutinfo)) if ext.lower() == '.gz': outname, _ = splitext(str(mutinfo)) else: outname, ext = splitext(str(outname)) if ext is None: ext = '.txt' outname += '_rankorder' + ext zscore = kwargs.get('zscore') if zscore: LOGGER.info('zscore normalization applied such that each column ' 'has 0 mean and standard deviation 1') header = 'Serial\tRow\tColumn\tZscore' mi = (mi - mi.mean(0)) / mi.std(0) else: header = 'Serial\tRow\tColumn\tMI' mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1) mi_matrix = mi[mi_ind_start, mi_ind_end] sorted_index = mi_matrix.argsort(axis=None)[::-1] row = mi_ind_start[sorted_index] column = mi_ind_end[sorted_index] count = 1 i = 0 f = openFile(outname, 'wb') if label is None: label = 'Serial Index' numpairs = kwargs.get('numpairs') size = len(row) seqsep = kwargs.get('seqsep') if not kwargs.get('usedist') or not pdbflag: if kwargs.get('usedist'): LOGGER.info('use-struct-sep set to true, but PDB not given or ' 'incorrect residue number. Using sequence separation') else: if pdbflag: LOGGER.info('use-dist not set, using sequence separation' ' to report coevolving pairs') f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + '\tSequence Separation:' + str(seqsep) + '\n')) if pdbflag: f.write((header + '\tDistance\n')) while count <=numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 else: f.write((header + '\n')) while count <=numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]])) count += 1 i += 1 else: structsep = kwargs.get('dist') f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' + str(structsep) + '\n')) f.write((header + '\tDistance\n')) while count <=numpairs and i < size: if distance[row[i], column[i]] > structsep: f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 f.close()
def testSequenceOccupancy(self): assert_array_equal(calcMSAOccupancy(FASTA, 'sequence'), FASTA_ALPHA.sum(1) / (FASTA.numResidues() * 1.0))
def testResidueOccupancy(self): assert_array_equal(calcMSAOccupancy(FASTA, 'residue'), FASTA_ALPHA.sum(0) / (FASTA.numSequences() * 1.0))
def testSequenceCount(self): assert_array_equal(calcMSAOccupancy(FASTA, 'sequence', count=1), FASTA_ALPHA.sum(1))
def testResidueCount(self): assert_array_equal(calcMSAOccupancy(FASTA, 'residue', count=1), FASTA_ALPHA.sum(0))
def testSequenceOccupancy(self): assert_array_equal(calcMSAOccupancy(FASTA, "sequence"), FASTA_ALPHA.sum(1) / (FASTA.numResidues() * 1.0))
def testResidueOccupancy(self): assert_array_equal(calcMSAOccupancy(FASTA, "residue"), FASTA_ALPHA.sum(0) / (FASTA.numSequences() * 1.0))
def testSequenceCount(self): assert_array_equal(calcMSAOccupancy(FASTA, "sequence", count=1), FASTA_ALPHA.sum(1))
def testResidueCount(self): assert_array_equal(calcMSAOccupancy(FASTA, "residue", count=1), FASTA_ALPHA.sum(0))
def evol_rankorder(mutinfo, **kwargs): from prody import parseMSA, LOGGER, parsePDB, calcMSAOccupancy from prody.utilities import openFile from os.path import splitext delimiter = kwargs.get('delimiter') mi = np.loadtxt(str(mutinfo), delimiter=delimiter) ndim, shape = mi.ndim, mi.shape if ndim != 2 or shape[0] != shape[1]: raise ValueError('mutinfo must contain a square matrix') msa, label = kwargs.get('msa'), kwargs.get('label') pdb, pdbflag = kwargs.get('pdb'), False resnum = None if pdb is not None: from prody import parsePDB try: pdb = parsePDB(pdb) except: LOGGER.info('Could not parse PDB, ignoring PDB input') else: chains = list(pdb.iterChains()) for chain in chains: sel = chain.select('protein and name CA') if sel.numAtoms() == shape[0]: resnum = sel.getResnums() coordset = sel.getCoordsets() distance = calcAllDist(coordset) pdbflag = True label = pdb.getTitle() LOGGER.info('Residue numbers will be based on pdb: ' '{0}'.format(pdb.getTitle())) break else: LOGGER.info('Number of residues in PDB does not match ' 'mutinfo matrix, ignoring PDB input') if not pdbflag: if msa is not None: msa = parseMSA(msa) if msa.numResidues() != shape[0]: LOGGER.info('Input MSA and mutinfo do not have similar no ' 'of residues, ignoring MSA') else: index = msa.getIndex(label) if index is None: if label is not None: LOGGER.info('Could not find given label in MSA, ' 'using complete sequence from MSA') occ = calcMSAOccupancy(msa._msa, 'row') index = np.where(occ == occ.max())[0][0] label, seq, start, end = msa[index] else: label, seq, start, end = msa[index] if (start and end is not None) and (start < end): resnum = np.arange(start, end + 1) if len(resnum) != shape[0]: LOGGER.info('Label: {0}/{1}-{2} and mutinfo do ' 'not have similar no of residues, using ' 'serial indexing'.format( label, start, end)) label = 'Serial Index' resnum = np.arange(1, shape[0] + 1) else: LOGGER.info('Residue numbers will be based on label: ' '{0}'.format(label)) else: LOGGER.info('Could not identify residue indexes from MSA' ' using serial indexing') label = 'Serial Index' resnum = np.arange(1, shape[0] + 1) else: LOGGER.info('MSA or PDB not given or does not match mutinfo, ' 'using serial indexing') resnum = np.arange(1, shape[0] + 1) LOGGER.info('Residue numbers start and end with {0}-{1}'.format( str(resnum[0]), str(resnum[-1]))) outname = kwargs.get('outname') if outname is None: outname, ext = splitext(str(mutinfo)) if ext.lower() == '.gz': outname, _ = splitext(str(mutinfo)) else: outname, ext = splitext(str(outname)) if ext is None: ext = '.txt' outname += '_rankorder' + ext zscore = kwargs.get('zscore') if zscore: LOGGER.info('zscore normalization applied such that each column ' 'has 0 mean and standard deviation 1') header = 'Serial\tRow\tColumn\tZscore' mi = (mi - mi.mean(0)) / mi.std(0) else: header = 'Serial\tRow\tColumn\tMI' mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1) mi_matrix = mi[mi_ind_start, mi_ind_end] sorted_index = mi_matrix.argsort(axis=None)[::-1] row = mi_ind_start[sorted_index] column = mi_ind_end[sorted_index] count = 1 i = 0 f = openFile(outname, 'wb') if label is None: label = 'Serial Index' numpairs = kwargs.get('numpairs') size = len(row) seqsep = kwargs.get('seqsep') if not kwargs.get('usedist') or not pdbflag: if kwargs.get('usedist'): LOGGER.info('use-struct-sep set to true, but PDB not given or ' 'incorrect residue number. Using sequence separation') else: if pdbflag: LOGGER.info('use-dist not set, using sequence separation' ' to report coevolving pairs') f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + '\tSequence Separation:' + str(seqsep) + '\n')) if pdbflag: f.write((header + '\tDistance\n')) while count <= numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 else: f.write((header + '\n')) while count <= numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]])) count += 1 i += 1 else: structsep = kwargs.get('dist') f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' + str(structsep) + '\n')) f.write((header + '\tDistance\n')) while count <= numpairs and i < size: if distance[row[i], column[i]] > structsep: f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format( count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 f.close()