Example #1
0
    def testNoAmbiguity(self):

        msa = array([list('OX'), list('XO')], dtype='|S1')

        expect = array([[0., log(2.)], [log(2.), 0.]])
        result = buildMutinfoMatrix(msa, ambiquity=False)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, ambiquity=False, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #2
0
    def testAmbiguity7(self):

        msa = array([list("bx"), list("xb")], dtype="|S1")
        expect = 72 * 0.0125 * log(0.0125 / 0.0250 / 0.275) + 4 * 0.0250 * log(0.0250 / 0.275 / 0.275)
        expect = array([[0.0, expect], [expect, 0.0]])
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg="turbo failed")
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
Example #3
0
    def testAmbiguity3(self):

        msa = array([list("XX")], dtype="|S1")

        expect = zeros((2, 2))
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg="turbo failed")
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
Example #4
0
    def testNoAmbiguity(self):

        msa = array([list("OX"), list("XO")], dtype="|S1")

        expect = array([[0.0, log(2.0)], [log(2.0), 0.0]])
        result = buildMutinfoMatrix(msa, ambiquity=False)
        assert_array_almost_equal(expect, result, err_msg="turbo failed")
        result = buildMutinfoMatrix(msa, ambiquity=False, turbo=False)
        assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
Example #5
0
    def testAmbiguity3(self):

        msa = array([list('XX')], dtype='|S1')

        expect = zeros((2, 2))
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #6
0
    def testAmbiguity7(self):

        msa = array([list('bx'), list('xb')], dtype='|S1')
        expect = (72 * 0.0125 * log(0.0125 / 0.0250 / 0.275) +
                  4 * 0.0250 * log(0.0250 / 0.275 / 0.275))
        expect = array([[0., expect], [expect, 0.]])
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #7
0
    def testAmbiguity4(self):

        msa = array([list("Bb"), list("jJ"), list("Zz")], dtype="|S1")

        expect = log((1.0 / 12) / (1.0 / 6) / (1.0 / 6))
        expect = array([[0.0, expect], [expect, 0.0]])
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg="turbo failed")
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
Example #8
0
    def testAmbiguity2(self):

        msa = array([list('AB'), list('BZ')], dtype='|S1')
        expect = (2 * .25 * log(.25 / .5 / .25) +
                  4 * .125 * log(.125 / .25 / .25))
        expect = array([[0., expect], [expect, 0.]])
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #9
0
    def testTwentyReversed(self):

        seq = "ACDEFGHIKLMNPQRSTVWY"
        msa = array([[s, seq[-i - 1]] for i, s in enumerate(seq)], dtype="|S1")

        expect = log(20.0)
        expect = array([[0.0, expect], [expect, 0.0]])
        result = buildMutinfoMatrix(msa)
        assert_array_almost_equal(expect, result, err_msg="turbo failed")
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
Example #10
0
    def testTwentyReversed(self):

        seq = 'ACDEFGHIKLMNPQRSTVWY'
        msa = array([[s, seq[-i - 1]] for i, s in enumerate(seq)], dtype='|S1')

        expect = log(20.)
        expect = array([[0., expect], [expect, 0.]])
        result = buildMutinfoMatrix(msa)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #11
0
    def testTwenty(self):

        seq = 'ACDEFGHIKLMNPQRSTVWY'
        msa = array([[s, s] for s in seq], dtype='|S1')

        expect = log(20.)
        expect = array([[0., expect], [expect, 0.]])
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #12
0
    def testTwenty(self):

        seq = "ACDEFGHIKLMNPQRSTVWY"
        msa = array([[s, s] for s in seq], dtype="|S1")

        expect = log(20.0)
        expect = array([[0.0, expect], [expect, 0.0]])
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg="turbo failed")
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg="w/out turbo failed")
Example #13
0
    def testInf(self):

        msa = zeros((500, 10), '|S1')
        msa.fill('.')
        msa[95, 8] = 's'
        msa[95, 9] = 'i'
        expect = zeros((10, 10))
        expect[8, 9] = expect[9,
                              8] = 0.002 * log(500.) + 0.998 * log(1. / 0.998)
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #14
0
    def testAmbiguity4(self):

        msa = array([
            list('Bb'),
            list('jJ'),
            list('Zz'),
        ], dtype='|S1')

        expect = log((1. / 12) / (1. / 6) / (1. / 6))
        expect = array([[0., expect], [expect, 0.]])
        result = buildMutinfoMatrix(msa, debug=False)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #15
0
    def testAmbiguity6(self):

        expect = zeros((2, 2))

        for seq in ['bb', 'jj', 'zz']:
            msa = array([list(seq)], dtype='|S1')
            result = buildMutinfoMatrix(msa, debug=False)
            assert_array_almost_equal(expect, result, err_msg=seq + ' failed')
Example #16
0
    def testNorm(self):

        seq = 'ACDEFGHIKLMNPQRSTVWY'
        msa = array([[s, seq[-i - 1]] for i, s in enumerate(seq)], dtype='|S1')

        expect = 1.
        expect = array([[0., expect], [expect, 0.]])
        result = buildMutinfoMatrix(msa, norm=True)
        assert_array_almost_equal(expect, result, err_msg='norm failed')
Example #17
0
    def testSixSequences(self):

        msa = array([list('ACCA'),
                     list('ACDA'),
                     list('ACEC'),
                     list('ACGC')],
                    dtype='|S1')

        expect = array([
            [0., 0., 0., 0.],
            [0., 0., 0., 0.],
            [0., 0., 0., log(2.)],
            [0., 0., log(2.), 0.],
        ])
        result = buildMutinfoMatrix(msa)
        assert_array_almost_equal(expect, result, err_msg='turbo failed')
        result = buildMutinfoMatrix(msa, turbo=False)
        assert_array_almost_equal(expect, result, err_msg='w/out turbo failed')
Example #18
0
    def testNorm2(self):

        seq = 'ACDEFGHIKLMNPQRSTVWY'
        msa = array([[s, 'O' if i % 2 else 'U'] for i, s in enumerate(seq)],
                    dtype='|S1')

        expect = log(1. / 20. / (1. / 20. * 1. / 2.)) / (-log(1. / 20.))
        expect = array([[0., expect], [expect, 0.]])
        result = buildMutinfoMatrix(msa, norm=True)
        assert_array_almost_equal(expect, result, err_msg='norm failed')
Example #19
0
    def testAmbiguity5(self):

        expect = array([[0., 0.], [0., 0.]])

        for seq in [
                'bx', 'Xb', 'jX', 'Xj', 'xz', 'ZX', 'bj', 'jb', 'bz', 'zb',
                'jz', 'zj'
        ]:
            msa = array([list(seq)], dtype='|S1')
            result = buildMutinfoMatrix(msa, debug=False)
            assert_array_almost_equal(expect, result, err_msg=seq + ' failed')
Example #20
0
 def calcEvolProperties(self,
                        resid='all',
                        refresh=False,
                        folder=None,
                        max_cols=None,
                        max_seqs=25000,
                        **kwargs):
     ''' Computes Evol properties, i.e. Shannon entropy, Mutual
     Information and Direct Information, from Pfam Multiple
     Sequence Alignments, for a given residue.
     '''
     assert type(refresh) is bool
     # recover Pfam mapping (if not found already)
     self._searchPfam(refresh=refresh)
     if resid == 'all':
         PF_list = self.Pfam.keys()
     else:
         # get list of Pfam domains containing resid
         PF_list = [
             k for k in self.Pfam if any([
                 resid >= int(segment['start'])
                 and resid <= int(segment['end'])
                 for segment in self.Pfam[k]['locations']
             ])
         ]
         if len(PF_list) == 0:
             raise RuntimeError(
                 'No Pfam domain for resid {}.'.format(resid))
         if len(PF_list) > 1:
             LOGGER.warn('Residue {} is found in multiple '.format(resid) + \
                         '({}) Pfam domains.'.format(len(PF_list)))
     if folder is None:
         folder = SETTINGS.get('rhapsody_local_folder', './')
     # iterate over Pfam families
     for PF in PF_list:
         d = self.Pfam[PF]
         # skip if properties are pre-computed
         if not refresh and d.get('mapping') is not None:
             continue
         d['mapping'] = None
         d['ref_MSA'] = None
         d['entropy'] = np.nan
         d['MutInfo'] = np.nan
         d['DirInfo'] = np.nan
         try:
             LOGGER.info('Processing {}...'.format(PF))
             # fetch & parse MSA
             #               fname = PF + '_full.sth'
             #               fullname = os.path.join(folder, fname)
             #               if not os.path.isfile(fullname):
             #                   f = fetchPfamMSA(PF)
             #                   os.rename(f, fullname)
             #               msa = parseMSA(fullname, **kwargs)
             # fetch & parse MSA without saving downloaded MSA
             f = fetchPfamMSA(PF)
             msa = parseMSA(f, **kwargs)
             os.remove(f)
             # slice MSA to match all segments of the Uniprot sequence
             sliced_msa, indexes = self._sliceMSA(msa)
             #               if max_cols is not None and sliced_msa.numResidues() > max_cols:
             #                   raise Exception('Unable to compute DI: MSA has ' +\
             #                                   'too many columns (max: {}).'.format(max_cols))
             # get mapping between Uniprot sequence and Pfam domain
             d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
             d['mapping'] = str(e)
             continue
         try:
             # refine MSA ('seqid' param. is set as in PolyPhen-2)
             rowocc = 0.6
             while True:
                 sliced_msa = refineMSA(sliced_msa, rowocc=rowocc)
                 rowocc += 0.02
                 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1:
                     break
             ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs)
             d['ref_MSA'] = ref_msa
             # compute evolutionary properties
             d['entropy'] = calcShannonEntropy(ref_msa)
             d['MutInfo'] = buildMutinfoMatrix(ref_msa)
             # d['DirInfo'] = buildDirectInfoMatrix(ref_msa)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
     return {k: self.Pfam[k] for k in PF_list}
Example #21
0
def evol_coevol(msa, **kwargs):

    from numpy import arange

    import prody
    from prody import parseMSA, buildMutinfoMatrix, showMutinfoMatrix
    from prody import applyMutinfoCorr, calcShannonEntropy
    from prody import writeArray, LOGGER, applyMutinfoNorm, writeHeatmap
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_mutinfo'

    msa = parseMSA(msa)
    mutinfo = buildMutinfoMatrix(msa, **kwargs)
    numformat = kwargs.get('numformat', '%12g')
    heatmap = kwargs.get('heatmap', False)
    #writeArray(prefix + '.txt', mutinfo, format=numformat)
    if heatmap:
        hmargs = {
                  'xlabel': 'Residue', 'ylabel': 'Residue',
                  'xorigin': 1, 'xstep': 1,
                  'residue': arange(msa.numResidues())}

    todo = [(None, None)]
    norm = kwargs.get('normalization', [])
    corr = kwargs.get('correction', [])
    if norm is not None:
        if 'joint' in norm:
            todo.append(('norm', 'joint'))
        for which in norm:
            if which == 'join': continue
            todo.append(('norm', which))
    if corr is not None:
        for which in corr:
            todo.append(('corr', which))
    entropy = None

    for what, which in todo:
        if what is None:
            matrix = mutinfo
            suffix = ''
            tuffix = ' Mutual Information'
        elif which == 'joint':
            LOGGER.info('Applying {0} normalization.'.format(repr(which)))
            matrix = buildMutinfoMatrix(msa, norm=True, **kwargs)
            suffix = '_norm_joint'
            tuffix = ' MI - Normalization: ' + which
        elif what == 'norm':
            LOGGER.info('Applying {0} normalization.'.format(repr(which)))
            if entropy is None:
                entropy = calcShannonEntropy(msa, **kwargs)
            matrix = applyMutinfoNorm(mutinfo, entropy, norm=which)
            suffix = '_norm_' + which
            tuffix = ' MI - Normalization: ' + which
        else:
            LOGGER.info('Applying {0} correction.'.format(repr(which)))
            matrix = applyMutinfoCorr(mutinfo, which)
            suffix = '_corr_' + which
            tuffix = ' MI - Correction: ' + which

        writeArray(prefix + suffix + '.txt',
                   matrix, format=kwargs.get('numformat', '%12g'))

        if heatmap:
            writeHeatmap(prefix + suffix + '.hm', matrix,
                         title = msa.getTitle() + tuffix, **hmargs)

        if kwargs.get('figcoevol'):
            try:
                import matplotlib.pyplot as plt
            except ImportError:
                LOGGER.warn('Matplotlib could not be imported, '
                            'figures are not saved.')
            else:
                cmin = kwargs.get('cmin', matrix.min())
                cmax = kwargs.get('cmax', matrix.max())
                prody.SETTINGS['auto_show'] = False
                width = kwargs.get('figwidth', 8)
                height = kwargs.get('figheight', 6)
                xlabel = kwargs.get('xlabel')
                title = kwargs.get('title')
                figure = plt.figure(figsize=(width, height))
                show = showMutinfoMatrix(matrix, msa=msa, clim=(cmin, cmax),
                                         xlabel=xlabel, title=title)

                format = kwargs.get('figformat', 'pdf')
                figure.savefig(prefix + suffix + '.' + format, format=format,
                            dpi=kwargs.get('figdpi', 300))