Example #1
0
def evol_occupancy(msa, **kwargs):

    from numpy import arange

    import prody
    from prody import parseMSA, calcMSAOccupancy, showMSAOccupancy, writeArray
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_occupancy'

    msa = parseMSA(msa)

    numformat = kwargs.get('numformat', '%12g')
    occupancy, suffix = [], []
    occaxis = kwargs.get('occaxis', 'row')
    if occaxis == 'both':
        suffix = ['_row', '_col']
        occupancy.append(calcMSAOccupancy(msa, occ='row'))
        occupancy.append(calcMSAOccupancy(msa, occ='col'))
    else:
        suffix = '_' + occaxis
        occupancy.append(calcMSAOccupancy(msa, occ=occaxis))

    for i, occ in enumerate(occupancy):
        writeArray((prefix + suffix[i] + '.txt'), occ, format=numformat)

    for i, occ in enumerate(occupancy):
        if kwargs.get('figocc'):
            try:
                import matplotlib.pyplot as plt
            except ImportError:
                LOGGER.warn('Matplotlib could not be imported, '
                            'figures are not saved.')
            else:
                prody.SETTINGS['auto_show'] = False
                width = kwargs.get('figwidth', 8)
                height = kwargs.get('figheight', 6)
                xlabel = kwargs.get('xlabel')
                title = kwargs.get('title')
                figure = plt.figure(figsize=(width, height))
                label = kwargs.get('label')
                show = showMSAOccupancy(msa=msa,
                                        occ=occ,
                                        label=label,
                                        xlabel=xlabel,
                                        title=title)
                format = kwargs.get('figformat', 'pdf')
                figure.savefig(prefix + suffix[i] + '.' + format,
                               format=format,
                               dpi=kwargs.get('figdpi', 300))
Example #2
0
def evol_occupancy(msa, **kwargs):

    from numpy import arange

    import prody
    from prody import parseMSA, calcMSAOccupancy, showMSAOccupancy, writeArray
    from os.path import splitext

    prefix = kwargs.get('prefix')
    if prefix is None:
        prefix, _ = splitext(msa)
        if _.lower() == '.gz':
            prefix, _ = splitext(prefix)
        prefix += '_occupancy'

    msa = parseMSA(msa)

    numformat = kwargs.get('numformat', '%12g')
    occupancy , suffix = [], []
    occaxis = kwargs.get('occaxis', 'row')
    if occaxis == 'both':
        suffix = ['_row', '_col']
        occupancy.append(calcMSAOccupancy(msa, occ='row'))
        occupancy.append(calcMSAOccupancy(msa, occ='col'))
    else:
        suffix = '_' + occaxis
        occupancy.append(calcMSAOccupancy(msa, occ=occaxis))

    for i, occ in enumerate(occupancy):
        writeArray((prefix + suffix[i] + '.txt'), occ, format=numformat)

    for i, occ in enumerate(occupancy):
        if kwargs.get('figocc'):
            try:
                import matplotlib.pyplot as plt
            except ImportError:
                LOGGER.warn('Matplotlib could not be imported, '
                            'figures are not saved.')
            else:
                prody.SETTINGS['auto_show'] = False
                width = kwargs.get('figwidth', 8)
                height = kwargs.get('figheight', 6)
                xlabel = kwargs.get('xlabel')
                title = kwargs.get('title')
                figure = plt.figure(figsize=(width, height))
                label = kwargs.get('label')
                show = showMSAOccupancy(msa=msa, occ=occ, label=label,
                                         xlabel=xlabel, title=title)
                format = kwargs.get('figformat', 'pdf')
                figure.savefig(prefix + suffix[i] + '.' + format, format=format,
                            dpi=kwargs.get('figdpi', 300))
Example #3
0
    def testAll(self):

        rowocc = 0.9
        colocc = 0.9
        seqid = 0.98
        label = "FSHB_BOVIN"
        refined = refineMSA(FASTA, label=label, seqid=seqid, rowocc=rowocc, colocc=colocc)

        index = FASTA.getIndex(label)
        which = FASTA_ALPHA[index].nonzero()[0]
        expected = FASTA._getArray().take(which, 1)

        expected = expected[uniqueSequences(expected, seqid)]

        expected = expected[calcMSAOccupancy(expected, "row") >= rowocc]

        which = (calcMSAOccupancy(expected) >= colocc).nonzero()[0]
        expected = expected.take(which, 1)

        assert_array_equal(refined._getArray(), expected)
Example #4
0
    def testAll(self):

        rowocc = 0.9
        colocc = 0.9
        seqid = 0.98
        label = 'FSHB_BOVIN'
        refined = refineMSA(FASTA, label=label, seqid=seqid,
                            rowocc=rowocc, colocc=colocc)

        index = FASTA.getIndex(label)
        which = FASTA_ALPHA[index].nonzero()[0]
        expected = FASTA._getArray().take(which, 1)

        expected = expected[uniqueSequences(expected, seqid)]

        expected = expected[calcMSAOccupancy(expected, 'row') >= rowocc]

        which = (calcMSAOccupancy(expected) >= colocc).nonzero()[0]
        expected = expected.take(which, 1)

        assert_array_equal(refined._getArray(), expected)
Example #5
0
def evol_rankorder(mutinfo, **kwargs):
    from prody import parseMSA, LOGGER, parsePDB, calcMSAOccupancy
    from prody.utilities import openFile
    from os.path import splitext
    
    delimiter = kwargs.get('delimiter')
    mi = np.loadtxt(str(mutinfo), delimiter=delimiter)
    
    ndim, shape = mi.ndim, mi.shape
    if ndim != 2 or shape[0] != shape[1]:
        raise ValueError('mutinfo must contain a square matrix')
    
    msa, label = kwargs.get('msa'), kwargs.get('label')
    
    pdb, pdbflag = kwargs.get('pdb'), False
    
    resnum = None
    
    if pdb is not None:
        from prody import parsePDB
        try:
            pdb = parsePDB(pdb)
        except:
            LOGGER.info('Could not parse PDB, ignoring PDB input')
        else:
            chains = list(pdb.iterChains())
            for chain in chains:
                sel = chain.select('protein and name CA')
                if sel.numAtoms() == shape[0]:
                    resnum = sel.getResnums()
                    coordset = sel.getCoordsets()
                    distance = calcAllDist(coordset)
                    pdbflag = True
                    label = pdb.getTitle()
                    LOGGER.info('Residue numbers will be based on pdb: '
                                '{0}'.format(pdb.getTitle()))
                    break
                else:
                    LOGGER.info('Number of residues in PDB does not match '
                                'mutinfo matrix, ignoring PDB input')
    
    if not pdbflag:
        if msa is not None:
            msa = parseMSA(msa)
            if msa.numResidues() != shape[0]:
                LOGGER.info('Input MSA and mutinfo do not have similar no '
                            'of residues, ignoring MSA')
            else:
                index = msa.getIndex(label)   
                if index is None:
                    if label is not None:
                        LOGGER.info('Could not find given label in MSA, '
                                    'using complete sequence from MSA')
                    occ = calcMSAOccupancy(msa._msa, 'row')
                    index = np.where(occ == occ.max())[0][0]
                    label, seq, start, end = msa[index]
                else:
                    label, seq, start, end = msa[index]
                if (start and end is not None) and (start < end):
                    resnum = np.arange(start, end+1)
                    if len(resnum) != shape[0]:
                        LOGGER.info('Label: {0}/{1}-{2} and mutinfo do '
                                    'not have similar no of residues, using '
                                    'serial indexing'.format(label, start, end))
                        label = 'Serial Index'
                        resnum = np.arange(1, shape[0]+1)
                    else:
                        LOGGER.info('Residue numbers will be based on label: '
                                    '{0}'.format(label))
                else:
                    LOGGER.info('Could not identify residue indexes from MSA'
                                    ' using serial indexing')
                    label = 'Serial Index'
                    resnum = np.arange(1, shape[0]+1)
        else:
            LOGGER.info('MSA or PDB not given or does not match mutinfo, '
                        'using serial indexing')
            resnum = np.arange(1, shape[0]+1)
    
    LOGGER.info('Residue numbers start and end with {0}-{1}'.
                format(str(resnum[0]), str(resnum[-1])))
    
    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(str(mutinfo))
        if ext.lower() == '.gz': 
            outname, _ = splitext(str(mutinfo))
    else:
        outname, ext = splitext(str(outname))
        if ext is None:
            ext = '.txt'
    
    outname += '_rankorder' + ext
    zscore = kwargs.get('zscore')
    if zscore:
        LOGGER.info('zscore normalization applied such that each column '
                    'has 0 mean and standard deviation 1')
        header = 'Serial\tRow\tColumn\tZscore'
        mi = (mi - mi.mean(0)) / mi.std(0)
    else:
        header = 'Serial\tRow\tColumn\tMI'
    
    mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1)
    mi_matrix = mi[mi_ind_start, mi_ind_end]
    sorted_index = mi_matrix.argsort(axis=None)[::-1]
    row = mi_ind_start[sorted_index]
    column = mi_ind_end[sorted_index]
    count = 1
    i = 0
    
    f = openFile(outname, 'wb')
    if label is None:
        label = 'Serial Index'
    
    numpairs = kwargs.get('numpairs')
    size = len(row)
    seqsep = kwargs.get('seqsep')
    if not kwargs.get('usedist') or not pdbflag:
        if kwargs.get('usedist'):
            LOGGER.info('use-struct-sep set to true, but PDB not given or '
                        'incorrect residue number. Using sequence separation')
        else:
            if pdbflag:
                LOGGER.info('use-dist not set, using sequence separation'
                            ' to report coevolving pairs')
        f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' +
             str(resnum[0]) + '-' + str(resnum[-1]) + '\tSequence Separation:' +
             str(seqsep) + '\n'))
        if pdbflag:
            f.write((header + '\tDistance\n'))
            while count <=numpairs  and i < size:        
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.
                            format(count, resnum[row[i]], resnum[column[i]],
                                   mi[row[i], column[i]],
                                   distance[row[i], column[i]]))
                    count += 1
                i += 1
        else:
            f.write((header + '\n'))
            while count <=numpairs  and i < size:        
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\n'.
                            format(count, resnum[row[i]], resnum[column[i]],
                                   mi[row[i], column[i]]))
                    count += 1
                i += 1
    else:
        structsep = kwargs.get('dist')
        f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' +
             str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' +
             str(structsep) + '\n'))
        f.write((header + '\tDistance\n'))        
        while count <=numpairs  and i < size:        
            if distance[row[i], column[i]] > structsep:
                f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.
                        format(count, resnum[row[i]], resnum[column[i]],
                               mi[row[i], column[i]],
                               distance[row[i], column[i]]))
                count += 1                
            i += 1
    f.close()
Example #6
0
    def testSequenceOccupancy(self):

        assert_array_equal(calcMSAOccupancy(FASTA, 'sequence'),
                           FASTA_ALPHA.sum(1) / (FASTA.numResidues() * 1.0))
Example #7
0
    def testResidueOccupancy(self):

        assert_array_equal(calcMSAOccupancy(FASTA, 'residue'),
                           FASTA_ALPHA.sum(0) / (FASTA.numSequences() * 1.0))
Example #8
0
    def testSequenceCount(self):

        assert_array_equal(calcMSAOccupancy(FASTA, 'sequence', count=1),
                           FASTA_ALPHA.sum(1))
Example #9
0
    def testResidueCount(self):

        assert_array_equal(calcMSAOccupancy(FASTA, 'residue', count=1),
                           FASTA_ALPHA.sum(0))
Example #10
0
    def testSequenceOccupancy(self):

        assert_array_equal(calcMSAOccupancy(FASTA, "sequence"), FASTA_ALPHA.sum(1) / (FASTA.numResidues() * 1.0))
Example #11
0
    def testResidueOccupancy(self):

        assert_array_equal(calcMSAOccupancy(FASTA, "residue"), FASTA_ALPHA.sum(0) / (FASTA.numSequences() * 1.0))
Example #12
0
    def testSequenceCount(self):

        assert_array_equal(calcMSAOccupancy(FASTA, "sequence", count=1), FASTA_ALPHA.sum(1))
Example #13
0
    def testResidueCount(self):

        assert_array_equal(calcMSAOccupancy(FASTA, "residue", count=1), FASTA_ALPHA.sum(0))
Example #14
0
def evol_rankorder(mutinfo, **kwargs):
    from prody import parseMSA, LOGGER, parsePDB, calcMSAOccupancy
    from prody.utilities import openFile
    from os.path import splitext

    delimiter = kwargs.get('delimiter')
    mi = np.loadtxt(str(mutinfo), delimiter=delimiter)

    ndim, shape = mi.ndim, mi.shape
    if ndim != 2 or shape[0] != shape[1]:
        raise ValueError('mutinfo must contain a square matrix')

    msa, label = kwargs.get('msa'), kwargs.get('label')

    pdb, pdbflag = kwargs.get('pdb'), False

    resnum = None

    if pdb is not None:
        from prody import parsePDB
        try:
            pdb = parsePDB(pdb)
        except:
            LOGGER.info('Could not parse PDB, ignoring PDB input')
        else:
            chains = list(pdb.iterChains())
            for chain in chains:
                sel = chain.select('protein and name CA')
                if sel.numAtoms() == shape[0]:
                    resnum = sel.getResnums()
                    coordset = sel.getCoordsets()
                    distance = calcAllDist(coordset)
                    pdbflag = True
                    label = pdb.getTitle()
                    LOGGER.info('Residue numbers will be based on pdb: '
                                '{0}'.format(pdb.getTitle()))
                    break
                else:
                    LOGGER.info('Number of residues in PDB does not match '
                                'mutinfo matrix, ignoring PDB input')

    if not pdbflag:
        if msa is not None:
            msa = parseMSA(msa)
            if msa.numResidues() != shape[0]:
                LOGGER.info('Input MSA and mutinfo do not have similar no '
                            'of residues, ignoring MSA')
            else:
                index = msa.getIndex(label)
                if index is None:
                    if label is not None:
                        LOGGER.info('Could not find given label in MSA, '
                                    'using complete sequence from MSA')
                    occ = calcMSAOccupancy(msa._msa, 'row')
                    index = np.where(occ == occ.max())[0][0]
                    label, seq, start, end = msa[index]
                else:
                    label, seq, start, end = msa[index]
                if (start and end is not None) and (start < end):
                    resnum = np.arange(start, end + 1)
                    if len(resnum) != shape[0]:
                        LOGGER.info('Label: {0}/{1}-{2} and mutinfo do '
                                    'not have similar no of residues, using '
                                    'serial indexing'.format(
                                        label, start, end))
                        label = 'Serial Index'
                        resnum = np.arange(1, shape[0] + 1)
                    else:
                        LOGGER.info('Residue numbers will be based on label: '
                                    '{0}'.format(label))
                else:
                    LOGGER.info('Could not identify residue indexes from MSA'
                                ' using serial indexing')
                    label = 'Serial Index'
                    resnum = np.arange(1, shape[0] + 1)
        else:
            LOGGER.info('MSA or PDB not given or does not match mutinfo, '
                        'using serial indexing')
            resnum = np.arange(1, shape[0] + 1)

    LOGGER.info('Residue numbers start and end with {0}-{1}'.format(
        str(resnum[0]), str(resnum[-1])))

    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(str(mutinfo))
        if ext.lower() == '.gz':
            outname, _ = splitext(str(mutinfo))
    else:
        outname, ext = splitext(str(outname))
        if ext is None:
            ext = '.txt'

    outname += '_rankorder' + ext
    zscore = kwargs.get('zscore')
    if zscore:
        LOGGER.info('zscore normalization applied such that each column '
                    'has 0 mean and standard deviation 1')
        header = 'Serial\tRow\tColumn\tZscore'
        mi = (mi - mi.mean(0)) / mi.std(0)
    else:
        header = 'Serial\tRow\tColumn\tMI'

    mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1)
    mi_matrix = mi[mi_ind_start, mi_ind_end]
    sorted_index = mi_matrix.argsort(axis=None)[::-1]
    row = mi_ind_start[sorted_index]
    column = mi_ind_end[sorted_index]
    count = 1
    i = 0

    f = openFile(outname, 'wb')
    if label is None:
        label = 'Serial Index'

    numpairs = kwargs.get('numpairs')
    size = len(row)
    seqsep = kwargs.get('seqsep')
    if not kwargs.get('usedist') or not pdbflag:
        if kwargs.get('usedist'):
            LOGGER.info('use-struct-sep set to true, but PDB not given or '
                        'incorrect residue number. Using sequence separation')
        else:
            if pdbflag:
                LOGGER.info('use-dist not set, using sequence separation'
                            ' to report coevolving pairs')
        f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' +
                 str(resnum[0]) + '-' + str(resnum[-1]) +
                 '\tSequence Separation:' + str(seqsep) + '\n'))
        if pdbflag:
            f.write((header + '\tDistance\n'))
            while count <= numpairs and i < size:
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format(
                        count, resnum[row[i]], resnum[column[i]],
                        mi[row[i], column[i]], distance[row[i], column[i]]))
                    count += 1
                i += 1
        else:
            f.write((header + '\n'))
            while count <= numpairs and i < size:
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\n'.format(
                        count, resnum[row[i]], resnum[column[i]],
                        mi[row[i], column[i]]))
                    count += 1
                i += 1
    else:
        structsep = kwargs.get('dist')
        f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' +
                 str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' +
                 str(structsep) + '\n'))
        f.write((header + '\tDistance\n'))
        while count <= numpairs and i < size:
            if distance[row[i], column[i]] > structsep:
                f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format(
                    count, resnum[row[i]], resnum[column[i]],
                    mi[row[i], column[i]], distance[row[i], column[i]]))
                count += 1
            i += 1
    f.close()