Esempio n. 1
0
    def getLabel(self, full=False):
        """Returns label of the sequence."""

        label = self._label
        if label is None:
            label = self._msa._labels[self._index]
        return (label if full else splitSeqLabel(label)[0]).strip()
Esempio n. 2
0
File: msa.py Progetto: nffaruk/ProDy
    def _map(self, mapping=None):

        labels = self._labels
        if mapping is not None:
            try:
                mapping['isdict']
            except KeyError:
                pass
            except Exception:
                raise TypeError('mapping must be a dictionary')

            for key, value in mapping.items():
                values = [value] if isscalar(value) else value
                for i in values:
                    if not key in labels[i]:
                        labels[i] = key

        self._mapping = mapping = {}
        for index, label in enumerate(labels):
            label = splitSeqLabel(label)[0]
            try:
                value = mapping[label]
            except KeyError:
                mapping[label] = index
            else:
                try:
                    value.append(index)
                except AttributeError:
                    mapping[label] = [value, index]
        return mapping
Esempio n. 3
0
File: msa.py Progetto: nffaruk/ProDy
    def getLabel(self, index, full=False):
        """Returns label of the sequence at given *index*.  Residue numbers will
        be removed from the sequence label, unless *full* is **True**."""

        index = self._mapping.get(index, index)
        if full:
            return self._labels[index]
        else:
            return splitSeqLabel(self._labels[index])[0]
Esempio n. 4
0
File: msa.py Progetto: nffaruk/ProDy
    def iterLabels(self, full=False):
        """Yield sequence labels.  By default the part of the label used for
        indexing sequences is yielded."""

        if full:
            for label in self._labels:
                yield label
        else:
            for label in self._labels:
                yield splitSeqLabel(label)[0]
Esempio n. 5
0
    def getResnums(self, gaps=False, report_match=False):
        """Returns list of residue numbers associated with non-gapped *seq*.
        When *gaps* is **True**, return a list containing the residue numbers
        with gaps appearing as **None**.  
        
        Residue numbers are inferred from the full label if possible. 
        When the label does not contain residue number information, 
        a range of numbers starting from 1 is returned."""

        title, start, end = splitSeqLabel(self.getLabel(True))
        match = False
        try:
            start, end = int(start), int(end)
        except:
            LOGGER.info(
                'Cannot parse start and end values from sequence label {0}. Setting '
                'resnums 1 to {1:d}'.format(title, self.numResidues()))
            start, end = 1, self.numResidues()
        else:
            if (end - start + 1) != self.numResidues():
                LOGGER.info('Label {0} start-end entry does not match '
                            'length of ungapped sequence. Setting '
                            'resnums 1 to {1:d}'.format(
                                title, self.numResidues()))
                start, end = 1, self.numResidues()
            else:
                LOGGER.info('Label {0} start-end entry matches '
                            'length of ungapped sequence. Setting '
                            'resnums {1:d} to {2:d}'.format(title, start, end))
                match = True

        resnums = iter(range(start, end + 1))
        if gaps:
            result = [
                next(resnums) if torf else None
                for torf in char.isalpha(self._array)
            ]
        else:
            result = list(resnums)

        if report_match:
            return match, result
        return result
Esempio n. 6
0
def evol_rankorder(mutinfo, **kwargs):
    from prody import parseMSA, LOGGER, PY3K
    from prody import parsePDB, calcMSAOccupancy, trimAtomsUsingMSA
    from prody.utilities import openFile, splitSeqLabel
    from os.path import splitext

    delimiter = kwargs.get('delimiter')
    mi = np.loadtxt(str(mutinfo), delimiter=delimiter)

    ndim, shape = mi.ndim, mi.shape
    if ndim != 2 or shape[0] != shape[1]:
        raise ValueError('mutinfo must contain a square matrix')

    msa, label, msaflag = kwargs.get('msa'), kwargs.get('label'), False

    pdb, pdbflag = kwargs.get('pdb'), False

    resnum = None

    if msa is not None:
        msa = parseMSA(msa)
        if msa.numResidues() != shape[0]:
            LOGGER.info('Input MSA and mutinfo do not have similar no '
                        'of residues, ignoring MSA')
        else:
            index = msa.getIndex(label)
            try:
                if index is None:
                    if label is not None:
                        LOGGER.info('Could not find given label in MSA, '
                                    'using complete sequence from MSA')
                    occ = calcMSAOccupancy(msa._msa, 'row')
                    index = np.where(occ == occ.max())[0][0]
                    label, start, end = splitSeqLabel(
                        msa[index].getLabel(True))
                else:
                    label, start, end = splitSeqLabel(
                        msa[index].getLabel(True))
            except:
                LOGGER.info('Could not extract resnums from MSA')
            else:
                msaflag = True

    if pdb is not None:
        from prody import parsePDB
        try:
            pdb = parsePDB(pdb)
        except:
            LOGGER.info('Could not parse PDB, ignoring PDB input')
        else:
            chains = list(pdb.iterChains())
            for chain in chains:
                sel = chain.select('protein and name CA')
                if sel.numAtoms() == shape[0]:
                    resnum = sel.getResnums()
                    coordset = sel.getCoordsets()
                    distance = calcAllDist(coordset)
                    pdbflag = True
                    label = pdb.getTitle()
                    LOGGER.info('Residue numbers will be based on pdb: '
                                '{0}'.format(pdb.getTitle()))
                    break
                else:
                    try:
                        sel = trimAtomsUsingMSA(sel,
                                                msa,
                                                chain=chain.getChid())
                        if sel.numAtoms() == shape[0]:
                            resnum = sel.getResnums()
                            coordset = sel.getCoordsets()
                            distance = calcAllDist(coordset)
                            pdbflag = True
                            label = pdb.getTitle()
                            LOGGER.info(
                                'Residue numbers will be based on pdb: '
                                '{0}'.format(pdb.getTitle()))
                            break
                    except:
                        LOGGER.info(
                            'Number of residues in PDB does not match '
                            'mutinfo matrix and no MSA was provided to '
                            'align the PDB against, so ignoring PDB input')

    if not pdbflag:
        if msaflag:
            if (start and end is not None) and (start < end):
                resnum = np.arange(start, end + 1)
                if len(resnum) != shape[0]:
                    LOGGER.info('Label: {0}/{1}-{2} and mutinfo do '
                                'not have similar no of residues, using '
                                'serial indexing'.format(label, start, end))
                    label = 'Serial Index'
                    resnum = np.arange(1, shape[0] + 1)
                else:
                    LOGGER.info(
                        'Residue numbers will be based on MSA and label: '
                        '{0}'.format(label))
            else:
                LOGGER.info('Could not identify residue indexes from MSA'
                            ' using serial indexing')
                label = 'Serial Index'
                resnum = np.arange(1, shape[0] + 1)
        else:
            LOGGER.info('MSA or PDB not given or does not match mutinfo, '
                        'using serial indexing')
            resnum = np.arange(1, shape[0] + 1)

    LOGGER.info('Residue numbers start and end with {0}-{1}'.format(
        str(resnum[0]), str(resnum[-1])))

    outname = kwargs.get('outname')
    if outname is None:
        outname, ext = splitext(str(mutinfo))
        if ext.lower() == '.gz':
            outname, _ = splitext(str(mutinfo))
    else:
        outname, ext = splitext(str(outname))
        if ext is None:
            ext = '.txt'

    outname += '_rankorder' + ext
    zscore = kwargs.get('zscore')
    if zscore:
        LOGGER.info('zscore normalization applied such that each column '
                    'has 0 mean and standard deviation 1')
        header = 'Serial\tRow\tColumn\tZscore'
        mi = (mi - mi.mean(0)) / mi.std(0)
    else:
        header = 'Serial\tRow\tColumn\tMI'

    mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1)
    mi_matrix = mi[mi_ind_start, mi_ind_end]
    sorted_index = mi_matrix.argsort(axis=None)[::-1]
    row = mi_ind_start[sorted_index]
    column = mi_ind_end[sorted_index]
    count = 1
    i = 0

    if PY3K:
        mode = 'w'
    else:
        mode = 'wb'
    f = openFile(outname, mode)

    if label is None:
        label = 'Serial Index'

    numpairs = kwargs.get('numpairs')
    size = len(row)
    seqsep = kwargs.get('seqsep')
    if not kwargs.get('usedist') or not pdbflag:
        if kwargs.get('usedist'):
            LOGGER.info('use-struct-sep set to true, but PDB not given or '
                        'incorrect residue number. Using sequence separation')
        else:
            if pdbflag:
                LOGGER.info('use-dist not set, using sequence separation'
                            ' to report coevolving pairs')
        f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' +
                 str(resnum[0]) + '-' + str(resnum[-1]) +
                 '\tSequence Separation:' + str(seqsep) + '\n'))
        if pdbflag:
            f.write((header + '\tDistance\n'))
            while count <= numpairs and i < size:
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format(
                        count, resnum[row[i]], resnum[column[i]],
                        mi[row[i], column[i]], distance[row[i], column[i]]))
                    count += 1
                i += 1
        else:
            f.write((header + '\n'))
            while count <= numpairs and i < size:
                if row[i] > (column[i] + seqsep):
                    f.write('{0}\t{1}\t{2}\t{3:.3f}\n'.format(
                        count, resnum[row[i]], resnum[column[i]],
                        mi[row[i], column[i]]))
                    count += 1
                i += 1
    else:
        structsep = kwargs.get('dist')
        f.write(('Label: ' + label + '\t' + 'Residue Numbers: ' +
                 str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' +
                 str(structsep) + '\n'))
        f.write((header + '\tDistance\n'))
        while count <= numpairs and i < size:
            if distance[row[i], column[i]] > structsep:
                f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'.format(
                    count, resnum[row[i]], resnum[column[i]],
                    mi[row[i], column[i]], distance[row[i], column[i]]))
                count += 1
            i += 1
    f.close()
Esempio n. 7
0
File: msa.py Progetto: nffaruk/ProDy
def refineMSA(msa,
              index=None,
              label=None,
              rowocc=None,
              seqid=None,
              colocc=None,
              **kwargs):
    """Refine *msa* by removing sequences (rows) and residues (columns) that
    contain gaps.

    :arg msa: multiple sequence alignment
    :type msa: :class:`.MSA`

    :arg index: remove columns that are gaps in the sequence with that index
    :type index: int

    :arg label: remove columns that are gaps in the sequence matching label,
        ``msa.getIndex(label)`` must return a sequence index, a PDB identifier
        is also acceptable
    :type label: str

    :arg rowocc: row occupancy, sequences with less occupancy will be
        removed after *label* refinement is applied
    :type rowocc: float

    :arg seqid: keep unique sequences at specified sequence identity level,
        unique sequences are identified using :func:`.uniqueSequences`
    :type seqid: float

    :arg colocc: column occupancy, residue positions with less occupancy
        will be removed after other refinements are applied
    :type colocc: float

    :arg keep: keep columns corresponding to residues not resolved in the PDB
        structure, default is **False**, applies when *label* is a PDB
        identifier
    :arg type: bool

    For Pfam MSA data, *label* is UniProt entry name for the protein.  You may
    also use PDB structure and chain identifiers, e.g. ``'1p38'`` or
    ``'1p38A'``, for *label* argument and UniProt entry names will be parsed
    using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and
    :class:`.DBRef`).

    The order of refinements are applied in the order of arguments.  If *label*
    and *unique* is specified, sequence matching *label* will
    be kept in the refined :class:`.MSA` although it may be similar to some
    other sequence."""

    # if msa is a char array, it will be refined but label won't work
    try:
        ndim, dtype_ = msa.ndim, msa.dtype
    except AttributeError:
        try:
            arr = msa._getArray()
        except AttributeError:
            raise TypeError('msa must be a character array or an MSA instance')
        ndim, dtype_ = arr.ndim, arr.dtype
    else:
        arr, msa = msa, None

    if dtype('|S1') != dtype_:
        raise ValueError('msa must be a character array or an MSA instance')
    if ndim != 2:
        raise ValueError('msa must be a 2D array or an MSA instance')

    title = []
    cols = None

    if index is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        cols = char.isalpha(arr[index]).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('index=' + str(index))
        LOGGER.report(
            'Index refinement reduced number of columns from {0} to '
            '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

    if label is not None:
        if index is not None:
            LOGGER.info('An index was provided so the label will be ignored.')

        else:
            before = arr.shape[1]
            LOGGER.timeit('_refine')
            try:
                upper, lower = label.upper(), label.lower()
            except AttributeError:
                raise TypeError('label must be a string')

            if msa is None:
                raise TypeError('msa must be an MSA instance, '
                                'label cannot be used')

            index = msa.getIndex(label)
            if index is None:
                index = msa.getIndex(upper)
            if index is None:
                index = msa.getIndex(lower)

            chain = None
            if index is None and (len(label) == 4 or len(label) == 5):
                from prody import parsePDB
                try:
                    structure, header = parsePDB(label[:4], header=True)
                except Exception as err:
                    raise IOError(
                        'failed to parse header for {0} ({1})'.format(
                            label[:4], str(err)))

                chid = label[4:].upper()
                for poly in header['polymers']:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if index is None:
                            index = msa.getIndex(dbref.idcode)
                            if index is not None:
                                LOGGER.info('{0} idcode {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database, dbref.idcode,
                                                label[:4], poly.chid,
                                                str(msa)))
                                break
                        if index is None:
                            index = msa.getIndex(dbref.accession)
                            if index is not None:
                                LOGGER.info('{0} accession {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database,
                                                dbref.accession, label[:4],
                                                poly.chid, str(msa)))
                                break
                if index is not None:
                    chain = structure[poly.chid]
                    resnums = chain.ca.getResnums()

            if index is None:
                raise ValueError('label is not in msa, or msa is not indexed')
            try:
                len(index)
            except TypeError:
                pass
            else:
                raise ValueError(
                    'label {0} maps onto multiple sequences, '
                    'so cannot be used for refinement'.format(label))

            title.append('label=' + label)
            cols = char.isalpha(arr[index]).nonzero()[0]
            arr = arr.take(cols, 1)
            LOGGER.report(
                'Label refinement reduced number of columns from {0} to '
                '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

            if chain is not None and not kwargs.get('keep', False):
                before = arr.shape[1]
                LOGGER.timeit('_refine')

                from Bio import pairwise2
                from prody.utilities import MATCH_SCORE, MISMATCH_SCORE
                from prody.utilities import GAP_PENALTY, GAP_EXT_PENALTY, ALIGNMENT_METHOD

                chseq = chain.getSequence()
                algn = pairwise2.align.localms(pystr(
                    arr[index].tostring().upper()),
                                               pystr(chseq),
                                               MATCH_SCORE,
                                               MISMATCH_SCORE,
                                               GAP_PENALTY,
                                               GAP_EXT_PENALTY,
                                               one_alignment_only=1)
                torf = []
                for s, c in zip(*algn[0][:2]):
                    if s == '-':
                        continue
                    elif c != '-':
                        torf.append(True)
                    else:
                        torf.append(False)
                torf = array(torf)
                tsum = torf.sum()
                assert tsum <= before, 'problem in mapping sequence to structure'
                if tsum < before:
                    arr = arr.take(torf.nonzero()[0], 1)
                    resnums = resnums.take(torf.nonzero()[0] -
                                           torf.nonzero()[0][0] + 1)
                    LOGGER.report(
                        'Structure refinement reduced number of '
                        'columns from {0} to {1} in %.2fs.'.format(
                            before, arr.shape[1]), '_refine')
                else:
                    LOGGER.debug(
                        'All residues in the sequence are contained in '
                        'PDB structure {0}.'.format(label))

                labels = msa._labels
                labels[index] = splitSeqLabel(labels[index])[0] + '/' + str(
                    resnums[0]) + '-' + str(resnums[-1])

    from .analysis import calcMSAOccupancy, uniqueSequences

    rows = None
    if rowocc is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        try:
            rowocc = float(rowocc)
        except Exception as err:
            raise TypeError('rowocc must be a float ({0})'.format(str(err)))
        assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1'

        rows = calcMSAOccupancy(arr, 'row') >= rowocc
        if index is not None:
            index = rows[:index].sum()
        rows = (rows).nonzero()[0]
        arr = arr[rows]
        title.append('rowocc>=' + str(rowocc))
        LOGGER.report(
            'Row occupancy refinement reduced number of rows from '
            '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine')

    if seqid is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        unique = uniqueSequences(arr, seqid)
        if index is not None:
            unique[index] = True
        unique = unique.nonzero()[0]
        arr = arr[unique]
        title.append('seqid>=' + str(seqid))
        if rows is not None:
            rows = rows[unique]
        else:
            rows = unique
        LOGGER.report(
            'Sequence identity refinement reduced number of rows '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]),
            '_refine')

    if colocc is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            colocc = float(colocc)
        except Exception as err:
            raise TypeError('colocc must be a float ({0})'.format(str(err)))
        assert 0. <= colocc <= 1., 'colocc must be between 0 and 1'

        cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('colocc>=' + str(colocc))
        LOGGER.report(
            'Column occupancy refinement reduced number of columns '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]),
            '_refine')

    if not title:
        raise ValueError(
            'label, index, seqid, rowocc, colocc all cannot be None')

    # depending on slicing of rows, arr may not have it's own memory
    if arr.base is not None:
        arr = arr.copy()

    if msa is None:
        return arr
    else:
        if rows is None:
            from copy import copy
            labels = copy(msa._labels)
        else:
            labels = msa._labels
            labels = [labels[i] for i in rows]
        return MSA(arr,
                   title=msa.getTitle() +
                   ' refined ({0})'.format(', '.join(title)),
                   labels=labels)
Esempio n. 8
0
File: msa.py Progetto: nffaruk/ProDy
    def getResnums(self, index):
        """Returns starting and ending residue numbers (:term:`resnum`) for the
        sequence at given *index*."""

        index = self._mapping.get(index, index)
        return splitSeqLabel(self._labels[index])[1:]