Example #1
0
    def getResnums(self, gaps=False):
        """Return list of residue numbers associated with non-gapped *seq*.
        When *gaps* is **True**, return a list containing the residue numbers
        with gaps appearing as **None**.  Residue numbers are inferred from the
        full label.  When label does not contain residue number information,
        indices a range of numbers starting from 1 is returned."""

        title, start, end = splitSeqLabel(self.getLabel(True))
        try:
            start, end = int(start), int(end)
        except:
            LOGGER.info('Cannot parse label start, end values, Setting '
                        'resnums 1 to {0:d}'.format(self.numResidues()))
            start, end = 1, self.numResidues()
        else:
            if (end - start + 1) != self.numResidues():
                LOGGER.info('Label start-end position does not match '
                            'length of ungapped sequence. Setting '
                            'resnums 1 to {0:d}'.format(self.numResidues()))
                start, end = 1, self.numResidues()

        resnums = iter(range(start, end + 1))
        if gaps:
            return [next(resnums) if torf else None
                    for torf in char.isalpha(self._array)]
        else:
            return list(resnums)
Example #2
0
    def getResnums(self, gaps=False):
        """Returns list of residue numbers associated with non-gapped *seq*.
        When *gaps* is **True**, return a list containing the residue numbers
        with gaps appearing as **None**.  Residue numbers are inferred from the
        full label.  When label does not contain residue number information,
        indices a range of numbers starting from 1 is returned."""

        title, start, end = splitSeqLabel(self.getLabel(True))
        try:
            start, end = int(start), int(end)
        except:
            LOGGER.info('Cannot parse label start, end values, Setting '
                        'resnums 1 to {0:d}'.format(self.numResidues()))
            start, end = 1, self.numResidues()
        else:
            if (end - start + 1) != self.numResidues():
                LOGGER.info('Label start-end position does not match '
                            'length of ungapped sequence. Setting '
                            'resnums 1 to {0:d}'.format(self.numResidues()))
                start, end = 1, self.numResidues()

        resnums = iter(range(start, end + 1))
        if gaps:
            return [
                next(resnums) if torf else None
                for torf in char.isalpha(self._array)
            ]
        else:
            return list(resnums)
Example #3
0
    def testRowCol(self):

        rowocc = 0.9
        colocc = 1.0
        refined = refineMSA(FASTA, rowocc=rowocc, colocc=colocc)._getArray()
        rows = FASTA_ALPHA.sum(1) / 112.0 >= rowocc
        expected = FASTA._getArray()[rows]
        cols = char.isalpha(expected).sum(0, dtype=float) / expected.shape[0] >= colocc

        expected = expected.take(cols.nonzero()[0], 1)
        assert_array_equal(refined, expected)
Example #4
0
    def testRowCol(self):

        rowocc = 0.9
        colocc = 1.0
        refined = refineMSA(FASTA, rowocc=rowocc, colocc=colocc)._getArray()
        rows = FASTA_ALPHA.sum(1) / 112. >= rowocc
        expected = FASTA._getArray()[rows]
        cols = char.isalpha(expected).sum(
            0, dtype=float) / expected.shape[0] >= colocc

        expected = expected.take(cols.nonzero()[0], 1)
        assert_array_equal(refined, expected)
Example #5
0
    def getResnums(self, gaps=False, report_match=False):
        """Returns list of residue numbers associated with non-gapped *seq*.
        When *gaps* is **True**, return a list containing the residue numbers
        with gaps appearing as **None**.  
        
        Residue numbers are inferred from the full label if possible. 
        When the label does not contain residue number information, 
        a range of numbers starting from 1 is returned."""

        title, start, end = splitSeqLabel(self.getLabel(True))
        match = False
        try:
            start, end = int(start), int(end)
        except:
            LOGGER.info(
                'Cannot parse start and end values from sequence label {0}. Setting '
                'resnums 1 to {1:d}'.format(title, self.numResidues()))
            start, end = 1, self.numResidues()
        else:
            if (end - start + 1) != self.numResidues():
                LOGGER.info('Label {0} start-end entry does not match '
                            'length of ungapped sequence. Setting '
                            'resnums 1 to {1:d}'.format(
                                title, self.numResidues()))
                start, end = 1, self.numResidues()
            else:
                LOGGER.info('Label {0} start-end entry matches '
                            'length of ungapped sequence. Setting '
                            'resnums {1:d} to {2:d}'.format(title, start, end))
                match = True

        resnums = iter(range(start, end + 1))
        if gaps:
            result = [
                next(resnums) if torf else None
                for torf in char.isalpha(self._array)
            ]
        else:
            result = list(resnums)

        if report_match:
            return match, result
        return result
Example #6
0
File: msa.py Project: njekin/ProDy
def refineMSA(msa, label=None, rowocc=None, seqid=None, colocc=None, **kwargs):
    """Refine *msa* by removing sequences (rows) and residues (columns) that
    contain gaps.

    :arg msa: multiple sequence alignment
    :type msa: :class:`.MSA`

    :arg label: remove columns that are gaps in the sequence matching label,
        ``msa.getIndex(label)`` must return a sequence index, a PDB identifier
        is also acceptable
    :type label: str

    :arg rowocc: row occupancy, sequences with less occupancy will be
        removed after *label* refinement is applied
    :type rowocc: float

    :arg seqid: keep unique sequences at specified sequence identity level,
        unique sequences are identified using :func:`.uniqueSequences`
    :type seqid: float

    :arg colocc: column occupancy, residue positions with less occupancy
        will be removed after other refinements are applied
    :type colocc: float

    :arg keep: keep columns corresponding to residues not resolved in the PDB
        structure, default is **False**, applies when *label* is a PDB
        identifier
    :arg type: bool

    For Pfam MSA data, *label* is UniProt entry name for the protein.  You may
    also use PDB structure and chain identifiers, e.g. ``'1p38'`` or
    ``'1p38A'``, for *label* argument and UniProt entry names will be parsed
    using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and
    :class:`.DBRef`).

    The order of refinements are applied in the order of arguments.  If *label*
    and *unique* is specified is specified, sequence matching *label* will
    be kept in the refined :class:`.MSA` although it may be similar to some
    other sequence."""

    # if msa is a char array, it will be refined but label won't work
    try:
        ndim, dtype_ = msa.ndim, msa.dtype
    except AttributeError:
        try:
            arr = msa._getArray()
        except AttributeError:
            raise TypeError('msa must be a character array or an MSA instance')
        ndim, dtype_ = arr.ndim, arr.dtype
    else:
        arr, msa = msa, None

    if dtype('|S1') != dtype_:
        raise ValueError('msa must be a character array or an MSA instance')
    if ndim != 2:
        raise ValueError('msa must be a 2D array or an MSA instance')

    title = []
    cols = None
    index = None
    if label is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            upper, lower = label.upper(), label.lower()
        except AttributeError:
            raise TypeError('label must be a string')

        if msa is None:
            raise TypeError('msa must be an MSA instance, '
                            'label cannot be used')

        index = msa.getIndex(label)
        if index is None:
                index = msa.getIndex(upper)
        if index is None:
                index = msa.getIndex(lower)

        chain = None
        if index is None and (len(label) == 4 or len(label) == 5):
            from prody import parsePDB
            try:
                structure, header = parsePDB(label[:4], header=True)
            except Exception as err:
                raise IOError('failed to parse header for {0} ({1})'
                              .format(label[:4], str(err)))

            chid = label[4:].upper()
            for poly in header['polymers']:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if index is None:
                        index = msa.getIndex(dbref.idcode)
                        if index is not None:
                            LOGGER.info('{0} idcode {1} for {2}{3} '
                                        'is found in chain {3}.'.format(
                                        dbref.database, dbref.idcode,
                                        label[:4], poly.chid, str(msa)))
                            break
                    if index is None:
                        index = msa.getIndex(dbref.accession)
                        if index is not None:
                            LOGGER.info('{0} accession {1} for {2}{3} '
                                        'is found in chain {3}.'.format(
                                        dbref.database, dbref.accession,
                                        label[:4], poly.chid, str(msa)))
                            break
            if index is not None:
                chain = structure[poly.chid]

        if index is None:
            raise ValueError('label is not in msa, or msa is not indexed')
        try:
            len(index)
        except TypeError:
            pass
        else:
            raise ValueError('label {0} maps onto multiple sequences, '
                             'so cannot be used for refinement'.format(label))

        title.append('label=' + label)
        cols = char.isalpha(arr[index]).nonzero()[0]
        arr = arr.take(cols, 1)
        LOGGER.report('Label refinement reduced number of columns from {0} to '
                      '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

        if chain is not None and not kwargs.get('keep', False):
            before = arr.shape[1]
            LOGGER.timeit('_refine')
            from prody.proteins.compare import importBioPairwise2
            from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE
            from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY
            pw2 = importBioPairwise2()
            chseq = chain.getSequence()
            algn = pw2.align.localms(arr[index].tostring().upper(), chseq,
                                     MATCH_SCORE, MISMATCH_SCORE,
                                     GAP_PENALTY, GAP_EXT_PENALTY,
                                     one_alignment_only=1)
            torf = []
            for s, c in zip(*algn[0][:2]):
                if s == '-':
                    continue
                elif c != '-':
                    torf.append(True)
                else:
                    torf.append(False)
            torf = array(torf)
            tsum = torf.sum()
            assert tsum <= before, 'problem in mapping sequence to structure'
            if tsum < before:
                arr = arr.take(torf.nonzero()[0], 1)
                LOGGER.report('Structure refinement reduced number of '
                              'columns from {0} to {1} in %.2fs.'
                              .format(before, arr.shape[1]), '_refine')
            else:
                LOGGER.debug('All residues in the sequence are contained in '
                             'PDB structure {0}.'.format(label))

    from .analysis import calcMSAOccupancy, uniqueSequences

    rows = None
    if rowocc is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        try:
            rowocc = float(rowocc)
        except Exception as err:
            raise TypeError('rowocc must be a float ({0})'.format(str(err)))
        assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1'

        rows = calcMSAOccupancy(arr, 'row') >= rowocc
        if index is not None:
            index = rows[:index].sum()
        rows = (rows).nonzero()[0]
        arr = arr[rows]
        title.append('rowocc>=' + str(rowocc))
        LOGGER.report('Row occupancy refinement reduced number of rows from '
                      '{0} to {1} in %.2fs.'.format(before, arr.shape[0]),
                      '_refine')

    if seqid is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        unique = uniqueSequences(arr, seqid)
        if index is not None:
            unique[index] = True
        unique = unique.nonzero()[0]
        arr = arr[unique]
        title.append('seqid>=' + str(seqid))
        if rows is not None:
            rows = rows[unique]
        else:
            rows = unique
        LOGGER.report('Sequence identity refinement reduced number of rows '
                      'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]),
                      '_refine')

    if colocc is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            colocc = float(colocc)
        except Exception as err:
            raise TypeError('colocc must be a float ({0})'.format(str(err)))
        assert 0. <= colocc <= 1., 'colocc must be between 0 and 1'

        cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('colocc>=' + str(colocc))
        LOGGER.report('Column occupancy refinement reduced number of columns '
                      'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]),
                      '_refine')

    if not title:
        raise ValueError('label, rowocc, colocc all cannot be None')

    # depending on slicing of rows, arr may not have it's own memory
    if arr.base is not None:
        arr = arr.copy()

    if msa is None:
        return arr
    else:
        if rows is None:
            from copy import copy
            labels = copy(msa._labels)
            mapping = copy(msa._mapping)
        else:
            labels = msa._labels
            labels = [labels[i] for i in rows]
            mapping = None
        return MSA(arr, title=msa.getTitle() + ' refined ({0})'
                   .format(', '.join(title)), labels=labels, mapping=mapping)
Example #7
0
def refineMSA(msa,
              index=None,
              label=None,
              rowocc=None,
              seqid=None,
              colocc=None,
              **kwargs):
    """Refine *msa* by removing sequences (rows) and residues (columns) that
    contain gaps.

    :arg msa: multiple sequence alignment
    :type msa: :class:`.MSA`

    :arg index: remove columns that are gaps in the sequence with that index
    :type index: int

    :arg label: remove columns that are gaps in the sequence matching label,
        ``msa.getIndex(label)`` must return a sequence index, a PDB identifier
        is also acceptable
    :type label: str

    :arg rowocc: row occupancy, sequences with less occupancy will be
        removed after *label* refinement is applied
    :type rowocc: float

    :arg seqid: keep unique sequences at specified sequence identity level,
        unique sequences are identified using :func:`.uniqueSequences`
    :type seqid: float

    :arg colocc: column occupancy, residue positions with less occupancy
        will be removed after other refinements are applied
    :type colocc: float

    :arg keep: keep columns corresponding to residues not resolved in the PDB
        structure, default is **False**, applies when *label* is a PDB
        identifier
    :arg type: bool

    For Pfam MSA data, *label* is UniProt entry name for the protein.  You may
    also use PDB structure and chain identifiers, e.g. ``'1p38'`` or
    ``'1p38A'``, for *label* argument and UniProt entry names will be parsed
    using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and
    :class:`.DBRef`).

    The order of refinements are applied in the order of arguments.  If *label*
    and *unique* is specified, sequence matching *label* will
    be kept in the refined :class:`.MSA` although it may be similar to some
    other sequence."""

    # if msa is a char array, it will be refined but label won't work
    try:
        ndim, dtype_ = msa.ndim, msa.dtype
    except AttributeError:
        try:
            arr = msa._getArray()
        except AttributeError:
            raise TypeError('msa must be a character array or an MSA instance')
        ndim, dtype_ = arr.ndim, arr.dtype
    else:
        arr, msa = msa, None

    if dtype('|S1') != dtype_:
        raise ValueError('msa must be a character array or an MSA instance')
    if ndim != 2:
        raise ValueError('msa must be a 2D array or an MSA instance')

    title = []
    cols = None

    if index is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        cols = char.isalpha(arr[index]).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('index=' + str(index))
        LOGGER.report(
            'Index refinement reduced number of columns from {0} to '
            '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

    if label is not None:
        if index is not None:
            LOGGER.info('An index was provided so the label will be ignored.')

        else:
            before = arr.shape[1]
            LOGGER.timeit('_refine')
            try:
                upper, lower = label.upper(), label.lower()
            except AttributeError:
                raise TypeError('label must be a string')

            if msa is None:
                raise TypeError('msa must be an MSA instance, '
                                'label cannot be used')

            index = msa.getIndex(label)
            if index is None:
                index = msa.getIndex(upper)
            if index is None:
                index = msa.getIndex(lower)

            chain = None
            if index is None and (len(label) == 4 or len(label) == 5):
                from prody import parsePDB
                try:
                    structure, header = parsePDB(label[:4], header=True)
                except Exception as err:
                    raise IOError(
                        'failed to parse header for {0} ({1})'.format(
                            label[:4], str(err)))

                chid = label[4:].upper()
                for poly in header['polymers']:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if index is None:
                            index = msa.getIndex(dbref.idcode)
                            if index is not None:
                                LOGGER.info('{0} idcode {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database, dbref.idcode,
                                                label[:4], poly.chid,
                                                str(msa)))
                                break
                        if index is None:
                            index = msa.getIndex(dbref.accession)
                            if index is not None:
                                LOGGER.info('{0} accession {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database,
                                                dbref.accession, label[:4],
                                                poly.chid, str(msa)))
                                break
                if index is not None:
                    chain = structure[poly.chid]

            if index is None:
                raise ValueError('label is not in msa, or msa is not indexed')
            try:
                len(index)
            except TypeError:
                pass
            else:
                raise ValueError(
                    'label {0} maps onto multiple sequences, '
                    'so cannot be used for refinement'.format(label))

            title.append('label=' + label)
            cols = char.isalpha(arr[index]).nonzero()[0]
            arr = arr.take(cols, 1)
            LOGGER.report(
                'Label refinement reduced number of columns from {0} to '
                '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

            if chain is not None and not kwargs.get('keep', False):
                before = arr.shape[1]
                LOGGER.timeit('_refine')
                from prody.proteins.compare import importBioPairwise2
                from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE
                from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY
                pw2 = importBioPairwise2()
                chseq = chain.getSequence()
                algn = pw2.align.localms(arr[index].tostring().upper(),
                                         chseq,
                                         MATCH_SCORE,
                                         MISMATCH_SCORE,
                                         GAP_PENALTY,
                                         GAP_EXT_PENALTY,
                                         one_alignment_only=1)
                torf = []
                for s, c in zip(*algn[0][:2]):
                    if s == '-':
                        continue
                    elif c != '-':
                        torf.append(True)
                    else:
                        torf.append(False)
                torf = array(torf)
                tsum = torf.sum()
                assert tsum <= before, 'problem in mapping sequence to structure'
                if tsum < before:
                    arr = arr.take(torf.nonzero()[0], 1)
                    LOGGER.report(
                        'Structure refinement reduced number of '
                        'columns from {0} to {1} in %.2fs.'.format(
                            before, arr.shape[1]), '_refine')
                else:
                    LOGGER.debug(
                        'All residues in the sequence are contained in '
                        'PDB structure {0}.'.format(label))

    from .analysis import calcMSAOccupancy, uniqueSequences

    rows = None
    if rowocc is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        try:
            rowocc = float(rowocc)
        except Exception as err:
            raise TypeError('rowocc must be a float ({0})'.format(str(err)))
        assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1'

        rows = calcMSAOccupancy(arr, 'row') >= rowocc
        if index is not None:
            index = rows[:index].sum()
        rows = (rows).nonzero()[0]
        arr = arr[rows]
        title.append('rowocc>=' + str(rowocc))
        LOGGER.report(
            'Row occupancy refinement reduced number of rows from '
            '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine')

    if seqid is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        unique = uniqueSequences(arr, seqid)
        if index is not None:
            unique[index] = True
        unique = unique.nonzero()[0]
        arr = arr[unique]
        title.append('seqid>=' + str(seqid))
        if rows is not None:
            rows = rows[unique]
        else:
            rows = unique
        LOGGER.report(
            'Sequence identity refinement reduced number of rows '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]),
            '_refine')

    if colocc is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            colocc = float(colocc)
        except Exception as err:
            raise TypeError('colocc must be a float ({0})'.format(str(err)))
        assert 0. <= colocc <= 1., 'colocc must be between 0 and 1'

        cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('colocc>=' + str(colocc))
        LOGGER.report(
            'Column occupancy refinement reduced number of columns '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]),
            '_refine')

    if not title:
        raise ValueError(
            'label, index, seqid, rowocc, colocc all cannot be None')

    # depending on slicing of rows, arr may not have it's own memory
    if arr.base is not None:
        arr = arr.copy()

    if msa is None:
        return arr
    else:
        if rows is None:
            from copy import copy
            labels = copy(msa._labels)
        else:
            labels = msa._labels
            labels = [labels[i] for i in rows]
        return MSA(arr,
                   title=msa.getTitle() +
                   ' refined ({0})'.format(', '.join(title)),
                   labels=labels)
Example #8
0
    def numResidues(self):
        """Returns the number of alphabet characters."""

        return sum(char.isalpha(self._array))
Example #9
0
    def numGaps(self):
        """Returns number of gap characters."""

        array = self._array
        return len(array) - sum(char.isalpha(array))
Example #10
0
__copyright__ = 'Copyright (C) 2010-2012 Ahmet Bakan'

from prody.tests import TestCase

from numpy import array, log, zeros, char
from numpy.testing import assert_array_equal, assert_array_almost_equal

from prody.tests.test_datafiles import *

from prody import LOGGER, refineMSA, parseMSA, calcMSAOccupancy, mergeMSA
from prody import uniqueSequences

LOGGER.verbosity = None

FASTA = parseMSA(pathDatafile('msa_Cys_knot.fasta'))
FASTA_ALPHA = char.isalpha(FASTA._msa)
NUMSEQ = FASTA.numSequences() * 1.


class TestRefinement(TestCase):
    def testLabel(self):
        label = 'FSHB_BOVIN'
        index = FASTA.getIndex(label)
        refined = refineMSA(FASTA, label=label)._getArray()

        expected = FASTA._getArray().take(FASTA_ALPHA[index].nonzero()[0], 1)

        assert_array_equal(refined, expected)

    def testRowocc(self):
Example #11
0
    def numResidues(self):
        """Return the number of alphabet characters."""

        return sum(char.isalpha(self._array))
Example #12
0
    def numGaps(self):
        """Return number of gap characters."""

        array = self._array
        return len(array) - sum(char.isalpha(array))
Example #13
0
from prody.tests import TestCase

from numpy import array, log, zeros, char, ones, fromfile
from numpy.testing import assert_array_equal, assert_array_almost_equal

from prody.tests.test_datafiles import *

from prody import LOGGER, calcShannonEntropy, buildMutinfoMatrix, parseMSA
from prody import calcMSAOccupancy, buildSeqidMatrix, uniqueSequences
from prody import buildOMESMatrix, buildSCAMatrix

LOGGER.verbosity = None

FASTA = parseMSA(pathDatafile("msa_Cys_knot.fasta"))
FASTA_ALPHA = char.isalpha(FASTA._msa)
FASTA_UPPER = char.upper(FASTA._msa)

FASTA_NUMBER, FASTA_LENGTH = FASTA_ALPHA.shape
FASTA_EYE = zeros((FASTA_NUMBER, FASTA_NUMBER))
for i in range(FASTA_NUMBER):
    FASTA_EYE[i, i] = 1
    for j in range(i + 1, FASTA_NUMBER):
        score = 0.0
        ncols = 0
        for k in range(FASTA_LENGTH):
            if FASTA_ALPHA[i, k] or FASTA_ALPHA[j, k]:
                if FASTA_UPPER[i, k] == FASTA_UPPER[j, k]:
                    score += 1
                ncols += 1
        FASTA_EYE[i, j] = FASTA_EYE[j, i] = score / ncols