Exemple #1
0
    def addCoordset(self, coords, weights=None, label=None, **kwargs):
        """Add coordinate set(s) to the ensemble.  *coords* must be a Numpy
        array with suitable shape and dimensionality, or an object with
        :meth:`getCoordsets`. *weights* is an optional argument.
        If provided, its length must match number of atoms.  Weights of
        missing (not resolved) atoms must be ``0`` and weights of those
        that are resolved can be anything greater than ``0``.  If not
        provided, weights of all atoms for this coordinate set will be
        set equal to ``1``. *label*, which may be a PDB identifier or a
        list of identifiers, is used to label conformations."""

        degeneracy = kwargs.pop('degeneracy', False)
        adddata = kwargs.pop('data', None)

        atoms = coords
        n_atoms = self._n_atoms
        n_select = self.numSelected()
        n_confs = self.numCoordsets()

        try:
            if degeneracy:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoords(selected=False)
                    elif hasattr(coords, '_getCoords'):
                        coords = coords._getCoords()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoords(selected=False)
                    elif hasattr(coords, 'getCoords'):
                        coords = coords.getCoords()
            else:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoordsets(selected=False)
                    elif hasattr(coords, '_getCoordsets'):
                        coords = coords._getCoordsets()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoordsets(selected=False)
                    elif hasattr(coords, 'getCoordsets'):
                        coords = coords.getCoordsets()

        except AttributeError:
            label = label or 'Unknown'

        else:
            if coords is None:
                raise ValueError('coordinates are not set')
            elif label is None and isinstance(atoms, Atomic):
                if not isinstance(atoms, AtomGroup):
                    ag = atoms.getAtomGroup()
                else:
                    ag = atoms
                label = ag.getTitle()
                if coords.shape[0] < ag.numCoordsets():
                    label += '_m' + str(atoms.getACSIndex())
            else:
                label = label or 'Unknown'

        # check coordinates
        try:
            checkCoords(coords, csets=True, natoms=n_atoms)
        except:
            try:
                checkCoords(coords, csets=True, natoms=n_select)
            except TypeError:
                raise TypeError('coords must be a numpy array or an object '
                                'with `getCoords` method')

        if coords.ndim == 2:
            n_nodes, _ = coords.shape
            coords = coords.reshape((1, n_nodes, 3))
            n_csets = 1
        else:
            n_csets, n_nodes, _ = coords.shape
            if degeneracy:
                coords = coords[:1]

        n_repeats = 1 if degeneracy else n_csets

        if not n_atoms:
            self._n_atoms = n_nodes
            n_atoms = self._n_atoms

        if n_nodes == n_select and self.isSelected():
            full_coords = np.repeat(self._coords[np.newaxis, :, :],
                                    n_csets,
                                    axis=0)
            full_coords[:, self._indices, :] = coords
            coords = full_coords

        # check weights
        if weights is None:
            weights = np.ones((n_csets, n_atoms, 1), dtype=float)
        else:
            weights = checkWeights(weights, n_atoms, n_csets)

        if degeneracy:
            weights = weights[:1]

        # check sequences
        seqs = None
        sequence = kwargs.pop('sequence', None)
        if hasattr(atoms, 'getSequence'):
            if sequence is not None:
                LOGGER.warn(
                    'sequence is supplied though coords has getSequence')
            sequence = atoms.getSequence()
            seqs = [sequence for _ in range(n_repeats)]
        else:
            if sequence is None:
                try:
                    sequence = self._atoms.getSequence()
                except AttributeError:
                    if self._msa:
                        sequence = ''.join('X' for _ in range(n_atoms))
                    # sequence and seqs remains to be None if MSA has not been created
            if isinstance(sequence, Sequence):
                seqs = [str(sequence)]
            elif isinstance(sequence, MSA):
                seqs = [str(seq) for seq in sequence]
            elif np.isscalar(sequence):
                seqs = [sequence for _ in range(n_repeats)]

        if seqs:
            if len(seqs) != n_repeats:
                raise ValueError(
                    'the number of sequences should be either one or '
                    'that of coordsets')

        # assign new values
        # update labels
        if n_csets > 1 and not degeneracy:
            if isinstance(label, str):
                labels = [
                    '{0}_m{1}'.format(label, i + 1) for i in range(n_csets)
                ]
            else:
                if len(label) != n_csets:
                    raise ValueError('length of label and number of '
                                     'coordinate sets must be the same')
                labels = label
        else:
            labels = [label] if np.isscalar(label) else label

        self._labels.extend(labels)

        # update sequences
        if seqs:
            msa = MSA(seqs, title=self.getTitle(), labels=labels)
            if self._msa is None:
                if n_confs > 0:
                    def_seqs = np.chararray((n_confs, n_atoms))
                    def_seqs[:] = 'X'

                    old_labels = [self._labels[i] for i in range(n_confs)]
                    self._msa = MSA(def_seqs,
                                    title=self.getTitle(),
                                    labels=old_labels)
                    self._msa.extend(msa)
                else:
                    self._msa = msa
            else:
                self._msa.extend(msa)

        # update coordinates
        if self._confs is None and self._weights is None:
            self._confs = coords
            self._weights = weights

        elif self._confs is not None and self._weights is not None:
            self._confs = np.concatenate((self._confs, coords), axis=0)
            self._weights = np.concatenate((self._weights, weights), axis=0)
        else:
            raise RuntimeError('_confs and _weights must be set or None at '
                               'the same time')

        # appending new data
        if self._data is not None and adddata is not None:
            if self._data is None:
                self._data = {}
            if adddata is None:
                adddata = {}
            all_keys = set(list(self._data.keys()) + list(adddata.keys()))

            for key in all_keys:
                if key in self._data:
                    data = self._data[key]
                    if key not in adddata:
                        shape = [n_repeats]
                        for s in data.shape[1:]:
                            shape.append(s)
                        newdata = np.zeros(shape, dtype=data.dtype)
                    else:
                        newdata = np.asarray(adddata[key])
                        if newdata.shape[0] != n_repeats:
                            raise ValueError(
                                'the length of data["%s"] does not match that of coords'
                                % key)
                else:
                    newdata = np.asarray(adddata[key])
                    shape = [self._n_csets]
                    for s in newdata.shape[1:]:
                        shape.append(s)
                    data = np.zeros(shape, dtype=newdata.dtype)
                self._data[key] = np.concatenate((data, newdata), axis=0)

        # update the number of coordinate sets
        self._n_csets += n_repeats
Exemple #2
0
    def addCoordset(self, coords, weights=None, label=None, **kwargs):
        """Add coordinate set(s) to the ensemble.  *coords* must be a Numpy
        array with suitable shape and dimensionality, or an object with
        :meth:`getCoordsets`. *weights* is an optional argument.
        If provided, its length must match number of atoms.  Weights of
        missing (not resolved) atoms must be ``0`` and weights of those
        that are resolved can be anything greater than ``0``.  If not
        provided, weights of all atoms for this coordinate set will be
        set equal to ``1``. *label*, which may be a PDB identifier or a
        list of identifiers, is used to label conformations."""

        degeneracy = kwargs.pop('degeneracy', False)

        atoms = coords
        n_atoms = self._n_atoms
        n_select = self.numSelected()
        n_confs = self.numCoordsets()

        try:
            if degeneracy:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoords(selected=False)
                    elif hasattr(coords, '_getCoords'):
                        coords = coords._getCoords()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoords(selected=False)
                    elif hasattr(coords, 'getCoords'):
                        coords = coords.getCoords()
            else:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoordsets(selected=False)
                    elif hasattr(coords, '_getCoordsets'):
                        coords = coords._getCoordsets()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoordsets(selected=False)
                    elif hasattr(coords, 'getCoordsets'):
                        coords = coords.getCoordsets()

        except AttributeError:
            label = label or 'Unknown'

        else:
            if coords is None:
                raise ValueError('coordinates are not set')
            elif label is None and isinstance(atoms, Atomic):
                if not isinstance(atoms, AtomGroup):
                    ag = atoms.getAtomGroup()
                else:
                    ag = atoms
                label = ag.getTitle()
                if coords.shape[0] < ag.numCoordsets():
                    label += '_m' + str(atoms.getACSIndex())
            else:
                label = label or 'Unknown'

        # check coordinates
        try:
            checkCoords(coords, csets=True, natoms=n_atoms)
        except:
            try:
                checkCoords(coords, csets=True, natoms=n_select)
            except TypeError:
                raise TypeError('coords must be a numpy array or an object '
                                'with `getCoords` method')

        if coords.ndim == 2:
            n_nodes, _ = coords.shape
            coords = coords.reshape((1, n_nodes, 3))
            n_csets = 1
        else:
            n_csets, n_nodes, _ = coords.shape
            if degeneracy:
                coords = coords[:1]

        n_repeats = 1 if degeneracy else n_csets
       
        if not n_atoms:
            self._n_atoms = n_nodes

        if n_nodes == n_select and self.isSelected():
            full_coords = np.repeat(self._coords[np.newaxis, :, :], n_csets, axis=0)
            full_coords[:, self._indices, :] = coords
            coords = full_coords
        
        # check weights
        if weights is None:
            weights = np.ones((n_csets, n_atoms, 1), dtype=float)
        else:
            weights = checkWeights(weights, n_atoms, n_csets)

        if degeneracy:
            weights = weights[:1]

        # check sequences
        seqs = None
        sequence = kwargs.pop('sequence', None)
        if hasattr(atoms, 'getSequence'):
            if sequence is not None:
                LOGGER.warn('sequence is supplied though coords has getSequence')
            sequence = atoms.getSequence()
            seqs = [sequence for _ in range(n_repeats)]
        else:
            if sequence is None:
                try:
                    sequence = self.getAtoms().getSequence()
                except AttributeError:
                    if self._msa:
                        sequence = ''.join('X' for _ in range(n_atoms))
                    # sequence and seqs remains to be None if MSA has not been created
            if isinstance(sequence, Sequence):
                seqs = [str(sequence)]
            elif isinstance(sequence, MSA):
                seqs = [str(seq) for seq in sequence]
            elif np.isscalar(sequence):
                seqs = [sequence for _ in range(n_repeats)]
        
        if seqs:
            if len(seqs) != n_repeats:
                raise ValueError('the number of sequences should be either one or '
                                'that of coordsets')

        # assign new values
        # update labels
        if n_csets > 1 and not degeneracy:
            if isinstance(label, str):
                labels = ['{0}_m{1}'.format(label, i+1) for i in range(n_csets)]
            else:
                if len(label) != n_csets:
                    raise ValueError('length of label and number of '
                                        'coordinate sets must be the same')
                labels = label
        else:
            labels = [label] if np.isscalar(label) else label

        self._labels.extend(labels)

        # update sequences
        if seqs:
            msa = MSA(seqs, title=self.getTitle(), labels=labels)
            if self._msa is None:
                if n_confs > 0:
                    def_seqs = np.chararray((n_confs, n_atoms))
                    def_seqs[:] = 'X'

                    old_labels = [self._labels[i] for i in range(n_confs)]
                    self._msa = MSA(def_seqs, title=self.getTitle(), labels=old_labels)
                    self._msa.extend(msa)
                else:
                    self._msa = msa
            else:
                self._msa.extend(msa)

        # update coordinates
        if self._confs is None and self._weights is None:
            self._confs = coords
            self._weights = weights
            self._n_csets = n_repeats
            
        elif self._confs is not None and self._weights is not None:
            self._confs = np.concatenate((self._confs, coords), axis=0)
            self._weights = np.concatenate((self._weights, weights), axis=0)
            self._n_csets += n_repeats
        else:
            raise RuntimeError('_confs and _weights must be set or None at '
                               'the same time')
Exemple #3
0
class PDBEnsemble(Ensemble):
    """This class enables handling coordinates for heterogeneous structural
    datasets and stores identifiers for individual conformations.

    See usage usage in :ref:`pca-xray`, :ref:`pca-dimer`, and :ref:`pca-blast`.

    .. note:: This class is designed to handle conformations with missing
       coordinates, e.g. atoms that are note resolved in an X-ray structure.
       For unresolved atoms, the coordinates of the reference structure is
       assumed in RMSD calculations and superpositions."""
    def __init__(self, title='Unknown'):

        self._labels = []
        self._trans = None
        self._msa = None
        Ensemble.__init__(self, title)

    def __add__(self, other):
        """Concatenate two ensembles. The reference coordinates of *self* is
        used in the result."""

        if not isinstance(other, Ensemble):
            raise TypeError('an Ensemble instance cannot be added to an {0} '
                            'instance'.format(type(other)))
        elif self.numAtoms() != other.numAtoms():
            raise ValueError('Ensembles must have same number of atoms.')

        ensemble = PDBEnsemble('{0} + {1}'.format(self.getTitle(),
                                                  other.getTitle()))
        if self._coords is not None:
            ensemble.setCoords(self._coords.copy())
        weights = copy(self._weights)
        if self._confs is not None:
            ensemble.addCoordset(copy(self._confs),
                                 weights=weights,
                                 label=self.getLabels(),
                                 sequence=self._msa)

        other_weights = copy(other._weights)
        ensemble.addCoordset(copy(other._confs),
                             weights=other_weights,
                             label=other.getLabels(),
                             sequence=other._msa)

        if self._atoms is not None:
            ensemble.setAtoms(self._atoms)
            ensemble._indices = self._indices
        else:
            ensemble.setAtoms(other._atoms)
            ensemble._indices = other._indices

        selfdata = self._data if self._data is not None else {}
        otherdata = other._data if other._data is not None else {}
        all_keys = set(list(selfdata.keys()) + list(otherdata.keys()))
        for key in all_keys:
            if key in selfdata and key in otherdata:
                self_data = selfdata[key]
                other_data = otherdata[key]
            elif key in selfdata:
                self_data = selfdata[key]
                other_data = np.zeros(other.numConfs(), dtype=self_data.dtype)
            elif key in otherdata:
                other_data = otherdata[key]
                self_data = np.zeros(other.numConfs(), dtype=other_data.dtype)
            ensemble._data[key] = np.concatenate((self_data, other_data),
                                                 axis=0)

        return ensemble

    def __iter__(self):

        n_confs = self._n_csets
        for i in range(n_confs):
            if n_confs != self._n_csets:
                raise RuntimeError('number of conformations in the ensemble '
                                   'changed during iteration')
            yield PDBConformation(self, i)

    def __getitem__(self, index):
        """Returns a conformation at given index."""

        msa = self._msa
        labels = self._labels
        if msa:
            msa = self._msa[index]
        if isinstance(index, Integral):
            return self.getConformation(index)

        elif isinstance(index, slice):
            ens = PDBEnsemble('{0} ({1[0]}:{1[1]}:{1[2]})'.format(
                self._title, index.indices(len(self))))
            if self._coords is not None:
                ens.setCoords(self._coords.copy())

            ens.addCoordset(self._confs[index].copy(),
                            self._weights[index].copy(),
                            label=self._labels[index],
                            sequence=msa)
            if self._trans is not None:
                ens._trans = self._trans[index]
            ens.setAtoms(self._atoms)
            ens._indices = self._indices

            for key in self._data.keys():
                ens._data[key] = self._data[key][index].copy()
            return ens

        elif isinstance(index, (list, np.ndarray)):
            index2 = list(index)
            for i in range(len(index)):
                if isinstance(index[i], str):
                    try:
                        index2[i] = labels.index(index[i])
                    except ValueError:
                        raise IndexError('invalid label: %s' % index[i])
            ens = PDBEnsemble('{0}'.format(self._title))
            if self._coords is not None:
                ens.setCoords(self._coords.copy())
            labels = list(np.array(self._labels)[index2])
            ens.addCoordset(self._confs[index2].copy(),
                            self._weights[index2].copy(),
                            label=labels,
                            sequence=msa)
            if self._trans is not None:
                ens._trans = self._trans[index2]
            ens.setAtoms(self._atoms)
            ens._indices = self._indices

            for key in self._data.keys():
                ens._data[key] = self._data[key][index].copy()
            return ens
        elif isinstance(index, str):
            try:
                i = labels.index(index)
                return self.getConformation(i)
            except ValueError:
                raise IndexError('invalid label: %s' % index)
        else:
            raise IndexError('invalid index')

    def superpose(self, **kwargs):
        """Superpose the ensemble onto the reference coordinates obtained by 
        :meth:`getCoords`.
        """

        trans = kwargs.pop('trans', True)
        if self._coords is None:
            raise ValueError('coordinates are not set, use `setCoords`')
        if self._confs is None or len(self._confs) == 0:
            raise ValueError('conformations are not set, use `addCoordset`')
        LOGGER.timeit('_prody_ensemble')
        self._superpose(trans=trans)  # trans kwarg is used by PDBEnsemble
        LOGGER.report('Superposition completed in %.2f seconds.',
                      '_prody_ensemble')

    def _superpose(self, **kwargs):
        """Superpose conformations and update coordinates."""

        calcT = getTransformation
        if kwargs.get('trans', False):
            if self._trans is not None:
                LOGGER.info('Existing transformations will be overwritten.')
            trans = np.zeros((self._n_csets, 4, 4))
        else:
            trans = None
        indices = self._indices
        if indices is None:
            weights = self._weights
            coords = self._coords
            confs = self._confs
            confs_selected = self._confs
        else:
            weights = self._weights[:, indices]
            coords = self._coords[indices]
            confs = self._confs
            confs_selected = self._confs[:, indices]

        for i, conf in enumerate(confs_selected):
            rmat, tvec = calcT(conf, coords, weights[i])
            if trans is not None:
                trans[i][:3, :3] = rmat
                trans[i][:3, 3] = tvec
            confs[i] = tvec + np.dot(confs[i], rmat.T)
        self._trans = trans

    def iterpose(self, rmsd=0.0001):
        confs = copy(self._confs)
        Ensemble.iterpose(self, rmsd)
        self._confs = confs
        LOGGER.info('Final superposition to calculate transformations.')
        self.superpose()

    iterpose.__doc__ = Ensemble.iterpose.__doc__

    def addCoordset(self, coords, weights=None, label=None, **kwargs):
        """Add coordinate set(s) to the ensemble.  *coords* must be a Numpy
        array with suitable shape and dimensionality, or an object with
        :meth:`getCoordsets`. *weights* is an optional argument.
        If provided, its length must match number of atoms.  Weights of
        missing (not resolved) atoms must be ``0`` and weights of those
        that are resolved can be anything greater than ``0``.  If not
        provided, weights of all atoms for this coordinate set will be
        set equal to ``1``. *label*, which may be a PDB identifier or a
        list of identifiers, is used to label conformations."""

        degeneracy = kwargs.pop('degeneracy', False)
        adddata = kwargs.pop('data', None)

        atoms = coords
        n_atoms = self._n_atoms
        n_select = self.numSelected()
        n_confs = self.numCoordsets()

        try:
            if degeneracy:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoords(selected=False)
                    elif hasattr(coords, '_getCoords'):
                        coords = coords._getCoords()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoords(selected=False)
                    elif hasattr(coords, 'getCoords'):
                        coords = coords.getCoords()
            else:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoordsets(selected=False)
                    elif hasattr(coords, '_getCoordsets'):
                        coords = coords._getCoordsets()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoordsets(selected=False)
                    elif hasattr(coords, 'getCoordsets'):
                        coords = coords.getCoordsets()

        except AttributeError:
            label = label or 'Unknown'

        else:
            if coords is None:
                raise ValueError('coordinates are not set')
            elif label is None and isinstance(atoms, Atomic):
                if not isinstance(atoms, AtomGroup):
                    ag = atoms.getAtomGroup()
                else:
                    ag = atoms
                label = ag.getTitle()
                if coords.shape[0] < ag.numCoordsets():
                    label += '_m' + str(atoms.getACSIndex())
            else:
                label = label or 'Unknown'

        # check coordinates
        try:
            checkCoords(coords, csets=True, natoms=n_atoms)
        except:
            try:
                checkCoords(coords, csets=True, natoms=n_select)
            except TypeError:
                raise TypeError('coords must be a numpy array or an object '
                                'with `getCoords` method')

        if coords.ndim == 2:
            n_nodes, _ = coords.shape
            coords = coords.reshape((1, n_nodes, 3))
            n_csets = 1
        else:
            n_csets, n_nodes, _ = coords.shape
            if degeneracy:
                coords = coords[:1]

        n_repeats = 1 if degeneracy else n_csets

        if not n_atoms:
            self._n_atoms = n_nodes
            n_atoms = self._n_atoms

        if n_nodes == n_select and self.isSelected():
            full_coords = np.repeat(self._coords[np.newaxis, :, :],
                                    n_csets,
                                    axis=0)
            full_coords[:, self._indices, :] = coords
            coords = full_coords

        # check weights
        if weights is None:
            weights = np.ones((n_csets, n_atoms, 1), dtype=float)
        else:
            weights = checkWeights(weights, n_atoms, n_csets)

        if degeneracy:
            weights = weights[:1]

        # check sequences
        seqs = None
        sequence = kwargs.pop('sequence', None)
        if hasattr(atoms, 'getSequence'):
            if sequence is not None:
                LOGGER.warn(
                    'sequence is supplied though coords has getSequence')
            sequence = atoms.getSequence()
            seqs = [sequence for _ in range(n_repeats)]
        else:
            if sequence is None:
                try:
                    sequence = self._atoms.getSequence()
                except AttributeError:
                    if self._msa:
                        sequence = ''.join('X' for _ in range(n_atoms))
                    # sequence and seqs remains to be None if MSA has not been created
            if isinstance(sequence, Sequence):
                seqs = [str(sequence)]
            elif isinstance(sequence, MSA):
                seqs = [str(seq) for seq in sequence]
            elif np.isscalar(sequence):
                seqs = [sequence for _ in range(n_repeats)]

        if seqs:
            if len(seqs) != n_repeats:
                raise ValueError(
                    'the number of sequences should be either one or '
                    'that of coordsets')

        # assign new values
        # update labels
        if n_csets > 1 and not degeneracy:
            if isinstance(label, str):
                labels = [
                    '{0}_m{1}'.format(label, i + 1) for i in range(n_csets)
                ]
            else:
                if len(label) != n_csets:
                    raise ValueError('length of label and number of '
                                     'coordinate sets must be the same')
                labels = label
        else:
            labels = [label] if np.isscalar(label) else label

        self._labels.extend(labels)

        # update sequences
        if seqs:
            msa = MSA(seqs, title=self.getTitle(), labels=labels)
            if self._msa is None:
                if n_confs > 0:
                    def_seqs = np.chararray((n_confs, n_atoms))
                    def_seqs[:] = 'X'

                    old_labels = [self._labels[i] for i in range(n_confs)]
                    self._msa = MSA(def_seqs,
                                    title=self.getTitle(),
                                    labels=old_labels)
                    self._msa.extend(msa)
                else:
                    self._msa = msa
            else:
                self._msa.extend(msa)

        # update coordinates
        if self._confs is None and self._weights is None:
            self._confs = coords
            self._weights = weights

        elif self._confs is not None and self._weights is not None:
            self._confs = np.concatenate((self._confs, coords), axis=0)
            self._weights = np.concatenate((self._weights, weights), axis=0)
        else:
            raise RuntimeError('_confs and _weights must be set or None at '
                               'the same time')

        # appending new data
        if self._data is not None and adddata is not None:
            if self._data is None:
                self._data = {}
            if adddata is None:
                adddata = {}
            all_keys = set(list(self._data.keys()) + list(adddata.keys()))

            for key in all_keys:
                if key in self._data:
                    data = self._data[key]
                    if key not in adddata:
                        shape = [n_repeats]
                        for s in data.shape[1:]:
                            shape.append(s)
                        newdata = np.zeros(shape, dtype=data.dtype)
                    else:
                        newdata = np.asarray(adddata[key])
                        if newdata.shape[0] != n_repeats:
                            raise ValueError(
                                'the length of data["%s"] does not match that of coords'
                                % key)
                else:
                    newdata = np.asarray(adddata[key])
                    shape = [self._n_csets]
                    for s in newdata.shape[1:]:
                        shape.append(s)
                    data = np.zeros(shape, dtype=newdata.dtype)
                self._data[key] = np.concatenate((data, newdata), axis=0)

        # update the number of coordinate sets
        self._n_csets += n_repeats

    def getMSA(self, indices=None, selected=True):
        """Returns an MSA of selected atoms."""

        selected = selected and self._indices is not None
        if self._msa is None:
            return None

        atom_indices = self._indices if selected else slice(None, None, None)
        indices = indices if indices is not None else slice(None, None, None)

        return self._msa[indices, atom_indices]

    def getLabels(self):
        """Returns identifiers of the conformations in the ensemble."""

        return list(self._labels)

    def getCoordsets(self, indices=None, selected=True):
        """Returns a copy of coordinate set(s) at given *indices* for selected
        atoms. *indices* may be an integer, a list of integers or **None**.
        **None** returns all coordinate sets.

        .. warning:: When there are atoms with weights equal to zero (0),
           their coordinates will be replaced with the coordinates of the
           ensemble reference coordinate set."""

        if self._confs is None:
            return None

        if indices is None:
            indices = slice(None)
        else:
            indices = np.array([indices]).flatten()
        coords = self._coords
        if self._indices is None or not selected:
            confs = self._confs[indices].copy()
            for i, w in enumerate(self._weights[indices]):
                which = w.flatten() == 0
                confs[i, which] = coords[which]
        else:
            selids = self._indices
            coords = coords[selids]
            confs = self._confs[indices, selids].copy()
            for i, w in enumerate(self._weights[indices]):
                which = w[selids].flatten() == 0
                confs[i, which] = coords[which]
        return confs

    _getCoordsets = getCoordsets

    def iterCoordsets(self):
        """Iterate over coordinate sets. A copy of each coordinate set for
        selected atoms is returned. Reference coordinates are not included."""

        conf = PDBConformation(self, 0)
        for i in range(self._n_csets):
            conf._index = i
            yield conf.getCoords()

    def delCoordset(self, index):
        """Delete a coordinate set from the ensemble."""

        Ensemble.delCoordset(self, index)
        if isinstance(index, Integral):
            index = [index]
        else:
            index = list(index)
        index.sort(reverse=True)
        for i in index:
            self._labels.pop(i)

        if self._msa is not None:
            rest = []
            for i in range(self._msa.numSequences()):
                if i not in index:
                    rest.append(i)
            self._msa = self._msa[rest]

    def getConformation(self, index):
        """Returns conformation at given index."""

        if self._confs is None:
            raise AttributeError('conformations are not set')
        if not isinstance(index, Integral):
            raise TypeError('index must be an integer')
        n_confs = self._n_csets
        if -n_confs <= index < n_confs:
            if index < 0:
                index = n_confs + index
            return PDBConformation(self, index)
        else:
            raise IndexError('conformation index out of range')

    def getMSFs(self):
        """Calculate and return mean square fluctuations (MSFs).
        Note that you might need to align the conformations using
        :meth:`superpose` or :meth:`iterpose` before calculating MSFs."""

        if self._confs is None:
            return
        indices = self._indices
        if indices is None:
            coords = self._coords
            confs = self._confs
            weights = self._weights > 0
        else:
            coords = self._coords[indices]
            confs = self._confs[:, indices]
            weights = self._weights[:, indices] > 0
        weightsum = weights.sum(0)
        mean = np.zeros(coords.shape)
        for i, conf in enumerate(confs):
            mean += conf * weights[i]
        mean /= weightsum
        ssqf = np.zeros(mean.shape)
        for i, conf in enumerate(confs):
            ssqf += ((conf - mean) * weights[i])**2
        return ssqf.sum(1) / weightsum.flatten()

    def getRMSDs(self, pairwise=False):
        """Calculate and return root mean square deviations (RMSDs). Note that
        you might need to align the conformations using :meth:`superpose` or
        :meth:`iterpose` before calculating RMSDs.

        :arg pairwise: if **True** then it will return pairwise RMSDs 
            as an n-by-n matrix. n is the number of conformations.
        :type pairwise: bool
        """

        if self._confs is None or self._coords is None:
            return None

        indices = self._indices
        if indices is None:
            indices = np.arange(self._confs.shape[1])

        weights = self._weights[:,
                                indices] if self._weights is not None else None
        if pairwise:
            n_confs = self.numConfs()
            RMSDs = np.zeros((n_confs, n_confs))
            for i in range(n_confs):
                for j in range(i + 1, n_confs):
                    if weights is None:
                        w = None
                    else:
                        wi = weights[i]
                        wj = weights[j]
                        w = wi * wj
                    RMSDs[i, j] = RMSDs[j,
                                        i] = getRMSD(self._confs[i, indices],
                                                     self._confs[j,
                                                                 indices], w)
        else:
            RMSDs = getRMSD(self._coords[indices], self._confs[:, indices],
                            weights)

        return RMSDs

    def setWeights(self, weights):
        """Set atomic weights."""

        if self._n_atoms == 0:
            raise AttributeError('coordinates are not set')

        try:
            self._weights = checkWeights(weights, self._n_atoms, self._n_csets)
        except ValueError:
            weights = checkWeights(weights, self.numSelected(), self._n_csets)
            if not self._weights:
                self._weights = np.ones((self._n_csets, self._n_atoms, 1),
                                        dtype=float)
            self._weights[self._indices, :] = weights

    def getTransformations(self):
        """Returns the :class:`~.Transformation` used to superpose this
        conformation onto reference coordinates.  The transformation can
        be used to superpose original PDB file onto the reference PDB file."""

        if self._trans is not None:
            return [Transformation(trans) for trans in self._trans]
        return
Exemple #4
0
class PDBEnsemble(Ensemble):

    """This class enables handling coordinates for heterogeneous structural
    datasets and stores identifiers for individual conformations.

    See usage usage in :ref:`pca-xray`, :ref:`pca-dimer`, and :ref:`pca-blast`.

    .. note:: This class is designed to handle conformations with missing
       coordinates, e.g. atoms that are note resolved in an X-ray structure.
       For unresolved atoms, the coordinates of the reference structure is
       assumed in RMSD calculations and superpositions."""

    def __init__(self, title='Unknown'):

        self._labels = []
        self._trans = None
        self._msa = None
        Ensemble.__init__(self, title)

    def __repr__(self):

        return '<PDB' + Ensemble.__repr__(self)[1:]

    def __str__(self):

        return 'PDB' + Ensemble.__str__(self)

    def __add__(self, other):
        """Concatenate two ensembles. The reference coordinates of *self* is
        used in the result."""

        if not isinstance(other, Ensemble):
            raise TypeError('an Ensemble instance cannot be added to an {0} '
                            'instance'.format(type(other)))
        elif self.numAtoms() != other.numAtoms():
            raise ValueError('Ensembles must have same number of atoms.')

        ensemble = PDBEnsemble('{0} + {1}'.format(self.getTitle(),
                                                  other.getTitle()))
        ensemble.setCoords(copy(self._coords))
        weights = copy(self._weights)
        if self._confs is not None:
            ensemble.addCoordset(copy(self._confs), weights=weights, 
                                 label=self.getLabels(), sequence=self._msa)
        
        other_weights = copy(other._weights)
        ensemble.addCoordset(copy(other._confs), weights=other_weights, 
                             label=other.getLabels(), sequence=other._msa)

        if self._atoms is not None:
            ensemble.setAtoms(self._atoms)
            ensemble._indices = self._indices
        else:
            ensemble.setAtoms(other._atoms)
            ensemble._indices = other._indices

        all_keys = list(self._data.keys()) + list(other._data.keys())
        for key in all_keys:
            if key in self._data and key in other._data:
                self_data = self._data[key]
                other_data = other._data[key]
            elif key in self._data:
                self_data = self._data[key]
                other_data = np.zeros(other.numConfs(), dtype=self_data.dtype)
            elif key in other._data:
                other_data = other._data[key]
                self_data = np.zeros(other.numConfs(), dtype=other_data.dtype)
            ensemble._data[key] = np.concatenate((self_data, other_data), axis=0)

        return ensemble

    def __iter__(self):

        n_confs = self._n_csets
        for i in range(n_confs):
            if n_confs != self._n_csets:
                raise RuntimeError('number of conformations in the ensemble '
                                   'changed during iteration')
            yield PDBConformation(self, i)

    def __getitem__(self, index):
        """Returns a conformation at given index."""

        msa = self._msa
        labels = self._labels
        if msa:
            msa = self._msa[index]
        if isinstance(index, Integral):
            return self.getConformation(index)

        elif isinstance(index, slice):
            ens = PDBEnsemble('{0} ({1[0]}:{1[1]}:{1[2]})'.format(
                              self._title, index.indices(len(self))))
            ens.setCoords(copy(self._coords))
            
            ens.addCoordset(self._confs[index].copy(),
                            self._weights[index].copy(),
                            label=self._labels[index],
                            sequence=msa)
            if self._trans is not None:
                ens._trans = self._trans[index]
            ens.setAtoms(self._atoms)
            ens._indices = self._indices

            for key in self._data.keys():
                ens._data[key] = self._data[key][index].copy()
            return ens

        elif isinstance(index, (list, np.ndarray)):
            index2 = list(index)
            for i in range(len(index)):
                if isinstance(index[i], str):
                    try:
                        index2[i] = labels.index(index[i])
                    except ValueError:
                        raise IndexError('invalid label: %s'%index[i])
            ens = PDBEnsemble('{0}'.format(self._title))
            ens.setCoords(copy(self._coords))
            labels = list(np.array(self._labels)[index2])
            ens.addCoordset(self._confs[index2].copy(),
                            self._weights[index2].copy(),
                            label=labels,
                            sequence=msa)
            if self._trans is not None:
                ens._trans = self._trans[index2]
            ens.setAtoms(self._atoms)
            ens._indices = self._indices

            for key in self._data.keys():
                ens._data[key] = self._data[key][index].copy()
            return ens
        elif isinstance(index, str):
            try:
                i = labels.index(index)
                return self.getConformation(i)
            except ValueError:
                raise IndexError('invalid label: %s'%index)
        else:
            raise IndexError('invalid index')

    def _superpose(self, **kwargs):
        """Superpose conformations and update coordinates."""

        calcT = getTransformation
        if kwargs.get('trans', False):
            if self._trans is not None:
                LOGGER.info('Existing transformations will be overwritten.')
            trans = np.zeros((self._n_csets, 4, 4))
        else:
            trans = None
        indices = self._indices
        if indices is None:
            weights = self._weights
            coords = self._coords
            confs = self._confs
            confs_selected = self._confs
        else:
            weights = self._weights[:, indices]
            coords = self._coords[indices]
            confs = self._confs
            confs_selected = self._confs[:, indices]

        for i, conf in enumerate(confs_selected):
            rmat, tvec = calcT(conf, coords, weights[i])
            if trans is not None:
                trans[i][:3, :3] = rmat
                trans[i][:3, 3] = tvec
            confs[i] = tvec + np.dot(confs[i], rmat.T)
        self._trans = trans

    def iterpose(self, rmsd=0.0001):

        confs = self._confs.copy()
        Ensemble.iterpose(self, rmsd)
        self._confs = confs
        LOGGER.info('Final superposition to calculate transformations.')
        self.superpose()

    iterpose.__doc__ = Ensemble.iterpose.__doc__

    def addCoordset(self, coords, weights=None, label=None, **kwargs):
        """Add coordinate set(s) to the ensemble.  *coords* must be a Numpy
        array with suitable shape and dimensionality, or an object with
        :meth:`getCoordsets`. *weights* is an optional argument.
        If provided, its length must match number of atoms.  Weights of
        missing (not resolved) atoms must be ``0`` and weights of those
        that are resolved can be anything greater than ``0``.  If not
        provided, weights of all atoms for this coordinate set will be
        set equal to ``1``. *label*, which may be a PDB identifier or a
        list of identifiers, is used to label conformations."""

        degeneracy = kwargs.pop('degeneracy', False)

        atoms = coords
        n_atoms = self._n_atoms
        n_select = self.numSelected()
        n_confs = self.numCoordsets()

        try:
            if degeneracy:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoords(selected=False)
                    elif hasattr(coords, '_getCoords'):
                        coords = coords._getCoords()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoords(selected=False)
                    elif hasattr(coords, 'getCoords'):
                        coords = coords.getCoords()
            else:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoordsets(selected=False)
                    elif hasattr(coords, '_getCoordsets'):
                        coords = coords._getCoordsets()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoordsets(selected=False)
                    elif hasattr(coords, 'getCoordsets'):
                        coords = coords.getCoordsets()

        except AttributeError:
            label = label or 'Unknown'

        else:
            if coords is None:
                raise ValueError('coordinates are not set')
            elif label is None and isinstance(atoms, Atomic):
                if not isinstance(atoms, AtomGroup):
                    ag = atoms.getAtomGroup()
                else:
                    ag = atoms
                label = ag.getTitle()
                if coords.shape[0] < ag.numCoordsets():
                    label += '_m' + str(atoms.getACSIndex())
            else:
                label = label or 'Unknown'

        # check coordinates
        try:
            checkCoords(coords, csets=True, natoms=n_atoms)
        except:
            try:
                checkCoords(coords, csets=True, natoms=n_select)
            except TypeError:
                raise TypeError('coords must be a numpy array or an object '
                                'with `getCoords` method')

        if coords.ndim == 2:
            n_nodes, _ = coords.shape
            coords = coords.reshape((1, n_nodes, 3))
            n_csets = 1
        else:
            n_csets, n_nodes, _ = coords.shape
            if degeneracy:
                coords = coords[:1]

        n_repeats = 1 if degeneracy else n_csets
       
        if not n_atoms:
            self._n_atoms = n_nodes

        if n_nodes == n_select and self.isSelected():
            full_coords = np.repeat(self._coords[np.newaxis, :, :], n_csets, axis=0)
            full_coords[:, self._indices, :] = coords
            coords = full_coords
        
        # check weights
        if weights is None:
            weights = np.ones((n_csets, n_atoms, 1), dtype=float)
        else:
            weights = checkWeights(weights, n_atoms, n_csets)

        if degeneracy:
            weights = weights[:1]

        # check sequences
        seqs = None
        sequence = kwargs.pop('sequence', None)
        if hasattr(atoms, 'getSequence'):
            if sequence is not None:
                LOGGER.warn('sequence is supplied though coords has getSequence')
            sequence = atoms.getSequence()
            seqs = [sequence for _ in range(n_repeats)]
        else:
            if sequence is None:
                try:
                    sequence = self.getAtoms().getSequence()
                except AttributeError:
                    if self._msa:
                        sequence = ''.join('X' for _ in range(n_atoms))
                    # sequence and seqs remains to be None if MSA has not been created
            if isinstance(sequence, Sequence):
                seqs = [str(sequence)]
            elif isinstance(sequence, MSA):
                seqs = [str(seq) for seq in sequence]
            elif np.isscalar(sequence):
                seqs = [sequence for _ in range(n_repeats)]
        
        if seqs:
            if len(seqs) != n_repeats:
                raise ValueError('the number of sequences should be either one or '
                                'that of coordsets')

        # assign new values
        # update labels
        if n_csets > 1 and not degeneracy:
            if isinstance(label, str):
                labels = ['{0}_m{1}'.format(label, i+1) for i in range(n_csets)]
            else:
                if len(label) != n_csets:
                    raise ValueError('length of label and number of '
                                        'coordinate sets must be the same')
                labels = label
        else:
            labels = [label] if np.isscalar(label) else label

        self._labels.extend(labels)

        # update sequences
        if seqs:
            msa = MSA(seqs, title=self.getTitle(), labels=labels)
            if self._msa is None:
                if n_confs > 0:
                    def_seqs = np.chararray((n_confs, n_atoms))
                    def_seqs[:] = 'X'

                    old_labels = [self._labels[i] for i in range(n_confs)]
                    self._msa = MSA(def_seqs, title=self.getTitle(), labels=old_labels)
                    self._msa.extend(msa)
                else:
                    self._msa = msa
            else:
                self._msa.extend(msa)

        # update coordinates
        if self._confs is None and self._weights is None:
            self._confs = coords
            self._weights = weights
            self._n_csets = n_repeats
            
        elif self._confs is not None and self._weights is not None:
            self._confs = np.concatenate((self._confs, coords), axis=0)
            self._weights = np.concatenate((self._weights, weights), axis=0)
            self._n_csets += n_repeats
        else:
            raise RuntimeError('_confs and _weights must be set or None at '
                               'the same time')

    def getMSA(self, indices=None, selected=True):
        """Returns an MSA of selected atoms."""

        selected = selected and self._indices is not None
        if self._msa is None:
            return None
        
        atom_indices = self._indices if selected else slice(None, None, None)
        indices = indices if indices is not None else slice(None, None, None)
        
        return self._msa[indices, atom_indices]

    def getLabels(self):
        """Returns identifiers of the conformations in the ensemble."""

        return list(self._labels)

    def getCoordsets(self, indices=None, selected=True):
        """Returns a copy of coordinate set(s) at given *indices* for selected
        atoms. *indices* may be an integer, a list of integers or **None**.
        **None** returns all coordinate sets.

        .. warning:: When there are atoms with weights equal to zero (0),
           their coordinates will be replaced with the coordinates of the
           ensemble reference coordinate set."""

        if self._confs is None:
            return None

        if indices is None:
            indices = slice(None)
        else:
            indices = np.array([indices]).flatten()
        coords = self._coords
        if self._indices is None or not selected:
            confs = self._confs[indices].copy()
            for i, w in enumerate(self._weights[indices]):
                which = w.flatten() == 0
                confs[i, which] = coords[which]
        else:
            selids = self._indices
            coords = coords[selids]
            confs = self._confs[indices, selids].copy()
            for i, w in enumerate(self._weights[indices]):
                which = w[selids].flatten() == 0
                confs[i, which] = coords[which]
        return confs

    _getCoordsets = getCoordsets

    def iterCoordsets(self):
        """Iterate over coordinate sets. A copy of each coordinate set for
        selected atoms is returned. Reference coordinates are not included."""

        conf = PDBConformation(self, 0)
        for i in range(self._n_csets):
            conf._index = i
            yield conf.getCoords()

    def delCoordset(self, index):
        """Delete a coordinate set from the ensemble."""

        Ensemble.delCoordset(self, index)
        if isinstance(index, Integral):
            index = [index]
        else:
            index = list(index)
        index.sort(reverse=True)
        for i in index:
            self._labels.pop(i)

        if self._msa is not None:
            rest = []
            for i in range(self._msa.numSequences()):
                if i not in index:
                    rest.append(i)
            self._msa = self._msa[rest]

    def getConformation(self, index):
        """Returns conformation at given index."""

        if self._confs is None:
            raise AttributeError('conformations are not set')
        if not isinstance(index, Integral):
            raise TypeError('index must be an integer')
        n_confs = self._n_csets
        if -n_confs <= index < n_confs:
            if index < 0:
                index = n_confs + index
            return PDBConformation(self, index)
        else:
            raise IndexError('conformation index out of range')

    def getMSFs(self):
        """Calculate and return mean square fluctuations (MSFs).
        Note that you might need to align the conformations using
        :meth:`superpose` or :meth:`iterpose` before calculating MSFs."""

        if self._confs is None:
            return
        indices = self._indices
        if indices is None:
            coords = self._coords
            confs = self._confs
            weights = self._weights > 0
        else:
            coords = self._coords[indices]
            confs = self._confs[:, indices]
            weights = self._weights[:, indices] > 0
        weightsum = weights.sum(0)
        mean = np.zeros(coords.shape)
        for i, conf in enumerate(confs):
            mean += conf * weights[i]
        mean /= weightsum
        ssqf = np.zeros(mean.shape)
        for i, conf in enumerate(confs):
            ssqf += ((conf - mean) * weights[i]) ** 2
        return ssqf.sum(1) / weightsum.flatten()

    def getRMSDs(self, pairwise=False):
        """Calculate and return root mean square deviations (RMSDs). Note that
        you might need to align the conformations using :meth:`superpose` or
        :meth:`iterpose` before calculating RMSDs.

        :arg pairwise: if **True** then it will return pairwise RMSDs 
            as an n-by-n matrix. n is the number of conformations.
        :type pairwise: bool
        """

        if self._confs is None or self._coords is None:
            return None

        indices = self._indices
        if indices is None:
            indices = np.arange(self._confs.shape[1])

        weights = self._weights[:, indices] if self._weights is not None else None
        if pairwise:
            n_confs = self.numConfs()
            RMSDs = np.zeros((n_confs, n_confs))
            for i in range(n_confs):
                for j in range(i+1, n_confs):
                    if weights is None:
                        w = None
                    else:
                        wi = weights[i]; wj = weights[j]
                        w = wi * wj
                    RMSDs[i, j] = RMSDs[j, i] = getRMSD(self._confs[i, indices], self._confs[j, indices], w)
        else:
            RMSDs = getRMSD(self._coords[indices], self._confs[:, indices], weights)

        return RMSDs

    def setWeights(self, weights):
        """Set atomic weights."""

        if self._n_atoms == 0:
            raise AttributeError('coordinates are not set')

        try:
            self._weights = checkWeights(weights, self._n_atoms, self._n_csets)
        except ValueError:
            weights = checkWeights(weights, self.numSelected(), self._n_csets)
            if not self._weights:
                self._weights = np.ones((self._n_csets, self._n_atoms, 1), dtype=float)
            self._weights[self._indices, :] = weights