Ejemplo n.º 1
0
    def calcModes(self, n_modes=20, turbo=True):
        """Calculate principal (or essential) modes.  This method uses
        :func:`scipy.linalg.eigh`, or :func:`numpy.linalg.eigh`, function
        to diagonalize the covariance matrix.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate,
            default is 20,
            if **None** or ``'all'`` is given, all modes will be calculated
        :type n_modes: int

        :arg turbo: when available, use a memory intensive but faster way to
            calculate modes, default is **True**
        :type turbo: bool"""

        if self._cov is None:
            raise ValueError('covariance matrix is not built or set')
        start = time.time()
        self._clear()
        if str(n_modes).lower() == 'all':
            n_modes = None

        values, vectors, _ = solveEig(self._cov,
                                      n_modes=n_modes,
                                      zeros=True,
                                      turbo=turbo,
                                      reverse=True)
        which = values > ZERO
        self._eigvals = values[which]
        self._array = vectors[:, which]
        self._vars = values
        self._n_modes = len(self._eigvals)
        LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format(
            self._n_modes,
            time.time() - start))
Ejemplo n.º 2
0
    def _sample(self, conf):

        tmp = self._atoms.copy()
        tmp.setCoords(conf)
        cg = tmp[self._idx_cg]

        anm_cg = self._buildANM(cg)

        if not self._checkANM(anm_cg):
            return None

        anm_cg.calcModes(self._n_modes)

        anm_ex = self._extendModel(anm_cg, cg, tmp)
        ens_ex = sampleModes(anm_ex, atoms=tmp,
                             n_confs=self._n_confs,
                             rmsd=self._rmsd[self._cycle])
        coordsets = ens_ex.getCoordsets()

        if self._targeted:
            if self._parallel:
                with Pool(cpu_count()) as p:
                    pot_conf = p.map(self._multi_targeted_sim,
                                     [(conf, coords) for coords in coordsets])
            else:
                pot_conf = [self._multi_targeted_sim((conf, coords)) for coords in coordsets]

            pots, poses = list(zip(*pot_conf))

            idx = np.logical_not(np.isnan(pots))
            coordsets = np.array(poses)[idx]

            LOGGER.debug('%d/%d sets of coordinates were moved to the target' % (len(poses), len(coordsets)))

        return coordsets
Ejemplo n.º 3
0
def calcHitTime(model, method='standard'):
    """Returns the hit and commute times between pairs of nodes calculated 
    based on a :class:`.NMA` object. 

    .. [CB95] Chennubhotla C., Bahar I. Signal Propagation in Proteins and Relation
    to Equilibrium Fluctuations. *PLoS Comput Biol* **2007** 3(9).

    :arg model: model to be used to calculate hit times
    :type model: :class:`.NMA`  

    :arg method: method to be used to calculate hit times. Available options are 
        ``"standard"`` or ``"kirchhoff"``. Default is ``"standard"``
    :type method: str

    :returns: (:class:`~numpy.ndarray`, :class:`~numpy.ndarray`)
    """

    try:
        K = model.getKirchhoff()
    except AttributeError:
        raise TypeError('model must be an NMA instance')

    if K is None:
        raise ValueError('model not built')
    
    method = method.lower()

    D = np.diag(K)
    A = np.diag(D) - K

    start = time.time()
    linalg = importLA()
    if method == 'standard':
        st = D / sum(D)

        P = np.dot(np.diag(D**(-1)), A)
        W = np.ones((len(st), 1)) * st.T
        Z = linalg.pinv(np.eye(P.shape[0], P.shape[1]) - P + W)

        H = np.ones((len(st), 1)) * np.diag(Z).T - Z
        H = H / W
        H = H.T

    elif method == 'kirchhoff':
        K_inv = linalg.pinv(K)
        sum_D = sum(D)

        T1 = (sum_D * np.ones((len(D),1)) * np.diag(K_inv)).T

        T2 = sum_D * K_inv
        T3_i = np.dot((np.ones((len(D),1)) * D), K_inv)

        H = T1 - T2 + T3_i - T3_i.T

    C = H + H.T

    LOGGER.debug('Hit and commute times are calculated in  {0:.2f}s.'
                 .format(time.time()-start)) 
    return H, C
Ejemplo n.º 4
0
Archivo: goa.py Proyecto: nffaruk/ProDy
def parseOBO(**kwargs):
    """Parse a GO OBO file containing the GO itself.
    See `OBO`_ for more information on the file format.

    .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html
    """
    try:
        from goatools import obo_parser
    except:
        raise ImportError('GOATools needs to be installed to use parseOBO')

    go_obo_url = kwargs.get('go_obo_url', None)
    if go_obo_url is None:
        go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'

    data_folder = kwargs.get('data_folder', None)
    if data_folder is None:
        data_folder = os.getcwd() + '/Data'

    # Check if we have the ./data directory already
    if (not os.path.isfile(data_folder)):
        # Emulate mkdir -p (no error if folder exists)
        try:
            os.mkdir(data_folder)
        except OSError as e:
            if (e.errno != 17):
                raise e
    else:
        raise Exception(
            'Data path (' + data_folder + ') exists as a file. '
            'Please rename, remove or change the desired location of the data path.'
        )

    # Check if the file exists already
    if (not os.path.isfile(data_folder + '/go-basic.obo')):
        try:
            handle = openURL(go_obo_url)
        except Exception as err:
            LOGGER.warn('{0} download failed ({1}).'.format(
                go_obo_url, str(err)))
        else:
            data = handle.read()
            if len(data):
                filename = data_folder + '/go-basic.obo'

                with open(filename, 'w+b') as obofile:
                    obofile.write(data)

                LOGGER.debug('{0} downloaded ({1})'.format(
                    go_obo_url, sympath(filename)))
            else:
                LOGGER.warn(
                    '{0} download failed, reason unknown.'.format(go_obo_url))

    else:
        go_obo = data_folder + '/go-basic.obo'

    return obo_parser.GODag(go_obo)
Ejemplo n.º 5
0
def buildReducedHessian(coords, system, cutoff=15., gamma=1.0, **kwargs):
    
    r0 = kwargs.pop('r0', 20.)
    dr = kwargs.pop('dr', 20.)
    labels = peelr(coords, system, r0, dr)
    LOGGER.debug('layers: ' + str(np.unique(labels)))

    H = calcHessianRecursion(coords, labels, 0, cutoff=cutoff, gamma=gamma, **kwargs)
    return H
Ejemplo n.º 6
0
    def performSVD(self, coordsets):
        """Calculate principal modes using singular value decomposition (SVD).
        *coordsets* argument may be a :class:`.Atomic`, :class:`.Ensemble`,
        or :class:`numpy.ndarray` instance.  If *coordsets* is a numpy array,
        its shape must be ``(n_csets, n_atoms, 3)``.  Note that coordinate
        sets must be aligned prior to SVD calculations.

        This is a considerably faster way of performing PCA calculations
        compared to eigenvalue decomposition of covariance matrix, but is
        an approximate method when heterogeneous datasets are analyzed.
        Covariance method should be preferred over this one for analysis of
        ensembles with missing atomic data.  See :ref:`pca-xray-calculations`
        example for comparison of results from SVD and covariance methods."""

        linalg = importLA()

        start = time.time()
        if not isinstance(coordsets, (Ensemble, Atomic, np.ndarray)):
            raise TypeError('coordsets must be an Ensemble, Atomic, Numpy '
                            'array instance')
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3
                    or coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')
            deviations = coordsets - coordsets.mean(0)
        else:
            if isinstance(coordsets, Ensemble):
                deviations = coordsets.getDeviations()
            elif isinstance(coordsets, Atomic):
                deviations = (coordsets._getCoordsets() -
                              coordsets._getCoords())

        n_confs = deviations.shape[0]
        if n_confs < 3:
            raise ValueError('coordsets must have more than 3 coordinate sets')
        n_atoms = deviations.shape[1]
        if n_atoms < 3:
            raise ValueError('coordsets must have more than 3 atoms')

        dof = n_atoms * 3
        deviations = deviations.reshape((n_confs, dof)).T

        vectors, values, self._temp = linalg.svd(deviations,
                                                 full_matrices=False)
        values = (values**2) / n_confs
        self._dof = dof
        self._n_atoms = n_atoms
        which = values > 1e-18
        self._eigvals = values[which]
        self._array = vectors[:, which]
        self._vars = self._eigvals
        self._trace = self._vars.sum()
        self._n_modes = len(self._eigvals)
        LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format(
            self._n_modes,
            time.time() - start))
Ejemplo n.º 7
0
Archivo: goa.py Proyecto: prody/ProDy
def parseOBO(**kwargs):
    """Parse a GO OBO file containing the GO itself.
    See `OBO`_ for more information on the file format.

    .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html
    """
    try:
        from goatools import obo_parser
    except:
        raise ImportError('GOATools needs to be installed to use parseOBO')

    go_obo_url = kwargs.get('go_obo_url', None)
    if go_obo_url is None:
        go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'

    data_folder = kwargs.get('data_folder', None)
    if data_folder is None:
        data_folder = os.getcwd() + '/Data'

    # Check if we have the ./data directory already
    if(not os.path.isfile(data_folder)):
        # Emulate mkdir -p (no error if folder exists)
        try:
            os.mkdir(data_folder)
        except OSError as e:
            if(e.errno != 17):
                raise e
    else:
        raise Exception('Data path (' + data_folder + ') exists as a file. '
                        'Please rename, remove or change the desired location of the data path.')

    # Check if the file exists already
    if(not os.path.isfile(data_folder+'/go-basic.obo')):
        try:
            handle = openURL(go_obo_url)
        except Exception as err:
            LOGGER.warn('{0} download failed ({1}).'.format(
                go_obo_url, str(err)))
        else:
            data = handle.read()
            if len(data):
                filename = data_folder+'/go-basic.obo'

                with open(filename, 'w+b') as obofile:
                    obofile.write(data)

                LOGGER.debug('{0} downloaded ({1})'
                             .format(go_obo_url, sympath(filename)))
            else:
                LOGGER.warn('{0} download failed, reason unknown.'
                            .format(go_obo_url))

    else:
        go_obo = data_folder+'/go-basic.obo'

    return obo_parser.GODag(go_obo)
Ejemplo n.º 8
0
def searchDali(pdbId, chainId, daliURL=None, subset='fullPDB', **kwargs):
    """Search Dali server with input of PDB ID and chain ID.
    Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/
    
    :arg subset: fullPDB, PDB25, PDB50, PDB90
    :type subset: str
    
    """

    LOGGER.timeit('_dali')
    # timeout = 120
    timeout = kwargs.pop('timeout', 120)

    if daliURL is None:
        daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi"
    pdbId = pdbId.lower()
    pdb_chain = pdbId + chainId
    parameters = {
        'cd1': pdb_chain,
        'method': 'search',
        'title': 'Title_' + pdb_chain,
        'address': ''
    }
    enc_params = urllib.urlencode(parameters).encode('utf-8')
    request = urllib2.Request(daliURL, enc_params)
    try_error = 3
    while try_error >= 0:
        try:
            url = urllib2.urlopen(request).url
            break
        except:
            try_error -= 1
            if try_error >= 0:
                LOGGER.sleep(
                    2, '. Connection error happened. Trying to reconnect...')
                continue
            else:
                url = urllib2.urlopen(request).url
                break
    if url.split('.')[-1].lower() in ['html', 'php']:
        # print('test -1: '+url)
        url = url.replace(url.split('/')[-1], '')
    LOGGER.debug(
        'Submitted Dali search for PDB and chain "{0} and {1}".'.format(
            pdbId, chainId))
    LOGGER.info(url)
    LOGGER.clear()
    obj = DaliRecord(url,
                     pdbId,
                     chainId,
                     subset=subset,
                     timeout=timeout,
                     **kwargs)
    #if obj.isSuccess:

    return obj
Ejemplo n.º 9
0
    def performSVD(self, coordsets):
        """Calculate principal modes using singular value decomposition (SVD).
        *coordsets* argument may be a :class:`.Atomic`, :class:`.Ensemble`,
        or :class:`numpy.ndarray` instance.  If *coordsets* is a numpy array,
        its shape must be ``(n_csets, n_atoms, 3)``.  Note that coordinate
        sets must be aligned prior to SVD calculations.

        This is a considerably faster way of performing PCA calculations
        compared to eigenvalue decomposition of covariance matrix, but is
        an approximate method when heterogeneous datasets are analyzed.
        Covariance method should be preferred over this one for analysis of
        ensembles with missing atomic data.  See :ref:`pca-xray-calculations`
        example for comparison of results from SVD and covariance methods."""

        linalg = importLA()

        start = time.time()
        if not isinstance(coordsets, (Ensemble, Atomic, np.ndarray)):
            raise TypeError('coordsets must be an Ensemble, Atomic, Numpy '
                            'array instance')
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or
                    coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')
            deviations = coordsets - coordsets.mean(0)
        else:
            if isinstance(coordsets, Ensemble):
                deviations = coordsets.getDeviations()
            elif isinstance(coordsets, Atomic):
                deviations = (coordsets._getCoordsets() -
                              coordsets._getCoords())

        n_confs = deviations.shape[0]
        if n_confs < 3:
            raise ValueError('coordsets must have more than 3 coordinate sets')
        n_atoms = deviations.shape[1]
        if n_atoms < 3:
            raise ValueError('coordsets must have more than 3 atoms')

        dof = n_atoms * 3
        deviations = deviations.reshape((n_confs, dof)).T

        vectors, values, self._temp = linalg.svd(deviations,
                                                 full_matrices=False)
        values = (values ** 2) / n_confs
        self._dof = dof
        self._n_atoms = n_atoms
        which = values > 1e-18
        self._eigvals = values[which]
        self._array = vectors[:, which]
        self._vars = self._eigvals
        self._trace = self._vars.sum()
        self._n_modes = len(self._eigvals)
        LOGGER.debug('{0} modes were calculated in {1:.2f}s.'
                     .format(self._n_modes, time.time()-start))
Ejemplo n.º 10
0
Archivo: norm.py Proyecto: uibcdf/ProDy
def SCN(M, **kwargs):
    """ Performs Sequential Component Normalization on matrix *M*.
    
    .. [AC12] Cournac A, Marie-Nelly H, Marbouty M, Koszul R, Mozziconacci J. 
       Normalization of a chromosomal contact map. *BMC Genomics* **2012**. 
    """

    total_count = kwargs.pop('total_count', None)
    max_loops = kwargs.pop('max_loops', 100)
    tol = kwargs.pop('tol', 1e-5)

    N = M.copy()
    n = 0
    d0 = None
    p = 1
    last_p = None

    while True:
        C = np.diag(div0(1., np.sum(N, axis=0)))
        N = np.dot(N, C)

        R = np.diag(div0(1., np.sum(N, axis=1)))
        N = np.dot(R, N)

        n += 1

        # check convergence of symmetry
        d = np.mean(np.abs(N - N.T))

        if d0 is not None:
            p = div0(d, d0)
            dp = np.abs(p - last_p)
            if dp < tol:
                break
        else:
            d0 = d
        LOGGER.debug('Iteration {0}: d = {1}, p = {2}'.format(
            str(n), str(d), str(p)))
        last_p = p

        if max_loops is not None:
            if n >= max_loops:
                LOGGER.warn('The SCN algorithm did not converge after {0} '
                            'iterations.'.format(max_loops))
                break
    # guarantee symmetry
    N = (N + N.T) / 2.
    if total_count == 'original':
        total_count = np.sum(M)

    if total_count is not None:
        sum_N = np.sum(N)
        k = total_count / sum_N
        N = N * k
    return N
Ejemplo n.º 11
0
def _getEigvecs(modes, row_norm=False, dummy_mode=False):
    la = importLA()

    if isinstance(modes, (Mode, ModeSet, NMA)):
        model = modes._model
        if isinstance(model, MaskedGNM):
            masked = model.masked
            model.masked = True
            V = modes.getArray()
            model.masked = masked
        else:
            V = modes.getArray()
    elif isinstance(modes, np.ndarray):
        V = modes
    else:
        try:
            mode0 = modes[0]
            if isinstance(mode0, Mode):
                V = np.empty((len(mode0), 0))
                for mode in modes:
                    assert isinstance(mode,
                                      Mode), 'Modes should be a list of modes.'
                    v = mode.getEigvec()
                    v = np.expand_dims(v, axis=1)
                    V = np.hstack((V, v))
            else:
                V = np.array(modes)
        except TypeError:
            raise TypeError('Modes should be a list of modes.')
    if V.ndim == 1:
        V = np.expand_dims(V, axis=1)

    # add a dummy zero mode to the modeset
    if dummy_mode:
        v0 = V[:, 0]
        if np.allclose(v0, np.mean(v0)):
            dummy_mode = False
            LOGGER.warn(
                'at least one zero mode is detected therefore dummy mode will NOT be added'
            )

    if dummy_mode:
        n, _ = V.shape
        v0 = np.ones((n, 1), dtype=V.dtype)
        v0 /= la.norm(v0)
        V = np.hstack((v0, V))
        LOGGER.debug('a dummy zero mode is added')

    # normalize the rows so that feature vectors are unit vectors
    if row_norm:
        norms = la.norm(V, axis=1)
        N = np.diag(div0(1., norms))
        V = np.dot(N, V)

    return V
Ejemplo n.º 12
0
    def calcModes(self, n_modes=20, turbo=True):
        """Calculate principal (or essential) modes.  This method uses
        :func:`scipy.linalg.eigh`, or :func:`numpy.linalg.eigh`, function
        to diagonalize the covariance matrix.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate,
            default is 20,
            if **None** or ``'all'`` is given, all modes will be calculated
        :type n_modes: int

        :arg turbo: when available, use a memory intensive but faster way to
            calculate modes, default is **True**
        :type turbo: bool"""

        linalg = importLA()
        if self._cov is None:
            raise ValueError('covariance matrix is not built or set')
        start = time.time()
        dof = self._dof
        self._clear()
        if str(n_modes).lower() == 'all':
            n_modes = None
        if linalg.__package__.startswith('scipy'):
            if n_modes is None:
                eigvals = None
                n_modes = dof
            else:
                n_modes = int(n_modes)
                if n_modes >= self._dof:
                    eigvals = None
                    n_modes = dof
                else:
                    eigvals = (dof - n_modes, dof - 1)
            values, vectors = linalg.eigh(self._cov,
                                          turbo=turbo,
                                          eigvals=eigvals)
        else:
            if n_modes is not None:
                LOGGER.info('Scipy is not found, all modes are calculated.')
            values, vectors = linalg.eigh(self._cov)
        # Order by descending SV
        revert = list(range(len(values) - 1, -1, -1))
        values = values[revert]
        vectors = vectors[:, revert]
        which = values > 1e-8
        self._eigvals = values[which]
        self._array = vectors[:, which]
        self._vars = self._eigvals
        self._n_modes = len(self._eigvals)
        LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format(
            self._n_modes,
            time.time() - start))
Ejemplo n.º 13
0
def SCN(M, **kwargs):
    la = importLA()
    total_count = kwargs.pop('total_count', None)
    max_loops = kwargs.pop('max_loops', 100)
    tol = kwargs.pop('tol', 1e-5)

    N = M.copy()
    n = 0
    d0 = None
    p = 1
    last_p = None

    while True:
        C = np.diag(div0(1., np.sum(N, axis=0)))
        N = np.dot(N, C)

        R = np.diag(div0(1., np.sum(N, axis=1)))
        N = np.dot(R, N)

        n += 1

        # check convergence of symmetry
        d = np.mean(np.abs(N - N.T))

        if d0 is not None:
            p = div0(d, d0)
            dp = np.abs(p - last_p)
            if dp < tol:
                break
        else:
            d0 = d
        LOGGER.debug('Iteration {0}: d = {1}, p = {2}'.format(
            str(n), str(d), str(p)))
        last_p = p

        if max_loops is not None:
            if n >= max_loops:
                LOGGER.warn('The SCN algorithm did not converge after {0} '
                            'iterations.'.format(max_loops))
                break
    # guarantee symmetry
    N = (N + N.T) / 2.
    if total_count is 'original':
        total_count = np.sum(M)

    if total_count is not None:
        sum_N = np.sum(N)
        k = total_count / sum_N
        N = N * k
    return N
Ejemplo n.º 14
0
    def calcModes(self, n_modes=20, turbo=True):
        """Calculate principal (or essential) modes.  This method uses
        :func:`scipy.linalg.eigh`, or :func:`numpy.linalg.eigh`, function
        to diagonalize the covariance matrix.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate,
            default is 20,
            if **None** or ``'all'`` is given, all modes will be calculated
        :type n_modes: int

        :arg turbo: when available, use a memory intensive but faster way to
            calculate modes, default is **True**
        :type turbo: bool"""
        
        linalg = importLA()
        if self._cov is None:
            raise ValueError('covariance matrix is not built or set')
        start = time.time()
        dof = self._dof
        self._clear()
        if str(n_modes).lower() == 'all':
            n_modes = None
        if linalg.__package__.startswith('scipy'):
            if n_modes is None:
                eigvals = None
                n_modes = dof
            else:
                n_modes = int(n_modes)
                if n_modes >= self._dof:
                    eigvals = None
                    n_modes = dof
                else:
                    eigvals = (dof - n_modes, dof - 1)
            values, vectors = linalg.eigh(self._cov, turbo=turbo,
                                          eigvals=eigvals)
        else:
            if n_modes is not None:
                LOGGER.info('Scipy is not found, all modes are calculated.')
            values, vectors = linalg.eigh(self._cov)
        # Order by descending SV
        revert = list(range(len(values)-1, -1, -1))
        values = values[revert]
        vectors = vectors[:, revert]
        which = values > 1e-8
        self._eigvals = values[which]
        self._array = vectors[:, which]
        self._vars = self._eigvals
        self._n_modes = len(self._eigvals)
        LOGGER.debug('{0} modes were calculated in {1:.2f}s.'
                     .format(self._n_modes, time.time()-start))
Ejemplo n.º 15
0
def SCN(M, **kwargs):
    la = importLA()
    total_count = kwargs.pop('total_count', None)
    max_loops = kwargs.pop('max_loops', 100)
    tol = kwargs.pop('tol', 1e-5)

    N = M.copy()
    n = 0
    d0 = None
    p = 1
    last_p = None

    while True:
        C = np.diag(div0(1., np.sum(N, axis=0)))
        N = np.dot(N, C)

        R = np.diag(div0(1., np.sum(N, axis=1)))
        N = np.dot(R, N)

        n += 1

        # check convergence of symmetry
        d = np.mean(np.abs(N - N.T))
        
        if d0 is not None:
            p = div0(d, d0)
            dp = np.abs(p - last_p)
            if dp < tol:
                break
        else:
            d0 = d
        LOGGER.debug('Iteration {0}: d = {1}, p = {2}'.format(str(n), str(d), str(p)))
        last_p = p
        
        if max_loops is not None:
            if n >= max_loops:
                LOGGER.warn('The SCN algorithm did not converge after {0} '
                            'iterations.'.format(max_loops))
                break
    # guarantee symmetry
    N = (N + N.T) / 2.
    if total_count is 'original':
        total_count = np.sum(M)

    if total_count is not None:
        sum_N = np.sum(N)
        k = total_count / sum_N
        N = N * k
    return N
Ejemplo n.º 16
0
    def match(self):
        """Matches the modes across mode sets according the mode overlaps."""

        if self._modesets:
            #LOGGER.debug('Matching {0} modes across {1} modesets...'
            #                .format(self.numModes(), self.numModeSets()))
            start = time.time()
            self._modesets = matchModes(*self._modesets)
            LOGGER.debug(
                '{0} modes across {1} modesets were matched in {2:.2f}s.'.
                format(self.numModes(), self.numModeSets(),
                       time.time() - start))
        else:
            LOGGER.warn('Mode ensemble has no modesets')
        self._matched = True
        return
Ejemplo n.º 17
0
Archivo: gnm.py Proyecto: brezal/ProDy
    def calcHitTime(self, method='Z'):

        if self._affinity is None:
            self._buildAffinity()

        start = time.time()
        linalg = importLA()
        if method == 'Z':

            D = self._diagonal
            A = self._affinity

            st = D / sum(D)

            P = np.dot(np.diag(D**(-1)), A)

            W = np.ones((len(st),1)) * st.T

            Z = linalg.pinv(np.eye(P.shape[0], P.shape[1]) - P + W)

            H = np.ones((len(st),1)) * np.diag(Z).T - Z
            H = H / W
            H = H.T

        elif method == 'K':

            K = self._kirchhoff
            D = self._diagonal

            K_inv = linalg.pinv(K)
            sum_D = sum(D)

            T1 = (sum_D * np.ones((len(D),1)) * np.diag(K_inv)).T

            T2 = sum_D * K_inv
            T3_i = np.dot((np.ones((len(D),1)) * D), K_inv)

            H = T1 - T2 + T3_i - T3_i.T

        self._hitTime = H
        self._commuteTime = H + H.T


        LOGGER.debug('Hitting and commute time are calculated in  {0:.2f}s.'
                     .format(time.time()-start))    
Ejemplo n.º 18
0
Archivo: gnm.py Proyecto: prody/ProDy
    def calcHitTime(self, method='Z'):

        if self._affinity is None:
            self._buildAffinity()

        start = time.time()
        linalg = importLA()
        if method == 'Z':

            D = self._diagonal
            A = self._affinity

            st = D / sum(D)

            P = np.dot(np.diag(D**(-1)), A)

            W = np.ones((len(st),1)) * st.T

            Z = linalg.pinv(np.eye(P.shape[0], P.shape[1]) - P + W)

            H = np.ones((len(st),1)) * np.diag(Z).T - Z
            H = H / W
            H = H.T

        elif method == 'K':

            K = self._kirchhoff
            D = self._diagonal

            K_inv = linalg.pinv(K)
            sum_D = sum(D)

            T1 = (sum_D * np.ones((len(D),1)) * np.diag(K_inv)).T

            T2 = sum_D * K_inv
            T3_i = np.dot((np.ones((len(D),1)) * D), K_inv)

            H = T1 - T2 + T3_i - T3_i.T

        self._hitTime = H
        self._commuteTime = H + H.T


        LOGGER.debug('Hitting and commute time are calculated in  {0:.2f}s.'
                     .format(time.time()-start))    
Ejemplo n.º 19
0
 def runManySteps(self, n_steps, **kwargs):
     LOGGER.timeit('_prody_runManySteps')
     n_start = self.numSteps
     while self.numSteps < n_start + n_steps:
         self.runStep(structA=self.structA, structB=self.structB, **kwargs)
         LOGGER.debug(
             'Total time so far is %.2f minutes' %
             ((time.time() - LOGGER._times['_prody_runManySteps']) / 60))
         converged = self.checkConvergence()
         if converged:
             self.structA.setCoords(
                 self.coordsA
             )  # That way the original object is back to normal
             self.structB.setCoords(
                 self.coordsB
             )  # That way the original object is back to normal
             LOGGER.debug(
                 'Process completed in %.2f hours' %
                 ((time.time() - LOGGER._times['_prody_runManySteps']) /
                  3600))
             break
Ejemplo n.º 20
0
def calcHessianRecursion(coords,
                         layers,
                         layer,
                         cutoff=15.,
                         gamma=1.0,
                         **kwargs):
    if layer == 0:
        LOGGER.debug('max layer: %d' % max(layers))
    LOGGER.debug('layer: %d' % layer)
    Hss, Hse = buildLayerHessian(coords,
                                 layers,
                                 layer,
                                 cutoff=cutoff,
                                 gamma=gamma,
                                 **kwargs)

    if Hse is None:  # last layer, Hee=Hss
        H = Hss
    else:
        Hee = calcHessianRecursion(coords,
                                   layers,
                                   layer + 1,
                                   cutoff=cutoff,
                                   gamma=gamma,
                                   **kwargs)
        Cee = inv(Hee)
        #H = Hss - Hse.dot(Cee.dot(Hse.T))
        #H = Hss - Hse @ Cee @ Hse.T
        if PY3K:
            H = Hss - Hse.__matmul__(Cee).__matmul__(Hse.T)
        else:
            H = Hss - Hse.dot(Cee.dot(Hse.T))
    LOGGER.debug('layer: %d finished' % layer)
    return H
Ejemplo n.º 21
0
def calcKirchhoffRecursion(coords,
                           layers,
                           layer,
                           cutoff=15.,
                           gamma=1.0,
                           **kwargs):
    if layer == 0:
        LOGGER.debug('max layer: %d' % max(layers))
    LOGGER.debug('layer: %d' % layer)
    Gss, Gse = buildLayerKirchhoff(coords,
                                   layers,
                                   layer,
                                   cutoff=cutoff,
                                   gamma=gamma,
                                   **kwargs)

    if Gse is None:  # last layer, Gee=Gss
        G = Gss
    else:
        Gee = calcKirchhoffRecursion(coords,
                                     layers,
                                     layer + 1,
                                     cutoff=cutoff,
                                     gamma=gamma,
                                     **kwargs)
        Cee = inv(Gee)
        #G = Gss - Gse.dot(Cee.dot(Gse.T))
        #G = Gss - Gse @ Cee @ Gse.T
        if PY3K:
            G = Gss - Gse.__matmul__(Cee).__matmul__(Gse.T)
        else:
            G = Gss - Gse.dot(Cee.dot(Gse.T))
    LOGGER.debug('layer: %d finished' % layer)
    return G
Ejemplo n.º 22
0
    def runManyStepsAlternating(self, n_steps, **kwargs):
        LOGGER.timeit('_prody_runManySteps')
        n_start = self.numSteps
        while self.numSteps < n_start + n_steps:
            n_modes = self.n_modes

            self.runStep(structA=self.structA,
                         structB=self.structB,
                         reduceSelA=self.reduceSelA,
                         reduceSelB=self.reduceSelB,
                         alignSelA=self.alignSelA,
                         alignSelB=self.alignSelB,
                         n_modes=n_modes,
                         **kwargs)
            LOGGER.debug(
                'Total time so far is %.2f minutes' %
                ((time.time() - LOGGER._times['_prody_runManySteps']) / 60))

            self.runStep(structA=self.structB,
                         structB=self.structA,
                         reduceSelA=self.reduceSelB,
                         reduceSelB=self.reduceSelA,
                         alignSelA=self.alignSelB,
                         alignSelB=self.alignSelA,
                         n_modes=n_modes,
                         **kwargs)
            LOGGER.debug(
                'Total time so far is %.2f minutes' %
                ((time.time() - LOGGER._times['_prody_runManySteps']) / 60))

            converged = self.checkConvergence()
            if converged:
                self.structA.setCoords(
                    self.coordsA
                )  # That way the original object is back to normal
                self.structB.setCoords(
                    self.coordsB
                )  # That way the original object is back to normal
                LOGGER.debug(
                    'Process completed in %.2f hours' %
                    ((time.time() - LOGGER._times['_prody_runManySteps']) /
                     3600))
                break

        ensemble = Ensemble('combined trajectory')
        ensemble.setAtoms(self.structA)
        for coordset in self.ensembleA.getCoordsets():
            ensemble.addCoordset(coordset)
        for coordset in reversed(self.ensembleB.getCoordsets()):
            ensemble.addCoordset(coordset)

        if self.outputPDB:
            writePDB(self.filename, ensemble)

        if self.outputDCD:
            writeDCD(self.filename, ensemble)

        return
Ejemplo n.º 23
0
def peelr(coords, system, r0=20., dr=20.):
    n_sys_atoms = int(system.sum())
    n_atoms = len(system)
    labels = np.zeros(n_atoms, dtype=int)

    # identify system beads
    sys_coords = coords[system, :2]
    sys_norms = norm(sys_coords, axis=1)
    sys_r = max(sys_norms)
    r0 += sys_r

    # label environment beads
    env_coords = coords[~system, :2]
    env_norms = norm(env_coords, axis=1)
    L = (env_norms - r0) // dr + 1
    L = np.clip(L, 0, None) + 1
    labels[n_sys_atoms:] = L

    uniq_labels = np.unique(labels)
    if len(uniq_labels) >= 3:
        uniq_labels.sort()
        lbl_last = uniq_labels[-1]
        lbl_2nd_last = uniq_labels[-2]

        n_last = np.sum(labels == lbl_last)
        n_2nd_last = np.sum(labels == lbl_2nd_last)

        if n_last < 0.2 * n_2nd_last:
            LOGGER.debug('edge nodes detected (%d/%d)' % (n_2nd_last, n_last))
            labels[labels == lbl_last] = lbl_2nd_last

    if len(uniq_labels) >= 3:
        uniq_labels.sort()
        lbl_first = uniq_labels[1]
        lbl_2nd = uniq_labels[2]

        n_first = np.sum(labels == lbl_first)
        n_2nd = np.sum(labels == lbl_2nd)

        if n_first < 0.2 * n_2nd:
            LOGGER.debug('inner nodes detected (%d/%d)' % (n_2nd, n_first))
            labels[labels == lbl_first] = lbl_2nd
    if not any(uniq_labels == 1):
        LOGGER.debug('no layer inside the system')
        for i in range(len(labels)):
            if labels[i] > 1:
                labels[i] -= 1

    uniq_labels = np.unique(labels)
    for i, label in enumerate(uniq_labels):
        labels[labels == label] = i

    return labels
Ejemplo n.º 24
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters)
        request = urllib.request.Request('http://hmmer.janelia.org/search/hmmscan', enc_params)

        url = ( urllib.request.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()
        
        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)
        matches = {}
        for child in root[0]:
            if child.tag == 'hits':
                accession = child.get('acc')
                pfam_id = accession.split('.')[0]
                matches[pfam_id]={}
                matches[pfam_id]['accession']=accession
                matches[pfam_id]['class']='Domain'
                matches[pfam_id]['id']=child.get('name')
                matches[pfam_id]['locations']={}
                matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                matches[pfam_id]['locations']['evalue']=child.get('evalue')
                matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                matches[pfam_id]['type']='Pfam-A'
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.xfam.org/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Ejemplo n.º 25
0
    def _targeted_sim(self, coords0, coords1, tmdk=15., d_steps=100, n_max_steps=10000, ddtol=1e-3, n_conv=5):

        try:
            from openmm import CustomExternalForce
            from openmm.app import StateDataReporter
            from openmm.unit import nanometer, kelvin, angstrom, kilojoule_per_mole, MOLAR_GAS_CONSTANT_R
        except ImportError:
            raise ImportError('Please install PDBFixer and OpenMM 7.6 in order to use ClustENM.')

        tmdk *= kilojoule_per_mole/angstrom**2
        tmdk = tmdk.value_in_unit(kilojoule_per_mole/nanometer**2)

        # coords1_ca = coords1[self._idx_cg, :]
        pos1 = coords1 * angstrom
        # pos1_ca = pos1[self._idx_cg, :]

        force = CustomExternalForce('tmdk*((x-x0)^2+(y-y0)^2+(z-z0)^2)')
        force.addGlobalParameter('tmdk', 0.) 
        force.addPerParticleParameter('x0')
        force.addPerParticleParameter('y0')
        force.addPerParticleParameter('z0')
        force.setForceGroup(1)
        # for i, atm_idx in enumerate(self._idx_cg):
        #     pars = pos1_ca[i, :].value_in_unit(nanometer)
        #     force.addParticle(int(atm_idx), pars)

        n_atoms = coords0.shape[0]
        atom_indices = np.arange(n_atoms)
        for i, atm_idx in enumerate(atom_indices):
            pars = pos1[i, :].value_in_unit(nanometer)
            force.addParticle(int(atm_idx), pars)

        simulation = self._prep_sim([force])

        # automatic conversion into nanometer will be carried out.
        simulation.context.setPositions(coords0 * angstrom)

        dist = dist0 = calcRMSD(coords0, coords1)
        m_conv = 0
        n_steps = 0
        try:
            simulation.minimizeEnergy(tolerance=self._tolerance*kilojoule_per_mole,
                                      maxIterations=self._maxIterations)

            # update parameters
            while n_steps < n_max_steps:
                simulation.context.setParameter('tmdk', tmdk)
                force.updateParametersInContext(simulation.context)

                simulation.step(d_steps)
                n_steps += d_steps

                # evaluate distance to destination
                pos = simulation.context.getState(getPositions=True).getPositions(asNumpy=True).value_in_unit(angstrom)
                d = calcRMSD(pos, coords1)
                dd = np.abs(dist - d)

                if dd < ddtol:
                    m_conv += 1

                if m_conv >= n_conv:
                    break

                dist = d

            LOGGER.debug('RMSD: %4.2f -> %4.2f' % (dist0, dist))

            simulation.context.setParameter('tmdk', 0.0)
            simulation.minimizeEnergy(tolerance=self._tolerance*kilojoule_per_mole,
                                      maxIterations=self._maxIterations)

            pos = simulation.context.getState(getPositions=True).getPositions(asNumpy=True).value_in_unit(angstrom)
            pot = simulation.context.getState(getEnergy=True).getPotentialEnergy().value_in_unit(kilojoule_per_mole)

            return pot, pos

        except BaseException as be:
            LOGGER.warning('OpenMM exception: ' + be.__str__() + ' so the corresponding conformer will be discarded!')

            return np.nan, np.full_like(coords0, np.nan)
Ejemplo n.º 26
0
Archivo: gnm.py Proyecto: sixpi/ProDy
    def buildKirchhoff(self, coords, cutoff=10., gamma=1., **kwargs):
        """Build Kirchhoff matrix for given coordinate set.

        :arg coords: a coordinate set or an object with ``getCoords`` method
        :type coords: :class:`numpy.ndarray` or :class:`.Atomic`

        :arg cutoff: cutoff distance (Å) for pairwise interactions
            default is 10.0 Å, , minimum is 4.0 Å
        :type cutoff: float

        :arg gamma: spring constant, default is 1.0
        :type gamma: float

        :arg sparse: elect to use sparse matrices, default is **False**. If
            Scipy is not found, :class:`ImportError` is raised.
        :type sparse: bool

        :arg kdtree: elect to use KDTree for building Kirchhoff matrix faster,
            default is **True**
        :type kdtree: bool


        Instances of :class:`Gamma` classes and custom functions are
        accepted as *gamma* argument.

        When Scipy is available, user can select to use sparse matrices for
        efficient usage of memory at the cost of computation speed."""

        try:
            coords = (coords._getCoords() if hasattr(coords, '_getCoords') else
                      coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')

        cutoff, g, gamma = checkENMParameters(cutoff, gamma)
        self._reset()
        self._cutoff = cutoff
        self._gamma = g

        n_atoms = coords.shape[0]
        start = time.time()
        if kwargs.get('sparse', False):
            try:
                from scipy import sparse as scipy_sparse
            except ImportError:
                raise ImportError('failed to import scipy.sparse, which  is '
                                  'required for sparse matrix calculations')
            kirchhoff = scipy_sparse.lil_matrix((n_atoms, n_atoms))
        else:
            kirchhoff = np.zeros((n_atoms, n_atoms), 'd')

        if kwargs.get('kdtree', True):
            kdtree = KDTree(coords)
            kdtree.search(cutoff)
            dist2 = kdtree.getDistances() ** 2
            r = 0
            for i, j in kdtree.getIndices():
                g = gamma(dist2[r], i, j)
                kirchhoff[i, j] = -g
                kirchhoff[j, i] = -g
                kirchhoff[i, i] = kirchhoff[i, i] + g
                kirchhoff[j, j] = kirchhoff[j, j] + g
                r += 1
        else:
            LOGGER.info('Using slower method for building the Kirchhoff.')
            cutoff2 = cutoff * cutoff
            mul = np.multiply
            for i in range(n_atoms):
                xyz_i = coords[i, :]
                i_p1 = i+1
                i2j = coords[i_p1:, :] - xyz_i
                mul(i2j, i2j, i2j)
                for j, dist2 in enumerate(i2j.sum(1)):
                    if dist2 > cutoff2:
                        continue
                    j += i_p1
                    g = gamma(dist2, i, j)
                    kirchhoff[i, j] = -g
                    kirchhoff[j, i] = -g
                    kirchhoff[i, i] = kirchhoff[i, i] + g
                    kirchhoff[j, j] = kirchhoff[j, j] + g

        LOGGER.debug('Kirchhoff was built in {0:.2f}s.'
                     .format(time.time()-start))
        self._kirchhoff = kirchhoff
        self._n_atoms = n_atoms
        self._dof = n_atoms
Ejemplo n.º 27
0
def fetchPDBfromMirror(*pdb, **kwargs):
    """Returns path(s) to PDB (default), PDBML, or mmCIF file(s) for specified
    *pdb* identifier(s).  If a *folder* is specified, files will be copied
    into this folder.  If *compressed* is **False**, files will decompressed.
    *format* argument can be used to get `PDBML <http://pdbml.pdb.org/>`_ and
    `mmCIF <http://mmcif.pdb.org/>`_ files: ``format='cif'`` will fetch an
    mmCIF file, and ``format='xml'`` will fetch a PDBML file.  If PDBML header
    file is desired, ``noatom=True`` argument will do the job."""

    mirror = pathPDBMirror()
    if mirror is None:
        raise IOError('no mirror path is set')

    try:
        mirror, mirror_format = mirror
    except ValueError:
        mirror_format = None

    format = str(kwargs.pop('format', 'pdb')).lower()

    if kwargs.get('check', True):
        identifiers = checkIdentifiers(*pdb)
    else:
        identifiers = list(pdb)

    if format == 'pdb':
        ftp_divided = 'data/structures/divided/pdb'
        ftp_pdbext = '.ent.gz'
        ftp_prefix = 'pdb'
        extension = '.pdb'
    elif format == 'xml':
        if bool(kwargs.pop('noatom', False)):
            ftp_divided = 'data/structures/divided/XML-noatom'
            ftp_pdbext = '-noatom.xml.gz'
            extension = '-noatom.xml'
        else:
            ftp_divided = 'data/structures/divided/XML'
            ftp_pdbext = '.xml.gz'
            extension = '.xml'
        ftp_prefix = ''
    elif format == 'cif':
        ftp_divided = 'data/structures/divided/mmCIF'
        ftp_pdbext = '.cif.gz'
        ftp_prefix = ''
        extension = '.cif'
    else:
        if format:
            raise ValueError('{0} is not a recognized format'.format(
                repr(format)))
        else:
            raise ValueError('please specify a valid format')

    if mirror_format:
        if mirror_format.lower() != format:
            raise IOError('mirror contains only ' + mirror_format + ' files')
        ftp_divided = ''
    else:
        ftp_divided = join(*ftp_divided.split('/'))
    folder = kwargs.get('folder')
    compressed = kwargs.get('compressed', True)
    filenames = []
    append = filenames.append
    success = 0
    failure = 0
    for pdb in identifiers:
        if pdb is None:
            append(None)
            continue
        fn = join(mirror, ftp_divided, pdb[1:3], ftp_prefix + pdb + ftp_pdbext)
        if isfile(fn):
            if folder or not compressed:
                if compressed:
                    fn = copyFile(fn,
                                  join(folder or '.', pdb + extension + '.gz'))
                else:
                    fn = gunzip(fn, join(folder or '.', pdb + extension))
            append(normpath(fn))
            success += 1
        else:
            append(None)
            failure += 1

    if len(identifiers) == 1:
        fn = filenames[0]
        if success:
            LOGGER.debug('PDB file is found in the local mirror ({0}).'.format(
                sympath(fn)))
        return fn
    else:
        LOGGER.debug('PDB files found in the local mirror ({0} found, '
                     '{1} missed).'.format(success, failure))
        return filenames
Ejemplo n.º 28
0
Archivo: msa.py Proyecto: njekin/ProDy
def refineMSA(msa, label=None, rowocc=None, seqid=None, colocc=None, **kwargs):
    """Refine *msa* by removing sequences (rows) and residues (columns) that
    contain gaps.

    :arg msa: multiple sequence alignment
    :type msa: :class:`.MSA`

    :arg label: remove columns that are gaps in the sequence matching label,
        ``msa.getIndex(label)`` must return a sequence index, a PDB identifier
        is also acceptable
    :type label: str

    :arg rowocc: row occupancy, sequences with less occupancy will be
        removed after *label* refinement is applied
    :type rowocc: float

    :arg seqid: keep unique sequences at specified sequence identity level,
        unique sequences are identified using :func:`.uniqueSequences`
    :type seqid: float

    :arg colocc: column occupancy, residue positions with less occupancy
        will be removed after other refinements are applied
    :type colocc: float

    :arg keep: keep columns corresponding to residues not resolved in the PDB
        structure, default is **False**, applies when *label* is a PDB
        identifier
    :arg type: bool

    For Pfam MSA data, *label* is UniProt entry name for the protein.  You may
    also use PDB structure and chain identifiers, e.g. ``'1p38'`` or
    ``'1p38A'``, for *label* argument and UniProt entry names will be parsed
    using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and
    :class:`.DBRef`).

    The order of refinements are applied in the order of arguments.  If *label*
    and *unique* is specified is specified, sequence matching *label* will
    be kept in the refined :class:`.MSA` although it may be similar to some
    other sequence."""

    # if msa is a char array, it will be refined but label won't work
    try:
        ndim, dtype_ = msa.ndim, msa.dtype
    except AttributeError:
        try:
            arr = msa._getArray()
        except AttributeError:
            raise TypeError('msa must be a character array or an MSA instance')
        ndim, dtype_ = arr.ndim, arr.dtype
    else:
        arr, msa = msa, None

    if dtype('|S1') != dtype_:
        raise ValueError('msa must be a character array or an MSA instance')
    if ndim != 2:
        raise ValueError('msa must be a 2D array or an MSA instance')

    title = []
    cols = None
    index = None
    if label is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            upper, lower = label.upper(), label.lower()
        except AttributeError:
            raise TypeError('label must be a string')

        if msa is None:
            raise TypeError('msa must be an MSA instance, '
                            'label cannot be used')

        index = msa.getIndex(label)
        if index is None:
                index = msa.getIndex(upper)
        if index is None:
                index = msa.getIndex(lower)

        chain = None
        if index is None and (len(label) == 4 or len(label) == 5):
            from prody import parsePDB
            try:
                structure, header = parsePDB(label[:4], header=True)
            except Exception as err:
                raise IOError('failed to parse header for {0} ({1})'
                              .format(label[:4], str(err)))

            chid = label[4:].upper()
            for poly in header['polymers']:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if index is None:
                        index = msa.getIndex(dbref.idcode)
                        if index is not None:
                            LOGGER.info('{0} idcode {1} for {2}{3} '
                                        'is found in chain {3}.'.format(
                                        dbref.database, dbref.idcode,
                                        label[:4], poly.chid, str(msa)))
                            break
                    if index is None:
                        index = msa.getIndex(dbref.accession)
                        if index is not None:
                            LOGGER.info('{0} accession {1} for {2}{3} '
                                        'is found in chain {3}.'.format(
                                        dbref.database, dbref.accession,
                                        label[:4], poly.chid, str(msa)))
                            break
            if index is not None:
                chain = structure[poly.chid]

        if index is None:
            raise ValueError('label is not in msa, or msa is not indexed')
        try:
            len(index)
        except TypeError:
            pass
        else:
            raise ValueError('label {0} maps onto multiple sequences, '
                             'so cannot be used for refinement'.format(label))

        title.append('label=' + label)
        cols = char.isalpha(arr[index]).nonzero()[0]
        arr = arr.take(cols, 1)
        LOGGER.report('Label refinement reduced number of columns from {0} to '
                      '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

        if chain is not None and not kwargs.get('keep', False):
            before = arr.shape[1]
            LOGGER.timeit('_refine')
            from prody.proteins.compare import importBioPairwise2
            from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE
            from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY
            pw2 = importBioPairwise2()
            chseq = chain.getSequence()
            algn = pw2.align.localms(arr[index].tostring().upper(), chseq,
                                     MATCH_SCORE, MISMATCH_SCORE,
                                     GAP_PENALTY, GAP_EXT_PENALTY,
                                     one_alignment_only=1)
            torf = []
            for s, c in zip(*algn[0][:2]):
                if s == '-':
                    continue
                elif c != '-':
                    torf.append(True)
                else:
                    torf.append(False)
            torf = array(torf)
            tsum = torf.sum()
            assert tsum <= before, 'problem in mapping sequence to structure'
            if tsum < before:
                arr = arr.take(torf.nonzero()[0], 1)
                LOGGER.report('Structure refinement reduced number of '
                              'columns from {0} to {1} in %.2fs.'
                              .format(before, arr.shape[1]), '_refine')
            else:
                LOGGER.debug('All residues in the sequence are contained in '
                             'PDB structure {0}.'.format(label))

    from .analysis import calcMSAOccupancy, uniqueSequences

    rows = None
    if rowocc is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        try:
            rowocc = float(rowocc)
        except Exception as err:
            raise TypeError('rowocc must be a float ({0})'.format(str(err)))
        assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1'

        rows = calcMSAOccupancy(arr, 'row') >= rowocc
        if index is not None:
            index = rows[:index].sum()
        rows = (rows).nonzero()[0]
        arr = arr[rows]
        title.append('rowocc>=' + str(rowocc))
        LOGGER.report('Row occupancy refinement reduced number of rows from '
                      '{0} to {1} in %.2fs.'.format(before, arr.shape[0]),
                      '_refine')

    if seqid is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        unique = uniqueSequences(arr, seqid)
        if index is not None:
            unique[index] = True
        unique = unique.nonzero()[0]
        arr = arr[unique]
        title.append('seqid>=' + str(seqid))
        if rows is not None:
            rows = rows[unique]
        else:
            rows = unique
        LOGGER.report('Sequence identity refinement reduced number of rows '
                      'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]),
                      '_refine')

    if colocc is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            colocc = float(colocc)
        except Exception as err:
            raise TypeError('colocc must be a float ({0})'.format(str(err)))
        assert 0. <= colocc <= 1., 'colocc must be between 0 and 1'

        cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('colocc>=' + str(colocc))
        LOGGER.report('Column occupancy refinement reduced number of columns '
                      'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]),
                      '_refine')

    if not title:
        raise ValueError('label, rowocc, colocc all cannot be None')

    # depending on slicing of rows, arr may not have it's own memory
    if arr.base is not None:
        arr = arr.copy()

    if msa is None:
        return arr
    else:
        if rows is None:
            from copy import copy
            labels = copy(msa._labels)
            mapping = copy(msa._mapping)
        else:
            labels = msa._labels
            labels = [labels[i] for i in rows]
            mapping = None
        return MSA(arr, title=msa.getTitle() + ' refined ({0})'
                   .format(', '.join(title)), labels=labels, mapping=mapping)
Ejemplo n.º 29
0
def matchChains(atoms1, atoms2, **kwargs):
    """Return pairs of chains matched based on sequence similarity.  Makes an
    all-to-all comparison of chains in *atoms1* and *atoms2*.  Chains are
    obtained from hierarchical views (:class:`.HierView`) of atom groups.
    This function returns a list of matching chains in a tuples that contain
    4 items:

      * matching chain from *atoms1* as a :class:`.AtomMap`
        instance,
      * matching chain from *atoms2* as a :class:`.AtomMap`
        instance,
      * percent sequence identity of the match,
      * percent sequence overlap of the match.

    List of matches are sorted in decreasing percent sequence identity order.
    :class:`.AtomMap` instances can be used to calculate RMSD values and
    superpose atom groups.

    :arg atoms1: atoms that contain a chain
    :type atoms1: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg atoms2: atoms that contain a chain
    :type atoms2: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :keyword subset: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``),
        ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"``
    :type subset: string

    :keyword seqid: percent sequence identity, default is 90
    :type seqid: float

    :keyword overlap: percent overlap, default is 90
    :type overlap: float

    :keyword pwalign: perform pairwise sequence alignment
    :type pwalign: bool

    If *subset* is set to *calpha* or *backbone*, only alpha carbon
    atoms or backbone atoms will be paired. If set to *all*, all atoms
    common to matched residues will be returned.

    This function tries to match chains based on residue numbers and names.
    All chains in *atoms1* is compared to all chains in *atoms2*.  This works
    well for different structures of the same protein.  When it fails,
    :mod:`Bio.pairwise2` is used for pairwise sequence alignment, and matching
    is performed based on the sequence alignment.  User can control, whether
    sequence alignment is performed or not with *pwalign* keyword.  If
    ``pwalign=True`` is passed, pairwise alignment is enforced."""

    if not isinstance(atoms1, (AtomGroup, Chain, Selection)):
        raise TypeError('atoms1 must be an AtomGroup, Chain, or Selection')
    if not isinstance(atoms2, (AtomGroup, Chain, Selection)):
        raise TypeError('atoms2 must be an AtomGroup, Chain, or Selection')

    subset = kwargs.get('subset', 'calpha')
    if subset not in _SUBSETS:
        raise ValueError('{0} is not a valid subset argument'
                         .format(str(subset)))
    seqid = kwargs.get('seqid', 90.)
    assert isinstance(seqid, (float, int)), 'seqid must be float'
    assert 0 < seqid <= 100, 'seqid must be in the range from 0 to 100'
    coverage = kwargs.get('overlap')
    if coverage is None:
        coverage = kwargs.get('coverage', 90.)
    assert isinstance(coverage, (float, int)), 'overlap must be float'
    assert 0 < coverage <= 100, 'overlap must be in the range from 0 to 100'
    pwalign = kwargs.get('pwalign', None)

    if isinstance(atoms1, Chain):
        chains1 = [atoms1]
        atoms1 = atoms1.getAtomGroup()
    else:
        chains1 = list(atoms1.getHierView().iterChains())
        if not isinstance(atoms1, AtomGroup):
            atoms1 = atoms1.getAtomGroup()
    chains = list()
    for ch in chains1:
        simpch = SimpleChain(ch)
        if len(simpch) > 0:
            chains.append(simpch)
    chains1 = chains
    if not isinstance(atoms1, Chain):
        LOGGER.debug('Checking {0}: {1} chains are identified'
                     .format(str(atoms1), len(chains1)))

    if isinstance(atoms2, Chain):
        chains2 = [atoms2]
        atoms2 = atoms2.getAtomGroup()
    else:
        chains2 = list(atoms2.getHierView().iterChains())
        if not isinstance(atoms2, AtomGroup):
            atoms2 = atoms2.getAtomGroup()
    chains = list()
    for ch in chains2:
        simpch = SimpleChain(ch)
        if len(simpch) > 0:
            chains.append(simpch)
    chains2 = chains
    if not isinstance(atoms2, Chain):
        LOGGER.debug('Checking {0}: {1} chains are identified'
                     .format(str(atoms2), len(chains2)))

    matches = []
    unmatched = []
    LOGGER.debug('Trying to match chains based on residue numbers and names:')
    for simpch1 in chains1:
        for simpch2 in chains2:
            LOGGER.debug('  Comparing {0} (len={1}) and {2} (len={3}):'
                         .format(simpch1.getTitle(), len(simpch1),
                                 simpch2.getTitle(), len(simpch2)))

            match1, match2, nmatches = getTrivialMatch(simpch1, simpch2)
            _seqid = nmatches * 100 / min(len(simpch1), len(simpch2))
            _cover = len(match2) * 100 / max(len(simpch1), len(simpch2))

            if _seqid >= seqid and _cover >= coverage:
                LOGGER.debug('\tMatch: {0} residues match with {1:.0f}% '
                             'sequence identity and {2:.0f}% overlap.'
                             .format(len(match1), _seqid, _cover))
                matches.append((match1, match2, _seqid, _cover, simpch1, simpch2))
            else:
                LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                             'overlap={1:.0f}%).'.format(_seqid, _cover))
                unmatched.append((simpch1, simpch2))

    if pwalign or (not matches and (pwalign is None or pwalign)):
        pairwise2 = importBioPairwise2()
        if pairwise2:
            LOGGER.debug('Trying to match chains based on {0} sequence '
                         'alignment:'.format(ALIGNMENT_METHOD))
            for simpch1, simpch2 in unmatched:
                LOGGER.debug('  Comparing {0} (len={1}) and {2} '
                             '(len={3}):'
                             .format(simpch1.getTitle(), len(simpch1),
                                     simpch2.getTitle(), len(simpch2)))
                match1, match2, nmatches = getAlignedMatch(simpch1, simpch2)
                _seqid = nmatches * 100 / min(len(simpch1), len(simpch2))
                _cover = len(match2) * 100 / max(len(simpch1), len(simpch2))
                if _seqid >= seqid and _cover >= coverage:
                    LOGGER.debug('\tMatch: {0} residues match with {1:.0f}% '
                                 'sequence identity and {2:.0f}% overlap.'
                                 .format(len(match1), _seqid, _cover))
                    matches.append((match1, match2, _seqid, _cover,
                                    simpch1, simpch2))
                else:
                    LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                                 'overlap={1:.0f}%).'
                                 .format(_seqid, _cover))
        else:
            LOGGER.warning('Pairwise alignment could not be performed.')
    if not matches:
        return None
    if subset == 'calpha':
        subset = 'ca'
    elif subset == 'backbone':
        subset = 'bb'
    elif subset == 'heavy':
        subset = 'noh'
    for mi, result in enumerate(matches):
        match1, match2, _seqid, _cover, simpch1, simpch2 = result

        indices1 = []
        indices2 = []

        for i in range(len(match1)):
            ares = match1[i]
            bres = match2[i]

            if subset == 'ca':
                try:
                    aid = ares.getNames().tolist().index('CA')
                except ValueError:
                    aid = None
                try:
                    bid = bres.getNames().tolist().index('CA')
                    if aid is not None:
                        indices1.append(ares._indices[aid])
                        indices2.append(bres._indices[bid])
                except ValueError:
                    pass
            elif subset == 'bb':
                for bban in ('N', 'CA', 'C', 'O'):
                    try:
                        aid = ares.getNames().tolist().index(bban)
                    except ValueError:
                        continue
                    try:
                        bid = bres.getNames().tolist().index(bban)
                    except ValueError:
                        continue
                    else:
                        indices1.append(ares._indices[aid])
                        indices2.append(bres._indices[bid])
            elif subset == 'noh':
                for han, aid, noh in zip(ares.getNames(), ares._indices,
                                         ares.getFlags('noh')):
                    if not noh:
                        continue
                    try:
                        bid = bres.getNames().tolist().index(han)
                    except ValueError:
                        continue
                    else:
                        indices1.append(aid)
                        indices2.append(bres._indices[bid])
            elif subset is None or subset is 'all':
                aans = ares.getNames()
                bans = bres.getNames().tolist()

                aids = ares.getIndices()
                #bids = bres.getIndices()

                for j in range(len(aans)):
                    try:
                        bid = bres._indices[bans.index(aans[j])]
                        indices1.append(aids[j])
                        indices2.append(bid)
                    except ValueError:
                        pass

        indices1 = np.array(indices1, int)
        indices2 = np.array(indices2, int)

        match1 = AM(atoms1, indices1, atoms1.getACSIndex(),
                    title=simpch1.getTitle() + ' -> ' + simpch2.getTitle(),
                    intarrays=True)
        match2 = AM(atoms2, indices2, atoms2.getACSIndex(),
                    title=simpch2.getTitle() + ' -> ' + simpch1.getTitle(),
                    intarrays=True)

        matches[mi] = (match1, match2, _seqid, _cover)
    if len(matches) > 1:
        def compare(m1, m2):
            return cmp(m1[2], m2[2])
        matches.sort(compare, reverse=True)
    return matches
Ejemplo n.º 30
0
def refineMSA(msa,
              index=None,
              label=None,
              rowocc=None,
              seqid=None,
              colocc=None,
              **kwargs):
    """Refine *msa* by removing sequences (rows) and residues (columns) that
    contain gaps.

    :arg msa: multiple sequence alignment
    :type msa: :class:`.MSA`

    :arg index: remove columns that are gaps in the sequence with that index
    :type index: int

    :arg label: remove columns that are gaps in the sequence matching label,
        ``msa.getIndex(label)`` must return a sequence index, a PDB identifier
        is also acceptable
    :type label: str

    :arg rowocc: row occupancy, sequences with less occupancy will be
        removed after *label* refinement is applied
    :type rowocc: float

    :arg seqid: keep unique sequences at specified sequence identity level,
        unique sequences are identified using :func:`.uniqueSequences`
    :type seqid: float

    :arg colocc: column occupancy, residue positions with less occupancy
        will be removed after other refinements are applied
    :type colocc: float

    :arg keep: keep columns corresponding to residues not resolved in the PDB
        structure, default is **False**, applies when *label* is a PDB
        identifier
    :arg type: bool

    For Pfam MSA data, *label* is UniProt entry name for the protein.  You may
    also use PDB structure and chain identifiers, e.g. ``'1p38'`` or
    ``'1p38A'``, for *label* argument and UniProt entry names will be parsed
    using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and
    :class:`.DBRef`).

    The order of refinements are applied in the order of arguments.  If *label*
    and *unique* is specified, sequence matching *label* will
    be kept in the refined :class:`.MSA` although it may be similar to some
    other sequence."""

    # if msa is a char array, it will be refined but label won't work
    try:
        ndim, dtype_ = msa.ndim, msa.dtype
    except AttributeError:
        try:
            arr = msa._getArray()
        except AttributeError:
            raise TypeError('msa must be a character array or an MSA instance')
        ndim, dtype_ = arr.ndim, arr.dtype
    else:
        arr, msa = msa, None

    if dtype('|S1') != dtype_:
        raise ValueError('msa must be a character array or an MSA instance')
    if ndim != 2:
        raise ValueError('msa must be a 2D array or an MSA instance')

    title = []
    cols = None

    if index is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        cols = char.isalpha(arr[index]).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('index=' + str(index))
        LOGGER.report(
            'Index refinement reduced number of columns from {0} to '
            '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

    if label is not None:
        if index is not None:
            LOGGER.info('An index was provided so the label will be ignored.')

        else:
            before = arr.shape[1]
            LOGGER.timeit('_refine')
            try:
                upper, lower = label.upper(), label.lower()
            except AttributeError:
                raise TypeError('label must be a string')

            if msa is None:
                raise TypeError('msa must be an MSA instance, '
                                'label cannot be used')

            index = msa.getIndex(label)
            if index is None:
                index = msa.getIndex(upper)
            if index is None:
                index = msa.getIndex(lower)

            chain = None
            if index is None and (len(label) == 4 or len(label) == 5):
                from prody import parsePDB
                try:
                    structure, header = parsePDB(label[:4], header=True)
                except Exception as err:
                    raise IOError(
                        'failed to parse header for {0} ({1})'.format(
                            label[:4], str(err)))

                chid = label[4:].upper()
                for poly in header['polymers']:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if index is None:
                            index = msa.getIndex(dbref.idcode)
                            if index is not None:
                                LOGGER.info('{0} idcode {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database, dbref.idcode,
                                                label[:4], poly.chid,
                                                str(msa)))
                                break
                        if index is None:
                            index = msa.getIndex(dbref.accession)
                            if index is not None:
                                LOGGER.info('{0} accession {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database,
                                                dbref.accession, label[:4],
                                                poly.chid, str(msa)))
                                break
                if index is not None:
                    chain = structure[poly.chid]

            if index is None:
                raise ValueError('label is not in msa, or msa is not indexed')
            try:
                len(index)
            except TypeError:
                pass
            else:
                raise ValueError(
                    'label {0} maps onto multiple sequences, '
                    'so cannot be used for refinement'.format(label))

            title.append('label=' + label)
            cols = char.isalpha(arr[index]).nonzero()[0]
            arr = arr.take(cols, 1)
            LOGGER.report(
                'Label refinement reduced number of columns from {0} to '
                '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

            if chain is not None and not kwargs.get('keep', False):
                before = arr.shape[1]
                LOGGER.timeit('_refine')
                from prody.proteins.compare import importBioPairwise2
                from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE
                from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY
                pw2 = importBioPairwise2()
                chseq = chain.getSequence()
                algn = pw2.align.localms(arr[index].tostring().upper(),
                                         chseq,
                                         MATCH_SCORE,
                                         MISMATCH_SCORE,
                                         GAP_PENALTY,
                                         GAP_EXT_PENALTY,
                                         one_alignment_only=1)
                torf = []
                for s, c in zip(*algn[0][:2]):
                    if s == '-':
                        continue
                    elif c != '-':
                        torf.append(True)
                    else:
                        torf.append(False)
                torf = array(torf)
                tsum = torf.sum()
                assert tsum <= before, 'problem in mapping sequence to structure'
                if tsum < before:
                    arr = arr.take(torf.nonzero()[0], 1)
                    LOGGER.report(
                        'Structure refinement reduced number of '
                        'columns from {0} to {1} in %.2fs.'.format(
                            before, arr.shape[1]), '_refine')
                else:
                    LOGGER.debug(
                        'All residues in the sequence are contained in '
                        'PDB structure {0}.'.format(label))

    from .analysis import calcMSAOccupancy, uniqueSequences

    rows = None
    if rowocc is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        try:
            rowocc = float(rowocc)
        except Exception as err:
            raise TypeError('rowocc must be a float ({0})'.format(str(err)))
        assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1'

        rows = calcMSAOccupancy(arr, 'row') >= rowocc
        if index is not None:
            index = rows[:index].sum()
        rows = (rows).nonzero()[0]
        arr = arr[rows]
        title.append('rowocc>=' + str(rowocc))
        LOGGER.report(
            'Row occupancy refinement reduced number of rows from '
            '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine')

    if seqid is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        unique = uniqueSequences(arr, seqid)
        if index is not None:
            unique[index] = True
        unique = unique.nonzero()[0]
        arr = arr[unique]
        title.append('seqid>=' + str(seqid))
        if rows is not None:
            rows = rows[unique]
        else:
            rows = unique
        LOGGER.report(
            'Sequence identity refinement reduced number of rows '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]),
            '_refine')

    if colocc is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            colocc = float(colocc)
        except Exception as err:
            raise TypeError('colocc must be a float ({0})'.format(str(err)))
        assert 0. <= colocc <= 1., 'colocc must be between 0 and 1'

        cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('colocc>=' + str(colocc))
        LOGGER.report(
            'Column occupancy refinement reduced number of columns '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]),
            '_refine')

    if not title:
        raise ValueError(
            'label, index, seqid, rowocc, colocc all cannot be None')

    # depending on slicing of rows, arr may not have it's own memory
    if arr.base is not None:
        arr = arr.copy()

    if msa is None:
        return arr
    else:
        if rows is None:
            from copy import copy
            labels = copy(msa._labels)
        else:
            labels = msa._labels
            labels = [labels[i] for i in rows]
        return MSA(arr,
                   title=msa.getTitle() +
                   ' refined ({0})'.format(', '.join(title)),
                   labels=labels)
Ejemplo n.º 31
0
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = { 'format' : 'tsv' }
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace('results','download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url) 
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        #xml = urllib2.urlopen(result_request).read()
        tsv = urllib2.urlopen(result_request).read()
        # openURL(url, timeout=timeout).read()
        
        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
            #if child.tag == 'hits':
                # accession = child.get('acc')
                # pfam_id = accession.split('.')[0]
                # matches[pfam_id]={}
                # matches[pfam_id]['accession']=accession
                # matches[pfam_id]['class']='Domain'
                # matches[pfam_id]['id']=child.get('name')
                # matches[pfam_id]['locations']={}
                # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['evalue']=child.get('evalue')
                # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                # matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                # matches[pfam_id]['type']='Pfam-A'
        # return matches

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id]={}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']   
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
 
            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'
                                .format(idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND','RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' + seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Ejemplo n.º 32
0
Archivo: pfam.py Proyecto: uibcdf/ProDy
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = {'hmmdb': 'pfam', 'seq': fseq}
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request(
            'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = {'format': 'tsv'}
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace(
            'results', 'download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url)
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(
            seq[:MINSEQLEN]))

        try:
            #xml = urllib2.urlopen(result_request).read()
            tsv = urllib2.urlopen(result_request).read()
            # openURL(url, timeout=timeout).read()
        except:
            raise ValueError('No matching Pfam domains were found.')

        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
        #if child.tag == 'hits':
        # accession = child.get('acc')
        # pfam_id = accession.split('.')[0]
        # matches[pfam_id]={}
        # matches[pfam_id]['accession']=accession
        # matches[pfam_id]['class']='Domain'
        # matches[pfam_id]['id']=child.get('name')
        # matches[pfam_id]['locations']={}
        # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
        # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
        # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['evalue']=child.get('evalue')
        # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
        # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
        # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
        # matches[pfam_id]['locations']['significant']=child[0].get('significant')
        # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
        # matches[pfam_id]['type']='Pfam-A'
        # return matches

        if PY3K:
            tsv = tsv.decode()

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id] = {}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child[
                'Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'.format(
                    seq[:4], str(err)))
            else:
                chid = seq[4:].upper()

            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'.format(
                                    idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND', 'RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' +
                                 seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Ejemplo n.º 33
0
def mapOntoChain(atoms, chain, **kwargs):
    """Map *atoms* onto *chain*.  This function returns a list of mappings.
    Each mapping is a tuple that contains 4 items:

      * Mapped chain as an :class:`.AtomMap` instance,
      * *chain* as an :class:`.AtomMap` instance,
      * Percent sequence identitity,
      * Percent sequence overlap

    Mappings are returned in decreasing percent sequence identity order.
    :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms
    in place of unmapped atoms.

    :arg atoms: atoms that will be mapped to the target *chain*
    :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg chain: chain to which atoms will be mapped
    :type chain: :class:`.Chain`

    :keyword subset: one of the following well-defined subsets of atoms:
        ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``),
        ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"``
    :type subset: string

    :keyword seqid: percent sequence identity, default is **90** if sequence alignment is 
        performed, otherwise **0**
    :type seqid: float

    :keyword overlap: percent overlap, default is **70**
    :type overlap: float

    :keyword mapping: if ``"ce"`` or ``"cealign"``, then the CE algorithm [IS98]_ will be 
        performed. It can also be a list of prealigned sequences, a :class:`.MSA` instance,
        or a dict of indices such as that derived from a :class:`.DaliRecord`.
        If set to anything other than the options listed above, including the default value 
        (**None**), a simple mapping will be first attempted and if that failed 
        then sequence alignment with a function from :mod:`~Bio.pairwise2` will be used 
        unless *pwalign* is set to **False**, in which case the mapping will fail.
    :type mapping: list, str

    :keyword pwalign: if **True**, then pairwise sequence alignment will 
        be performed. If **False** then a simple mapping will be performed 
        based on residue numbers (as well as insertion codes). This will be 
        overridden by the *mapping* keyword's value. 
    :type pwalign: bool

    This function tries to map *atoms* to *chain* based on residue
    numbers and types. Each individual chain in *atoms* is compared to
    target *chain*.
    
    .. [IS98] Shindyalov IN, Bourne PE. Protein structure alignment by 
       incremental combinatorial extension (CE) of the optimal path. 
       *Protein engineering* **1998** 11(9):739-47.
    """

    if not isinstance(atoms, (AtomGroup, AtomSubset)):
        raise TypeError('atoms must be an AtomGroup or a AtomSubset instance')
    if not isinstance(chain, Chain):
        raise TypeError('chain must be Chain instance')

    subset = str(kwargs.get('subset', 'calpha')).lower()
    if subset not in _SUBSETS:
        raise ValueError('{0} is not a valid subset argument'.format(
            str(subset)))
    seqid = kwargs.get('seqid', 90.)
    coverage = kwargs.get('overlap', 70.)
    coverage = kwargs.get('coverage', coverage)
    pwalign = kwargs.get('pwalign', None)
    pwalign = kwargs.get('mapping', pwalign)
    alignment = None
    if pwalign is not None:
        if isinstance(pwalign, basestring):
            pwalign = str(pwalign).strip().lower()
        elif not isinstance(pwalign, bool):
            alignment = pwalign
            pwalign = True

    if subset != 'all':
        chid = chain.getChid()
        segname = chain.getSegname()
        chain_subset = chain.select(subset)
        target_chain = chain_subset.getHierView()[segname, chid]

        mobile = atoms.select(subset)
    else:
        target_chain = chain
        mobile = atoms

    if isinstance(mobile, Chain):
        chains = [mobile]
        map_ag = mobile.getAtomGroup()
    else:
        if isinstance(mobile, AtomGroup):
            map_ag = mobile
        else:
            map_ag = mobile.getAtomGroup()
        chains = list(mobile.getHierView().iterChains())
        LOGGER.debug('Evaluating {0}: {1} chains are identified'.format(
            str(atoms), len(chains)))

    mappings = []
    unmapped = []
    unmapped_chids = []
    target_ag = target_chain.getAtomGroup()
    simple_target = SimpleChain(target_chain, False)
    LOGGER.debug('Trying to map atoms based on residue numbers and '
                 'identities:')
    for chain in chains:
        simple_chain = SimpleChain(chain, False)
        if len(simple_chain) == 0:
            LOGGER.debug('  Skipping {0}, which does not contain any amino '
                         'acid residues.'.format(simple_chain))
            continue
        LOGGER.debug('  Comparing {0} (len={1}) with {2}:'.format(
            simple_chain.getTitle(), len(simple_chain),
            simple_target.getTitle()))

        # trivial mapping serves as a first simple trial of alignment the two
        # sequences based on residue number, therefore the sequence identity
        # (TRIVIAL_SEQID) criterion is strict.
        _seqid = _cover = -1
        target_list, chain_list, n_match, n_mapped = getTrivialMapping(
            simple_target, simple_chain)
        if n_mapped > 0:
            _seqid = n_match * 100 / n_mapped
            _cover = n_mapped * 100 / max(len(simple_target),
                                          len(simple_chain))

        trivial_seqid = TRIVIAL_SEQID if pwalign else seqid
        trivial_cover = TRIVIAL_COVERAGE if pwalign else coverage
        if _seqid >= trivial_seqid and _cover >= trivial_cover:
            LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% '
                         'sequence identity and {2:.0f}% overlap.'.format(
                             n_mapped, _seqid, _cover))
            mappings.append((target_list, chain_list, _seqid, _cover))
        else:
            if not pwalign:
                LOGGER.debug(
                    '\tFailed to match chains based on residue numbers '
                    '(seqid={0:.0f}%, overlap={1:.0f}%).'.format(
                        _seqid, _cover))
            unmapped.append(simple_chain)
            unmapped_chids.append(chain.getChid())

    if not mappings and pwalign is None:
        pwalign = True

    if pwalign and unmapped:
        if alignment is None:
            if pwalign in ['ce', 'cealign']:
                aln_type = 'structure alignment'
                method = 'CE'
                if not 'seqid' in kwargs:
                    seqid = 0.
            else:
                aln_type = 'sequence alignment'
                method = ALIGNMENT_METHOD
        else:
            aln_type = 'alignment'
            method = 'predefined'
            if not 'seqid' in kwargs:
                seqid = 0.

        LOGGER.debug('Trying to map atoms based on {0} {1}:'.format(
            method, aln_type))

        for chid, simple_chain in zip(unmapped_chids, unmapped):
            LOGGER.debug('  Comparing {0} (len={1}) with {2}:'.format(
                simple_chain.getTitle(), len(simple_chain),
                simple_target.getTitle()))
            if method == 'CE':
                result = getCEAlignMapping(simple_target, simple_chain)
            else:
                if isinstance(alignment, dict):
                    result = getDictMapping(simple_target,
                                            simple_chain,
                                            map_dict=alignment)
                else:
                    result = getAlignedMapping(simple_target,
                                               simple_chain,
                                               alignment=alignment)

            if result is not None:
                target_list, chain_list, n_match, n_mapped = result
                if n_mapped > 0:
                    _seqid = n_match * 100 / n_mapped
                    _cover = n_mapped * 100 / max(len(simple_target),
                                                  len(simple_chain))
                else:
                    _seqid = 0
                    _cover = 0
                if _seqid >= seqid and _cover >= coverage:
                    LOGGER.debug(
                        '\tMapped: {0} residues match with {1:.0f}%'
                        ' sequence identity and {2:.0f}% overlap.'.format(
                            n_mapped, _seqid, _cover))
                    mappings.append((target_list, chain_list, _seqid, _cover))
                else:
                    LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                                 'overlap={1:.0f}%).'.format(_seqid, _cover))

    for mi, result in enumerate(mappings):
        residues_target, residues_chain, _seqid, _cover = result
        indices_target = []
        indices_chain = []
        indices_mapping = []
        indices_dummies = []
        counter = 0
        for i in range(len(residues_target)):
            res_tar = residues_target[i]
            res_chn = residues_chain[i]

            for atom_tar in res_tar:
                indices_target.append(atom_tar.getIndex())
                if res_chn is not None:
                    atom_chn = res_chn.getAtom(atom_tar.getName())
                    if atom_chn is not None:
                        indices_chain.append(atom_chn.getIndex())
                        indices_mapping.append(counter)
                    else:
                        indices_dummies.append(counter)
                else:
                    indices_dummies.append(counter)
                counter += 1
        #n_atoms = len(indices_target)

        ch_tar = next((r for r in residues_target if r is not None)).getChain()
        ch_chn = next((r for r in residues_chain if r is not None)).getChain()
        title_tar = 'Chain {0} from {1}'.format(
            ch_tar.getChid(),
            ch_tar.getAtomGroup().getTitle())
        title_chn = 'Chain {0} from {1}'.format(
            ch_chn.getChid(),
            ch_chn.getAtomGroup().getTitle())

        # note that chain here is from atoms
        atommap = AM(map_ag,
                     indices_chain,
                     chain.getACSIndex(),
                     mapping=indices_mapping,
                     dummies=indices_dummies,
                     title=title_chn + ' -> ' + title_tar)
        selection = AM(target_ag,
                       indices_target,
                       target_chain.getACSIndex(),
                       title=title_tar + ' -> ' + title_chn,
                       intarrays=True)

        mappings[mi] = (atommap, selection, _seqid, _cover)
    if len(mappings) > 1:
        mappings.sort(key=lambda m: m[-2:], reverse=True)
    return mappings
Ejemplo n.º 34
0
def fetchPDBviaFTP(*pdb, **kwargs):
    """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb*
    identifier(s) and return path(s).  Downloaded files will be stored in
    local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied
    into *folder*, if specified by the user.  If no destination folder is
    specified, files will be saved in the current working directory.  If
    *compressed* is **False**, decompressed files will be copied into
    *folder*.  *format* keyword argument can be used to retrieve
    `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_
    and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ 
    files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file,
    and ``format='xml'`` will fetch a PDBML file. 
    If PDBML header file is desired, ``noatom=True`` argument will do the job."""

    if kwargs.get('check', True):
        identifiers = checkIdentifiers(*pdb)
    else:
        identifiers = list(pdb)

    output_folder = kwargs.pop('folder', None)
    compressed = bool(kwargs.pop('compressed', True))
    format = str(kwargs.pop('format', 'pdb')).lower()
    noatom = bool(kwargs.pop('noatom', False))

    if format == 'pdb':
        ftp_divided = 'pdb/data/structures/divided/pdb'
        ftp_pdbext = '.ent.gz'
        ftp_prefix = 'pdb'
        extension = '.pdb'
    elif format == 'xml':
        if noatom:
            ftp_divided = 'pdb/data/structures/divided/XML-noatom'
            ftp_pdbext = '-noatom.xml.gz'
            extension = '-noatom.xml'
        else:
            ftp_divided = 'pdb/data/structures/divided/XML'
            ftp_pdbext = '.xml.gz'
            extension = '.xml'
        ftp_prefix = ''
    elif format == 'cif':
        ftp_divided = 'pdb/data/structures/divided/mmCIF'
        ftp_pdbext = '.cif.gz'
        ftp_prefix = ''
        extension = '.cif'
    elif format == 'emd' or format == 'map':
        ftp_divided = 'emdb/structures'
        ftp_pdbext = '.map.gz'
        ftp_prefix = 'emd_'
        extension = '.map'
    else:
        raise ValueError(repr(format) + ' is not valid format')

    local_folder = pathPDBFolder()

    if format == 'pdb' and local_folder:
        local_folder, is_divided = local_folder
        if is_divided:
            getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])),
                                       'pdb' + pdb + '.pdb.gz')
        else:
            getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz')
        if output_folder is None:
            second = lambda filename, pdb: filename
        else:
            if compressed:
                second = lambda filename, pdb: (copyFile(filename,
                            join(output_folder, pdb + extension + '.gz')))
            else:
                second = lambda filename, pdb: gunzip(filename,
                            join(output_folder, pdb + extension))

    else:
        if output_folder is None:
            output_folder = getcwd()
        if compressed:
            getPath = lambda pdb: join(output_folder, pdb + extension + '.gz')
            second = lambda filename, pdb: filename
        else:
            getPath = lambda pdb: join(output_folder, pdb + extension)
            second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb))


    ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us']
    LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name))

    from ftplib import FTP
    try:
        ftp = FTP(ftp_host)
    except Exception as error:
        raise type(error)('FTP connection problem, potential reason: '
                          'no internet connectivity')
    else:
        success = 0
        failure = 0
        filenames = []
        ftp.login('')
        for pdb in identifiers:
            if pdb is None:
                filenames.append(None)
                continue
            data = []
            ftp_fn = ftp_prefix + pdb + ftp_pdbext
            try:
                ftp.cwd(ftp_path)
                ftp.cwd(ftp_divided)
                if format == 'emd':
                    ftp.cwd('EMD-{0}/map'.format(pdb))
                else:
                    ftp.cwd(pdb[1:3])
                ftp.retrbinary('RETR ' + ftp_fn, data.append)
            except Exception as error:
                if ftp_fn in ftp.nlst():
                    LOGGER.warn('{0} download failed ({1}). It is '
                                'possible that you do not have rights to '
                                'download .gz files in the current network.'
                                .format(pdb, str(error)))
                else:
                    LOGGER.info('{0} download failed. {1} does not exist '
                                'on {2}.'.format(ftp_fn, pdb, ftp_host))
                failure += 1
                filenames.append(None)
            else:
                if len(data):
                    filename = getPath(pdb)

                    with open(filename, 'w+b') as pdbfile:
                        write = pdbfile.write
                        [write(block) for block in data]

                    filename = normpath(relpath(second(filename, pdb)))
                    LOGGER.debug('{0} downloaded ({1})'
                                 .format(pdb, sympath(filename)))
                    success += 1
                    filenames.append(filename)
                else:
                    LOGGER.warn('{0} download failed, reason unknown.'
                                .format(pdb))
                    failure += 1
                    filenames.append(None)

        ftp.quit()

    LOGGER.debug('PDB download via FTP completed ({0} downloaded, '
                 '{1} failed).'.format(success, failure))
    if len(identifiers) == 1:
        return filenames[0]
    else:
        return filenames
Ejemplo n.º 35
0
Archivo: pfam.py Proyecto: njekin/ProDy
def searchPfam(query, search_b=False, skip_a=False, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.sanger.ac.uk/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')

        urlextension = ''
        if kwargs:
            ga = int(kwargs.get('ga', 1))
            if not (ga == 1 or ga == 0):
                raise ValueError('ga must be either 0 or 1')

            evalue = kwargs.get('evalue', None)
            if evalue:
                if not float(evalue) <= 10.0:
                    raise ValueError('evalue must be a valid float < 10.0')
                urlextension = urlextension + '&evalue=' + str(evalue)
            else:
                urlextension = urlextension + '&ga=' + str(ga)

        search_b = int(bool(search_b))
        skip_a = int(bool(skip_a))
        if skip_a == 1:
            search_b = 1

        urlextension = urlextension + '&searchBs=' + str(search_b)
        urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) +
               urlextension + '&output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        try:
            url = dictElement(root[0], prefix)['result_url']
        except (IndexError, KeyError):
            raise ValueError('failed to parse results XML, check URL: ' + url)

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.sanger.ac.uk/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        #else:
        #    if xml:
        #        break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Ejemplo n.º 36
0
def mapOntoChain(atoms, chain, **kwargs):
    """Map *atoms* onto *chain*.  This function returns a list of mappings.
    Each mapping is a tuple that contains 4 items:

      * Mapped chain as an :class:`.AtomMap` instance,
      * *chain* as an :class:`.AtomMap` instance,
      * Percent sequence identitity,
      * Percent sequence overlap

    Mappings are returned in decreasing percent sequence identity order.
    :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms
    in place of unmapped atoms.

    :arg atoms: atoms that will be mapped to the target *chain*
    :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg chain: chain to which atoms will be mapped
    :type chain: :class:`.Chain`

    :keyword subset: one of the following well-defined subsets of atoms:
        ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``),
        ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"``
    :type subset: string

    :keyword seqid: percent sequence identity, default is **90** if sequence alignment is 
        performed, otherwise **0**
    :type seqid: float

    :keyword overlap: percent overlap, default is **70**
    :type overlap: float

    :keyword mapping: if ``"ce"`` or ``"cealign"``, then the CE algorithm [IS98]_ will be 
        performed. It can also be a list of prealigned sequences, a :class:`.MSA` instance,
        or a dict of indices such as that derived from a :class:`.DaliRecord`.
        If set to anything other than the options listed above, including the default value 
        (**None**), a simple mapping will be first attempted and if that failed 
        then sequence alignment with a function from :mod:`~Bio.pairwise2` will be used 
        unless *pwalign* is set to **False**, in which case the mapping will fail.
    :type mapping: list, str

    :keyword pwalign: if **True**, then pairwise sequence alignment will 
        be performed. If **False** then a simple mapping will be performed 
        based on residue numbers (as well as insertion codes). This will be 
        overridden by the *mapping* keyword's value. 
    :type pwalign: bool

    This function tries to map *atoms* to *chain* based on residue
    numbers and types. Each individual chain in *atoms* is compared to
    target *chain*.
    
    .. [IS98] Shindyalov IN, Bourne PE. Protein structure alignment by 
       incremental combinatorial extension (CE) of the optimal path. 
       *Protein engineering* **1998** 11(9):739-47.
    """

    if not isinstance(atoms, (AtomGroup, AtomSubset)):
        raise TypeError('atoms must be an AtomGroup or a AtomSubset instance')
    if not isinstance(chain, Chain):
        raise TypeError('chain must be Chain instance')

    subset = str(kwargs.get('subset', 'calpha')).lower()
    if subset not in _SUBSETS:
        raise ValueError('{0} is not a valid subset argument'
                         .format(str(subset)))
    seqid = kwargs.get('seqid', 90.) 
    coverage = kwargs.get('overlap', 70.)
    coverage = kwargs.get('coverage', coverage) 
    pwalign = kwargs.get('pwalign', None)
    pwalign = kwargs.get('mapping', pwalign)
    alignment = None
    if pwalign is not None:
        if isinstance(pwalign, basestring):
            pwalign = str(pwalign).strip().lower()
        elif not isinstance(pwalign, bool):
            alignment = pwalign
            pwalign = True

    if subset != 'all':
        chid = chain.getChid()
        segname = chain.getSegname()
        chain_subset = chain.select(subset)
        target_chain = chain_subset.getHierView()[segname, chid]
        
        mobile = atoms.select(subset)
    else:
        target_chain = chain
        mobile = atoms

    if isinstance(mobile, Chain):
        chains = [mobile]
        map_ag = mobile.getAtomGroup()
    else:
        if isinstance(mobile, AtomGroup):
            map_ag = mobile
        else:
            map_ag = mobile.getAtomGroup()
        chains = list(mobile.getHierView().iterChains())
        LOGGER.debug('Evaluating {0}: {1} chains are identified'
                     .format(str(atoms), len(chains))) 

    mappings = []
    unmapped = []
    unmapped_chids = []
    target_ag = target_chain.getAtomGroup()
    simple_target = SimpleChain(target_chain, False)
    LOGGER.debug('Trying to map atoms based on residue numbers and '
                'identities:')
    for chain in chains:
        simple_chain = SimpleChain(chain, False)
        if len(simple_chain) == 0:
            LOGGER.debug('  Skipping {0}, which does not contain any amino '
                        'acid residues.'.format(simple_chain))
            continue
        LOGGER.debug('  Comparing {0} (len={1}) with {2}:'
                    .format(simple_chain.getTitle(), len(simple_chain),
                            simple_target.getTitle()))

        # trivial mapping serves as a first simple trial of alignment the two 
        # sequences based on residue number, therefore the sequence identity 
        # (TRIVIAL_SEQID) criterion is strict.
        _seqid = _cover = -1
        target_list, chain_list, n_match, n_mapped = getTrivialMapping(
            simple_target, simple_chain)
        if n_mapped > 0:
            _seqid = n_match * 100 / n_mapped
            _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain))

        trivial_seqid = TRIVIAL_SEQID if pwalign else seqid
        trivial_cover = TRIVIAL_COVERAGE if pwalign else coverage
        if _seqid >= trivial_seqid and _cover >= trivial_cover:
            LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% '
                    'sequence identity and {2:.0f}% overlap.'
                    .format(n_mapped, _seqid, _cover))
            mappings.append((target_list, chain_list, _seqid, _cover))
        else:
            if not pwalign:
                LOGGER.debug('\tFailed to match chains based on residue numbers '
                        '(seqid={0:.0f}%, overlap={1:.0f}%).'
                        .format(_seqid, _cover))
            unmapped.append(simple_chain)
            unmapped_chids.append(chain.getChid())

    if not mappings and pwalign is None:
        pwalign = True

    if pwalign and unmapped:
        if alignment is None:
            if pwalign in ['ce', 'cealign']:
                aln_type = 'structure alignment'
                method = 'CE'
                if not 'seqid' in kwargs:
                    seqid = 0.
            else:
                aln_type = 'sequence alignment'
                method = ALIGNMENT_METHOD
        else:
            aln_type = 'alignment'
            method = 'predefined'
            if not 'seqid' in kwargs:
                seqid = 0.

        LOGGER.debug('Trying to map atoms based on {0} {1}:'
                     .format(method, aln_type))

        for chid, simple_chain in zip(unmapped_chids, unmapped):
            LOGGER.debug('  Comparing {0} (len={1}) with {2}:'
                        .format(simple_chain.getTitle(), len(simple_chain),
                                simple_target.getTitle()))
            if method == 'CE':
                result = getCEAlignMapping(simple_target, simple_chain)
            else:
                if isinstance(alignment, dict):
                    result = getDictMapping(simple_target, simple_chain, map_dict=alignment)
                else:
                    result = getAlignedMapping(simple_target, simple_chain, alignment=alignment)

            if result is not None:
                target_list, chain_list, n_match, n_mapped = result
                if n_mapped > 0:
                    _seqid = n_match * 100 / n_mapped
                    _cover = n_mapped * 100 / max(len(simple_target),
                                                  len(simple_chain))
                else:
                    _seqid = 0
                    _cover = 0
                if _seqid >= seqid and _cover >= coverage:
                    LOGGER.debug('\tMapped: {0} residues match with {1:.0f}%'
                                 ' sequence identity and {2:.0f}% overlap.'
                                 .format(n_mapped, _seqid, _cover))
                    mappings.append((target_list, chain_list, _seqid, _cover))
                else:
                    LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                                 'overlap={1:.0f}%).'
                                 .format(_seqid, _cover))

    for mi, result in enumerate(mappings):
        residues_target, residues_chain, _seqid, _cover = result
        indices_target = []
        indices_chain = []
        indices_mapping = []
        indices_dummies = []
        counter = 0
        for i in range(len(residues_target)):
            res_tar = residues_target[i]
            res_chn = residues_chain[i]

            for atom_tar in res_tar:
                indices_target.append(atom_tar.getIndex())
                if res_chn is not None:
                    atom_chn = res_chn.getAtom(atom_tar.getName())
                    if atom_chn is not None:
                        indices_chain.append(atom_chn.getIndex())
                        indices_mapping.append(counter)
                    else:
                        indices_dummies.append(counter)
                else:
                    indices_dummies.append(counter)
                counter += 1
        #n_atoms = len(indices_target)

        ch_tar = next((r for r in residues_target if r is not None)).getChain()
        ch_chn = next((r for r in residues_chain if r is not None)).getChain()
        title_tar = 'Chain {0} from {1}'.format(ch_tar.getChid(), ch_tar.getAtomGroup().getTitle())
        title_chn = 'Chain {0} from {1}'.format(ch_chn.getChid(), ch_chn.getAtomGroup().getTitle())

        # note that chain here is from atoms
        atommap = AM(map_ag, indices_chain, chain.getACSIndex(),
                     mapping=indices_mapping, dummies=indices_dummies,
                     title=title_chn + ' -> ' + title_tar )
        selection = AM(target_ag, indices_target, target_chain.getACSIndex(),
                       title=title_tar + ' -> ' + title_chn, intarrays=True)

        mappings[mi] = (atommap, selection, _seqid, _cover)
    if len(mappings) > 1:
        mappings.sort(key=lambda m: m[-2:], reverse=True)
    return mappings
Ejemplo n.º 37
0
def fetchPDB(pdb, folder='.', compressed=True, copy=False, **kwargs):
    """Retrieve PDB, PDBML, or mmCIF file(s) for specified *pdb* identifier(s).  
    *pdb* may be a string or a list.  The function will return a filename or a 
    list of filenames depending on input (see :ref:`fetchpdb` for examples).  

    If *compressed* is **False**, all files will be decompressed.  If *copy* is 
    **True**, all files from local PDB mirror will copied to the user specified 
    *folder*.  *format* keyword argument can be used to retrieve `PDBML 
    <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files:  
    ``format="cif"`` will fetch an mmCIF file (e.g. :file:`1XXX.cif.gz`), 
    similarly ``format="xml"`` will fetch a PDBML file.  If PDBML header file 
    is desired, ``format="xml", noatom=True`` will do the job (e.g. 
    :file:`1XXX-noatom.xml.gz`)
    
    The order of file search operations are as follows:  First, files are 
    sought in *folder*.  Second, local PDB mirror will be sought, if one is 
    set by the user (see :func:`setPDBMirrorPath`).  Then, local PDB folder
    will be sought, if one is  set by the user (see :func:`setPDBLocalFolder`).
    Finally, if files are not found locally, they will be downloaded one of 
    wwPDB FTP servers (use :func:`setWWPDBFTPServer` to specify one close to 
    you)."""
    
    if isinstance(pdb, str):
        identifiers = [pdb]
    elif isinstance(pdb, list):
        identifiers = pdb
    else:
        raise TypeError('pdb may be a string or a list of strings')
        
    assert isinstance(folder, str), 'folder must be a string'
    assert isinstance(compressed, bool), 'compressed must be a boolean'
    assert isinstance(copy, bool), 'copy must be a boolean'
    format = kwargs.pop('format', 'pdb')
    assert isinstance(format, str), 'format must be a string'
    format = format.lower()
    assert format in _PDB_FORMATS, '{0:s} is not valid format'.format(
                                                                repr(format))
    noatom = kwargs.pop('noatom', False) 
    assert isinstance(noatom, bool), 'noatom must be a boolean'
    if kwargs:
        raise TypeError('{0:s} is not a valid keyword argument for this' 
                        'function'.format(repr(kwargs.iterkeys().next())))
    if folder != '.':
        folder = makePath(folder)
    if not os.access(folder, os.W_OK):
        raise IOError('permission to write in {0:s} is denied, please '
                      'specify another folder'.format(folder))
    
    filenames = []
    exists = 0
    success = 0
    failure = 0
    download = False
    if format == 'pdb':
        divided = 'data/structures/divided/pdb'
        pdbext = '.ent.gz'
        extensions = ['.ent', '.pdb'] # '.pdb' should be the last item
        prefix = 'pdb'
    elif format == 'xml':
        if noatom:
            divided = 'data/structures/divided/XML-noatom'
            pdbext = '-noatom.xml.gz'
            extensions = ['-noatom.xml']
        else:
            divided = 'data/structures/divided/XML'
            pdbext = '.xml.gz'
            extensions = ['.xml']
        prefix = ''
    else:
        divided = 'data/structures/divided/mmCIF'
        pdbext = '.cif.gz'
        extensions = ['.cif'] # '.pdb' should be the last item
        prefix = ''
    
    pdbfnmap = {}
    for extension in extensions:
        for pdbfn in glob(os.path.join(folder, '*' + extension + '*')): 
            if os.path.splitext(pdbfn)[1] in _PDB_EXTENSIONS:
                pdbfnmap[os.path.split(pdbfn)[1].split('.')[0].lower()] = pdbfn
        for pdbfn in glob(os.path.join(folder, '*' + extension.upper() + '*')):
            if os.path.splitext(pdbfn)[1] in _PDB_EXTENSIONS:
                pdbfnmap[os.path.split(pdbfn)[1].split('.')[0].lower()] = pdbfn
                
    for i, pdbid in enumerate(identifiers):
        # Check validity of identifiers
        if not isinstance(pdbid, str):
            LOGGER.debug('{0:s} is not a valid identifier.'.format(pdbid))
            filenames.append(None)
            failure += 1 
            continue
        pdbid = pdbid.strip().lower()
        if not (len(pdbid) == 4 and pdbid.isalnum()):
            LOGGER.debug('{0:s} is not a valid identifier.'.format(pdbid))
            filenames.append(None)
            failure += 1 
            continue
        # Check if file exists in working directory
        identifiers[i] = pdbid
        if noatom:
            fn = pdbfnmap.get(pdbid + '-noatom', None)
        else:
            fn = pdbfnmap.get(pdbid, None) or pdbfnmap.get('pdb'+pdbid, None)
        if fn:
            fn = relpath(fn)
            if not compressed:
                temp, ext = os.path.splitext(fn) 
                if ext == '.gz':
                    fn = gunzip(fn, temp)
            filenames.append(fn)
            LOGGER.debug('{0:s} ({1:s}) is found in the working directory.'
                         .format(pdbid, fn))
            exists += 1
            continue
        # Check the PDB mirror
        mirror_path = getPDBMirrorPath()
        if mirror_path is not None and os.path.isdir(mirror_path):
            fn = os.path.join(mirror_path, divided, pdbid[1:3], 
                              prefix + pdbid + pdbext)
            if os.path.isfile(fn):
                if copy or not compressed:
                    if compressed:
                        filename = os.path.join(folder, pdbid + extension + 
                                                        '.gz')
                        shutil.copy(fn, filename)
                    else:
                        filename = os.path.join(folder, pdbid + extension)
                        gunzip(fn, filename)
                    filenames.append(filename)
                    LOGGER.debug('{0:s} copied from local mirror ({1:s})'
                                 .format(pdbid, filename))
                    success += 1
                else:
                    filenames.append(fn)
                    
                    LOGGER.debug('{0:s} ({1:s}...{2:s}) is found in the local '
                                'mirror.'.format(pdbid, 
                                fn[:fn[1:].index(os.path.sep)+2], fn[-15:]))
                    exists += 1
                continue
        # Check the PDB mirror
        local_folder = getPDBLocalFolder()
        if format and local_folder:
            local_folder, is_divided = local_folder
            if is_divided:
                fn = os.path.join(local_folder, pdbid[1:3], 
                                  'pdb' + pdbid + '.pdb.gz')
            else:
                fn = os.path.join(local_folder, pdbid + '.pdb.gz')
                
            if os.path.isfile(fn):
                if copy or not compressed:
                    if compressed:
                        filename = os.path.join(folder, pdbid + extension + 
                                                        '.gz')
                        shutil.copy(fn, filename)
                    else:
                        filename = os.path.join(folder, pdbid + extension)
                        gunzip(fn, filename)
                    filenames.append(filename)
                    LOGGER.debug('{0:s} copied from local PDB folder ({1:s})'
                                 .format(pdbid, filename))
                    success += 1
                else:
                    filenames.append(fn)
                    
                    LOGGER.debug('{0:s} ({1:s}...{2:s}) is found in the PDB '
                                'local folder.'.format(pdbid, 
                                fn[:fn[1:].index(os.path.sep)+2], fn[-15:]))
                    exists += 1
                continue

        filenames.append(pdbid)
        download = True
    if download:
        from ftplib import FTP
        ftp_name, ftp_host, ftp_path = getWWPDBFTPServer()
        LOGGER.debug('Connecting wwPDB FTP server {0:s}.'.format(ftp_name))
        if format == 'pdb' and not copy and local_folder:
            folder = local_folder
            compressed = True
            if is_divided:
                getfn = lambda folder, pdbid, ext: \
                    os.path.join(makePath(os.path.join(local_folder, 
                                            pdbid[1:3])), 'pdb' + pdbid + ext)
            else:
                getfn = lambda folder, pdbid, ext: os.path.join(folder,
                                                                pdbid + ext)
                
        else: 
            getfn = lambda folder, pdbid, ext: os.path.join(folder, 
                                                            pdbid + ext)
        try:
            ftp = FTP(ftp_host)
        except Exception as error:
            raise type(error)('FTP connection problem, potential reason: '
                              'no internet connectivity')
        else:
            #ftp_path = os.path.join(ftp_path, divided)
            ftp.login('')
            for i, pdbid in enumerate(identifiers):
                if pdbid != filenames[i]:
                    continue
                filename = getfn(folder, pdbid, extension)
                if compressed:
                    filename += '.gz'

                pdbfile = open(filename, 'w+b')
                fn = prefix + pdbid + pdbext
                try:
                    ftp.cwd(ftp_path)
                    ftp.cwd(divided)
                    ftp.cwd(pdbid[1:3])
                    ftp.retrbinary('RETR ' + fn, pdbfile.write)
                except Exception as error:
                    pdbfile.close()
                    os.remove(filename)
                    if fn in ftp.nlst():
                        LOGGER.debug('{0:s} download failed ({1:s}). It '
                                     'is possible that you don\'t have '
                                     'rights to download .gz files in the '
                                     'current network.'.format(pdbid, 
                                     str(error)))
                    else:
                        LOGGER.debug('{0:s} download failed. {1:s} does not '
                                     'exist on {2:s}.'
                                     .format(fn, pdbid, ftp_host))
                    failure += 1
                    filenames[i] = None 
                else:
                    pdbfile.close()
                    if not compressed:
                        gunzip(filename)
                    filename = relpath(filename)
                    LOGGER.debug('{0:s} downloaded ({1:s})'
                                 .format(pdbid, filename))
                    success += 1
                    filenames[i] = filename
            ftp.quit()
    if len(identifiers) == 1:
        return filenames[0]    
    else:
        LOGGER.info('PDB download completed ({2:d} found, '
                    '{0:d} downloaded, {1:d} failed).'
                    .format(success, failure, exists))
        return filenames
Ejemplo n.º 38
0
Archivo: pfam.py Proyecto: sixpi/ProDy
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    result = root[0].get('id')
    return result
Ejemplo n.º 39
0
    def buildKirchhoff(self, coords, cutoff=10., gamma=1., **kwargs):
        """Build Kirchhoff matrix for given coordinate set.

        :arg coords: a coordinate set or an object with ``getCoords`` method
        :type coords: :class:`numpy.ndarray` or :class:`.Atomic`

        :arg cutoff: cutoff distance (Å) for pairwise interactions
            default is 10.0 Å, , minimum is 4.0 Å
        :type cutoff: float

        :arg gamma: spring constant, default is 1.0
        :type gamma: float

        :arg sparse: elect to use sparse matrices, default is **False**. If
            Scipy is not found, :class:`ImportError` is raised.
        :type sparse: bool

        :arg kdtree: elect to use KDTree for building Kirchhoff matrix faster,
            default is **True**
        :type kdtree: bool


        Instances of :class:`Gamma` classes and custom functions are
        accepted as *gamma* argument.

        When Scipy is available, user can select to use sparse matrices for
        efficient usage of memory at the cost of computation speed."""

        try:
            coords = (coords._getCoords()
                      if hasattr(coords, '_getCoords') else coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')

        cutoff, g, gamma = checkENMParameters(cutoff, gamma)
        self._reset()
        self._cutoff = cutoff
        self._gamma = g

        n_atoms = coords.shape[0]
        start = time.time()
        if kwargs.get('sparse', False):
            try:
                from scipy import sparse as scipy_sparse
            except ImportError:
                raise ImportError('failed to import scipy.sparse, which  is '
                                  'required for sparse matrix calculations')
            kirchhoff = scipy_sparse.lil_matrix((n_atoms, n_atoms))
        else:
            kirchhoff = np.zeros((n_atoms, n_atoms), 'd')

        if kwargs.get('kdtree', True):
            kdtree = KDTree(coords)
            kdtree.search(cutoff)
            dist2 = kdtree.getDistances()**2
            r = 0
            for i, j in kdtree.getIndices():
                g = gamma(dist2[r], i, j)
                kirchhoff[i, j] = -g
                kirchhoff[j, i] = -g
                kirchhoff[i, i] = kirchhoff[i, i] + g
                kirchhoff[j, j] = kirchhoff[j, j] + g
                r += 1
        else:
            LOGGER.info('Using slower method for building the Kirchhoff.')
            cutoff2 = cutoff * cutoff
            mul = np.multiply
            for i in range(n_atoms):
                xyz_i = coords[i, :]
                i_p1 = i + 1
                i2j = coords[i_p1:, :] - xyz_i
                mul(i2j, i2j, i2j)
                for j, dist2 in enumerate(i2j.sum(1)):
                    if dist2 > cutoff2:
                        continue
                    j += i_p1
                    g = gamma(dist2, i, j)
                    kirchhoff[i, j] = -g
                    kirchhoff[j, i] = -g
                    kirchhoff[i, i] = kirchhoff[i, i] + g
                    kirchhoff[j, j] = kirchhoff[j, j] + g

        LOGGER.debug('Kirchhoff was built in {0:.2f}s.'.format(time.time() -
                                                               start))
        self._kirchhoff = kirchhoff
        self._n_atoms = n_atoms
        self._dof = n_atoms
Ejemplo n.º 40
0
Archivo: anm.py Proyecto: njekin/ProDy
    def calcModes(self, n_modes=20, zeros=False, turbo=True):
        """Calculate normal modes.  This method uses :func:`scipy.linalg.eigh`
        function to diagonalize the Hessian matrix. When Scipy is not found,
        :func:`numpy.linalg.eigh` is used.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate.
            If ``None`` is given, all modes will be calculated.
        :type n_modes: int or None, default is 20

        :arg zeros: If ``True``, modes with zero eigenvalues will be kept.
        :type zeros: bool, default is ``False``

        :arg turbo: Use a memory intensive, but faster way to calculate modes.
        :type turbo: bool, default is ``True``
        """

        if self._hessian is None:
            raise ValueError("Hessian matrix is not built or set")
        assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, "n_modes must be a positive integer"
        assert isinstance(zeros, bool), "zeros must be a boolean"
        assert isinstance(turbo, bool), "turbo must be a boolean"
        linalg = importLA()
        start = time.time()
        shift = 5
        if linalg.__package__.startswith("scipy"):
            if n_modes is None:
                eigvals = None
                n_modes = self._dof
            else:
                if n_modes >= self._dof:
                    eigvals = None
                    n_modes = self._dof
                else:
                    eigvals = (0, n_modes + shift)
            if eigvals:
                turbo = False
            if isinstance(self._hessian, np.ndarray):
                values, vectors = linalg.eigh(self._hessian, turbo=turbo, eigvals=eigvals)
            else:
                try:
                    from scipy.sparse import linalg as scipy_sparse_la
                except ImportError:
                    raise ImportError(
                        "failed to import scipy.sparse.linalg, " "which is required for sparse matrix " "decomposition"
                    )
                try:
                    values, vectors = scipy_sparse_la.eigsh(self._hessian, k=n_modes + 6, which="SA")
                except:
                    values, vectors = scipy_sparse_la.eigen_symmetric(self._hessian, k=n_modes + 6, which="SA")

        else:
            if n_modes is not None:
                LOGGER.info("Scipy is not found, all modes are calculated.")
            values, vectors = linalg.eigh(self._hessian)
        n_zeros = sum(values < ZERO)
        if n_zeros < 6:
            LOGGER.warning("Less than 6 zero eigenvalues are calculated.")
            shift = n_zeros - 1
        elif n_zeros > 6:
            LOGGER.warning("More than 6 zero eigenvalues are calculated.")
            shift = n_zeros - 1
        if zeros:
            shift = -1
        self._eigvals = values[1 + shift :]
        self._vars = 1 / self._eigvals
        self._trace = self._vars.sum()
        self._array = vectors[:, 1 + shift :]
        self._n_modes = len(self._eigvals)
        LOGGER.debug("{0} modes were calculated in {1:.2f}s.".format(self._n_modes, time.time() - start))
Ejemplo n.º 41
0
    def calcModes(self, n_modes=20, zeros=False, turbo=True):
        """Calculate normal modes.  This method uses :func:`scipy.linalg.eigh`
        function to diagonalize the Kirchhoff matrix. When Scipy is not found,
        :func:`numpy.linalg.eigh` is used.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate.
              If ``None`` is given, all modes will be calculated.
        :type n_modes: int or None, default is 20

        :arg zeros: If ``True``, modes with zero eigenvalues will be kept.
        :type zeros: bool, default is ``False``

        :arg turbo: Use a memory intensive, but faster way to calculate modes.
        :type turbo: bool, default is ``True``
        """

        if self._kirchhoff is None:
            raise ValueError('Kirchhoff matrix is not built or set')
        assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, \
            'n_modes must be a positive integer'
        assert isinstance(zeros, bool), 'zeros must be a boolean'
        assert isinstance(turbo, bool), 'turbo must be a boolean'
        linalg = importLA()
        start = time.time()
        shift = 0
        if linalg.__package__.startswith('scipy'):
            if n_modes is None:
                eigvals = None
                n_modes = self._dof
            else:
                if n_modes >= self._dof:
                    eigvals = None
                    n_modes = self._dof
                else:
                    eigvals = (0, n_modes + shift)
            if eigvals:
                turbo = False
            if isinstance(self._kirchhoff, np.ndarray):
                values, vectors = linalg.eigh(self._kirchhoff,
                                              turbo=turbo,
                                              eigvals=eigvals)
            else:
                try:
                    from scipy.sparse import linalg as scipy_sparse_la
                except ImportError:
                    raise ImportError('failed to import scipy.sparse.linalg, '
                                      'which is required for sparse matrix '
                                      'decomposition')
                try:
                    values, vectors = (scipy_sparse_la.eigsh(self._kirchhoff,
                                                             k=n_modes + 1,
                                                             which='SA'))
                except:
                    values, vectors = (scipy_sparse_la.eigen_symmetric(
                        self._kirchhoff, k=n_modes + 1, which='SA'))
        else:
            if n_modes is not None:
                LOGGER.info('Scipy is not found, all modes are calculated.')
            values, vectors = linalg.eigh(self._kirchhoff)
        n_zeros = sum(values < ZERO)
        if n_zeros < 1:
            LOGGER.warning('Less than 1 zero eigenvalues are calculated.')
            shift = n_zeros - 1
        elif n_zeros > 1:
            LOGGER.warning('More than 1 zero eigenvalues are calculated.')
            shift = n_zeros - 1
        if zeros:
            shift = -1
        self._eigvals = values[1 + shift:]
        self._vars = 1 / self._eigvals
        self._trace = self._vars.sum()
        self._array = vectors[:, 1 + shift:]
        self._n_modes = len(self._eigvals)
        LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format(
            self._n_modes,
            time.time() - start))
Ejemplo n.º 42
0
def mapOntoChain(atoms, chain, **kwargs):
    """Map *atoms* onto *chain*.  This function returns a list of mappings.
    Each mapping is a tuple that contains 4 items:

      * Mapped chain as an :class:`.AtomMap` instance,
      * *chain* as an :class:`.AtomMap` instance,
      * Percent sequence identitity,
      * Percent sequence overlap

    Mappings are returned in decreasing percent sequence identity order.
    :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms
    in place of unmapped atoms.

    :arg atoms: atoms that will be mapped to the target *chain*
    :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg chain: chain to which atoms will be mapped
    :type chain: :class:`.Chain`

    :keyword seqid: percent sequence identity, default is 90
    :type seqid: float

    :keyword overlap: percent overlap, default is 90
    :type overlap: float

    :keyword pwalign: perform pairwise sequence alignment
    :type pwalign: bool

    This function tries to map *atoms* to *chain* based on residue
    numbers and types. Each individual chain in *atoms* is compared to
    target *chain*. This works well for different structures of the same
    protein. When it fails, :mod:`Bio.pairwise2` is used for sequence
    alignment, and mapping is performed based on the sequence alignment.
    User can control, whether sequence alignment is performed or not with
    *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is
    enforced."""

    """
    :keyword subset: "calpha" (or "ca"), "backbone" (or "bb"), or "all",
        default is "calpha"
    :type subset: string
    """

    target_chain = chain
    if not isinstance(atoms, (AtomGroup, Chain, Selection)):
        raise TypeError('atoms must be an AtomGroup, a Chain, or a '
                        'Selection instance')
    if not isinstance(target_chain, Chain):
        raise TypeError('chain must be Chain instance')

    subset = str(kwargs.get('subset', 'calpha')).lower()
    if subset not in _SUBSETS:
        raise ValueError('{0} is not a valid subset argument'
                         .format(str(subset)))
    seqid = kwargs.get('seqid', 90.)
    coverage = kwargs.get('overlap')
    if coverage is None:
        coverage = kwargs.get('coverage', 90.)
    pwalign = kwargs.get('pwalign', None)

    if isinstance(atoms, Chain):
        chains = [atoms]
        map_ag = atoms.getAtomGroup()
    else:
        if isinstance(atoms, AtomGroup):
            map_ag = atoms
        else:
            map_ag = atoms.getAtomGroup()
        chains = list(atoms.getHierView().iterChains())
        LOGGER.debug('Evaluating {0}: {1} chains are identified'
                     .format(str(atoms), len(chains)))

    if subset != 'all':
        target_chain = target_chain.select(subset
                                        ).getHierView()[target_chain.getChid()]

    mappings = []
    unmapped = []
    target_ag = target_chain.getAtomGroup()
    simple_target = SimpleChain(target_chain, True)
    LOGGER.debug('Trying to map atoms based on residue numbers and '
                 'identities:')
    for chain in chains:
        simple_chain = SimpleChain(True)
        simple_chain.buildFromChain(chain)
        if len(simple_chain) == 0:
            LOGGER.debug('  Skipping {0}, which does not contain any amino '
                         'acid residues.'.format(simple_chain))
            continue
        LOGGER.debug('  Comparing {0} (len={1}) with {2}:'
                     .format(simple_chain.getTitle(), len(simple_chain),
                             simple_target.getTitle()))

        target_list, chain_list, n_match, n_mapped = getTrivialMapping(
            simple_target, simple_chain)
        if n_mapped > 0:
            _seqid = n_match * 100 / n_mapped
            _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain))
        else:
            _seqid = 0
            _cover = 0

        if _seqid >= seqid and _cover >= coverage:
            LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% '
                         'sequence identity and {2:.0f}% overlap.'
                         .format(n_mapped, _seqid, _cover))
            mappings.append((target_list, chain_list, _seqid, _cover))
        else:
            LOGGER.debug('\tFailed to match chains based on residue numbers '
                         '(seqid={0:.0f}%, overlap={1:.0f}%).'
                         .format(_seqid, _cover))
            unmapped.append(simple_chain)

    if pwalign or (not mappings and (pwalign is None or pwalign)):
        LOGGER.debug('Trying to map atoms based on {0} sequence alignment:'
                     .format(ALIGNMENT_METHOD))
        for simple_chain in unmapped:
            LOGGER.debug('  Comparing {0} (len={1}) with {2}:'
                         .format(simple_chain.getTitle(), len(simple_chain),
                                 simple_target.getTitle()))
            result = getAlignedMapping(simple_target, simple_chain)
            if result is not None:
                target_list, chain_list, n_match, n_mapped = result
                if n_mapped > 0:
                    _seqid = n_match * 100 / n_mapped
                    _cover = n_mapped * 100 / max(len(simple_target),
                                                  len(simple_chain))
                else:
                    _seqid = 0
                    _cover = 0
                if _seqid >= seqid and _cover >= coverage:
                    LOGGER.debug('\tMapped: {0} residues match with {1:.0f}%'
                                 ' sequence identity and {2:.0f}% overlap.'
                                 .format(n_mapped, _seqid, _cover))
                    mappings.append((target_list, chain_list, _seqid, _cover))
                else:
                    LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                                 'overlap={1:.0f}%).'
                                 .format(_seqid, _cover))

    for mi, result in enumerate(mappings):
        residues_target, residues_chain, _seqid, _cover = result
        indices_target = []
        indices_chain = []
        indices_mapping = []
        indices_dummies = []
        counter = 0
        for i in range(len(residues_target)):
            res_tar = residues_target[i]
            res_chn = residues_chain[i]

            for atom_tar in res_tar:
                indices_target.append(atom_tar.getIndex())
                if res_chn is not None:
                    atom_chn = res_chn.getAtom(atom_tar.getName())
                    if atom_chn is not None:
                        indices_chain.append(atom_chn.getIndex())
                        indices_mapping.append(counter)
                    else:
                        indices_dummies.append(counter)
                else:
                    indices_dummies.append(counter)
                counter += 1
        #n_atoms = len(indices_target)
        atommap = AM(map_ag, indices_chain, chain.getACSIndex(),
                     mapping=indices_mapping, dummies=indices_dummies,
                     title=simple_chain.getTitle() + ' -> ' +
                     simple_target.getTitle())
        selection = AM(target_ag, indices_target, target_chain.getACSIndex(),
                       title=simple_target.getTitle() + ' -> ' +
                       simple_chain.getTitle(), intarrays=True)

        mappings[mi] = (atommap, selection, _seqid, _cover)
    if len(mappings) > 1:
        def compare(m1, m2):
            return cmp(m1[2], m2[2])
        mappings.sort(compare, reverse=True)
    return mappings
Ejemplo n.º 43
0
def fetchPDBfromMirror(*pdb, **kwargs):
    """Returns path(s) to PDB (default), PDBML, or mmCIF file(s) for specified
    *pdb* identifier(s).  If a *folder* is specified, files will be copied
    into this folder.  If *compressed* is **False**, files will decompressed.
    *format* argument can be used to get `PDBML <http://pdbml.pdb.org/>`_ and
    `mmCIF <http://mmcif.pdb.org/>`_ files: ``format='cif'`` will fetch an
    mmCIF file, and ``format='xml'`` will fetch a PDBML file.  If PDBML header
    file is desired, ``noatom=True`` argument will do the job."""

    mirror = pathPDBMirror()
    if mirror is None:
        raise IOError('no mirror path is set')


    try:
        mirror, mirror_format = mirror
    except ValueError:
        mirror_format = None

    format = str(kwargs.pop('format', 'pdb')).lower()

    if kwargs.get('check', True):
        identifiers = checkIdentifiers(*pdb)
    else:
        identifiers = list(pdb)

    if format == 'pdb':
        ftp_divided = 'data/structures/divided/pdb'
        ftp_pdbext = '.ent.gz'
        ftp_prefix = 'pdb'
        extension = '.pdb'
    elif format == 'xml':
        if bool(kwargs.pop('noatom', False)):
            ftp_divided = 'data/structures/divided/XML-noatom'
            ftp_pdbext = '-noatom.xml.gz'
            extension = '-noatom.xml'
        else:
            ftp_divided = 'data/structures/divided/XML'
            ftp_pdbext = '.xml.gz'
            extension = '.xml'
        ftp_prefix = ''
    elif format == 'cif':
        ftp_divided = 'data/structures/divided/mmCIF'
        ftp_pdbext = '.cif.gz'
        ftp_prefix = ''
        extension = '.cif'
    else:
        if format:
            raise ValueError('{0} is not a recognized format'
                             .format(repr(format)))
        else:
            raise ValueError('please specify a valid format')

    if mirror_format:
        if mirror_format.lower() != format:
            raise IOError('mirror contains only ' + mirror_format + ' files')
        ftp_divided = ''
    else:
        ftp_divided = join(*ftp_divided.split('/'))
    folder = kwargs.get('folder')
    compressed = kwargs.get('compressed', True)
    filenames = []
    append = filenames.append
    success = 0
    failure = 0
    for pdb in identifiers:
        if pdb is None:
            append(None)
            continue
        fn = join(mirror, ftp_divided, pdb[1:3],
                  ftp_prefix + pdb + ftp_pdbext)
        if isfile(fn):
            if folder or not compressed:
                if compressed:
                    fn = copyFile(fn, join(folder or '.',
                                             pdb + extension + '.gz'))
                else:
                    fn = gunzip(fn, join(folder or '.', pdb + extension))
            append(normpath(fn))
            success += 1
        else:
            append(None)
            failure += 1

    if len(identifiers) == 1:
        fn = filenames[0]
        if kwargs.get('report', True):
            if success:
                LOGGER.debug('PDB file is found in the local mirror ({0}).'
                             .format(sympath(fn)))
        return fn
    else:
        if kwargs.get('report', True):
            LOGGER.debug('PDB files found in the local mirror ({0} found, '
                         '{1} missed).'.format(success, failure))
        return filenames
Ejemplo n.º 44
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = "{http://pfam.xfam.org/}"
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile

        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = "".join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError("could not parse a sequence without gaps from " + query)
    else:
        seq = "".join(query.split())

    import xml.etree.cElementTree as ET

    LOGGER.timeit("_pfam")
    timeout = int(kwargs.get("timeout", 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + " is not a valid sequence")

            fseq = ">Seq\n" + seq
            parameters = {"hmmdb": "pfam", "seq": fseq}
            enc_params = urllib.urlencode(parameters)
            request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params)

            url = urllib2.urlopen(request).geturl() + "?output=xml"
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError("failed to parse results XML, check URL: " + url)
            matches = {}
            for child in root[0]:
                if child.tag == "hits":
                    accession = child.get("acc")
                    pfam_id = accession.split(".")[0]
                    matches[pfam_id] = {}
                    matches[pfam_id]["accession"] = accession
                    matches[pfam_id]["class"] = "Domain"
                    matches[pfam_id]["id"] = child.get("name")
                    matches[pfam_id]["locations"] = {}
                    matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore")
                    matches[pfam_id]["locations"]["end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["evalue"] = child.get("evalue")
                    matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0"
                    matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto")
                    matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom")
                    matches[pfam_id]["locations"]["significant"] = child[0].get("significant")
                    matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["type"] = "Pfam-A"
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader

            try:
                polymers = parsePDBHeader(seq[:4], "polymers")
            except Exception as err:
                LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != "UniProt":
                            continue
                        idcode = dbref.idcode
                        LOGGER.info(
                            "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid)
                        )
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq)))
                url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"
            else:
                url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml"

        else:
            url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"

    LOGGER.debug("Retrieving Pfam search results: " + url)
    xml = None
    while LOGGER.timing("_pfam") < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url)
    else:
        LOGGER.report("Pfam search completed in %.2fs.", "_pfam")

    if xml.find(b"There was a system error on your last request.") > 0:
        LOGGER.warn("No Pfam matches found for: " + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError("failed to parse results XML, check URL: " + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError("failed to parse results XML, check URL: " + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results["matches"]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib["accession"][:7]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

        if not re.search("^P(F|B)[0-9]{5}$", accession):
            raise ValueError("{0} does not match pfam accession" " format".format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault("locations", [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = "Query " + repr(query)
    else:
        query = "Query sequence"

    if matches:
        LOGGER.info(query + " matched {0} Pfam families.".format(len(matches)))
    else:
        LOGGER.info(query + " did not match any Pfam families.")
    return matches
Ejemplo n.º 45
0
def matchChains(atoms1, atoms2, **kwargs):
    """Returns pairs of chains matched based on sequence similarity.  Makes an
    all-to-all comparison of chains in *atoms1* and *atoms2*.  Chains are
    obtained from hierarchical views (:class:`.HierView`) of atom groups.
    This function returns a list of matching chains in a tuples that contain
    4 items:

      * matching chain from *atoms1* as a :class:`.AtomMap`
        instance,
      * matching chain from *atoms2* as a :class:`.AtomMap`
        instance,
      * percent sequence identity of the match,
      * percent sequence overlap of the match.

    List of matches are sorted in decreasing percent sequence identity order.
    :class:`.AtomMap` instances can be used to calculate RMSD values and
    superpose atom groups.

    :arg atoms1: atoms that contain a chain
    :type atoms1: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg atoms2: atoms that contain a chain
    :type atoms2: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :keyword subset: one of the following well-defined subsets of atoms:
        ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``),
        ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"``
    :type subset: string

    :keyword seqid: percent sequence identity, default is 90
    :type seqid: float

    :keyword overlap: percent overlap, default is 90
    :type overlap: float

    :keyword pwalign: perform pairwise sequence alignment
    :type pwalign: bool

    If *subset* is set to *calpha* or *backbone*, only alpha carbon
    atoms or backbone atoms will be paired. If set to *all*, all atoms
    common to matched residues will be returned.

    This function tries to match chains based on residue numbers and names.
    All chains in *atoms1* is compared to all chains in *atoms2*.  This works
    well for different structures of the same protein.  When it fails,
    :mod:`Bio.pairwise2` is used for pairwise sequence alignment, and matching
    is performed based on the sequence alignment.  User can control, whether
    sequence alignment is performed or not with *pwalign* keyword.  If
    ``pwalign=True`` is passed, pairwise alignment is enforced."""

    if not isinstance(atoms1, (AtomGroup, Chain, Selection)):
        raise TypeError('atoms1 must be an AtomGroup, Chain, or Selection')
    if not isinstance(atoms2, (AtomGroup, Chain, Selection)):
        raise TypeError('atoms2 must be an AtomGroup, Chain, or Selection')

    subset = kwargs.get('subset', 'calpha')
    if subset not in _SUBSETS:
        raise ValueError('{0} is not a valid subset argument'.format(
            str(subset)))
    seqid = kwargs.get('seqid', 90.)
    assert isinstance(seqid, (float, int)), 'seqid must be float'
    assert 0 < seqid <= 100, 'seqid must be in the range from 0 to 100'
    coverage = kwargs.get('overlap')
    if coverage is None:
        coverage = kwargs.get('coverage', 90.)
    assert isinstance(coverage, (float, int)), 'overlap must be float'
    assert 0 < coverage <= 100, 'overlap must be in the range from 0 to 100'
    pwalign = kwargs.get('pwalign', None)

    if isinstance(atoms1, Chain):
        chains1 = [atoms1]
        atoms1 = atoms1.getAtomGroup()
    else:
        chains1 = list(atoms1.getHierView().iterChains())
        if not isinstance(atoms1, AtomGroup):
            atoms1 = atoms1.getAtomGroup()
    chains = list()
    for ch in chains1:
        simpch = SimpleChain(ch)
        if len(simpch) > 0:
            chains.append(simpch)
    chains1 = chains
    if not isinstance(atoms1, Chain):
        LOGGER.debug('Checking {0}: {1} chains are identified'.format(
            str(atoms1), len(chains1)))

    if isinstance(atoms2, Chain):
        chains2 = [atoms2]
        atoms2 = atoms2.getAtomGroup()
    else:
        chains2 = list(atoms2.getHierView().iterChains())
        if not isinstance(atoms2, AtomGroup):
            atoms2 = atoms2.getAtomGroup()
    chains = list()
    for ch in chains2:
        simpch = SimpleChain(ch)
        if len(simpch) > 0:
            chains.append(simpch)
    chains2 = chains
    if not isinstance(atoms2, Chain):
        LOGGER.debug('Checking {0}: {1} chains are identified'.format(
            str(atoms2), len(chains2)))

    matches = []
    unmatched = []
    LOGGER.debug('Trying to match chains based on residue numbers and names:')
    for simpch1 in chains1:
        for simpch2 in chains2:
            LOGGER.debug('  Comparing {0} (len={1}) and {2} (len={3}):'.format(
                simpch1.getTitle(), len(simpch1), simpch2.getTitle(),
                len(simpch2)))

            match1, match2, nmatches = getTrivialMatch(simpch1, simpch2)
            _seqid = nmatches * 100 / min(len(simpch1), len(simpch2))
            _cover = len(match2) * 100 / max(len(simpch1), len(simpch2))

            if _seqid >= seqid and _cover >= coverage:
                LOGGER.debug('\tMatch: {0} residues match with {1:.0f}% '
                             'sequence identity and {2:.0f}% overlap.'.format(
                                 len(match1), _seqid, _cover))
                matches.append(
                    (match1, match2, _seqid, _cover, simpch1, simpch2))
            else:
                LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                             'overlap={1:.0f}%).'.format(_seqid, _cover))
                unmatched.append((simpch1, simpch2))

    if pwalign or (not matches and (pwalign is None or pwalign)):
        pairwise2 = importBioPairwise2()
        if pairwise2:
            LOGGER.debug('Trying to match chains based on {0} sequence '
                         'alignment:'.format(ALIGNMENT_METHOD))
            for simpch1, simpch2 in unmatched:
                LOGGER.debug('  Comparing {0} (len={1}) and {2} '
                             '(len={3}):'.format(simpch1.getTitle(),
                                                 len(simpch1),
                                                 simpch2.getTitle(),
                                                 len(simpch2)))
                match1, match2, nmatches = getAlignedMatch(simpch1, simpch2)
                _seqid = nmatches * 100 / min(len(simpch1), len(simpch2))
                _cover = len(match2) * 100 / max(len(simpch1), len(simpch2))
                if _seqid >= seqid and _cover >= coverage:
                    LOGGER.debug(
                        '\tMatch: {0} residues match with {1:.0f}% '
                        'sequence identity and {2:.0f}% overlap.'.format(
                            len(match1), _seqid, _cover))
                    matches.append(
                        (match1, match2, _seqid, _cover, simpch1, simpch2))
                else:
                    LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                                 'overlap={1:.0f}%).'.format(_seqid, _cover))
        else:
            LOGGER.warning('Pairwise alignment could not be performed.')
    if not matches:
        return None
    subset = _SUBSETS[subset]
    for mi, result in enumerate(matches):
        match1, match2, _seqid, _cover, simpch1, simpch2 = result

        indices1 = []
        indices2 = []

        for i in range(len(match1)):
            ares = match1[i]
            bres = match2[i]

            if subset == 'ca':
                try:
                    aid = ares.getNames().tolist().index('CA')
                except ValueError:
                    aid = None
                try:
                    bid = bres.getNames().tolist().index('CA')
                    if aid is not None:
                        indices1.append(ares._indices[aid])
                        indices2.append(bres._indices[bid])
                except ValueError:
                    pass
            elif subset == 'bb':
                for bban in ('N', 'CA', 'C', 'O'):
                    try:
                        aid = ares.getNames().tolist().index(bban)
                    except ValueError:
                        continue
                    try:
                        bid = bres.getNames().tolist().index(bban)
                    except ValueError:
                        continue
                    else:
                        indices1.append(ares._indices[aid])
                        indices2.append(bres._indices[bid])
            elif subset == 'noh':
                for han, aid, noh in zip(ares.getNames(), ares._indices,
                                         ares.getFlags('noh')):
                    if not noh:
                        continue
                    try:
                        bid = bres.getNames().tolist().index(han)
                    except ValueError:
                        continue
                    else:
                        indices1.append(aid)
                        indices2.append(bres._indices[bid])
            elif subset is None or subset is 'all':
                aans = ares.getNames()
                bans = bres.getNames().tolist()

                aids = ares.getIndices()
                #bids = bres.getIndices()

                for j in range(len(aans)):
                    try:
                        bid = bres._indices[bans.index(aans[j])]
                        indices1.append(aids[j])
                        indices2.append(bid)
                    except ValueError:
                        pass

        indices1 = np.array(indices1, int)
        indices2 = np.array(indices2, int)

        match1 = AM(atoms1,
                    indices1,
                    atoms1.getACSIndex(),
                    title=simpch1.getTitle() + ' -> ' + simpch2.getTitle(),
                    intarrays=True)
        match2 = AM(atoms2,
                    indices2,
                    atoms2.getACSIndex(),
                    title=simpch2.getTitle() + ' -> ' + simpch1.getTitle(),
                    intarrays=True)

        matches[mi] = (match1, match2, _seqid, _cover)
    if len(matches) > 1:

        def compare(m1, m2):
            return cmp(m1[2], m2[2])

        matches.sort(compare, reverse=True)
    return matches
Ejemplo n.º 46
0
def fetchPDB(*pdb, **kwargs):
    """Returns path(s) to PDB file(s) for specified *pdb* identifier(s).  Files
    will be sought in user specified *folder* or current working director, and
    then in local PDB folder and mirror, if they are available.  If *copy*
    is set **True**, files will be copied into *folder*.  If *compressed* is
    **False**, all files will be decompressed.  See :func:`pathPDBFolder` and
    :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP`
    and :func:`.fetchPDBviaFTP` for downloading files from PDB servers."""

    if len(pdb) == 1 and isinstance(pdb[0], list):
        pdb = pdb[0]

    if 'format' in kwargs and kwargs.get('format') != 'pdb':
        return fetchPDBviaFTP(*pdb, **kwargs)

    identifiers = checkIdentifiers(*pdb)

    folder = kwargs.get('folder', '.')
    compressed = kwargs.get('compressed')

    # check *folder* specified by the user, usually pwd ('.')
    filedict = findPDBFiles(folder, compressed=compressed)

    filenames = []
    not_found = []
    exists = 0
    for i, pdb in enumerate(identifiers):
        if pdb is None:
            filenames.append(None)
        elif pdb in filedict:
            filenames.append(filedict[pdb])
            exists += 1
        else:
            filenames.append(None)
            not_found.append((i, pdb))

    if not not_found:
        if len(filenames) == 1:
            filenames = filenames[0]
            if exists:
                LOGGER.debug(
                    'PDB file is found in working directory ({0}).'.format(
                        sympath(filenames)))
        return filenames

    if not isWritable(folder):
        raise IOError('permission to write in {0} is denied, please '
                      'specify another folder'.format(folder))

    if compressed is not None and not compressed:
        filedict = findPDBFiles(folder, compressed=True)
        not_found, decompress = [], not_found
        for i, pdb in decompress:
            if pdb in filedict:
                fn = filedict[pdb]
                filenames[i] = gunzip(fn, splitext(fn)[0])
            else:
                not_found.append((i, pdb))

    if not not_found:
        return filenames[0] if len(identifiers) == 1 else filenames

    local_folder = pathPDBFolder()
    copy = kwargs.setdefault('copy', False)
    if local_folder:
        local_folder, is_divided = local_folder
        temp, not_found = not_found, []
        for i, pdb in temp:
            if is_divided:
                fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz')
            else:
                fn = join(local_folder, pdb + '.pdb.gz')
            if isfile(fn):
                if copy or not compressed and compressed is not None:
                    if compressed:
                        fn = copyFile(fn, join(folder, pdb + 'pdb.gz'))
                    else:
                        fn = gunzip(fn, join(folder, pdb + '.pdb'))
                filenames[i] = normpath(fn)
            else:
                not_found.append((i, pdb))

    if not not_found:
        if len(identifiers) == 1:
            fn = filenames[0]
            items = fn.split(pathsep)
            if len(items) > 5:
                fndisp = pathsep.join(items[:3] + ['...'] + items[-1:])
            else:
                fndisp = relpath(fn)
            LOGGER.debug(
                'PDB file is found in the local folder ({0}).'.format(fndisp))
            return fn
        else:
            return filenames

    if kwargs['copy'] or (compressed is not None and not compressed):
        kwargs['folder'] = folder

    downloads = [pdb for i, pdb in not_found]
    fns = None

    try:
        fns = fetchPDBfromMirror(*downloads, **kwargs)
    except IOError:
        pass
    else:
        if len(downloads) == 1: fns = [fns]
        temp, not_found = not_found, []
        for i, fn in enumerate(fns):
            if fn is None:
                not_found.append(temp[i])
            else:
                i, _ = temp[i]
                filenames[i] = fn

    if not not_found:
        return filenames[0] if len(identifiers) == 1 else filenames

    if fns:
        downloads = [pdb for i, pdb in not_found]

    fns = None

    tp = kwargs.pop('tp', None)
    if tp is not None:
        tp = tp.lower()

    if tp == 'http':
        try:
            fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs)
        except Exception as err:
            LOGGER.warn('Downloading PDB files via HTTP failed '
                        '({0}).'.format(str(err)))
    elif tp == 'ftp':
        try:
            fns = fetchPDBviaFTP(*downloads, check=False, **kwargs)
        except Exception as err:
            LOGGER.warn('Downloading PDB files via FTP failed '
                        '({0}).'.format(str(err)))
    else:
        tryHTTP = False
        try:
            fns = fetchPDBviaFTP(*downloads, check=False, **kwargs)
        except Exception as err:
            tryHTTP = True

        if fns is None or isinstance(fns, list) and None in fns:
            tryHTTP = True
        elif isinstance(fns, list):
            downloads = [
                not_found[i][1] for i in range(len(fns)) if fns[i] is None
            ]
            if len(downloads) > 0:
                tryHTTP = True
        if tryHTTP:
            LOGGER.info('Downloading PDB files via FTP failed, '
                        'trying HTTP.')
            try:
                fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs)
            except Exception as err:
                LOGGER.warn('Downloading PDB files via HTTP also failed '
                            '({0}).'.format(str(err)))

    if len(downloads) == 1: fns = [fns]
    if fns:
        for i, fn in zip([i for i, pdb in not_found], fns):
            filenames[i] = fn

    return filenames[0] if len(identifiers) == 1 else filenames
Ejemplo n.º 47
0
Archivo: gnm.py Proyecto: prody/ProDy
def solveEig(M, n_modes=None, zeros=False, turbo=True, is3d=False):
    linalg = importLA()
    dof = M.shape[0]

    expct_n_zeros = 6 if is3d else 1

    if n_modes is None:
        eigvals = None
        n_modes = dof
    else:
        if n_modes >= dof:
            eigvals = None
            n_modes = dof
        else:
            eigvals = (0, n_modes+expct_n_zeros-1)

    def _eigh(M, eigvals=None, turbo=True):
        if linalg.__package__.startswith('scipy'):
            from scipy.sparse import issparse

            if eigvals:
                turbo = False
            if not issparse(M):
                values, vectors = linalg.eigh(M, turbo=turbo, eigvals=eigvals)
            else:
                try:
                    from scipy.sparse import linalg as scipy_sparse_la
                except ImportError:
                    raise ImportError('failed to import scipy.sparse.linalg, '
                                      'which is required for sparse matrix '
                                      'decomposition')
                if eigvals:
                    j = eigvals[0]
                    k = eigvals[-1] + 1
                else:
                    j = 0
                    k = dof

                if k >= dof:
                    k -= 1
                    LOGGER.warning('Cannot calculate all eigenvalues for sparse matrices, thus '
                                   'the last eigenvalue is omitted. See scipy.sparse.linalg.eigsh '
                                   'for more information')
                values, vectors = scipy_sparse_la.eigsh(M, k=k, which='SA')
                values = values[j:k]
                vectors = vectors[:, j:k]
        else:
            if n_modes is not None:
                LOGGER.info('Scipy is not found, all modes were calculated.')
            else:
                n_modes = dof
            values, vectors = linalg.eigh(M)
        return values, vectors

    def _calc_n_zero_modes(M):
        from scipy.sparse import issparse

        if not issparse(M):
            w = linalg.eigvalsh(M)
        else:
            try:
                from scipy.sparse import linalg as scipy_sparse_la
            except ImportError:
                raise ImportError('failed to import scipy.sparse.linalg, '
                                    'which is required for sparse matrix '
                                    'decomposition')
            w, _ = scipy_sparse_la.eigsh(M, k=dof-1, which='SA')
        n_zeros = sum(w < ZERO)
        return n_zeros

    values, vectors = _eigh(M, eigvals, turbo)
    n_zeros = sum(values < ZERO)

    if n_zeros < n_modes + expct_n_zeros:
        if n_zeros < expct_n_zeros:
            LOGGER.warning('Fewer than %d (%d) zero eigenvalues were calculated.'%(expct_n_zeros, n_zeros))
        elif n_zeros > expct_n_zeros:
            LOGGER.warning('More than %d (%d) zero eigenvalues were calculated.'%(expct_n_zeros, n_zeros))
    else:
        LOGGER.warning('More than %d zero eigenvalues were detected.'%expct_n_zeros)

    if not zeros:
        if n_zeros > expct_n_zeros:
            if n_zeros == n_modes + expct_n_zeros and n_modes != dof:
                LOGGER.debug('Determing the number of zero eigenvalues...')
                # find the actual number of zero modes
                n_zeros = _calc_n_zero_modes(M)
                LOGGER.debug('%d zero eigenvalues detected.'%n_zeros)
            LOGGER.debug('Solving for additional eigenvalues...')
            start = min(n_modes+expct_n_zeros, dof-1); end = min(n_modes+n_zeros-1, dof-1)
            values_, vectors_ = _eigh(M, eigvals=(start, end))
            values = np.concatenate((values, values_))
            vectors = np.hstack((vectors, vectors_))

        # final_n_modes may exceed len(eigvals) - no need to fix for the sake of the simplicity of the code
        final_n_modes = n_zeros + n_modes
        eigvals = values[n_zeros:final_n_modes]
        eigvecs = vectors[:, n_zeros:final_n_modes]
        vars = 1 / eigvals
    else:
        eigvals = values[:n_modes]
        eigvecs = vectors[:, :n_modes]
        vars = div0(1, values)
        vars[:n_zeros] = 0.
        vars = vars[:n_modes]

    return eigvals, eigvecs, vars
Ejemplo n.º 48
0
Archivo: gnm.py Proyecto: sixpi/ProDy
    def calcModes(self, n_modes=20, zeros=False, turbo=True, hinges=True):
        """Calculate normal modes.  This method uses :func:`scipy.linalg.eigh`
        function to diagonalize the Kirchhoff matrix. When Scipy is not found,
        :func:`numpy.linalg.eigh` is used.

        :arg n_modes: number of non-zero eigenvalues/vectors to calculate.
              If ``None`` is given, all modes will be calculated.
        :type n_modes: int or None, default is 20

        :arg zeros: If ``True``, modes with zero eigenvalues will be kept.
        :type zeros: bool, default is ``False``

        :arg turbo: Use a memory intensive, but faster way to calculate modes.
        :type turbo: bool, default is ``True``

        :arg hinges: Identify hinge sites after modes are computed.
        :type hinges: bool, default is ``True``
        """

        if self._kirchhoff is None:
            raise ValueError('Kirchhoff matrix is not built or set')
        assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, \
            'n_modes must be a positive integer'
        assert isinstance(zeros, bool), 'zeros must be a boolean'
        assert isinstance(turbo, bool), 'turbo must be a boolean'
        linalg = importLA()
        start = time.time()
        shift = 0
        if linalg.__package__.startswith('scipy'):
            if n_modes is None:
                eigvals = None
                n_modes = self._dof
            else:
                if n_modes >= self._dof:
                    eigvals = None
                    n_modes = self._dof
                else:
                    eigvals = (0, n_modes + shift)
            if eigvals:
                turbo = False
            if isinstance(self._kirchhoff, np.ndarray):
                values, vectors = linalg.eigh(self._kirchhoff, turbo=turbo,
                                              eigvals=eigvals)
            else:
                try:
                    from scipy.sparse import linalg as scipy_sparse_la
                except ImportError:
                    raise ImportError('failed to import scipy.sparse.linalg, '
                                      'which is required for sparse matrix '
                                      'decomposition')
                try:
                    values, vectors = (
                        scipy_sparse_la.eigsh(self._kirchhoff,
                                              k=n_modes + 1, which='SA'))
                except:
                    values, vectors = (
                        scipy_sparse_la.eigen_symmetric(self._kirchhoff,
                                                        k=n_modes + 1,
                                                        which='SA'))
        else:
            if n_modes is not None:
                LOGGER.info('Scipy is not found, all modes are calculated.')
            values, vectors = linalg.eigh(self._kirchhoff)
        n_zeros = sum(values < ZERO)
        if n_zeros < 1:
            LOGGER.warning('Less than 1 zero eigenvalues are calculated.')
            shift = n_zeros - 1
        elif n_zeros > 1:
            LOGGER.warning('More than 1 zero eigenvalues are calculated.')
            shift = n_zeros - 1
        if zeros:
            shift = -1
        self._eigvals = values[1+shift:]
        self._vars = 1 / self._eigvals
        self._trace = self._vars.sum()
        self._array = vectors[:, 1+shift:]
        self._n_modes = len(self._eigvals)
        if hinges:
            self.calcHinges()
        LOGGER.debug('{0} modes were calculated in {1:.2f}s.'
                     .format(self._n_modes, time.time()-start))
Ejemplo n.º 49
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')

	    fseq = '>Seq\n' + seq
	    parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
	    enc_params = urllib.urlencode(parameters)
	    request = urllib2.Request('http://hmmer.janelia.org/search/hmmscan', enc_params)

	    url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()
		
        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)
	    matches = {}
	    for child in root[0]:
		    if child.tag == 'hits':
			    accession = child.get('acc')
			    pfam_id = accession.split('.')[0]
			    matches[pfam_id]={}
			    matches[pfam_id]['accession']=accession
			    matches[pfam_id]['class']='Domain'
			    matches[pfam_id]['id']=child.get('name')
			    matches[pfam_id]['locations']={}
			    matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
			    matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
			    matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
			    matches[pfam_id]['locations']['end']=child[0].get('alisqto')
			    matches[pfam_id]['locations']['evalue']=child.get('evalue')
			    matches[pfam_id]['locations']['evidence']='hmmer v3.0'
			    matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
			    matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
			    matches[pfam_id]['locations']['significant']=child[0].get('significant')	
			    matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
			    matches[pfam_id]['type']='Pfam-A'
	            return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.xfam.org/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Ejemplo n.º 50
0
def fetchPDB(*pdb, **kwargs):
    """Returns path(s) to PDB file(s) for specified *pdb* identifier(s).  Files
    will be sought in user specified *folder* or current working director, and
    then in local PDB folder and mirror, if they are available.  If *copy*
    is set **True**, files will be copied into *folder*.  If *compressed* is
    **False**, all files will be decompressed.  See :func:`pathPDBFolder` and
    :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP`
    and :func:`.fetchPDBviaFTP` for downloading files from PDB servers."""

    if len(pdb) == 1 and isinstance(pdb[0], list):
        pdb = pdb[0]

    if 'format' in kwargs and kwargs.get('format') != 'pdb':
        return fetchPDBviaFTP(*pdb, **kwargs)

    identifiers = checkIdentifiers(*pdb)

    folder = kwargs.get('folder', '.')
    compressed = kwargs.get('compressed')

    # check *folder* specified by the user, usually pwd ('.')
    filedict = findPDBFiles(folder, compressed=compressed)

    filenames = []
    not_found = []
    exists = 0
    for i, pdb in enumerate(identifiers):
        if pdb is None:
            filenames.append(None)
        elif pdb in filedict:
            filenames.append(filedict[pdb])
            exists += 1
        else:
            filenames.append(None)
            not_found.append((i, pdb))

    if not not_found:
        if len(filenames) == 1:
            filenames = filenames[0]
            if exists:
                LOGGER.debug('PDB file is found in working directory ({0}).'
                             .format(sympath(filenames)))
        return filenames

    if not isWritable(folder):
        raise IOError('permission to write in {0} is denied, please '
                      'specify another folder'.format(folder))

    if compressed is not None and not compressed:
        filedict = findPDBFiles(folder, compressed=True)
        not_found, decompress = [], not_found
        for i, pdb in decompress:
            if pdb in filedict:
                fn = filedict[pdb]
                filenames[i] = gunzip(fn, splitext(fn)[0])
            else:
                not_found.append((i, pdb))

    if not not_found:
        return filenames[0] if len(identifiers) == 1 else filenames

    local_folder = pathPDBFolder()
    copy = kwargs.setdefault('copy', False)
    if local_folder:
        local_folder, is_divided = local_folder
        temp, not_found = not_found, []
        for i, pdb in temp:
            if is_divided:
                fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz')
            else:
                fn = join(local_folder, pdb + '.pdb.gz')
            if isfile(fn):
                if copy or not compressed and compressed is not None:
                    if compressed:
                        fn = copyFile(fn, join(folder, pdb + 'pdb.gz'))
                    else:
                        fn = gunzip(fn, join(folder, pdb + '.pdb'))
                filenames[i] = normpath(fn)
            else:
                not_found.append((i, pdb))

    if not not_found:
        if len(identifiers) == 1:
            fn = filenames[0]
            if kwargs.get('report', True):
                items = fn.split(pathsep)
                if len(items) > 5:
                    fndisp = pathsep.join(items[:3] + ['...'] + items[-1:])
                else:
                    fndisp = relpath(fn)
                LOGGER.debug('PDB file is found in the local folder ({0}).'
                             .format(fndisp))
            return fn
        else:
            return filenames

    if kwargs['copy'] or (compressed is not None and not compressed):
        kwargs['folder'] = folder

    downloads = [pdb for i, pdb in not_found]
    fns = None

    try:
        fns = fetchPDBfromMirror(*downloads, **kwargs)
    except IOError:
        pass
    else:
        if len(downloads) == 1: fns = [fns]
        temp, not_found = not_found, []
        for i, fn in enumerate(fns):
            if fn is None:
                not_found.append(temp[i])
            else:
                i, _ = temp[i]
                filenames[i] = fn

    if not not_found:
        return filenames[0] if len(identifiers) == 1 else filenames

    if fns:
        downloads = [pdb for i, pdb in not_found]
    fns = None
    try:
        fns = fetchPDBviaFTP(*downloads, check=False, **kwargs)
    except Exception as err:
        LOGGER.warn('Downloading PDB files via FTP failed ({0}), '
                    'trying HTTP.'.format(str(err)))
        try:
            fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs)
        except Exception as err:
            LOGGER.warn('Downloading PDB files via HTTP also failed '
                        '({0}).'.format(str(err)))
    if len(downloads) == 1: fns = [fns]
    if fns:
        for i, fn in zip([i for i, pdb in not_found], fns):
            filenames[i] = fn

    return filenames[0] if len(identifiers) == 1 else filenames
Ejemplo n.º 51
0
def fetchPDBviaHTTP(*pdb, **kwargs):
    """Retrieve PDB file(s) for specified *pdb* identifier(s) and return
    path(s).  Downloaded files will be stored in local PDB folder, if one
    is set using :meth:`.pathPDBFolder`, and copied into *folder*, if
    specified by the user.  If no destination folder is specified, files
    will be saved in the current working directory.  If *compressed* is
    **False**, decompressed files will be copied into *folder*."""

    if kwargs.get('check', True):
        identifiers = checkIdentifiers(*pdb)
    else:
        identifiers = list(pdb)

    output_folder = kwargs.pop('folder', None)
    compressed = bool(kwargs.pop('compressed', True))

    extension = '.pdb'
    local_folder = pathPDBFolder()
    if local_folder:
        local_folder, is_divided = local_folder
        if is_divided:
            getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])),
                                       'pdb' + pdb + '.pdb.gz')
        else:
            getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz')
        if output_folder is None:
            second = lambda filename, pdb: filename
        else:
            if compressed:
                second = lambda filename, pdb: (copyFile(filename,
                            join(output_folder, pdb + extension + '.gz')))
            else:
                second = lambda filename, pdb: gunzip(filename,
                            join(output_folder, pdb + extension))

    else:
        if output_folder is None:
            output_folder = getcwd()
        if compressed:
            getPath = lambda pdb: join(output_folder, pdb + extension + '.gz')
            second = lambda filename, pdb: filename
        else:
            getPath = lambda pdb: join(output_folder, pdb + extension)
            second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb))


    getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us']

    success = 0
    failure = 0
    filenames = []
    for pdb in identifiers:
        if pdb is None:
            filenames.append(None)
            continue
        try:
            handle = openURL(getURL(pdb))
        except Exception as err:
            LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err)))
            failure += 1
            filenames.append(None)
        else:
            data = handle.read()
            if len(data):
                filename = getPath(pdb)

                with open(filename, 'w+b') as pdbfile:
                    pdbfile.write(data)

                filename = normpath(relpath(second(filename, pdb)))
                LOGGER.debug('{0} downloaded ({1})'
                             .format(pdb, sympath(filename)))
                success += 1
                filenames.append(filename)
            else:
                LOGGER.warn('{0} download failed, reason unknown.'
                            .format(pdb))
                failure += 1
                filenames.append(None)
    LOGGER.debug('PDB download via HTTP completed ({0} downloaded, '
                 '{1} failed).'.format(success, failure))
    if len(identifiers) == 1:
        return filenames[0]
    else:
        return filenames
Ejemplo n.º 52
0
def mapOntoChain(atoms, chain, **kwargs):
    """Map *atoms* onto *chain*.  This function returns a list of mappings.
    Each mapping is a tuple that contains 4 items:

      * Mapped chain as an :class:`.AtomMap` instance,
      * *chain* as an :class:`.AtomMap` instance,
      * Percent sequence identitity,
      * Percent sequence overlap

    Mappings are returned in decreasing percent sequence identity order.
    :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms
    in place of unmapped atoms.

    :arg atoms: atoms that will be mapped to the target *chain*
    :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg chain: chain to which atoms will be mapped
    :type chain: :class:`.Chain`

    :keyword subset: one of the following well-defined subsets of atoms:
        ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``),
        ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"``
    :type subset: string

    :keyword seqid: percent sequence identity, default is 90
    :type seqid: float

    :keyword overlap: percent overlap, default is 90
    :type overlap: float

    :keyword pwalign: perform pairwise sequence alignment
    :type pwalign: bool

    :keyword fast: get rid of verbosity and just returns sequence identity. 
    :type fast: bool

    This function tries to map *atoms* to *chain* based on residue
    numbers and types. Each individual chain in *atoms* is compared to
    target *chain*. This works well for different structures of the same
    protein. When it fails, :mod:`Bio.pairwise2` is used for sequence
    alignment, and mapping is performed based on the sequence alignment.
    User can control, whether sequence alignment is performed or not with
    *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is
    enforced."""

    target_chain = chain
    if not isinstance(atoms, (AtomGroup, Chain, Selection)):
        raise TypeError('atoms must be an AtomGroup, a Chain, or a '
                        'Selection instance')
    if not isinstance(target_chain, Chain):
        raise TypeError('chain must be Chain instance')

    subset = str(kwargs.get('subset', 'calpha')).lower()
    if subset not in _SUBSETS:
        raise ValueError('{0} is not a valid subset argument'.format(
            str(subset)))
    seqid = kwargs.get('seqid', 90.)
    coverage = kwargs.get('overlap')
    if coverage is None:
        coverage = kwargs.get('coverage', 70.)
    pwalign = kwargs.get('pwalign', None)
    fast = kwargs.get('fast', False)

    if isinstance(atoms, Chain):
        chains = [atoms]
        map_ag = atoms.getAtomGroup()
    else:
        if isinstance(atoms, AtomGroup):
            map_ag = atoms
        else:
            map_ag = atoms.getAtomGroup()
        chains = list(atoms.getHierView().iterChains())
        LOGGER.debug('Evaluating {0}: {1} chains are identified'.format(
            str(atoms), len(chains)))

    if subset != 'all':
        target_chain = target_chain.select(subset).getHierView()[
            target_chain.getChid()]

    mappings = []
    unmapped = []
    target_ag = target_chain.getAtomGroup()
    simple_target = SimpleChain(target_chain, True)
    if fast is False:
        LOGGER.debug('Trying to map atoms based on residue numbers and '
                     'identities:')
    for chain in chains:
        simple_chain = SimpleChain(True)
        simple_chain.buildFromChain(chain)
        if len(simple_chain) == 0:
            if fast is False:
                LOGGER.debug(
                    '  Skipping {0}, which does not contain any amino '
                    'acid residues.'.format(simple_chain))
            continue
        if fast is False:
            LOGGER.debug('  Comparing {0} (len={1}) with {2}:'.format(
                simple_chain.getTitle(), len(simple_chain),
                simple_target.getTitle()))

        target_list, chain_list, n_match, n_mapped = getTrivialMapping(
            simple_target, simple_chain)
        if n_mapped > 0:
            _seqid = n_match * 100 / n_mapped
            _cover = n_mapped * 100 / max(len(simple_target),
                                          len(simple_chain))
        else:
            _seqid = 0
            _cover = 0

        if _seqid >= seqid and _cover >= coverage:
            if fast is False:
                LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% '
                             'sequence identity and {2:.0f}% overlap.'.format(
                                 n_mapped, _seqid, _cover))
            mappings.append((target_list, chain_list, _seqid, _cover))
        else:
            if fast is False:
                LOGGER.debug(
                    '\tFailed to match chains based on residue numbers '
                    '(seqid={0:.0f}%, overlap={1:.0f}%).'.format(
                        _seqid, _cover))
            unmapped.append(simple_chain)

    if pwalign or (not mappings and (pwalign is None or pwalign)):
        LOGGER.debug(
            'Trying to map atoms based on {0} sequence alignment:'.format(
                ALIGNMENT_METHOD))
        for simple_chain in unmapped:
            LOGGER.debug('  Comparing {0} (len={1}) with {2}:'.format(
                simple_chain.getTitle(), len(simple_chain),
                simple_target.getTitle()))
            result = getAlignedMapping(simple_target, simple_chain)
            if result is not None:
                target_list, chain_list, n_match, n_mapped = result
                if n_mapped > 0:
                    _seqid = n_match * 100 / n_mapped
                    _cover = n_mapped * 100 / max(len(simple_target),
                                                  len(simple_chain))
                else:
                    _seqid = 0
                    _cover = 0
                if _seqid >= seqid and _cover >= coverage:
                    LOGGER.debug(
                        '\tMapped: {0} residues match with {1:.0f}%'
                        ' sequence identity and {2:.0f}% overlap.'.format(
                            n_mapped, _seqid, _cover))
                    mappings.append((target_list, chain_list, _seqid, _cover))
                else:
                    LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                                 'overlap={1:.0f}%).'.format(_seqid, _cover))

    for mi, result in enumerate(mappings):
        residues_target, residues_chain, _seqid, _cover = result
        indices_target = []
        indices_chain = []
        indices_mapping = []
        indices_dummies = []
        counter = 0
        for i in range(len(residues_target)):
            res_tar = residues_target[i]
            res_chn = residues_chain[i]

            for atom_tar in res_tar:
                indices_target.append(atom_tar.getIndex())
                if res_chn is not None:
                    atom_chn = res_chn.getAtom(atom_tar.getName())
                    if atom_chn is not None:
                        indices_chain.append(atom_chn.getIndex())
                        indices_mapping.append(counter)
                    else:
                        indices_dummies.append(counter)
                else:
                    indices_dummies.append(counter)
                counter += 1
        #n_atoms = len(indices_target)

        ch_tar = next((r for r in residues_target if r is not None)).getChain()
        ch_chn = next((r for r in residues_chain if r is not None)).getChain()
        title_tar = 'Chain {0} from {1}'.format(
            ch_tar.getChid(),
            ch_tar.getAtomGroup().getTitle())
        title_chn = 'Chain {0} from {1}'.format(
            ch_chn.getChid(),
            ch_chn.getAtomGroup().getTitle())

        atommap = AM(map_ag,
                     indices_chain,
                     chain.getACSIndex(),
                     mapping=indices_mapping,
                     dummies=indices_dummies,
                     title=title_chn + ' -> ' + title_tar)
        selection = AM(target_ag,
                       indices_target,
                       target_chain.getACSIndex(),
                       title=title_tar + ' -> ' + title_chn,
                       intarrays=True)

        mappings[mi] = (atommap, selection, _seqid, _cover)
    if len(mappings) > 1:

        def compare(m1, m2):
            return cmp(m1[2], m2[2])

        mappings.sort(compare, reverse=True)
    return mappings
Ejemplo n.º 53
0
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    result = root[0].get('id')
    return result