Beispiel #1
0
def loadPDBClusters(sqid=None):
    """Load previously fetched PDB sequence clusters from disk to memory."""

    PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters')
    if sqid is None:
        sqid_list = list(PDB_CLUSTERS)
        LOGGER.info('Loading all PDB sequence clusters.')
    else:
        assert isinstance(sqid, Integral), 'sqid must be an integer'
        if sqid not in PDB_CLUSTERS:
            raise ValueError('PDB cluster data is not available for sequence '
                             'identity {0}%, try one of {1}'.format(
                                 sqid, PDB_CLUSTERS_SQID_STR))
        LOGGER.info('Loading PDB sequence clusters for sequence identity '
                    '{0}.'.format(sqid))
        sqid_list = [sqid]
    global PDB_CLUSTERS_UPDATE_WARNING
    for sqid in sqid_list:
        filename = os.path.join(PDB_CLUSTERS_PATH,
                                'bc-{0}.out.gz'.format(sqid))
        if not os.path.isfile(filename):
            fetchPDBClusters(sqid)

        if PDB_CLUSTERS_UPDATE_WARNING:
            import time
            diff = (time.time() - os.path.getmtime(filename)) / 604800.
            if diff > 1.:
                LOGGER.warning(
                    'PDB sequence clusters are {0:.1f} week(s) old,'
                    ' call `fetchPDBClusters` to receive updates.'.format(
                        diff))
                PDB_CLUSTERS_UPDATE_WARNING = False
        inp = openFile(filename)
        clusters_str = pystr(inp.read())

        clusters = []
        for cluster_str in clusters_str.split('\n'):
            cluster_str = cluster_str.strip()
            if len(cluster_str):
                cluster = [
                    tuple(item.split('_')) for item in cluster_str.split()
                ]
                clusters.append(cluster)

        PDB_CLUSTERS[sqid] = clusters
        inp.close()

    if sqid is None:
        return PDB_CLUSTERS
    else:
        return clusters
Beispiel #2
0
def parseSTAR(filename, **kwargs):
    """Returns a dictionary containing data
    parsed from a Relion STAR file.

    :arg filename: a filename
        The .star extension can be omitted.
    :type filename: str

    :arg start: line number for starting
        Default is **None**, meaning start at the beginning
    :type start: int, None

    :arg stop: line number for stopping
        Default is **None**, meaning don't stop.
    :type stop: int, None

    :arg shlex: whether to use shlex for splitting lines so as to preserve quoted substrings
        Default is **False**
    :type shlex: bool
    """
    if not os.path.isfile(filename) and not os.path.isfile(filename + '.star'):
        raise IOError('There is no file called {0}.'.format(filename))

    start = kwargs.get('start', None)
    if start is not None and not isinstance(start, Integral):
        raise TypeError('start should be an integer or None')

    stop = kwargs.get('stop', None)
    if stop is not None and not isinstance(stop, Integral):
        raise TypeError('stop should be an integer or None')

    shlex = kwargs.get('shlex', False)
    if not isinstance(shlex, bool):
        raise TypeError('shlex should be a boolean')

    starfile = openFile(filename, 'r')
    lines = starfile.readlines()
    lines = [pystr(line) for line in lines]
    starfile.close()

    parsingDict, prog = parseSTARLines(lines, **kwargs)

    return StarDict(parsingDict, prog, filename)
Beispiel #3
0
def buildPDBEnsemble(atomics,
                     ref=None,
                     title='Unknown',
                     labels=None,
                     atommaps=None,
                     unmapped=None,
                     **kwargs):
    """Builds a :class:`.PDBEnsemble` from a given reference structure and a list of structures 
    (:class:`.Atomic` instances). Note that the reference should be included in the list as well.

    :arg atomics: a list of :class:`.Atomic` instances
    :type atomics: list

    :arg ref: reference structure or the index to the reference in *atomics*. If **None**,
        then the first item in *atomics* will be considered as the reference. If it is a 
        :class:`.PDBEnsemble` instance, then *atomics* will be appended to the existing ensemble.
        Default is **None**
    :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup`

    :arg title: the title of the ensemble
    :type title: str

    :arg labels: labels of the conformations
    :type labels: list

    :arg degeneracy: whether only the active coordinate set (**True**) or all the coordinate sets 
        (**False**) of each structure should be added to the ensemble. Default is **True**
    :type degeneracy: bool

    :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy
        is below this value will be trimmed
    :type occupancy: float

    :arg atommaps: labels of *atomics* that were mapped and added into the ensemble. This is an 
        output argument
    :type atommaps: list

    :arg unmapped: labels of *atomics* that cannot be included in the ensemble. This is an 
        output argument
    :type unmapped: list

    :arg subset: a subset for selecting particular atoms from the input structures.
        Default is ``"all"``
    :type subset: str

    :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to 
        superpose the structures, otherwise conformations will be superposed with respect 
        to the reference specified by *ref* unless set to ``False``. Default is ``'iter'``
    :type superpose: str, bool
    """

    occupancy = kwargs.pop('occupancy', None)
    degeneracy = kwargs.pop('degeneracy', True)
    subset = str(kwargs.get('subset', 'all')).lower()
    superpose = kwargs.pop('superpose', 'iter')
    superpose = kwargs.pop('iterpose', superpose)
    debug = kwargs.pop('debug', {})

    if 'mapping_func' in kwargs:
        raise DeprecationWarning(
            'mapping_func is deprecated. Please see release notes for '
            'more details: http://prody.csb.pitt.edu/manual/release/v1.11_series.html'
        )
    start = time.time()

    if not isListLike(atomics):
        raise TypeError('atomics should be list-like')

    if len(atomics) == 1 and degeneracy is True:
        raise ValueError('atomics should have at least two items')

    if labels is not None:
        if len(labels) != len(atomics):
            raise TypeError('Labels and atomics must have the same lengths.')
    else:
        labels = []

        for atoms in atomics:
            if atoms is None:
                labels.append(None)
            else:
                labels.append(atoms.getTitle())

    if ref is None:
        target = atomics[0]
    elif isinstance(ref, Integral):
        target = atomics[ref]
    elif isinstance(ref, PDBEnsemble):
        target = ref._atoms
    else:
        target = ref

    # initialize a PDBEnsemble with reference atoms and coordinates
    isrefset = False
    if isinstance(ref, PDBEnsemble):
        ensemble = ref
    else:
        # select the subset of reference beforehand for the sake of efficiency
        if subset != 'all':
            target = target.select(subset)
        ensemble = PDBEnsemble(title)
        if isinstance(target, Atomic):
            ensemble.setAtoms(target)
            ensemble.setCoords(target.getCoords())
            isrefset = True
        else:
            ensemble._n_atoms = len(target)
            isrefset = False

    # build the ensemble
    if unmapped is None: unmapped = []
    if atommaps is None: atommaps = []

    LOGGER.progress('Building the ensemble...', len(atomics),
                    '_prody_buildPDBEnsemble')
    for i, atoms in enumerate(atomics):
        if atoms is None:
            unmapped.append(labels[i])
            continue

        LOGGER.update(i,
                      'Mapping %s to the reference...' % atoms.getTitle(),
                      label='_prody_buildPDBEnsemble')
        try:
            atoms.getHierView()
        except AttributeError:
            raise TypeError(
                'atomics must be a list of instances having the access to getHierView'
            )

        if subset != 'all':
            atoms = atoms.select(subset)

        # find the mapping of chains of atoms to those of target
        debug[labels[i]] = {}
        atommaps_ = alignChains(atoms,
                                target,
                                debug=debug[labels[i]],
                                **kwargs)

        if len(atommaps_) == 0:
            unmapped.append(labels[i])
            continue
        else:
            atommaps.extend(atommaps_)

        # add the atommaps to the ensemble
        for atommap in atommaps_:
            lbl = pystr(labels[i])
            if len(atommaps_) > 1:
                chids = np.unique(atommap.getChids())
                strchids = ''.join(chids)
                lbl += '_%s' % strchids
            ensemble.addCoordset(atommap,
                                 weights=atommap.getFlags('mapped'),
                                 label=lbl,
                                 degeneracy=degeneracy)

            if not isrefset:
                ensemble.setCoords(atommap.getCoords())
                isrefset = True

    LOGGER.finish()

    if occupancy is not None:
        ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy)

    if superpose == 'iter':
        ensemble.iterpose()
    elif superpose is not False:
        ensemble.superpose()

    LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format(
        ensemble.numConfs(),
        time.time() - start))

    if unmapped:
        LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped)))
    return ensemble
Beispiel #4
0
def refineMSA(msa,
              index=None,
              label=None,
              rowocc=None,
              seqid=None,
              colocc=None,
              **kwargs):
    """Refine *msa* by removing sequences (rows) and residues (columns) that
    contain gaps.

    :arg msa: multiple sequence alignment
    :type msa: :class:`.MSA`

    :arg index: remove columns that are gaps in the sequence with that index
    :type index: int

    :arg label: remove columns that are gaps in the sequence matching label,
        ``msa.getIndex(label)`` must return a sequence index, a PDB identifier
        is also acceptable
    :type label: str

    :arg rowocc: row occupancy, sequences with less occupancy will be
        removed after *label* refinement is applied
    :type rowocc: float

    :arg seqid: keep unique sequences at specified sequence identity level,
        unique sequences are identified using :func:`.uniqueSequences`
    :type seqid: float

    :arg colocc: column occupancy, residue positions with less occupancy
        will be removed after other refinements are applied
    :type colocc: float

    :arg keep: keep columns corresponding to residues not resolved in the PDB
        structure, default is **False**, applies when *label* is a PDB
        identifier
    :arg type: bool

    For Pfam MSA data, *label* is UniProt entry name for the protein.  You may
    also use PDB structure and chain identifiers, e.g. ``'1p38'`` or
    ``'1p38A'``, for *label* argument and UniProt entry names will be parsed
    using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and
    :class:`.DBRef`).

    The order of refinements are applied in the order of arguments.  If *label*
    and *unique* is specified, sequence matching *label* will
    be kept in the refined :class:`.MSA` although it may be similar to some
    other sequence."""

    # if msa is a char array, it will be refined but label won't work
    try:
        ndim, dtype_ = msa.ndim, msa.dtype
    except AttributeError:
        try:
            arr = msa._getArray()
        except AttributeError:
            raise TypeError('msa must be a character array or an MSA instance')
        ndim, dtype_ = arr.ndim, arr.dtype
    else:
        arr, msa = msa, None

    if dtype('|S1') != dtype_:
        raise ValueError('msa must be a character array or an MSA instance')
    if ndim != 2:
        raise ValueError('msa must be a 2D array or an MSA instance')

    title = []
    cols = None

    if index is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        cols = char.isalpha(arr[index]).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('index=' + str(index))
        LOGGER.report(
            'Index refinement reduced number of columns from {0} to '
            '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

    if label is not None:
        if index is not None:
            LOGGER.info('An index was provided so the label will be ignored.')

        else:
            before = arr.shape[1]
            LOGGER.timeit('_refine')
            try:
                upper, lower = label.upper(), label.lower()
            except AttributeError:
                raise TypeError('label must be a string')

            if msa is None:
                raise TypeError('msa must be an MSA instance, '
                                'label cannot be used')

            index = msa.getIndex(label)
            if index is None:
                index = msa.getIndex(upper)
            if index is None:
                index = msa.getIndex(lower)

            chain = None
            if index is None and (len(label) == 4 or len(label) == 5):
                from prody import parsePDB
                try:
                    structure, header = parsePDB(label[:4], header=True)
                except Exception as err:
                    raise IOError(
                        'failed to parse header for {0} ({1})'.format(
                            label[:4], str(err)))

                chid = label[4:].upper()
                for poly in header['polymers']:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if index is None:
                            index = msa.getIndex(dbref.idcode)
                            if index is not None:
                                LOGGER.info('{0} idcode {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database, dbref.idcode,
                                                label[:4], poly.chid,
                                                str(msa)))
                                break
                        if index is None:
                            index = msa.getIndex(dbref.accession)
                            if index is not None:
                                LOGGER.info('{0} accession {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database,
                                                dbref.accession, label[:4],
                                                poly.chid, str(msa)))
                                break
                if index is not None:
                    chain = structure[poly.chid]
                    resnums = chain.ca.getResnums()

            if index is None:
                raise ValueError('label is not in msa, or msa is not indexed')
            try:
                len(index)
            except TypeError:
                pass
            else:
                raise ValueError(
                    'label {0} maps onto multiple sequences, '
                    'so cannot be used for refinement'.format(label))

            title.append('label=' + label)
            cols = char.isalpha(arr[index]).nonzero()[0]
            arr = arr.take(cols, 1)
            LOGGER.report(
                'Label refinement reduced number of columns from {0} to '
                '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

            if chain is not None and not kwargs.get('keep', False):
                before = arr.shape[1]
                LOGGER.timeit('_refine')

                from Bio import pairwise2
                from prody.utilities import MATCH_SCORE, MISMATCH_SCORE
                from prody.utilities import GAP_PENALTY, GAP_EXT_PENALTY, ALIGNMENT_METHOD

                chseq = chain.getSequence()
                algn = pairwise2.align.localms(pystr(
                    arr[index].tostring().upper()),
                                               pystr(chseq),
                                               MATCH_SCORE,
                                               MISMATCH_SCORE,
                                               GAP_PENALTY,
                                               GAP_EXT_PENALTY,
                                               one_alignment_only=1)
                torf = []
                for s, c in zip(*algn[0][:2]):
                    if s == '-':
                        continue
                    elif c != '-':
                        torf.append(True)
                    else:
                        torf.append(False)
                torf = array(torf)
                tsum = torf.sum()
                assert tsum <= before, 'problem in mapping sequence to structure'
                if tsum < before:
                    arr = arr.take(torf.nonzero()[0], 1)
                    resnums = resnums.take(torf.nonzero()[0] -
                                           torf.nonzero()[0][0] + 1)
                    LOGGER.report(
                        'Structure refinement reduced number of '
                        'columns from {0} to {1} in %.2fs.'.format(
                            before, arr.shape[1]), '_refine')
                else:
                    LOGGER.debug(
                        'All residues in the sequence are contained in '
                        'PDB structure {0}.'.format(label))

                labels = msa._labels
                labels[index] = splitSeqLabel(labels[index])[0] + '/' + str(
                    resnums[0]) + '-' + str(resnums[-1])

    from .analysis import calcMSAOccupancy, uniqueSequences

    rows = None
    if rowocc is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        try:
            rowocc = float(rowocc)
        except Exception as err:
            raise TypeError('rowocc must be a float ({0})'.format(str(err)))
        assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1'

        rows = calcMSAOccupancy(arr, 'row') >= rowocc
        if index is not None:
            index = rows[:index].sum()
        rows = (rows).nonzero()[0]
        arr = arr[rows]
        title.append('rowocc>=' + str(rowocc))
        LOGGER.report(
            'Row occupancy refinement reduced number of rows from '
            '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine')

    if seqid is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        unique = uniqueSequences(arr, seqid)
        if index is not None:
            unique[index] = True
        unique = unique.nonzero()[0]
        arr = arr[unique]
        title.append('seqid>=' + str(seqid))
        if rows is not None:
            rows = rows[unique]
        else:
            rows = unique
        LOGGER.report(
            'Sequence identity refinement reduced number of rows '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]),
            '_refine')

    if colocc is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            colocc = float(colocc)
        except Exception as err:
            raise TypeError('colocc must be a float ({0})'.format(str(err)))
        assert 0. <= colocc <= 1., 'colocc must be between 0 and 1'

        cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('colocc>=' + str(colocc))
        LOGGER.report(
            'Column occupancy refinement reduced number of columns '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]),
            '_refine')

    if not title:
        raise ValueError(
            'label, index, seqid, rowocc, colocc all cannot be None')

    # depending on slicing of rows, arr may not have it's own memory
    if arr.base is not None:
        arr = arr.copy()

    if msa is None:
        return arr
    else:
        if rows is None:
            from copy import copy
            labels = copy(msa._labels)
        else:
            labels = msa._labels
            labels = [labels[i] for i in rows]
        return MSA(arr,
                   title=msa.getTitle() +
                   ' refined ({0})'.format(', '.join(title)),
                   labels=labels)