Beispiel #1
0
def resetTicks(x, y=None):
    """Reset X (and Y) axis ticks using values in given *array*.  Ticks in the
    current figure should not be fractional values for this function to work as
    expected."""

    import matplotlib.pyplot as plt
    if x is not None:
        try:
            xticks = plt.xticks()[0]
            xlist = list(xticks.astype(int))
            if xlist[-1] > len(x):
                xlist.pop()
            if xlist:
                xlist = list(x[xlist])
                plt.xticks(xticks, xlist + [''] * (len(xticks) - len(xlist)))
        except:
            LOGGER.warning('xticks could not be reset.')
    if y is not None:
        try:
            yticks = plt.yticks()[0]
            ylist = list(yticks.astype(int))
            if ylist[-1] > len(y):
                ylist.pop()
            if ylist:
                ylist = list(y[ylist])
                plt.yticks(yticks, ylist + [''] * (len(yticks) - len(ylist)))
        except:
            LOGGER.warning('xticks could not be reset.')
Beispiel #2
0
def loadAtoms(filename):
    """Return :class:`AtomGroup` instance from *filename*.  This function makes
    use of :func:`numpy.load` function.  See also :func:`saveAtoms`."""
    
    LOGGER.timeit()
    attr_dict = load(filename)
    files = set(attr_dict.files)
    if not 'n_atoms' in files:
        raise ValueError("'{0:s}' is not a valid atomic data file"
                         .format(filename))
    title = str(attr_dict['title'])
    if 'coordinates' in files:
        coords = attr_dict['coordinates']
        ag = AtomGroup(title)
        ag._n_csets = int(attr_dict['n_csets'])
        ag._coords = coords
    ag._n_atoms = int(attr_dict['n_atoms'])
    ag._setTimeStamp()
    if 'bonds' in files and 'bmap' in files and 'numbonds' in files:
        ag._bonds = attr_dict['bonds']
        ag._bmap = attr_dict['bmap']
        ag._data['numbonds'] = attr_dict['numbonds']
    for key, data in attr_dict.iteritems():
        if key in SKIPLOAD:
            continue
        if key in ATOMIC_ATTRIBUTES:
            ag._data[key] = data
        else:
            ag.setData(key, data)
    if ag.numCoordsets() > 0:
        ag._acsi = 0
    if 'cslabels' in files:
        ag.setCSLabels(list(attr_dict['cslabels']))
    LOGGER.timing('Atom group was loaded in %.2fs.')
    return ag
Beispiel #3
0
    def _superpose(self, **kwargs):
        """Superpose conformations and update coordinates."""

        calcT = getTransformation
        if kwargs.get('trans', False):
            if self._trans is not None:
                LOGGER.info('Existing transformations will be overwritten.')
            trans = np.zeros((self._n_csets, 4, 4))
        else:
            trans = None
        indices = self._indices
        if indices is None:
            weights = self._weights
            coords = self._coords
            confs = self._confs
            confs_selected = self._confs
        else:
            weights = self._weights[:, indices]
            coords = self._coords[indices]
            confs = self._confs
            confs_selected = self._confs[:, indices]

        for i, conf in enumerate(confs_selected):
            rmat, tvec = calcT(conf, coords, weights[i])
            if trans is not None:
                trans[i][:3, :3] = rmat
                trans[i][:3, 3] = tvec
            confs[i] = tvec + np.dot(confs[i], rmat.T)
        self._trans = trans
Beispiel #4
0
def psiBlastRun(sequence, cycles=2, filename=None, **kwargs):
    """Returns the results from a full PSI-BLAST run (multiple cycles).
    All arguments are the same as psiBlastCycle and are passed to it
    except for cycles.

    :arg cycles: the number of cycles to run
        default is 2
    :type cycles: int
    """
    psithr = kwargs.get('psithr', 1.0e-3)
    job_id = kwargs.get('previousjobid','') 
    selectedHits = kwargs.get('selectedHits','')

    cycles_done = 0
    results_list = []
    job_ids = []
    while cycles_done < cycles:
        if cycles_done > 0:
            selectedHits = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/result/' \
                      + job_id + '/preselected_seq'
            sequence = None
        job_id, results, sequence = psiBlastCycle(sequence, filename, \
                                                 previousjobid=job_id, \
                                                 selectedHits=selectedHits, \
                                                 cycle=cycles_done, **kwargs)
        results_list.append(results)
        job_ids.append(job_id)
        cycles_done += 1
        LOGGER.info('Finished cycle {0} with job ID {1}.'.format(cycles_done, job_id))

    return job_ids, results_list, sequence
Beispiel #5
0
def alignCoordsets(atoms, weights=None):
    """Return *atoms* after superposing coordinate sets onto its active 
    coordinate set.  Transformations will be calculated for *atoms* and 
    applied to its :class:`.AtomGroup`, when applicable.  Optionally, 
    atomic *weights* can be passed for weighted superposition."""
    
    try:
        acsi, n_csets = atoms.getACSIndex(), atoms.numCoordsets()
    except AttributeError:
        raise TypeError('atoms must have type Atomic, not {0:s}'
                        .format(type(atoms)))
        if n_csets < 2:
            LOGGER.warning('{0:s} contains fewer than two coordinate sets, '
                           'alignment was not performed.'.format(str(atoms)))
            return
    
    try:
        ag = atoms.getAtomGroup()
    except AttributeError:
        ag = atoms
    agacsi = ag.getACSIndex()

    tar = atoms._getCoords()
    for i in range(n_csets):
        if i == acsi:
            continue
        atoms.setACSIndex(i)
        ag.setACSIndex(i)
        calcTransformation(atoms, tar, weights).apply(ag)
    atoms.setACSIndex(acsi)
    ag.setACSIndex(agacsi)
    return atoms
Beispiel #6
0
def deformAtoms(atoms, mode, rmsd=None):
    """Generate a new coordinate set for *atoms* along the *mode*.  *atoms*
    must be a :class:`.AtomGroup` instance.  New coordinate set will be
    appended to *atoms*. If *rmsd* is provided, *mode* will be scaled to
    generate a coordinate set with given RMSD distance to the active coordinate
    set."""

    if not isinstance(atoms, AtomGroup):
        raise TypeError('atoms must be an AtomGroup, not {0}'
                        .format(type(atoms)))
    if not isinstance(mode, VectorBase):
        raise TypeError('mode must be a Mode or Vector instance, '
                        'not {0}'.format(type(mode)))
    if not mode.is3d():
        raise ValueError('mode must be from a 3-dimensional model.')
    if atoms.numAtoms() != mode.numAtoms():
        raise ValueError('number of atoms do not match')

    array = mode.getArrayNx3()

    if rmsd is not None:
        rmsd = float(rmsd)
        # rmsd = ( ((scalar * array)**2).sum() / n_atoms )**0.5
        scalar = (atoms.numAtoms() * rmsd**2 / (array**2).sum())**0.5
        LOGGER.info('Mode is scaled by {0}.'.format(scalar))
        atoms.addCoordset(atoms.getCoords() + array * scalar)
    else:
        atoms.addCoordset(atoms.getCoords() + array)
Beispiel #7
0
    def __and__(self, other):
        
        if self is other:
            return self
    
        if not isinstance(other, AtomPointer):
            raise TypeError('other must be an AtomPointer')
            
        if self._ag != other.getAtomGroup():
            raise ValueError('both selections must be from the same AtomGroup')
    
        acsi = self.getACSIndex()
        if acsi != other.getACSIndex():
            LOGGER.warning('active coordinate set indices do not match, '
                           'so it will be set to zero in the union.')
            acsi = 0

        acsi = self.getACSIndex()
        if acsi != other.getACSIndex():
            LOGGER.warn('Active coordinate set indices do not match, it will '
                        'be set to zero.')
            acsi = 0
            
        indices = set(self._getIndices())
    
        indices = indices.intersection(other.getIndices())
        if indices:
            indices = np.unique(indices)
            return Selection(self._ag, indices, '({0:s}) and ({1:s})'.format(
                                    self.getSelstr(), other.getSelstr()), acsi)
Beispiel #8
0
def calcMeff(msa, seqid=0.8, refine=False, weight=False, **kwargs):
    """Returns the Meff for *msa*, which may be an :class:`.MSA`
    instance or a 2D Numpy character array.

    Since similar sequences in an *msa* decreases the diversity of *msa*,
    *Meff* gives a weight for sequences in the *msa*.

    For example: One sequence in MSA has 5 other similar sequences in this
    MSA(itself included). The weight of this sequence is defined as 1/5=0.2.
    Meff is the sum of all sequence weights. In another word, Meff can be
    understood as the effective number of independent sequences.

    Sequences sharing sequence identity of *seqid* or more with another
    sequence are regarded as similar sequences to calculate Meff.

    Sequences are not refined by default. When *refine* is set **True**, the
    MSA will be refined by the first sequence.

    The weight for each sequence are returned when *weight* is **True**."""

    msa = getMSA(msa)
    from .msatools import msameff

    LOGGER.timeit("_meff")
    refine = 1 if refine else 0
    weight = 0 if weight else 1  # A Mark for return weighted array.
    if not weight:
        w = zeros((msa.shape[0]), float)
        meff = msameff(msa, theta=1.0 - seqid, meff_only=weight, refine=refine, w=w)
    else:
        meff = msameff(msa, theta=1.0 - seqid, meff_only=weight, refine=refine)
    LOGGER.report("Meff was calculated in %.2fs.", "_meff")
    return meff
Beispiel #9
0
def range2selstr(rangestr):
    if rangestr.strip() == '':
        return None
    frags = rangestr.split(',')
    sels = []
    for frag in frags:
        try:
            fromtos = frag.split('-')
            if len(fromtos) == 2:
                fro, to = fromtos
            else:
                LOGGER.warn('range "%s" is irregular'%rangestr)
                fro = '1'
                to = fromtos[-1]
            fro_num = intResnum(fro)
            to_num = intResnum(to)

            if fro_num > to_num:
                LOGGER.warn('range "%s" is irregular'%rangestr)
                to_num = fro_num
                fro_num = 1
            fro = str(fro_num)
            to = str(to_num)
        except ValueError:
            print('error occurred when parsing "%s"'%rangestr)
            continue
        sels.append('resnum %s to %s'%(fro, to))
    selstr = ' or '.join(sels)
    return selstr
Beispiel #10
0
def calcMinBranchLength(go_id1, go_id2, go):
    '''Find the minimum branch length between two terms in the GO DAG.

    :arg go_id1: the first GO ID
    :type go_id1: str

    :arg go_id2: the second GO ID
    :type go_id2:str

    :arg go: object containing a gene ontology (GO) directed acyclic graph (DAG)
    :type go: `~goatools.obo_parser.GODag`
    '''
    # First get the deepest common ancestor
    dca = findDeepestCommonAncestor([go_id1, go_id2], go)
    if dca is None:
        LOGGER.warn('There are no common ancestors between {0} and {1} so no meaningful distance can be calculated.'.format(
            go_id1, go_id2))
        return None

    # Then get the distance from the DCA to each term
    dca_depth = go[dca].depth
    d1 = go[go_id1].depth - dca_depth
    d2 = go[go_id2].depth - dca_depth

    # Return the total distance - i.e., to the deepest common ancestor and back.
    return d1 + d2
Beispiel #11
0
    def iterpose(self, rmsd=0.0001):

        confs = self._confs.copy()
        Ensemble.iterpose(self, rmsd)
        self._confs = confs
        LOGGER.info('Final superposition to calculate transformations.')
        self.superpose()
Beispiel #12
0
def calcEnsembleFunctionOverlaps(ens, **kwargs):
    """Calculate function overlaps for an ensemble as the 
    mean of the value from :func:`calcDeepFunctionOverlaps`.

    :arg ens: an ensemble with labels
    :type ens: :class:`Ensemble`
    """
    if not isinstance(ens, Ensemble) and not isListLike(ens):
        raise TypeError('ens should be an ensemble or list-like')

    if isinstance(ens, Ensemble):
        ids = [label[:5] for label in ens.getLabels()]
    else:
        ids = ens

    if not isinstance(ids[0], str):
        raise TypeError('ens should have labels')

    goa_ens = queryGOA(ids, **kwargs)
    for entry in goa_ens:
        if len(entry._molecular) == 0:
            LOGGER.warn(
                'ensemble member {0} has no molecular functions and was omitted'.format(entry._title))

    goa_ens = [entry for entry in goa_ens if len(entry._molecular) > 0]

    overlaps = calcDeepFunctionOverlaps(*goa_ens, **kwargs)

    return overlaps
Beispiel #13
0
def calcDeepFunctionOverlaps(*goa_data, **kwargs):
    """Calculate function overlaps between the deep 
    (most detailed) molecular functions in particular 
    from two sets of GO terms.

    :arg goa1: the first set of GO terms
    :type goa1: tuple, list, :class:`~numpy.ndarray`

    :arg goa2: the second set of GO terms
    :type goa2: tuple, list, :class:`~numpy.ndarray`
    """
    return_funcs = kwargs.pop('return_funcs', False)

    deepFuncs = [findDeepestFunctions(entry, **kwargs) for entry in goa_data]
    for i, entry in enumerate(deepFuncs):
        if len(entry) == 0:
            LOGGER.warn(
                'ensemble member {0} has no deep molecular functions and was omitted'
                .format(goa_data[i]._title))

    deepFuncs = [entry for entry in deepFuncs if len(entry) > 0]
    overlaps = calcGoOverlap(*deepFuncs, **kwargs)

    if return_funcs:
        return overlaps, deepFuncs

    return overlaps
Beispiel #14
0
def showContactMap(enm, *args, **kwargs):
    """Show Kirchhoff matrix using :func:`~matplotlib.pyplot.spy`.
    
    .. plot::
       :context:
       :include-source:
        
       p38_gnm = GNM('p38')
       p38_gnm.buildKirchhoff( p38_structure )
       plt.figure(figsize=(4,4))
       showContactMap( p38_gnm )

    .. plot::
       :context:
       :nofigs:
        
       plt.close('all')"""
    
    import matplotlib.pyplot as plt
    if not isinstance(enm, GNMBase):
        raise TypeError('model argument must be an ENM instance')
    kirchhoff = enm.getKirchhoff()
    if kirchhoff is None:
        LOGGER.warning('kirchhoff matrix is not set')
        return None
    show = plt.spy(kirchhoff, *args, **kwargs)
    plt.title('{0:s} contact map'.format(enm.getTitle())) 
    plt.xlabel('Residue index')
    plt.ylabel('Residue index')
    return show
Beispiel #15
0
def mapOntoChainByAlignment(atoms, chain, **kwargs):
    """This function is similar to :func:`.mapOntoChain` but correspondence 
    of chains is found by alignment provided. 
    
    :arg alignments: A list of predefined alignments. It can be also a 
        dictionary or :class:`MSA` instance where the keys or 
        labels are the title of *atoms* or *chains*. 
    :type alignments: list, dict, :class:`MSA`
    """

    alignments = kwargs.pop('alignments', None)
    if alignments is None:
        return mapOntoChain(atoms, chain, **kwargs)
    else:
        if isinstance(alignments, (MSA, dict)):
            refseq = str(alignments[chain.getTitle()])
            tarseq = str(alignments[atoms.getTitle()])
            alignment = [refseq, tarseq]
        else:
            index = kwargs.pop('index', 0)
            alignment = alignments[index]

        tar_aligned_seq = alignment[-1]
        for char in GAPCHARS:
            tar_aligned_seq = tar_aligned_seq.replace(char, '').upper()
        hv = atoms.getHierView()
        for target_chain in hv.iterChains():
            tar_seq = target_chain.getSequence().upper()
            if tar_seq == tar_aligned_seq:
                mappings = mapOntoChain(target_chain, chain, pwalign=alignment, **kwargs)
                return mappings
        LOGGER.warn('The sequence of chain does not match that in alignment (%s).'%atoms.getTitle())
    return []
Beispiel #16
0
    def __add__(self, other):
        """Returns an :class:`.AtomMap` instance. Order of pointed atoms are
        preserved."""

        try:
            ag = other.getAtomGroup()
        except AttributeError:
            raise TypeError('unsupported operand type(s) for +: {0} and '
                            '{1}'.format(repr(type(self).__name__),
                                         repr(type(other).__name__)))

        if ag != self._ag:
            raise ValueError('AtomPointer instances must point to the same '
                             'AtomGroup instance')
        acsi = self.getACSIndex()
        if acsi != other.getACSIndex():
            LOGGER.warning('Active coordset indices of atoms are not the same.'
                           ' Result will have ACSI {0}.'.format(acsi))

        title = '({0}) + ({1})'.format(str(self), str(other))
        indices = concatenate([self._getIndices(), other._getIndices()])

        dummies = 0
        try:
            dummies += self.numDummies()
        except AttributeError:
            pass
        try:
            dummies += other.numDummies()
        except AttributeError:
            pass

        return AtomMap(ag, indices, acsi, title=title, intarrays=True,
                       dummies=dummies)
Beispiel #17
0
 def __or__(self, other):
     
     if self is other:
         return self
 
     try:
         ag = other.getAtomGroup()
     except AttributeError:
         raise TypeError('other must be an AtomPointer')
         
     if self._ag != ag:
         raise ValueError('both selections must be from the same AtomGroup')
         
     acsi = self.getACSIndex()
     if acsi != other.getACSIndex():
         LOGGER.warn('Active coordinate set indices do not match, it will '
                     'be set to zero.')
         acsi = 0
         
     indices = unique(concatenate((self._getIndices(), 
                                   other._getIndices())))
     if indices[-1] == atommap.DUMMY:
         indices = indices[:-1]
     return Selection(self._ag, indices, '({0:s}) or ({1:s})'.format(
                                 self.getSelstr(), other.getSelstr()), 
                                 acsi, unique=True)
Beispiel #18
0
 def getCoordsets(self, indices=None):
     """Returns coordinate sets at given *indices*. *indices* may be an 
     integer, a list of integers or ``None``. ``None`` returns all 
     coordinate sets."""
             
     if self._closed: 
         raise ValueError('I/O operation on closed file')
     if (self._indices is None and 
         (indices is None or indices == slice(None))):
         nfi = self._nfi
         self.reset()
         n_floats = self._n_floats + self._unitcell * 14
         n_atoms = self._n_atoms
         n_csets = self._n_csets
         data = np.fromfile(self._file, self._dtype, 
                            n_floats * n_csets)
         if len(data) > n_floats * n_csets:
             n_csets = len(data)/n_floats
             data = data[:n_csets]
             LOGGER.warning('DCD is corrupt, {0:d} out of {1:d} frames '
                            'were parsed.'.format(n_csets, self._n_csets))
         data = data.reshape((n_csets, n_floats))
         if self._unitcell:
             data = data[:, 14:]
         data = data.reshape((n_csets, 3, n_atoms+2))
         data = data[:, :, 1:-1]
         data = data.transpose(0, 2, 1)
         self.goto(nfi)
         if self._astype is not None and self._astype != data.dtype:
             data = data.astype(self._astype)
         return data
     else:            
         return TrajFile.getCoordsets(self, indices)
Beispiel #19
0
    def __and__(self, other):

        if self is other:
            return self

        try:
            ag = other.getAtomGroup()
        except AttributeError:
            raise TypeError('other must be an AtomPointer')

        if self._ag != ag:
            raise ValueError('both selections must be from the same AtomGroup')

        acsi = self.getACSIndex()
        if acsi != other.getACSIndex():
            LOGGER.warning('active coordinate set indices do not match, '
                           'so it will be set to zero in the union.')
            acsi = 0

        acsi = self.getACSIndex()
        if acsi != other.getACSIndex():
            LOGGER.warn('Active coordinate set indices do not match, it will '
                        'be set to zero.')
            acsi = 0

        indices = set(self._getIndices())

        indices = indices.intersection(other.getIndices())
        if indices:
            indices = unique(indices)
            if indices[-1] == atommap.DUMMY:
                indices = indices[:-1]
            return Selection(self._ag, indices, '({0}) and ({1})'
                             .format(self.getSelstr(), other.getSelstr()),
                             acsi)
Beispiel #20
0
def calcShannonEntropy(msa, ambiguity=True, omitgaps=True, **kwargs):
    """Returns Shannon entropy array calculated for *msa*, which may be
    an :class:`.MSA` instance or a 2D Numpy character array.  Implementation
    is case insensitive and handles ambiguous amino acids as follows:

      * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn)
      * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln)
      * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu)
      * **X** (Xaa) count is allocated to the twenty standard amino acids

    Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered
    as distinct amino acids.  When *ambiguity* is set **False**, all alphabet
    characters as considered as distinct types.

    All non-alphabet characters are considered as gaps, and they are handled
    in two ways:

      * non-existent, the probability of observing amino acids in a given
        column is adjusted, by default
      * as a distinct character with its own probability, when *omitgaps* is
        **False**"""

    msa = getMSA(msa)
    length = msa.shape[1]
    if msa.shape[0] < 100:
        LOGGER.warning(
            "SCA performs the best with higher number of sequences, and "
            "minimal number of sequences is recommended as 100."
        )
    entropy = empty(length, float)
    from .msatools import msaentropy

    return msaentropy(msa, entropy, ambiguity=bool(ambiguity), omitgaps=bool(omitgaps))
Beispiel #21
0
def wwPDBServer(*key):
    """Set/get `wwPDB`_ FTP/HTTP server location used for downloading PDB
    structures.  Use one of the following keywords for setting a server:

    +---------------------------+-----------------------------+
    | wwPDB FTP server          | *Key* (case insensitive)    |
    +===========================+=============================+
    | RCSB PDB (USA) (default)  | RCSB, USA, US               |
    +---------------------------+-----------------------------+
    | PDBe (Europe)             | PDBe, Europe, Euro, EU      |
    +---------------------------+-----------------------------+
    | PDBj (Japan)              | PDBj, Japan, Jp             |
    +---------------------------+-----------------------------+

    .. _wwPDB: http://www.wwpdb.org/"""

    if not key:
        return SETTINGS.get('wwpdb', None)
    elif len(key) == 1:
        try:
            key = key[0].lower()
        except AttributeError:
            raise TypeError('key must be a string')
        if key in WWPDB_FTP_SERVERS:
            SETTINGS['wwpdb'] = key
            SETTINGS.save()
            LOGGER.info('wwPDB server is set to {}.'
                        .format(WWPDB_FTP_SERVERS[key][0]))
        else:
            raise ValueError('{0} is not a valid wwPDB server identifier'
                             .format(repr(key)))
    else:
        raise TypeError('one wwPDB server identifier is expected, {0} given'
                        .format(len(key)))
Beispiel #22
0
def buildSCAMatrix(msa, turbo=True, **kwargs):
    """Return SCA matrix calculated for *msa*, which may be an :class:`.MSA`
    instance or a 2D Numpy character array.

    Implementation is case insensitive and handles ambiguous amino acids
    as follows:

      * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn)
      * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln)
      * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu)
      * **X** (Xaa) count is allocated to the twenty standard amino acids
      * Joint probability of observing a pair of ambiguous amino acids is
        allocated to all potential combinations, e.g. probability of **XX**
        is allocated to 400 combinations of standard amino acids, similarly
        probability of **XB** is allocated to 40 combinations of *D* and *N*
        with the standard amino acids.

    Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered
    as distinct amino acids.  When *ambiguity* is set **False**, all alphabet
    characters as considered as distinct types.  All non-alphabet characters
    are considered as gaps."""

    msa = getMSA(msa)
    from .msatools import msasca
    LOGGER.timeit('_sca')
    length = msa.shape[1]
    sca = zeros((length, length), float)
    sca = msasca(msa, sca, turbo=bool(turbo))
    LOGGER.report('SCA matrix was calculated in %.2fs.', '_sca')
    return sca
Beispiel #23
0
    def getResnums(self, gaps=False):
        """Return list of residue numbers associated with non-gapped *seq*.
        When *gaps* is **True**, return a list containing the residue numbers
        with gaps appearing as **None**.  Residue numbers are inferred from the
        full label.  When label does not contain residue number information,
        indices a range of numbers starting from 1 is returned."""

        title, start, end = splitSeqLabel(self.getLabel(True))
        try:
            start, end = int(start), int(end)
        except:
            LOGGER.info('Cannot parse label start, end values, Setting '
                        'resnums 1 to {0:d}'.format(self.numResidues()))
            start, end = 1, self.numResidues()
        else:
            if (end - start + 1) != self.numResidues():
                LOGGER.info('Label start-end position does not match '
                            'length of ungapped sequence. Setting '
                            'resnums 1 to {0:d}'.format(self.numResidues()))
                start, end = 1, self.numResidues()

        resnums = iter(range(start, end + 1))
        if gaps:
            return [next(resnums) if torf else None
                    for torf in char.isalpha(self._array)]
        else:
            return list(resnums)
Beispiel #24
0
def parsePQR(filename, **kwargs):
    """Returns an :class:`.AtomGroup` containing data parsed from PDB lines.

    :arg filename: a PQR filename
    :type filename: str"""

    title = kwargs.get('title', kwargs.get('name'))
    model = 1
    header = False
    chain = kwargs.get('chain')
    subset = kwargs.get('subset')
    altloc = kwargs.get('altloc', 'A')
    max_n_atoms = kwargs.get('max_n_atoms', 1e5)
    if not os.path.isfile(filename):
        raise IOError('No such file: {0}'.format(repr(filename)))
    if title is None:
        fn, ext = os.path.splitext(os.path.split(filename)[1])
        if ext == '.gz':
            fn, ext = os.path.splitext(fn)
        title = fn.lower()
    title_suffix = ''
    if subset:
        try:
            subset = _PDBSubsets[subset.lower()]
        except AttributeError:
            raise TypeError('subset must be a string')
        except KeyError:
            raise ValueError('{0} is not a valid subset'
                             .format(repr(subset)))
        title_suffix = '_' + subset
    if chain is not None:
        if not isinstance(chain, str):
            raise TypeError('chain must be a string')
        elif len(chain) == 0:
            raise ValueError('chain must not be an empty string')
        title_suffix = '_' + chain + title_suffix
    if 'ag' in kwargs:
        ag = kwargs['ag']
        if not isinstance(ag, AtomGroup):
            raise TypeError('ag must be an AtomGroup instance')
        n_csets = ag.numCoordsets()
    else:
        ag = AtomGroup(title + title_suffix)
        n_csets = 0

    pqr = openFile(filename, 'rt')
    lines = pqr.readlines()
    pqr.close()
    LOGGER.timeit()
    ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain,
                        subset=subset, altloc_torf=False, format='pqr', 
                        max_n_atoms=max_n_atoms)
    if ag.numAtoms() > 0:
        LOGGER.report('{0} atoms and {1} coordinate sets were '
                      'parsed in %.2fs.'.format(ag.numAtoms(),
                      ag.numCoordsets() - n_csets))
        return ag
    else:
        return None
Beispiel #25
0
def calcCrossProjection(ensemble, mode1, mode2, scale=None, **kwargs):
    """Return projection of conformational deviations onto modes from
    different models.

    :arg ensemble: ensemble for which deviations will be projected
    :type ensemble: :class:`.Ensemble`
    :arg mode1: normal mode to project conformations onto
    :type mode1: :class:`.Mode`, :class:`.Vector`
    :arg mode2: normal mode to project conformations onto
    :type mode2: :class:`.Mode`, :class:`.Vector`
    :arg scale: scale width of the projection onto mode ``x`` or ``y``,
        best scaling factor will be calculated and printed on the console,
        absolute value of scalar makes the with of two projection same,
        sign of scalar makes the projections yield a positive correlation"""

    if not isinstance(ensemble, (Ensemble, Conformation, Vector, TrajBase)):
        raise TypeError('ensemble must be Ensemble, Conformation, Vector, '
                        'or a Trajectory, not {0}'.format(type(ensemble)))
    if not isinstance(mode1, VectorBase):
        raise TypeError('mode1 must be a Mode instance, not {0}'
                        .format(type(mode1)))
    if not mode1.is3d():
        raise ValueError('mode1 must be 3-dimensional')
    if not isinstance(mode2, VectorBase):
        raise TypeError('mode2 must be a Mode instance, not {0}'
                        .format(type(mode2)))
    if not mode2.is3d():
        raise ValueError('mode2 must be 3-dimensional')

    if scale is not None:
        assert isinstance(scale, str), 'scale must be a string'
        scale = scale.lower()
        assert scale in ('x', 'y'), 'scale must be x or y'

    xcoords = calcProjection(ensemble, mode1, kwargs.get('rmsd', True))
    ycoords = calcProjection(ensemble, mode2, kwargs.pop('rmsd', True))
    if scale:
        scalar = kwargs.get('scalar', None)
        if scalar:
            assert isinstance(scalar, (float, int)), 'scalar must be a number'
        else:
            scalar = ((ycoords.max() - ycoords.min()) /
                      (xcoords.max() - xcoords.min())
                      ) * np.sign(np.dot(xcoords, ycoords))
            if scale == 'x':
                LOGGER.info('Projection onto {0} is scaled by {1:.2f}'
                            .format(mode1, scalar))
            else:
                scalar = 1 / scalar
                LOGGER.info('Projection onto {0} is scaled by {1:.2f}'
                            .format(mode2, scalar))

        if scale == 'x':
            xcoords = xcoords * scalar
        else:
            ycoords = ycoords * scalar

    return xcoords, ycoords
Beispiel #26
0
def pathVMD(*path):
    """Return VMD path, or set it to be a user specified *path*."""

    if not path:
        path = SETTINGS.get('vmd', None)
        if isExecutable(path):
            return path
        else:
            LOGGER.warning('VMD path is not set by user, looking for it.')

            vmdbin = None
            vmddir = None
            if PLATFORM == 'Windows':
                if PY3K:
                    import winreg
                else:
                    import _winreg as winreg  # PY3K: OK
                for vmdversion in ('1.8.7', '1.9', '1.9.1'):
                    try:
                        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
                                'Software\\University of Illinois\\VMD\\' +
                                vmdversion)
                        vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0]
                        vmdbin = join(vmddir, 'vmd.exe')
                    except:
                        pass
                    try:
                        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE,
                    'Software\\WOW6432node\\University of Illinois\\VMD\\' +
                    vmdversion)
                        vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0]
                        vmdbin = join(vmddir, 'vmd.exe')
                    except:
                        pass
            else:
                vmdbin = which('vmd')
                if False:
                    pipe = os.popen('which vmd')
                    vmdbin = pipe.next().strip()
                    vmdfile = open(vmdbin)
                    for line in vmdfile:
                        if line.startswith('defaultvmddir='):
                            vmddir = line.split('=')[1].replace('"', '')
                            break
                    vmdfile.close()
            if isExecutable(vmdbin):
                setVMDpath(vmdbin)
                return vmdbin
    elif len(path) == 1:
        path = path[0]
        if isExecutable(path):
            SETTINGS['vmd'] = path
            SETTINGS.save()
            LOGGER.info("VMD path is set to '{0}'.".format(path))
        else:
            raise OSError('{0} is not executable.'.format(str(path)))
    else:
        raise ValueError('specify a single path string')
Beispiel #27
0
def setVMDpath(path):
    """Set path to a VMD executable."""

    if isExecutable(path):
        SETTINGS["vmd"] = path
        SETTINGS.save()
        LOGGER.info("VMD path is set to '{0:s}'.".format(path))
    else:
        raise OSError("{0:s} is not executable.".format(str(path)))
Beispiel #28
0
 def superpose(self):
     """Superpose the ensemble onto the reference coordinates."""
     
     if self._coords is None:
         raise ValueError('coordinates are not set, use `setCoords`')
     if self._confs is None or len(self._confs) == 0: 
         raise ValueError('conformations are not set, use `addCoordset`')
     LOGGER.timeit()
     self._superpose(trans=True) # trans kwarg is used by PDBEnsemble
     LOGGER.timing('Superposition completed in %.2f seconds.')
Beispiel #29
0
    def _setCoords(self, coords, label=None, overwrite=False):
        """Set coordinates without data type checking.  *coords* must
        be a :class:`~numpy.ndarray`, but may have data type other than
        :class:`numpy.float64`, e.g. :class:`numpy.float32`.  *label*
        argument may be used to label coordinate sets.  *label* may be
        a string or a list of strings length equal to the number of
        coordinate sets."""

        n_atoms = self._n_atoms
        if n_atoms:
            if coords.shape[-2] != n_atoms:
                raise ValueError('coords array has incorrect number of atoms')
        else:
            self._n_atoms = n_atoms = coords.shape[-2]

        ndim = coords.ndim
        shape = coords.shape
        if self._coords is None or overwrite or (ndim == 3 and shape[0] > 1):
            if ndim == 2:
                self._coords = coords.reshape((1, n_atoms, 3))
                if label is None:
                    self._cslabels = [None]
                else:
                    self._cslabels = [str(label)]
                self._n_csets = n_csets = 1

            else:
                self._coords = coords
                self._n_csets = n_csets = shape[0]

                if label is None or isinstance(label, str):
                    self._cslabels = [label] * n_csets

                elif isinstance(label, (list, tuple)):
                    if len(label) == n_csets:
                        self._cslabels = list(label)

                    else:
                        self._cslabels = [None] * n_csets
                        LOGGER.warn('Number of labels does not match number '
                                    'of coordinate sets.')
                else:
                    LOGGER.warn('Wrong type for `label` argument.')
            self._acsi = 0
            self._setTimeStamp()

        else:
            acsi = self._acsi
            if ndim == 2:
                self._coords[acsi] = coords
            else:
                self._coords[acsi] = coords[0]
            self._setTimeStamp(acsi)
            if label is not None:
                self._cslabels[acsi] = str(label)
Beispiel #30
0
    def performSVD(self, coordsets):
        """Calculate principal modes using singular value decomposition (SVD).
        *coordsets* argument may be a :class:`.Atomic`, :class:`.Ensemble`,
        or :class:`numpy.ndarray` instance.  If *coordsets* is a numpy array,
        its shape must be ``(n_csets, n_atoms, 3)``.  Note that coordinate
        sets must be aligned prior to SVD calculations.

        This is a considerably faster way of performing PCA calculations
        compared to eigenvalue decomposition of covariance matrix, but is
        an approximate method when heterogeneous datasets are analyzed.
        Covariance method should be preferred over this one for analysis of
        ensembles with missing atomic data.  See :ref:`pca-xray-calculations`
        example for comparison of results from SVD and covariance methods."""

        linalg = importLA()

        start = time.time()
        if not isinstance(coordsets, (Ensemble, Atomic, np.ndarray)):
            raise TypeError('coordsets must be an Ensemble, Atomic, Numpy '
                            'array instance')
        if isinstance(coordsets, np.ndarray):
            if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or
                    coordsets.dtype not in (np.float32, float)):
                raise ValueError('coordsets is not a valid coordinate array')
            deviations = coordsets - coordsets.mean(0)
        else:
            if isinstance(coordsets, Ensemble):
                deviations = coordsets.getDeviations()
            elif isinstance(coordsets, Atomic):
                deviations = (coordsets._getCoordsets() -
                              coordsets._getCoords())

        n_confs = deviations.shape[0]
        if n_confs < 3:
            raise ValueError('coordsets must have more than 3 coordinate sets')
        n_atoms = deviations.shape[1]
        if n_atoms < 3:
            raise ValueError('coordsets must have more than 3 atoms')

        dof = n_atoms * 3
        deviations = deviations.reshape((n_confs, dof)).T

        vectors, values, self._temp = linalg.svd(deviations,
                                                 full_matrices=False)
        values = (values ** 2) / n_confs
        self._dof = dof
        self._n_atoms = n_atoms
        which = values > 1e-18
        self._eigvals = values[which]
        self._array = vectors[:, which]
        self._vars = self._eigvals
        self._trace = self._vars.sum()
        self._n_modes = len(self._eigvals)
        LOGGER.debug('{0} modes were calculated in {1:.2f}s.'
                     .format(self._n_modes, time.time()-start))
Beispiel #31
0
def fetchPDBviaHTTP(*pdb, **kwargs):
    """Retrieve PDB file(s) for specified *pdb* identifier(s) and return
    path(s).  Downloaded files will be stored in local PDB folder, if one
    is set using :meth:`.pathPDBFolder`, and copied into *folder*, if
    specified by the user.  If no destination folder is specified, files
    will be saved in the current working directory.  If *compressed* is
    **False**, decompressed files will be copied into *folder*."""

    if kwargs.get('check', True):
        identifiers = checkIdentifiers(*pdb)
    else:
        identifiers = list(pdb)

    output_folder = kwargs.pop('folder', None)
    compressed = bool(kwargs.pop('compressed', True))
    report = kwargs.get('report', True)

    extension = '.pdb'
    local_folder = pathPDBFolder()
    if local_folder:
        local_folder, is_divided = local_folder
        if is_divided:
            getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])),
                                       'pdb' + pdb + '.pdb.gz')
        else:
            getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz')
        if output_folder is None:
            second = lambda filename, pdb: filename
        else:
            if compressed:
                second = lambda filename, pdb: (copyFile(
                    filename, join(output_folder, pdb + extension + '.gz')))
            else:
                second = lambda filename, pdb: gunzip(
                    filename, join(output_folder, pdb + extension))

    else:
        if output_folder is None:
            output_folder = getcwd()
        if compressed:
            getPath = lambda pdb: join(output_folder, pdb + extension + '.gz')
            second = lambda filename, pdb: filename
        else:
            getPath = lambda pdb: join(output_folder, pdb + extension)
            second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb))

    getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us']

    success = 0
    failure = 0
    filenames = []
    for pdb in identifiers:
        if pdb is None:
            filenames.append(None)
            continue
        try:
            handle = openURL(getURL(pdb))
        except Exception as err:
            LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err)))
            failure += 1
            filenames.append(None)
        else:
            data = handle.read()
            if len(data):
                filename = getPath(pdb)

                with open(filename, 'w+b') as pdbfile:
                    pdbfile.write(data)

                filename = normpath(relpath(second(filename, pdb)))
                LOGGER.debug('{0} downloaded ({1})'.format(
                    pdb, sympath(filename)))
                success += 1
                filenames.append(filename)
            else:
                LOGGER.warn('{0} download failed, reason unknown.'.format(pdb))
                failure += 1
                filenames.append(None)
    if report:
        LOGGER.debug('PDB download via HTTP completed ({0} downloaded, '
                     '{1} failed).'.format(success, failure))
    if len(identifiers) == 1:
        return filenames[0]
    else:
        return filenames
Beispiel #32
0
    def addCoordset(self, coords, weights=None, label=None, **kwargs):
        """Add coordinate set(s) to the ensemble.  *coords* must be a Numpy
        array with suitable shape and dimensionality, or an object with
        :meth:`getCoordsets`. *weights* is an optional argument.
        If provided, its length must match number of atoms.  Weights of
        missing (not resolved) atoms must be ``0`` and weights of those
        that are resolved can be anything greater than ``0``.  If not
        provided, weights of all atoms for this coordinate set will be
        set equal to ``1``. *label*, which may be a PDB identifier or a
        list of identifiers, is used to label conformations."""

        degeneracy = kwargs.pop('degeneracy', False)
        adddata = kwargs.pop('data', None)

        atoms = coords
        n_atoms = self._n_atoms
        n_select = self.numSelected()
        n_confs = self.numCoordsets()

        try:
            if degeneracy:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoords(selected=False)
                    elif hasattr(coords, '_getCoords'):
                        coords = coords._getCoords()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoords(selected=False)
                    elif hasattr(coords, 'getCoords'):
                        coords = coords.getCoords()
            else:
                if self._coords is not None:
                    if isinstance(coords, Ensemble):
                        coords = coords._getCoordsets(selected=False)
                    elif hasattr(coords, '_getCoordsets'):
                        coords = coords._getCoordsets()
                else:
                    if isinstance(coords, Ensemble):
                        coords = coords.getCoordsets(selected=False)
                    elif hasattr(coords, 'getCoordsets'):
                        coords = coords.getCoordsets()

        except AttributeError:
            label = label or 'Unknown'

        else:
            if coords is None:
                raise ValueError('coordinates are not set')
            elif label is None and isinstance(atoms, Atomic):
                if not isinstance(atoms, AtomGroup):
                    ag = atoms.getAtomGroup()
                else:
                    ag = atoms
                label = ag.getTitle()
                if coords.shape[0] < ag.numCoordsets():
                    label += '_m' + str(atoms.getACSIndex())
            else:
                label = label or 'Unknown'

        # check coordinates
        try:
            checkCoords(coords, csets=True, natoms=n_atoms)
        except:
            try:
                checkCoords(coords, csets=True, natoms=n_select)
            except TypeError:
                raise TypeError('coords must be a numpy array or an object '
                                'with `getCoords` method')

        if coords.ndim == 2:
            n_nodes, _ = coords.shape
            coords = coords.reshape((1, n_nodes, 3))
            n_csets = 1
        else:
            n_csets, n_nodes, _ = coords.shape
            if degeneracy:
                coords = coords[:1]

        n_repeats = 1 if degeneracy else n_csets

        if not n_atoms:
            self._n_atoms = n_nodes

        if n_nodes == n_select and self.isSelected():
            full_coords = np.repeat(self._coords[np.newaxis, :, :],
                                    n_csets,
                                    axis=0)
            full_coords[:, self._indices, :] = coords
            coords = full_coords

        # check weights
        if weights is None:
            weights = np.ones((n_csets, n_atoms, 1), dtype=float)
        else:
            weights = checkWeights(weights, n_atoms, n_csets)

        if degeneracy:
            weights = weights[:1]

        # check sequences
        seqs = None
        sequence = kwargs.pop('sequence', None)
        if hasattr(atoms, 'getSequence'):
            if sequence is not None:
                LOGGER.warn(
                    'sequence is supplied though coords has getSequence')
            sequence = atoms.getSequence()
            seqs = [sequence for _ in range(n_repeats)]
        else:
            if sequence is None:
                try:
                    sequence = self._atoms.getSequence()
                except AttributeError:
                    if self._msa:
                        sequence = ''.join('X' for _ in range(n_atoms))
                    # sequence and seqs remains to be None if MSA has not been created
            if isinstance(sequence, Sequence):
                seqs = [str(sequence)]
            elif isinstance(sequence, MSA):
                seqs = [str(seq) for seq in sequence]
            elif np.isscalar(sequence):
                seqs = [sequence for _ in range(n_repeats)]

        if seqs:
            if len(seqs) != n_repeats:
                raise ValueError(
                    'the number of sequences should be either one or '
                    'that of coordsets')

        # assign new values
        # update labels
        if n_csets > 1 and not degeneracy:
            if isinstance(label, str):
                labels = [
                    '{0}_m{1}'.format(label, i + 1) for i in range(n_csets)
                ]
            else:
                if len(label) != n_csets:
                    raise ValueError('length of label and number of '
                                     'coordinate sets must be the same')
                labels = label
        else:
            labels = [label] if np.isscalar(label) else label

        self._labels.extend(labels)

        # update sequences
        if seqs:
            msa = MSA(seqs, title=self.getTitle(), labels=labels)
            if self._msa is None:
                if n_confs > 0:
                    def_seqs = np.chararray((n_confs, n_atoms))
                    def_seqs[:] = 'X'

                    old_labels = [self._labels[i] for i in range(n_confs)]
                    self._msa = MSA(def_seqs,
                                    title=self.getTitle(),
                                    labels=old_labels)
                    self._msa.extend(msa)
                else:
                    self._msa = msa
            else:
                self._msa.extend(msa)

        # update coordinates
        if self._confs is None and self._weights is None:
            self._confs = coords
            self._weights = weights

        elif self._confs is not None and self._weights is not None:
            self._confs = np.concatenate((self._confs, coords), axis=0)
            self._weights = np.concatenate((self._weights, weights), axis=0)
        else:
            raise RuntimeError('_confs and _weights must be set or None at '
                               'the same time')

        # appending new data
        if self._data is not None and adddata is not None:
            if self._data is None:
                self._data = {}
            if adddata is None:
                adddata = {}
            all_keys = set(list(self._data.keys()) + list(adddata.keys()))

            for key in all_keys:
                if key in self._data:
                    data = self._data[key]
                    if key not in adddata:
                        shape = [n_repeats]
                        for s in data.shape[1:]:
                            shape.append(s)
                        newdata = np.zeros(shape, dtype=data.dtype)
                    else:
                        newdata = np.asarray(adddata[key])
                        if newdata.shape[0] != n_repeats:
                            raise ValueError(
                                'the length of data["%s"] does not match that of coords'
                                % key)
                else:
                    newdata = np.asarray(adddata[key])
                    shape = [self._n_csets]
                    for s in newdata.shape[1:]:
                        shape.append(s)
                    data = np.zeros(shape, dtype=newdata.dtype)
                self._data[key] = np.concatenate((data, newdata), axis=0)

        # update the number of coordinate sets
        self._n_csets += n_repeats
Beispiel #33
0
def calcPerturbResponse(model, **kwargs):
    """Returns a matrix of profiles from scanning the response of the
    structure to random perturbations at specific atom (or node) positions.
    The function implements the perturbation response scanning (PRS) method
    described in [CA09]_.  Rows of the matrix are the average magnitude of the
    responses obtained by perturbing the atom/node position at that row index,
    i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to
    perturbations in residue/node *i*.  PRS is performed using the covariance
    matrix from *model*, e.g. :class:`.ANM` instance.

    When an *atoms* instance is given, the PRS matrix will be added as data, 
    which can be retrieved with ``atoms.getData('prs_matrix')``.  

    *model* and *atoms* must have the same number of atoms. *atoms* must be an
    :class:`.AtomGroup` instance. 

    .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning
       Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein.
       *PLoS Comput Biol* **2009** 5(10):e1000544.

    The PRS matrix can be calculated and saved as follows::

      prs_matrix = calcPerturbResponse(p38_anm, saveMatrix=True)
      
    The PRS matrix can also be save later as follows::
    
      writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t')

    :arg saveMatrix: whether to save the last matrix generated to a text file.
        Default is False
    :type saveMatrix: bool

    :arg saveName: The file name for saved matrices
        Default is 'response_matrix.txt'.
    :type saveName: str
    """

    if not isinstance(model, (NMA, ModeSet, Mode)):
        raise TypeError('model must be an NMA, ModeSet, or Mode instance')

    if isinstance(model, NMA) and len(model) == 0:
        raise ValueError('model must have normal modes calculated')

    atoms = kwargs.get('atoms', None)
    if atoms is not None:
        if isinstance(atoms, Selection):
            atoms = atoms.copy()
        if not isinstance(atoms, AtomGroup):
            raise TypeError('atoms must be an AtomGroup instance')
        elif atoms.numAtoms() != model.numAtoms():
            raise ValueError('model and atoms must have the same number atoms')

    n_atoms = model.numAtoms()
    LOGGER.timeit('_prody_prs_all')
    LOGGER.info('Calculating covariance matrix')
    LOGGER.timeit('_prody_cov')

    cov = calcCovariance(model)
    if cov is None:
        raise ValueError('model did not return a covariance matrix')

    LOGGER.clear()
    LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov')

    LOGGER.progress('Calculating perturbation response', n_atoms,
                    '_prody_prs_mat')

    if not model.is3d():
        prs_matrix = cov**2

    else:
        cov_squared = cov**2
        n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms))
        prs_matrix = np.zeros((n_atoms, n_atoms))
        i3 = -3
        i3p3 = 0
        for i in range(n_atoms):
            i3 += 3
            i3p3 += 3
            n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0)

        j3 = -3
        j3p3 = 0
        for j in range(n_atoms):
            j3 += 3
            j3p3 += 3
            prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1)

    LOGGER.clear()
    LOGGER.report('Perturbation response matrix calculated in %.1fs.',
                  '_prody_prs_mat')

    saveMatrix = kwargs.get('saveMatrix', False)
    suppressDiag = kwargs.get('suppressDiag', False)
    saveName = kwargs.get('saveName', 'response_matrix.txt')

    norm_prs_matrix = np.zeros((n_atoms, n_atoms))
    self_dp = np.diag(prs_matrix)
    self_dp = self_dp.reshape(n_atoms, 1)
    norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1)

    if suppressDiag == True:
        # suppress the diagonal (self displacement) to facilitate
        # visualizing the response profile
        norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix))

    if saveMatrix == True:
        np.savetxt(saveName, norm_prs_matrix, delimiter='\t', fmt='%8.6f')

    LOGGER.report('Perturbation response scanning completed in %.1fs.',
                  '_prody_prs_all')

    if atoms is not None:
        atoms.setData('prs_matrix', norm_prs_matrix)
        return atoms, norm_prs_matrix
    else:
        return norm_prs_matrix
Beispiel #34
0
def assignSecstr(header, atoms, coil=False):
    """Assign secondary structure from *header* dictionary to *atoms*.
    *header* must be a dictionary parsed using the :func:`.parsePDB`.
    *atoms* may be an instance of :class:`.AtomGroup`, :class:`.Selection`,
    :class:`.Chain` or :class:`.Residue`.  ProDy can be configured to
    automatically parse and assign secondary structure information using
    ``confProDy(auto_secondary=True)`` command.  See also :func:`.confProDy`
    function.

    The Dictionary of Protein Secondary Structure, in short DSSP, type
    single letter code assignments are used:

      * **G** = 3-turn helix (310 helix). Min length 3 residues.
      * **H** = 4-turn helix (alpha helix). Min length 4 residues.
      * **I** = 5-turn helix (pi helix). Min length 5 residues.
      * **T** = hydrogen bonded turn (3, 4 or 5 turn)
      * **E** = extended strand in parallel and/or anti-parallel
        beta-sheet conformation. Min length 2 residues.
      * **B** = residue in isolated beta-bridge (single pair beta-sheet
        hydrogen bond formation)
      * **S** = bend (the only non-hydrogen-bond based assignment).
      * **C** = residues not in one of above conformations.


    See http://en.wikipedia.org/wiki/Protein_secondary_structure#The_DSSP_code
    for more details.

    Following PDB helix classes are omitted:

      * Right-handed omega (2, class number)
      * Right-handed gamma (4)
      * Left-handed alpha (6)
      * Left-handed omega (7)
      * Left-handed gamma (8)
      * 2 - 7 ribbon/helix (9)
      * Polyproline (10)

    Secondary structures are assigned to all atoms in a residue.  Amino acid
    residues without any secondary structure assignments in the header
    section will be assigned coil (C) conformation.  This can be prevented
    by passing ``coil=False`` argument."""

    if not isinstance(header, dict):
        raise TypeError('header must be a dictionary')
    helix = header.get('helix', {})
    sheet = header.get('sheet', {})
    if len(helix) == 0 and len(sheet) == 0:
        LOGGER.warn('header does not contain secondary structure data')
        return atoms

    ssa = atoms.getSecstrs()
    if ssa is None:
        if isinstance(atoms, AtomGroup):
            ag = atoms
        else:
            ag = atoms.getAtomGroup()
        ag.setSecstrs(np.zeros(ag.numAtoms(),
                               ATOMIC_FIELDS['secondary'].dtype))
        ag.setSecids(np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secid'].dtype))
        ag.setSecclasses(
            np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secclass'].dtype))
        ag.setSecindices(
            np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secindex'].dtype))

    atoms.select('protein').setSecstrs('C')
    hierview = atoms.getHierView()
    count = 0
    getResidue = hierview.getResidue
    for key, value in helix.items():  # PY3K: OK
        res = getResidue(*key)
        if res is None:
            continue
        res.setSecids(value[2])
        res.setSecclasses(value[0])
        res.setSecindices(value[1])
        res.setSecstrs(mapHelix[value[0]])

        count += 1
    for key, value in sheet.items():  # PY3K: OK
        res = getResidue(*key)
        if res is None:
            continue
        res.setSecids(value[2])
        res.setSecclasses(value[0])
        res.setSecindices(value[1])
        res.setSecstrs('E')
        count += 1

    LOGGER.info(
        'Secondary structures were assigned to {0} residues.'.format(count))

    return atoms
Beispiel #35
0
def writePerturbResponsePDB(prs_matrix, pdbIn=None, **kwargs):
    """ Write the average response to perturbation of
    a particular residue (a row of a perturbation response matrix)
    or the average effect of perturbation of a particular residue
    (a column of a normalized perturbation response matrix)
    into the b-factor field of a PDB file for visualisation in a
    molecular graphics program.
    If no chain is given this will be done for that residue in all chains.

    If no residue number is given then the effectiveness and sensitivity
    profiles will be written out instead. These two profiles are also returned
    as arrays for further analysis if they aren't already provided.

    :arg prs_matrix: a perturbation response matrix 
        or a :class:`.AtomGroup` object with a PRS matrix associated as data
    :type prs_matrix: array or :class:`.AtomGroup`

    :arg pdbIn: file name for the input PDB file where you would like the PRS
        data mapped
    :type pdbIn: str

    :arg pdbOut: a list of file names (enclosed in square
        brackets) for the output PDB file, default is to append
        the chain and residue info (name and number) onto the pdbIn stem.
        The input for pdbOut can also be used as a stem if you enter a 
        single string enclosed in quotes.
        If no residue number is supplied, chain is ignored and the default 
        is to append '_effectiveness' and '_sensitivity' onto the stem.
    :type pdbOut: list

    :arg chain: chain identifier for the residue of interest, default is all chains
        If you want to analyse residues in a subset of chains, concatentate them
        together e.g. 'AC'
    :type chain: str

    :arg resnum: residue number for the residue of interest
    :type resnum: int

    :arg direction: the direction you want to use to read data out
        of the PRS matrix for plotting: the options are 'effect' or 'response'.
        Default is 'effect'.
        A row gives the effect on each residue of peturbing the specified 
        residue.
        A column gives the response of the specified residue to perturbing 
        each residue.
        If no residue number is provided then this option will be ignored
    :type direction: str

    :arg returnData: whether to return effectiveness and sensitivity for analysis
        default is False
    :type returnProfiles: bool

    :arg effectiveness: effectiveness profile
    :type array

    :arg sensitivity: sensitivity profile
    :type array
    """

    if not isinstance(prs_matrix, np.ndarray):
        try:
            prs_matrix = prs_matrix.getData('prs_matrix')
        except:
            raise TypeError(
                'Please provide a valid PRS matrix in numpy ndarray format.')

    try:
        fi = open(pdbIn, 'r')
        lines = fi.readlines()
        fi.close()
    except:
        raise PRSMatrixParseError(
            'Please provide a valid file name for the input PDB.')

    chain = kwargs.get('chain', None)

    structure = parsePDB(pdbIn, subset='ca')
    structure.setData('prs_matrix', prs_matrix)

    hv = structure.getHierView()
    chains = []
    for i in range(len(list(hv))):
        chainAg = list(hv)[i]
        chains.append(chainAg.getChids()[0])

    chains = np.array(chains)
    if chain is None:
        chain = ''.join(chains)

    resnum = kwargs.get('resnum', None)
    pdbOut = kwargs.get('pdbOut', None)
    if pdbOut is None:
        out_stem = pdbIn.split('.')[0]
    elif type(pdbOut) is str:
        out_stem = pdbOut.split('.')[0]
        pdbOut = None

    if resnum is None:
        effectiveness = kwargs.get('effectiveness', None)
        sensitivity = kwargs.get('sensitivity', None)
        if effectiveness is None or sensitivity is None:
            effectiveness, sensitivity = calcPerturbResponseProfiles(
                prs_matrix)

        structure.setData('effectiveness', effectiveness)
        structure.setData('sensitivity', sensitivity)

        file_effs_name = '{0}_effectiveness.pdb'.format(out_stem)
        file_sens_name = '{0}_sensitivity.pdb'.format(out_stem)
        fileEffs = open(file_effs_name, 'w')
        fileSens = open(file_sens_name, 'w')

        for line in lines:
            if line.find('ATOM') != 0 and line.find(
                    'HETATM') != 0 and line.find('ANISOU') != 0:
                fileEffs.write(line)
                fileSens.write(line)
            elif line.find('ATOM') == 0:
                fileEffs.write(line[:60] + '{:6.2f}'.format(float(structure.select( \
                               'chain {0} and resnum {1}'.format(line[21],line[22:26])) \
                               .getData('effectiveness')) * 100/np.max( \
                               structure.getData('effectiveness'))) + line[66:])
                fileSens.write(line[:60] + '{:6.2f}'.format(float(structure.select( \
                               'chain {0} and resnum {1}'.format(line[21],line[22:26])) \
                               .getData('sensitivity')) * 100/np.max( \
                               structure.getData('sensitivity'))) + line[66:])
            elif line.find('HETATM') == 0:
                fileEffs.write(line[:60] + '  0.00' + line[66:])
                fileSens.write(line[:60] + '  0.00' + line[66:])

        fileEffs.close()
        fileSens.close()
        LOGGER.info('The effectiveness and sensitivity profiles were written' \
                    ' to {0} and {1}.'.format(file_effs_name,file_sens_name))

        returnData = kwargs.get('returnData', False)
        if returnData:
            return structure, effectiveness, sensitivity
        else:
            return

    direction = kwargs.get('direction', 'effect')
    for n in range(len(chain)):
        if not chain[n] in chains:
            raise PRSMatrixParseError('Chain {0} was not found in {1}'.format(
                chain[n], pdbIn))

    if pdbOut is None:
        pdbOut = []
        for c in chain:
            pdbOut.append('{0}_{1}_{2}{3}_{4}.pdb' \
                          .format(out_stem, c, \
                                  str(structure.select('chain {0} and resnum {1}' \
                                      .format(c, resnum)).getResnames()), \
                                  resnum, direction))

    for c in chain:
        fo = open(pdbOut[n], 'w')
        for line in lines:
            if line.find('ATOM') != 0 and line.find(
                    'HETATM') != 0 and line.find('ANISOU') != 0:
                fo.write(line)
            elif line.find('ATOM') == 0:
                if direction is 'effect':
                    fo.write(line[:60] + '{:6.2f}'.format(float(structure.getData('prs_matrix') \
                                         [structure.select('chain {0} and resnum {1}' \
                                          .format(c, resnum)).getResindices(), \
                                          structure.select('chain {0} and resnum {1}' \
                                          .format(line[21], line[22:26])).getResindices()])*100) \
                             + line[66:])
                else:
                    fo.write(line[:60] + '{:6.2f}'.format(float(structure.getData('prs_matrix') \
                                         [structure.select('chain {0} and resnum {1}' \
                                          .format(line[21], line[22:26])).getResindices(), \
                                          structure.select('chain {0} and resnum {1}' \
                                          .format(c, resnum)).getResindices()])*100) \
                             + line[66:])
            elif line.find('HETATM') == 0:
                fo.write(line[:60] + '  0.00' + line[66:])

        LOGGER.info('Perturbation responses for specific residues were written' \
                    ' to {0}.'.format(', '.join(pdbOut)))
Beispiel #36
0
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = {'hmmdb': 'pfam', 'seq': fseq}
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request(
            'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = {'format': 'tsv'}
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace(
            'results', 'download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url)
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(
            seq[:MINSEQLEN]))

        try:
            #xml = urllib2.urlopen(result_request).read()
            tsv = urllib2.urlopen(result_request).read()
            # openURL(url, timeout=timeout).read()
        except:
            raise ValueError('No matching Pfam domains were found.')

        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
        #if child.tag == 'hits':
        # accession = child.get('acc')
        # pfam_id = accession.split('.')[0]
        # matches[pfam_id]={}
        # matches[pfam_id]['accession']=accession
        # matches[pfam_id]['class']='Domain'
        # matches[pfam_id]['id']=child.get('name')
        # matches[pfam_id]['locations']={}
        # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
        # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
        # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['evalue']=child.get('evalue')
        # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
        # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
        # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
        # matches[pfam_id]['locations']['significant']=child[0].get('significant')
        # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
        # matches[pfam_id]['type']='Pfam-A'
        # return matches

        if PY3K:
            tsv = tsv.decode()

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id] = {}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child[
                'Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'.format(
                    seq[:4], str(err)))
            else:
                chid = seq[4:].upper()

            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'.format(
                                    idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND', 'RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' +
                                 seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Beispiel #37
0
def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
    """Returns a path to the downloaded Pfam MSA file.

    :arg acc: Pfam ID or Accession Code
    :type acc: str

    :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``,
         ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``,
         ``'rp75'`` or ``'uniprot'`` where rp stands for representative 
         proteomes

    :arg compressed: gzip the downloaded MSA file, default is **False**

    *Alignment Options*

    :arg format: a Pfam supported MSA file format, one of ``'selex'``,
        (default), ``'stockholm'`` or ``'fasta'``

    :arg order: ordering of sequences, ``'tree'`` (default) or
        ``'alphabetical'``

    :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'``

    :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``,
        ``'mixed'`` or **None** for unaligned

    *Other Options*

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60

    :arg outname: out filename, default is input ``'acc_alignment.format'``

    :arg folder: output folder, default is ``'.'``"""

    url = prefix + 'family/acc?id=' + acc
    handle = openURL(url, timeout=int(kwargs.get('timeout', 60)))
    orig_acc = acc
    acc = handle.readline().strip()
    if PY3K:
        acc = acc.decode()
    url_flag = False

    if not re.search('(?<=PF)[0-9]{5}$', acc):
        raise ValueError('{0} is not a valid Pfam ID or Accession Code'.format(
            repr(orig_acc)))

    if alignment not in DOWNLOAD_FORMATS:
        raise ValueError('alignment must be one of full, seed, ncbi or'
                         ' metagenomics')
    if alignment == 'ncbi' or alignment == 'metagenomics' or alignment == 'uniprot':
        url = (prefix + 'family/' + acc + '/alignment/' + alignment +
               '/gzipped')
        url_flag = True
        extension = '.sth'
    else:
        if not kwargs:
            url = (prefix + 'family/' + acc + '/alignment/' + alignment +
                   '/gzipped')
            url_flag = True
            extension = '.sth'
        else:
            align_format = kwargs.get('format', 'selex').lower()

            if align_format not in FORMAT_OPTIONS['format']:
                raise ValueError('alignment format must be of type selex'
                                 ' stockholm or fasta. MSF not supported')

            if align_format == SELEX:
                align_format, extension = 'pfam', '.slx'
            elif align_format == FASTA:
                extension = '.fasta'
            else:
                extension = '.sth'

            gaps = str(kwargs.get('gaps', 'dashes')).lower()
            if gaps not in FORMAT_OPTIONS['gaps']:
                raise ValueError('gaps must be of type mixed, dots, dashes, '
                                 'or None')

            inserts = kwargs.get('inserts', 'upper').lower()
            if (inserts not in FORMAT_OPTIONS['inserts']):
                raise ValueError('inserts must be of type lower or upper')

            order = kwargs.get('order', 'tree').lower()
            if order not in FORMAT_OPTIONS['order']:
                raise ValueError('order must be of type tree or alphabetical')

            url = (prefix + 'family/' + acc + '/alignment/' + alignment +
                   '/format?format=' + align_format + '&alnType=' + alignment +
                   '&order=' + order[0] + '&case=' + inserts[0] + '&gaps=' +
                   gaps + '&download=1')

    response = openURL(url, timeout=int(kwargs.get('timeout', 60)))
    outname = kwargs.get('outname', None)
    if not outname:
        outname = orig_acc
    folder = str(kwargs.get('folder', '.'))
    filepath = join(makePath(folder), outname + '_' + alignment + extension)
    if compressed:
        filepath = filepath + '.gz'
        if url_flag:
            f_out = open(filepath, 'wb')
        else:
            f_out = openFile(filepath, 'wb')
        f_out.write(response.read())
        f_out.close()
    else:
        if url_flag:
            gunzip(response.read(), filepath)
        else:
            with open(filepath, 'wb') as f_out:
                f_out.write(response.read())

    filepath = relpath(filepath)
    LOGGER.info('Pfam MSA for {0} is written as {1}.'.format(
        orig_acc, filepath))

    return filepath
Beispiel #38
0
 def iterpose(self, rmsd=0.0001):
     confs = copy(self._confs)
     Ensemble.iterpose(self, rmsd)
     self._confs = confs
     LOGGER.info('Final superposition to calculate transformations.')
     self.superpose()
def writeHeatmap(filename, heatmap, **kwargs):
    """Return *filename* that contains *heatmap* in Heat Mapper :file:`.hm`
    file (extension is automatically added when not found).  *filename* may
    also be an output stream.

    :arg title: title of the heatmap
    :type title: str

    :arg xlabel: x-axis lab, default is ``'unknown'``
    :type xlabel: str

    :arg ylabel: y-axis lab, default is ``'unknown'``
    :type ylabel: str

    :arg xorigin: x-axis origin, default is 0
    :type xorigin: float

    :arg xstep: x-axis step, default is 1
    :type xstep: float

    :arg min: minimum value, default is minimum in *heatmap*
    :type min: float

    :arg max: maximum value, default is maximum in *heatmap*
    :type max: float

    :arg format: number format, default is ``'%f'``
    :type format: str

    Other keyword arguments that are arrays with length equal to the y-axis
    (second dimension of heatmap) will be considered as *numbering*."""

    try:
        ndim, shape = heatmap.ndim, heatmap.shape
    except:
        raise TypeError('heatmap must be an array object')
    if ndim!= 2:
        raise TypeError('heatmap must be a 2D array')

    try:
        write, close, stream = filename.write, lambda: None, filename
    except AttributeError:
        out = openFile(addext(filename, '.hm'), 'wb')
        write, close, stream = out.write, out.close, out

    format = kwargs.pop('format', '%f')
    write('-min "{0}"\n'.format(kwargs.pop('min', heatmap.min())))
    write('-max "{0}"\n'.format(kwargs.pop('max', heatmap.max())))
    for label, default in [
        ('title', 'unknown'),
        ('xlabel', 'unknown'),
        ('xorigin', 0),
        ('xstep', 1),
        ('ylabel', 'unknown'),
    ]:
        write('-{0} "{1}"\n'.format(label, kwargs.pop(label, default)))

    numbering = []
    numlabels = []
    for key, val in kwargs.items():
        try:
            length = len(val)
        except TypeError:
            LOGGER.warn('Keyword argument {0} is not used.'.format(key))
        else:
            if length == shape[0]:
                numlabels.append(key)
                numbering.append(val)
    if not numbering:
        numlabels.append('unknown')
        numbering.append(arange(1, shape[0] + 1))

    write('-numbering "{0}"\n'.format(':'.join(numlabels)))

    for i, row in enumerate(heatmap):
        write(':'.join(str(nums[i]) for nums in numbering) + ':')
        row.tofile(stream, sep=';', format=format)
        write(';\n')

    close()
    return filename
Beispiel #40
0
def print_sat_mutagen_figure(filename,
                             rhapsody_obj,
                             res_interval=None,
                             PolyPhen2=True,
                             EVmutation=True,
                             extra_plot=None,
                             fig_height=8,
                             fig_width=None,
                             dpi=300,
                             min_interval_size=15,
                             html=False,
                             main_clsf='main',
                             aux_clsf='aux.'):

    # check inputs
    assert isinstance(filename, str), 'filename must be a string'
    assert isinstance(rhapsody_obj, Rhapsody), 'not a Rhapsody object'
    assert rhapsody_obj._isColSet('main score'), 'predictions not found'
    assert rhapsody_obj._isSaturationMutagenesis(), 'unable to create figure'
    if res_interval is not None:
        assert isinstance(res_interval, tuple) and len(res_interval) == 2, \
               'res_interval must be a tuple of 2 values'
        assert res_interval[1] >= res_interval[0], 'invalid res_interval'
    if extra_plot is not None:
        assert len(extra_plot) == rhapsody_obj.numSAVs, \
               'length of additional predictions array is incorrect'
    assert isinstance(fig_height, (int, float))
    assert isinstance(dpi, int)

    matplotlib = _try_import_matplotlib()
    if matplotlib is None:
        return

    # delete extension from filename
    filename = os.path.splitext(filename)[0]

    # make sure that all variants belong to the same Uniprot sequence
    accs = [s.split()[0] for s in rhapsody_obj.data['SAV coords']]
    if len(set(accs)) != 1:
        m = 'Only variants from a single Uniprot sequence can be accepted'
        raise ValueError(m)

    # select an appropriate interval, based on available predictions
    seq_pos = [int(s.split()[1]) for s in rhapsody_obj.data['SAV coords']]
    res_min = np.min(seq_pos)
    res_max = np.max(seq_pos)
    upper_lim = res_max + min_interval_size

    # create empty (20 x num_res) mutagenesis tables
    table_best = np.zeros((20, upper_lim), dtype=float)
    table_best[:] = 'nan'
    table_main = table_best.copy()
    if extra_plot is not None:
        table_other = table_best.copy()
    if PolyPhen2:
        table_PP2 = table_best.copy()
    if EVmutation:
        table_EVmut = table_best.copy()

    # import pathogenicity probabilities from Rhapsody object
    p_best = rhapsody_obj.getPredictions(classifier='best')['path. prob.']
    p_main = rhapsody_obj.data['main path. prob.']
    if PolyPhen2:
        rhapsody_obj._calcPolyPhen2Predictions()
        p_PP2 = rhapsody_obj.data['PolyPhen-2 score']
    if EVmutation:
        rhapsody_obj._calcEVmutationPredictions()
        EVmut_score = np.array(rhapsody_obj.data['EVmutation score'])
        EVmut_cutoff = SETTINGS.get('EVmutation_metrics')['optimal cutoff']
        p_EVmut = -EVmut_score / EVmut_cutoff * 0.5

    # fill tables with predicted probability
    #  1:    deleterious
    #  0:    neutral
    # 'nan': no prediction/wt
    aa_list = 'ACDEFGHIKLMNPQRSTVWY'
    aa_map = {aa: i for i, aa in enumerate(aa_list)}
    for i, SAV in enumerate(rhapsody_obj.data['SAV coords']):
        aa_mut = SAV.split()[3]
        index = int(SAV.split()[1]) - 1
        table_best[aa_map[aa_mut], index] = p_best[i]
        table_main[aa_map[aa_mut], index] = p_main[i]
        if extra_plot is not None:
            table_other[aa_map[aa_mut], index] = extra_plot[i]
        if PolyPhen2:
            table_PP2[aa_map[aa_mut], index] = p_PP2[i]
        if EVmutation:
            table_EVmut[aa_map[aa_mut], index] = p_EVmut[i]

    # compute average pathogenicity profiles
    # NB: I expect to see RuntimeWarnings in this block
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        avg_p_best = np.nanmean(table_best, axis=0)
        avg_p_main = np.nanmean(table_main, axis=0)
        min_p = np.nanmin(table_best, axis=0)
        max_p = np.nanmax(table_best, axis=0)
        if extra_plot is not None:
            avg_p_other = np.nanmean(table_other, axis=0)
        if PolyPhen2:
            avg_p_PP2 = np.nanmean(table_PP2, axis=0)
        if EVmutation:
            avg_p_EVmut = np.nanmean(table_EVmut, axis=0)

    # use upper strip for showing additional info, such as PDB lengths
    upper_strip = np.zeros((1, upper_lim))
    upper_strip[:] = 'nan'
    PDB_sizes = np.zeros(upper_lim, dtype=int)
    PDB_coords = [''] * upper_lim
    for s in rhapsody_obj.data:
        index = int(s['SAV coords'].split()[1]) - 1
        if s['PDB size'] != 0:
            PDB_length = int(s['PDB size'])
            PDBID_chain = ':'.join(s['PDB SAV coords'][0].split()[:2])
            upper_strip[0, index] = PDB_length
            PDB_sizes[index] = PDB_length
            PDB_coords[index] = PDBID_chain
    max_PDB_size = max(PDB_sizes)
    if max_PDB_size != 0:
        upper_strip[0, :] /= max_PDB_size

    # PLOT FIGURE

    from matplotlib import pyplot as plt
    from matplotlib import gridspec as gridspec

    # portion of the sequence to display
    if res_interval is None:
        res_interval = (res_min, res_max)
    # adjust interval
    res_i, res_f = _adjust_res_interval(res_interval, upper_lim,
                                        min_interval_size)
    nres_shown = res_f - res_i + 1

    # figure proportions
    if fig_width is None:
        fig_width = fig_height / 2  # inches
        fig_width *= nres_shown / 20
    fig, ax = plt.subplots(3, 2, figsize=(fig_width, fig_height))
    wspace = 0.5  # inches
    plt.subplots_adjust(wspace=wspace / fig_width, hspace=0.15)

    # figure structure
    gs = gridspec.GridSpec(3,
                           2,
                           width_ratios=[nres_shown, 1],
                           height_ratios=[1, 20, 10])
    ax0 = plt.subplot(gs[0, 0])  # secondary structure strip
    ax1 = plt.subplot(gs[1, 0])  # mutagenesis table
    axcb = plt.subplot(gs[1, 1])  # colorbar
    ax2 = plt.subplot(gs[2, 0])  # average profile

    # padding for tick labels
    pad = 0.2 / fig_width

    # top strip
    matplotlib.cm.YlGn.set_bad(color='antiquewhite')
    ax0.imshow(upper_strip[0:1, res_i - 1:res_f],
               aspect='auto',
               cmap='YlGn',
               vmin=0,
               vmax=1)
    ax0.set_ylim((-0.45, .45))
    ax0.set_yticks([])
    ax0.set_ylabel(f'PDB size \n[0-{max_PDB_size} res] ',
                   fontsize=14,
                   ha='right',
                   va='center',
                   rotation=0)
    ax0.set_xticks(np.arange(5 - res_i % 5, res_f - res_i + 1, 5))
    ax0.set_xticklabels([])
    # add white grid
    ax0.set_xticks(np.arange(-.5, res_f - res_i + 1, 1), minor=True)
    ax0.tick_params(axis='both', which='minor', length=0)
    ax0.grid(which='minor', color='w', linestyle='-', linewidth=.5)

    # mutagenesis table (heatmap)
    matplotlib.cm.coolwarm.set_bad(color='antiquewhite')
    im = ax1.imshow(table_best[:, res_i - 1:res_f],
                    aspect='auto',
                    cmap='coolwarm',
                    vmin=0,
                    vmax=1)
    axcb.figure.colorbar(im, cax=axcb)
    ax1.set_yticks(np.arange(len(aa_list)))
    ax1.set_yticklabels(aa_list, ha='center', position=(-pad, 0), fontsize=14)
    ax1.set_xticks(np.arange(5 - res_i % 5, res_f - res_i + 1, 5))
    ax1.set_xticklabels([])
    ax1.set_ylabel('pathog. probability', labelpad=10)
    # add white grid
    ax1.set_xticks(np.arange(-.5, res_f - res_i + 1, 1), minor=True)
    ax1.set_yticks(np.arange(-.5, 20, 1), minor=True)
    ax1.tick_params(axis='both', which='minor', length=0)
    ax1.grid(which='minor', color='w', linestyle='-', linewidth=.5)

    # average pathogenicity profile
    x_resids = np.arange(1, upper_lim + 1)
    # shading showing range of values
    ax2.fill_between(x_resids,
                     min_p,
                     max_p,
                     alpha=0.5,
                     edgecolor='salmon',
                     facecolor='salmon')
    # plot average profile for other predictions, if available
    if extra_plot is not None:
        ax2.plot(x_resids, avg_p_other, color='gray', lw=1)
    if PolyPhen2:
        ax2.plot(x_resids, avg_p_PP2, color='blue', lw=1)
    if EVmutation:
        ax2.plot(x_resids, avg_p_EVmut, color='green', lw=1)
    # solid line for predictions obtained with full classifier
    ax2.plot(x_resids, avg_p_main, 'ro-')
    # dotted line for predictions obtained with auxiliary classifier
    ax2.plot(x_resids, avg_p_best, 'ro-', markerfacecolor='none', ls='dotted')
    # cutoff line
    ax2.axhline(y=0.5, color='grey', lw=.8, linestyle='dashed')

    ax2.set_xlim((res_i - .5, res_f + .5))
    ax2.set_xlabel('residue number')
    ax2.set_ylim((-0.05, 1.05))
    ax2.set_ylabel('average', rotation=90, labelpad=10)
    ax2.set_yticklabels([])
    ax2r = ax2.twinx()
    ax2r.set_ylim((-0.05, 1.05))
    ax2r.set_yticks([0, .5, 1])
    ax2r.set_yticklabels(['0', '0.5', '1'])
    ax2r.tick_params(axis='both', which='major', pad=15)

    tight_padding = 0.1
    fig.savefig(filename + '.png',
                format='png',
                bbox_inches='tight',
                pad_inches=tight_padding,
                dpi=dpi)
    plt.close()
    plt.rcParams.update(plt.rcParamsDefault)
    LOGGER.info(f'Saturation mutagenesis figure saved to {filename}.png')

    # write a map in html format, to make figure clickable
    if html:
        all_axis = {'strip': ax0, 'table': ax1, 'bplot': ax2}

        # precompute some useful quantities for html code
        html_data = {}
        # dpi of printed figure
        html_data["dpi"] = dpi
        # figure size *before* tight
        html_data["fig_size"] = fig.get_size_inches()
        # tight bbox as used by fig.savefig()
        html_data["tight_bbox"] = fig.get_tightbbox(fig.canvas.get_renderer())
        # compute new origin and height, based on tight box and padding
        html_data["new_orig"] = html_data["tight_bbox"].min - tight_padding
        html_data["new_height"] = (html_data["tight_bbox"].height +
                                   2 * tight_padding)

        def get_area_coords(ax, d):
            assert ax_type in ("strip", "table", "bplot")
            # get bbox coordinates (x0, y0, x1, y1)
            bbox = ax.get_position().get_points()
            # get bbox coordinates in inches
            b_inch = bbox * d["fig_size"]
            # adjust bbox coordinates based on tight bbox
            b_adj = b_inch - d["new_orig"]
            # use html reference system (y = 1 - y)
            b_html = b_adj * np.array([1, -1]) + np.array([0, d["new_height"]])
            # convert to pixels
            b_px = (d["dpi"] * b_html).astype(int)
            b_px = np.sort(b_px, axis=0)
            # put in html format
            coords = '{},{},{},{}'.format(*b_px.flatten())
            # output
            return coords

        # html templates
        area_html = Template('<area shape="rect" coords="$coords" '
                             'id="{{map_id}}_$areaid" {{area_attrs}}> \n')

        # write html
        with open(filename + '.html', 'w') as f:
            f.write('<div>\n')
            f.write('<map name="{{map_id}}" id="{{map_id}}" {{map_attrs}}>\n')
            for ax_type, ax in all_axis.items():
                fields = {'areaid': ax_type}
                fields['coords'] = get_area_coords(ax, html_data)
                f.write(area_html.substitute(fields))
            f.write('</map>\n')
            f.write('</div>\n')

        # populate info table that will be passed as a javascript variable
        best_preds = rhapsody_obj.getPredictions()
        best_avg_preds = rhapsody_obj.getResAvgPredictions()
        PDB_coords = rhapsody_obj.getPDBcoords()
        abbrev = {
            '?': '?',
            'deleterious': 'del',
            'neutral': 'neu',
            'prob.delet.': 'p.del',
            'prob.neutral': 'p.neu'
        }
        info = {}
        for k in ['strip', 'table', 'bplot']:
            n_cols = 20 if k == 'table' else 1
            info[k] = [[''] * nres_shown for i in range(n_cols)]
        for i, row in enumerate(rhapsody_obj.data):
            SAV = row['SAV coords']
            acc, resid, aa_wt, aa_mut = SAV.split()
            resid = int(resid)
            # consider only residues shown in figure
            if not (res_i <= resid <= res_f):
                continue
            # SAV coordinates
            SAV_code = f'{aa_wt}{resid}{aa_mut}'
            # coordinates on table
            t_i = aa_map[aa_mut]
            t_j = resid - 1
            # coordinates on *shown* table
            ts_i = t_i
            ts_j = resid - res_i
            # compose message for table
            bp = best_preds[i]
            pprob = bp['path. prob.']
            pclass = bp['path. class']
            clsf = main_clsf if row['best classifier'] == 'main' else aux_clsf
            m = f'{SAV_code}: Rhapsody-{clsf} = {pprob:<3.2f} ({pclass})'
            if PolyPhen2:
                score = bp['PolyPhen-2 score']
                pclass = abbrev[bp['PolyPhen-2 path. class']]
                m += f', PolyPhen-2 = {score:<3.2f} ({pclass})'
            if EVmutation:
                score = bp['EVmutation score']
                pclass = abbrev[bp['EVmutation path. class']]
                m += f', EVmutation = {score:<3.2f} ({pclass})'
            if extra_plot is not None:
                score = table_other[t_i, t_j]
                m += f', other = {score:<3.2f}'
            info['table'][ts_i][ts_j] = m
            info['table'][aa_map[aa_wt]][ts_j] = f'{SAV_code[:-1]}: wild-type'
            if i % 19 == 0:
                # compose message for upper strip
                PDBID, ch, resid, aa, size = PDB_coords[i][[
                    'PDBID', 'chain', 'resid', 'resname', 'PDB size'
                ]]
                if size > 0:
                    m = f'{PDBID}:{ch}, resid {resid}, aa {aa}, size {size}'
                else:
                    m = 'no PDB found'
                info['strip'][0][ts_j] = m
                # compose message for bottom plot (residue-averages)
                bap = best_avg_preds[int(i / 19)]
                pprob = bap['path. prob.']
                pcl = bap['path. class']
                m = f'{SAV_code[:-1]}: Rhapsody-{clsf} = {pprob:<3.2f} ({pcl})'
                if PolyPhen2:
                    score = bap['PolyPhen-2 score']
                    pcl = abbrev[bap['PolyPhen-2 path. class']]
                    m += f', PolyPhen-2 = {score:<3.2f} ({pcl})'
                if EVmutation:
                    score = bap['EVmutation score']
                    pcl = abbrev[bap['EVmutation path. class']]
                    m += f', EVmutation = {score:<3.2f} ({pcl})'
                if extra_plot is not None:
                    score = avg_p_other[t_j]
                    m += f', other = {score:<3.2f}'
                info['bplot'][0][ts_j] = m

        def create_info_msg(ax_type, d):
            text = '[ \n'
            for row in d:
                text += '  ['
                for m in row:
                    text += f'"{m}",'
                text += '], \n'
            text += ']'
            return text

        area_js = Template('{{map_data}}["{{map_id}}_$areaid"] = { \n'
                           '  "img_id": "{{img_id}}", \n'
                           '  "map_id": "{{map_id}}", \n'
                           '  "coords": [$coords], \n'
                           '  "num_rows": $num_rows, \n'
                           '  "num_cols": $num_cols, \n'
                           '  "info_msg": $info_msg, \n'
                           '}; \n')

        # dump info in javascript format
        with open(filename + '.js', 'w') as f:
            f.write('var {{map_data}} = {}; \n')
            for ax_type, d in info.items():
                vars = {'areaid': ax_type}
                vars['coords'] = get_area_coords(all_axis[ax_type], html_data)
                vars['num_rows'] = 20 if ax_type == 'table' else 1
                vars['num_cols'] = nres_shown
                vars['info_msg'] = create_info_msg(ax_type, d)
                f.write(area_js.substitute(vars))

        return info
    return
Beispiel #41
0
 def calcEvolProperties(self,
                        resid='all',
                        refresh=False,
                        folder=None,
                        max_cols=None,
                        max_seqs=25000,
                        **kwargs):
     ''' Computes Evol properties, i.e. Shannon entropy, Mutual
     Information and Direct Information, from Pfam Multiple
     Sequence Alignments, for a given residue.
     '''
     assert type(refresh) is bool
     # recover Pfam mapping (if not found already)
     self._searchPfam(refresh=refresh)
     if resid == 'all':
         PF_list = self.Pfam.keys()
     else:
         # get list of Pfam domains containing resid
         PF_list = [
             k for k in self.Pfam if any([
                 resid >= int(segment['start'])
                 and resid <= int(segment['end'])
                 for segment in self.Pfam[k]['locations']
             ])
         ]
         if len(PF_list) == 0:
             raise RuntimeError(f'No Pfam domain for resid {resid}.')
         if len(PF_list) > 1:
             LOGGER.warn(f'Residue {resid} is found in multiple '
                         '({}) Pfam domains.'.format(len(PF_list)))
     if folder is None:
         folder = SETTINGS.get('rhapsody_local_folder')
         if folder is None:
             folder = '.'
         else:
             folder = os.path.join(folder, 'pickles')
     # iterate over Pfam families
     for PF in PF_list:
         d = self.Pfam[PF]
         # skip if properties are pre-computed
         if not refresh and d.get('mapping') is not None:
             continue
         d['mapping'] = None
         d['ref_MSA'] = None
         d['entropy'] = np.nan
         d['MutInfo'] = np.nan
         d['DirInfo'] = np.nan
         try:
             LOGGER.info('Processing {}...'.format(PF))
             # fetch & parse MSA
             #               fname = PF + '_full.sth'
             #               fullname = os.path.join(folder, fname)
             #               if not os.path.isfile(fullname):
             #                   f = fetchPfamMSA(PF)
             #                   shutil.move(f, folder)
             #               msa = parseMSA(fullname, **kwargs)
             # fetch & parse MSA without saving downloaded MSA
             f = pd.fetchPfamMSA(PF)
             msa = pd.parseMSA(f, **kwargs)
             os.remove(f)
             # slice MSA to match all segments of the Uniprot sequence
             sliced_msa, indexes = self._sliceMSA(msa)
             #               if max_cols is not None and sliced_msa.numResidues() > max_cols:
             #                   raise Exception('Unable to compute DI: MSA has ' +\
             #                                   'too many columns (max: {}).'.format(max_cols))
             # get mapping between Uniprot sequence and Pfam domain
             d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
             d['mapping'] = str(e)
             continue
         try:
             # refine MSA ('seqid' param. is set as in PolyPhen-2)
             rowocc = 0.6
             while True:
                 sliced_msa = pd.refineMSA(sliced_msa, rowocc=rowocc)
                 rowocc += 0.02
                 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1:
                     break
             ref_msa = pd.refineMSA(sliced_msa, seqid=0.94, **kwargs)
             d['ref_MSA'] = ref_msa
             # compute evolutionary properties
             d['entropy'] = pd.calcShannonEntropy(ref_msa)
             d['MutInfo'] = pd.buildMutinfoMatrix(ref_msa)
             # d['DirInfo'] = buildDirectInfoMatrix(ref_msa)
         except Exception as e:
             LOGGER.warn('{}: {}'.format(PF, e))
     return {k: self.Pfam[k] for k in PF_list}
Beispiel #42
0
def calcPairDeformationDist(model, coords, ind1, ind2, kbt=1.):
    """Returns distribution of the deformations in the distance contributed by each mode 
    for selected pair of residues *ind1* *ind2* using *model* from a :class:`.ANM`.
    Method described in [EB08]_ equation (10) and figure (2).     
    
    .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of 
        the Anisotropic Response of Proteins to External Forces:
        Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. 
    
    :arg model: this is an 3-dimensional :class:`NMA` instance from a :class:`.ANM`
        calculations.
    :type model: :class:`.ANM`  

    :arg coords: a coordinate set or an object with :meth:`getCoords` method.
        Recommended: ``coords = parsePDB('pdbfile').select('protein and name CA')``.
    :type coords: :class:`~numpy.ndarray`.

    :arg ind1: first residue number.
    :type ind1: int 
    
    :arg ind2: secound residue number.
    :type ind2: int 
    """

    try:
        resnum_list = coords.getResnums()
        resnam_list = coords.getResnames()
        coords = (coords._getCoords()
                  if hasattr(coords, '_getCoords') else coords.getCoords())
    except AttributeError:
        try:
            checkCoords(coords)
        except TypeError:
            raise TypeError('coords must be a Numpy array or an object '
                            'with `getCoords` method')

    if not isinstance(model, NMA):
        raise TypeError('model must be a NMA instance')
    elif not model.is3d():
        raise TypeError('model must be a 3-dimensional NMA instance')
    elif len(model) == 0:
        raise ValueError('model must have normal modes calculated')

    linalg = importLA()
    n_atoms = model.numAtoms()
    n_modes = model.numModes()
    LOGGER.timeit('_pairdef')

    r_ij = np.zeros((n_atoms, n_atoms, 3))
    r_ij_norm = np.zeros((n_atoms, n_atoms, 3))

    for i in range(n_atoms):
        for j in range(i + 1, n_atoms):
            r_ij[i][j] = coords[j, :] - coords[i, :]
            r_ij[j][i] = r_ij[i][j]
            r_ij_norm[i][j] = r_ij[i][j] / linalg.norm(r_ij[i][j])
            r_ij_norm[j][i] = r_ij_norm[i][j]

    eigvecs = model.getEigvecs()
    eigvals = model.getEigvals()

    D_pair_k = []
    mode_nr = []
    ind1 = ind1 - resnum_list[0]
    ind2 = ind2 - resnum_list[0]

    for m in range(6, n_modes):
        U_ij_k = [(eigvecs[ind1*3][m] - eigvecs[ind2*3][m]), (eigvecs[ind1*3+1][m] \
            - eigvecs[ind2*3+1][m]), (eigvecs[ind1*3+2][m] - eigvecs[ind2*3+2][m])]
        D_ij_k = abs(
            sqrt(kbt / eigvals[m]) * (np.vdot(r_ij_norm[ind1][ind2], U_ij_k)))
        D_pair_k.append(D_ij_k)
        mode_nr.append(m)

    LOGGER.report('Deformation was calculated in %.2lfs.', label='_pairdef')

    return mode_nr, D_pair_k
Beispiel #43
0
def _getPolymers(lines):
    """Returns list of polymers (macromolecules)."""

    pdbid = lines['pdbid']
    polymers = dict()
    for i, line in lines['SEQRES']:
        ch = line[11]
        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        poly.sequence += ''.join(getSequence(line[19:].split()))

    for i, line in lines['DBREF ']:
        i += 1

        ch = line[12]
        if ch == ' ':
            if not len(polymers) == 1:
                LOGGER.warn('DBREF chain identifier is not specified '
                            '({0}:{1})'.format(pdbid, i))
                continue
            else:
                ch = list(polymers)[0]
        dbabbr = line[26:32].strip()
        dbref = DBRef()
        dbref.dbabbr = dbabbr
        dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown')
        dbref.accession = line[33:41].strip()
        dbref.idcode = line[42:54].strip()

        try:
            first = int(line[14:18])
        except:
            LOGGER.warn('DBREF for chain {2}: failed to parse '
                        'initial sequence number of the PDB sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            last = int(line[20:24])
        except:
            LOGGER.warn('DBREF for chain {2}: failed to parse '
                        'ending sequence number of the PDB sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            dbref.first = (first, line[18], int(line[56:60]))
        except:
            LOGGER.warn('DBREF for chain {2}: failed to parse '
                        'initial sequence number of the database sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            dbref.last = (last, line[24].strip(), int(line[62:67]))
        except:
            LOGGER.warn('DBREF for chain {2}: failed to parse '
                        'ending sequence number of the database sequence '
                        '({0}:{1})'.format(pdbid, i, ch))

        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        poly.dbrefs.append(dbref)

    dbref1 = lines['DBREF1']
    dbref2 = lines['DBREF2']
    if len(dbref1) != len(dbref2):
        LOGGER.warn('DBREF1 and DBREF1 records are not complete')
        dbref12 = []
    else:
        dbref12 = zip(dbref1, dbref2)  # PY3K: OK

    for dbref1, dbref2 in dbref12:
        i, line = dbref1
        i += 1
        ch = line[12]

        dbabbr = line[26:32].strip()
        dbref = DBRef()
        dbref.dbabbr = dbabbr
        dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown')
        dbref.idcode = line[47:67].strip()

        try:
            first = int(line[14:18])
        except:
            LOGGER.warn('DBREF1 for chain {2}: failed to parse '
                        'initial sequence number of the PDB sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            last = int(line[20:24])
        except:
            LOGGER.warn('DBREF1 for chain {2}: failed to parse '
                        'ending sequence number of the PDB sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        i, line = dbref2
        i += 1
        if line[12] == ' ':
            LOGGER.warn('DBREF2 chain identifier is not specified '
                        '({0}:{1})'.format(pdbid, ch))
        elif line[12] != ch:
            LOGGER.warn('DBREF1 and DBREF2 chain id mismatch'
                        '({0}:{1})'.format(pdbid, ch))

        dbref.accession = line[18:40].strip()
        try:
            dbref.first = (first, line[18].strip(), int(line[45:55]))
        except:
            LOGGER.warn('DBREF2 for chain {2}: failed to parse '
                        'initial sequence number of the database sequence '
                        '({0}:{1})'.format(pdbid, i, ch))
        try:
            dbref.last = (last, line[24].strip(), int(line[57:67]))
        except:
            LOGGER.warn('DBREF2 for chain {2}: failed to parse '
                        'ending sequence number of the database sequence '
                        '({0}:{1})'.format(pdbid, i, ch))

        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        poly.dbrefs.append(dbref)

    for poly in polymers.values():  # PY3K: OK
        resnum = []
        for dbref in poly.dbrefs:
            dbabbr = dbref.dbabbr
            if dbabbr == 'PDB':
                if not (pdbid == dbref.accession == dbref.idcode):
                    LOGGER.warn('DBREF for chain {2} refers to PDB '
                                'entry {3} ({0}:{1})'.format(
                                    pdbid, i, ch, dbref.accession))
            else:
                if pdbid == dbref.accession or pdbid == dbref.idcode:
                    LOGGER.warn('DBREF for chain {2} is {3}, '
                                'expected PDB ({0}:{1})'.format(
                                    pdbid, i, ch, dbabbr))
                    dbref.database = 'PDB'
            resnum.append((dbref.first[0], dbref.last[0]))
        resnum.sort()
        last = -10000
        for first, temp in resnum:
            if first <= last:
                LOGGER.warn('DBREF records overlap for chain {0} ({1})'.format(
                    poly.chid, pdbid))
            last = temp

    for i, line in lines['MODRES']:
        ch = line[16]
        if ch == ' ':
            if not len(polymers) == 1:
                LOGGER.warn('MODRES chain identifier is not specified '
                            '({0}:{1})'.format(pdbid, i))
                continue
            else:
                ch = list(polymers)[0]
        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        if poly.modified is None:
            poly.modified = []
        poly.modified.append(
            (line[12:15].strip(), line[18:22].strip() + line[22].strip(),
             line[24:27].strip(), line[29:70].strip()))

    for i, line in lines['SEQADV']:
        i += 1
        ch = line[16]
        if ch == ' ':
            if not len(polymers) == 1:
                LOGGER.warn('MODRES chain identifier is not specified '
                            '({0}:{1})'.format(pdbid, i))
                continue
            else:
                ch = list(polymers)[0]
        poly = polymers.get(ch, Polymer(ch))
        polymers[ch] = poly
        dbabbr = line[24:28].strip()
        resname = line[12:15].strip()
        try:
            resnum = int(line[18:22].strip())
        except:
            #LOGGER.warn('SEQADV for chain {2}: failed to parse PDB sequence '
            #            'number ({0}:{1})'.format(pdbid, i, ch))
            continue
        icode = line[22].strip()
        try:
            dbnum = int(line[43:48].strip())
        except:
            #LOGGER.warn('SEQADV for chain {2}: failed to parse database '
            #            'sequence number ({0}:{1})'.format(pdbid, i, ch))
            continue

        comment = line[49:70].strip()
        match = False
        for dbref in poly.dbrefs:
            if not dbref.first[0] <= resnum <= dbref.last[0]:
                continue
            match = True
            if dbref.dbabbr != dbabbr:
                LOGGER.warn('SEQADV for chain {2}: reference database '
                            'mismatch, expected {3} parsed {4} '
                            '({0}:{1})'.format(pdbid, i,
                                               ch, repr(dbref.dbabbr),
                                               repr(dbabbr)))
                continue
            dbacc = line[29:38].strip()
            if dbref.accession[:9] != dbacc[:9]:
                LOGGER.warn('SEQADV for chain {2}: accession code '
                            'mismatch, expected {3} parsed {4} '
                            '({0}:{1})'.format(pdbid, i, ch,
                                               repr(dbref.accession),
                                               repr(dbacc)))
                continue
            dbref.diff.append((resname, resnum, icode, dbnum, dbnum, comment))
        if not match:
            LOGGER.warn('SEQADV for chain {2}: database sequence reference '
                        'not found ({0}:{1})'.format(pdbid, i, ch))
            continue

    string = ' '.join([line[10:].strip() for i, line in lines['COMPND']])
    if string.startswith('MOL_ID'):
        dict_ = {}
        for molecule in string[6:].split('MOL_ID'):
            dict_.clear()
            for token in molecule.split(';'):
                token = token.strip()
                if not token:
                    continue
                items = token.split(':', 1)
                if len(items) == 2:
                    key, value = items
                    dict_[key.strip()] = value.strip()

            chains = dict_.pop('CHAIN', '').strip()

            if not chains:
                continue
            for ch in chains.split(','):
                ch = ch.strip()
                poly = polymers.get(ch, Polymer(ch))
                polymers[ch] = poly
                poly.name = dict_.get('MOLECULE', '')

                poly.fragment = dict_.get('FRAGMENT', '')

                poly.comments = dict_.get('OTHER_DETAILS', '')

                val = dict_.get('SYNONYM', '')
                poly.synonyms = [s.strip()
                                 for s in val.split(',')] if val else []

                val = dict_.get('EC', '')
                poly.ec = [s.strip() for s in val.split(',')] if val else []

                poly.engineered = dict_.get('ENGINEERED', '') == 'YES'
                poly.mutation = dict_.get('MUTATION', '') == 'YES'

    return list(polymers.values())
Beispiel #44
0
def writeDCD(filename,
             trajectory,
             start=None,
             stop=None,
             step=None,
             align=False):
    """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later).
    *trajectory* can be an :class:`Trajectory`, :class:`DCDFile`, or
    :class:`Ensemble` instance. *filename* is returned upon successful
    output of file."""
    if not filename.lower().endswith('.dcd'):
        filename += '.dcd'

    if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)):
        raise TypeError('{0} is not a valid type for trajectory'.format(
            type(trajectory)))

    irange = list(
        range(*slice(start, stop, step).indices(trajectory.numCoordsets())))
    n_csets = len(irange)
    if n_csets == 0:
        raise ValueError('trajectory does not have any coordinate sets, or '
                         'no coordinate sets are selected')

    if isinstance(trajectory, Atomic):
        isEnsemble = False
        isAtomic = True
        n_atoms = trajectory.numAtoms()
    else:
        isEnsemble = True
        isAtomic = False
        n_atoms = trajectory.numSelected()
    if n_atoms == 0:
        raise ValueError('no atoms are selected in the trajectory')
    if isinstance(trajectory, TrajBase):
        isTrajectory = True
        unitcell = trajectory.hasUnitcell()
        nfi = trajectory.nextIndex()
        trajectory.reset()
        pack_i_48 = pack('i', 48)
        if isinstance(trajectory, Trajectory):
            timestep = trajectory.getTimestep()[0]
            first_ts = trajectory.getFirstTimestep()[0]
            framefreq = trajectory.getFrameFreq()[0]
            n_fixed = trajectory.numFixed()[0]
        else:
            timestep = trajectory.getTimestep()
            first_ts = trajectory.getFirstTimestep()
            framefreq = trajectory.getFrameFreq()
            n_fixed = trajectory.numFixed()
    else:
        isTrajectory = False
        unitcell = False
        if isinstance(trajectory, Ensemble):
            frame = trajectory[0]
        else:
            frame = trajectory
            acsi = trajectory.getACSIndex()
        timestep = 1
        first_ts = 0
        framefreq = 1
        n_fixed = 0

    dcd = DCDFile(filename, mode='w')
    LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD')
    prev = -1
    uc = None
    time_ = time()
    for j, i in enumerate(irange):
        diff = i - prev
        if diff > 1:
            trajectory.skip(diff - 1)
        prev = i
        if isTrajectory:
            frame = next(trajectory)
            if frame is None:
                break
            if unitcell:
                uc = frame._getUnitcell()
                uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:]))
                uc = uc[[0, 3, 1, 4, 5, 2]]
        elif isEnsemble:
            frame._index = i
        else:
            frame.setACSIndex(i)
        if align:
            frame.superpose()
        if j == 0:
            dcd.write(frame._getCoords(),
                      uc,
                      timestep=timestep,
                      firsttimestep=first_ts,
                      framefreq=framefreq)
        else:
            dcd.write(frame._getCoords(), uc)
        LOGGER.update(i, label='_prody_writeDCD')
    if isAtomic:
        trajectory.setACSIndex(acsi)
    j += 1
    LOGGER.finish()
    dcd.close()
    time_ = time() - time_ or 0.01
    dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024)
    LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_))
    LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format(
        dcd_size, dcd_size / time_))
    LOGGER.info(
        '{0} coordinate sets written at output rate {1} frame/s.'.format(
            n_csets, int(n_csets / time_)))
    if j != n_csets:
        LOGGER.warn('Warning: {0} frames expected, {1} written.'.format(
            n_csets, j))
    if isTrajectory:
        trajectory.goto(nfi)
    return filename
def parseHeatmap(heatmap, **kwargs):
    """Return a two dimensional array and a dictionary with information parsed
    from *heatmap*, which may be an input stream or an :file:`.hm` file in VMD
    plugin Heat Mapper format."""

    try:
        readline, close = heatmap.readline, lambda: None
    except AttributeError:
        heatmap = openFile(heatmap)
        readline, close = heatmap.readline, heatmap.close

    meta = {}
    arrs = []

    line = readline()
    while line:
        if line.startswith('-'):
            label, data = line[1:].split(None, 1)
            data = data.strip()
            if data[0] == data[-1] == '"':
                data = data[1:-1]
            label = label.strip()
            try:
                meta[label] = HMTYPES[label](data)
            except KeyError:
                LOGGER.warn('Unrecognized label encountered: {0}'
                            .format(repr(label)))
                meta[label] = HMTYPES[label](data)
            except TypeError:
                LOGGER.warn('Could not parse data with label {0}.'
                            .format(repr(label)))
        else:
            arrs.append(line.rstrip())
        line = readline()
    close()
    nnums = len(meta.get('numbering', ''))
    heatmap = []
    numbers = []

    for arr in arrs:
        if nnums:
            items = arr.split(':', nnums + 1)
            numbers.append(items[:nnums])
        else:
            items = [arr]
        heatmap.append(fromstring(items[-1], float, sep=';'))

    heatmap = array(heatmap)
    if nnums:
        numbering = meta['numbering']
        try:
            numbers = array(numbers, int)
        except ValueError:
            try:
                numbers = array(numbers, float)
            except ValueError:
                LOGGER.warn('Numbering for y-axis could not be parsed.')
                numbering = []
        for i, label in enumerate(numbering):
            meta[label] = numbers[:, i].copy()

    return heatmap, meta
Beispiel #46
0
def fetchPDBviaFTP(*pdb, **kwargs):
    """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb*
    identifier(s) and return path(s).  Downloaded files will be stored in
    local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied
    into *folder*, if specified by the user.  If no destination folder is
    specified, files will be saved in the current working directory.  If
    *compressed* is **False**, decompressed files will be copied into
    *folder*.  *format* keyword argument can be used to retrieve
    `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_
    and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ 
    files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file,
    and ``format='xml'`` will fetch a PDBML file. 
    If PDBML header file is desired, ``noatom=True`` argument will do the job."""

    if kwargs.get('check', True):
        identifiers = checkIdentifiers(*pdb)
    else:
        identifiers = list(pdb)

    output_folder = kwargs.pop('folder', None)
    compressed = bool(kwargs.pop('compressed', True))
    format = str(kwargs.pop('format', 'pdb')).lower()
    noatom = bool(kwargs.pop('noatom', False))
    report = kwargs.get('report', True)

    if format == 'pdb':
        ftp_divided = 'pdb/data/structures/divided/pdb'
        ftp_pdbext = '.ent.gz'
        ftp_prefix = 'pdb'
        extension = '.pdb'
    elif format == 'xml':
        if noatom:
            ftp_divided = 'pdb/data/structures/divided/XML-noatom'
            ftp_pdbext = '-noatom.xml.gz'
            extension = '-noatom.xml'
        else:
            ftp_divided = 'pdb/data/structures/divided/XML'
            ftp_pdbext = '.xml.gz'
            extension = '.xml'
        ftp_prefix = ''
    elif format == 'cif':
        ftp_divided = 'pdb/data/structures/divided/mmCIF'
        ftp_pdbext = '.cif.gz'
        ftp_prefix = ''
        extension = '.cif'
    elif format == 'emd' or format == 'map':
        ftp_divided = 'emdb/structures'
        ftp_pdbext = '.map.gz'
        ftp_prefix = 'emd_'
        extension = '.map'
    else:
        raise ValueError(repr(format) + ' is not valid format')

    local_folder = pathPDBFolder()

    if format == 'pdb' and local_folder:
        local_folder, is_divided = local_folder
        if is_divided:
            getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])),
                                       'pdb' + pdb + '.pdb.gz')
        else:
            getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz')
        if output_folder is None:
            second = lambda filename, pdb: filename
        else:
            if compressed:
                second = lambda filename, pdb: (copyFile(
                    filename, join(output_folder, pdb + extension + '.gz')))
            else:
                second = lambda filename, pdb: gunzip(
                    filename, join(output_folder, pdb + extension))

    else:
        if output_folder is None:
            output_folder = getcwd()
        if compressed:
            getPath = lambda pdb: join(output_folder, pdb + extension + '.gz')
            second = lambda filename, pdb: filename
        else:
            getPath = lambda pdb: join(output_folder, pdb + extension)
            second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb))

    ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us']
    if report:
        LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name))

    from ftplib import FTP
    try:
        ftp = FTP(ftp_host)
    except Exception as error:
        raise type(error)('FTP connection problem, potential reason: '
                          'no internet connectivity')
    else:
        success = 0
        failure = 0
        filenames = []
        ftp.login('')
        for pdb in identifiers:
            if pdb is None:
                filenames.append(None)
                continue
            data = []
            ftp_fn = ftp_prefix + pdb + ftp_pdbext
            try:
                ftp.cwd(ftp_path)
                ftp.cwd(ftp_divided)
                if format == 'emd':
                    ftp.cwd('EMD-{0}/map'.format(pdb))
                else:
                    ftp.cwd(pdb[1:3])
                ftp.retrbinary('RETR ' + ftp_fn, data.append)
            except Exception as error:
                if ftp_fn in ftp.nlst():
                    LOGGER.warn(
                        '{0} download failed ({1}). It is '
                        'possible that you do not have rights to '
                        'download .gz files in the current network.'.format(
                            pdb, str(error)))
                else:
                    LOGGER.warn('{0} download failed. {1} does not exist '
                                'on {2}.'.format(ftp_fn, pdb, ftp_host))
                failure += 1
                filenames.append(None)
            else:
                if len(data):
                    filename = getPath(pdb)

                    with open(filename, 'w+b') as pdbfile:
                        write = pdbfile.write
                        [write(block) for block in data]

                    filename = normpath(relpath(second(filename, pdb)))
                    if report:
                        LOGGER.debug('{0} downloaded ({1})'.format(
                            pdb, sympath(filename)))
                    success += 1
                    filenames.append(filename)
                else:
                    LOGGER.warn(
                        '{0} download failed, reason unknown.'.format(pdb))
                    failure += 1
                    filenames.append(None)

        ftp.quit()

    if report:
        LOGGER.debug('PDB download via FTP completed ({0} downloaded, '
                     '{1} failed).'.format(success, failure))
    if len(identifiers) == 1:
        return filenames[0]
    else:
        return filenames
Beispiel #47
0
def parseMSA(filename, **kwargs):
    """Return an :class:`.MSA` instance that stores multiple sequence alignment
    and sequence labels parsed from Stockholm, SELEX, or FASTA format
    *filename* file, which may be a compressed file. Uncompressed MSA files
    are parsed using C code at a fraction of the time it would take to parse
    compressed files in Python."""

    from .msa import MSA

    try:
        fileok = isfile(filename)
    except TypeError:
        raise TypeError('filename must be a string')
    else:
        if not fileok:
            raise IOError('[Errno 2] No such file or directory: ' +
                          repr(filename))

    # if MSA is a compressed file or filter/slice is passed, use
    #   Python parsers

    LOGGER.timeit('_parsemsa')

    title, ext = splitext(filename)
    title = split(title)[1]
    aligned = kwargs.get('aligned', True)
    if (ext.lower() == '.gz' or 'filter' in kwargs or 'slice' in kwargs or
            not aligned):
        if ext.lower() == '.gz':
            title = splitext(title)[0]
        msa = MSAFile(filename, split=False, **kwargs)
        seqlist = []
        sappend = seqlist.append
        labels = []
        lappend = labels.append
        mapping = {}
        maxlen = 0
        for i, seq in enumerate(msa):
            label = seq.getLabel(True)
            lappend(label)
            if aligned:
                sappend(seq._array)
            else:
                if len(seq) > maxlen:
                    maxlen = len(seq)
                sappend(seq)
            key = splitSeqLabel(label)[0]
            if key in mapping:
                try:
                    mapping[key].append(i)
                except AttributeError:
                    mapping[key] = [mapping[key], i]
            else:
                mapping[key] = i
        if not seqlist:
            LOGGER.warn('No sequences were parsed from {0}.'.format(filename))
            return
        if aligned:
            msaarr = array(seqlist, '|S1')
        else:
            msaarr = array(seqlist, '|S' + str(maxlen))
    else:
        filesize = getsize(filename)
        format = MSAEXTMAP.get(splitext(filename)[1])

        if format == FASTA:
            from .msaio import parseFasta as parser
        elif format == SELEX or format == STOCKHOLM:
            from .msaio import parseSelex as parser
        else:
            raise IOError('MSA file format is not recognized from the '
                          'extension')
        msaarr = empty(filesize, '|S1')
        msaarr, labels, mapping, lcount = parser(filename, msaarr)
        if lcount != len(msaarr):
            LOGGER.warn('Failed to parse {0} sequence labels.'
                        .format(len(msaarr) - lcount))

    msa = MSA(msa=msaarr, title=title, labels=labels, mapping=mapping,
              aligned=aligned)

    if aligned:
        LOGGER.report('{0} sequence(s) with {1} residues were parsed in '
                      '%.2fs.'.format(*msaarr.shape), '_parsemsa')
    else:
        LOGGER.report('{0} sequence(s) were parsed in %.2fs.'
                      .format(*msaarr.shape), '_parsemsa')
    return msa
Beispiel #48
0
    def buildHessian(self, coords, masses, cutoff=15., gamma=1., **kwargs):
        """Build Hessian matrix for given coordinate set.

        NOTE: MASS WEIGHTING OF HESSIAN IS PERFORMED

        :arg coords: a coordinate set or an object with ``getCoords`` method
        :type coords: :class:`numpy.ndarray`

        :arg cutoff: cutoff distance (Å) for pairwise interactions,
            default is 15.0 Å, minimum is 4.0 Å
        :type cutoff: float

        :arg gamma: spring constant, default is 1.0
        :type gamma: float, :class:`Gamma`

        :arg sparse: elect to use sparse matrices, default is **False**. If
            Scipy is not found, :class:`ImportError` is raised.
        :type sparse: bool

        :arg kdtree: elect to use KDTree for building Hessian matrix,
            default is **False** since KDTree method is slower
        :type kdtree: bool

        Instances of :class:`Gamma` classes and custom functions are
        accepted as *gamma* argument.

        When Scipy is available, user can select to use sparse matrices for
        efficient usage of memory at the cost of computation speed."""

        try:
            coords = (coords._getCoords()
                      if hasattr(coords, '_getCoords') else coords.getCoords())
        except AttributeError:
            try:
                checkCoords(coords)
            except TypeError:
                raise TypeError('coords must be a Numpy array or an object '
                                'with `getCoords` method')

        cutoff, g, gamma = checkENMParameters(cutoff, gamma)
        self._reset()
        self._cutoff = cutoff
        self._gamma = g
        n_atoms = coords.shape[0]

        dof = n_atoms * 3
        LOGGER.timeit('_anm_hessian')

        sparse = kwargs.get('sparse', False)
        if sparse:
            try:
                from scipy import sparse as scipy_sparse
            except ImportError:
                raise ImportError('failed to import scipy.sparse, which  is '
                                  'required for sparse matrix calculations')
            kirchhoff = scipy_sparse.lil_matrix((n_atoms, n_atoms))
            hessian = scipy_sparse.lil_matrix((dof, dof))
        else:
            kirchhoff = np.zeros((n_atoms, n_atoms), 'd')
            hessian = np.zeros((dof, dof), float)

        if kwargs.get('kdtree', False):
            LOGGER.info('Using KDTree for building the Hessian.')
            kdtree = KDTree(coords)
            kdtree.search(cutoff)
            for i, j in kdtree.getIndices():
                i2j = coords[j] - coords[i]
                dist2 = np.dot(i2j, i2j)
                g = gamma(dist2, i, j)

                mass_i = masses[i]
                mass_j = masses[j]
                #super_element = np.outer(i2j, i2j) * (- g / dist2)
                super_element = np.outer(
                    i2j, i2j) * (-g / dist2) * (1.0 / np.sqrt(mass_i * mass_j))
                res_i3 = i * 3
                res_i33 = res_i3 + 3
                res_j3 = j * 3
                res_j33 = res_j3 + 3
                hessian[res_i3:res_i33, res_j3:res_j33] = super_element
                hessian[res_j3:res_j33, res_i3:res_i33] = super_element
                hessian[res_i3:res_i33, res_i3:res_i33] = \
                    hessian[res_i3:res_i33, res_i3:res_i33] - super_element
                hessian[res_j3:res_j33, res_j3:res_j33] = \
                    hessian[res_j3:res_j33, res_j3:res_j33] - super_element
                kirchhoff[i, j] = -g
                kirchhoff[j, i] = -g
                kirchhoff[i, i] = kirchhoff[i, i] + g
                kirchhoff[j, j] = kirchhoff[j, j] + g
        else:
            print "anm-mod: not using kdtree - using mass weighting"
            #
            # mass weighting of the hessian
            #
            cutoff2 = cutoff * cutoff
            for i in range(n_atoms):
                res_i3 = i * 3
                res_i33 = res_i3 + 3
                i_p1 = i + 1
                i2j_all = coords[i_p1:, :] - coords[i]
                for j, dist2 in enumerate((i2j_all**2).sum(1)):
                    if dist2 > cutoff2:
                        continue
                    i2j = i2j_all[j]
                    j += i_p1
                    g = gamma(dist2, i, j)

                    mass_i = masses[i]
                    mass_j = masses[j]
                    # super element in my own code
                    # hessian_ij = np.outer(i2j,i2j)*( - self.k/dist_ij)*(1/np.sqrt(mi*mj))

                    res_j3 = j * 3
                    res_j33 = res_j3 + 3
                    super_element = np.outer(i2j, i2j) * (-g / dist2) * (
                        1.0 / np.sqrt(mass_i * mass_j))
                    # super_element = np.outer(i2j, i2j) * (- g / dist2)
                    hessian[res_i3:res_i33, res_j3:res_j33] = super_element
                    hessian[res_j3:res_j33, res_i3:res_i33] = super_element
                    hessian[res_i3:res_i33, res_i3:res_i33] = \
                        hessian[res_i3:res_i33, res_i3:res_i33] - super_element
                    hessian[res_j3:res_j33, res_j3:res_j33] = \
                        hessian[res_j3:res_j33, res_j3:res_j33] - super_element
                    kirchhoff[i, j] = -g
                    kirchhoff[j, i] = -g
                    kirchhoff[i, i] = kirchhoff[i, i] + g
                    kirchhoff[j, j] = kirchhoff[j, j] + g

        if sparse:
            kirchhoff = kirchhoff.tocsr()
            hessian = hessian.tocsr()

        LOGGER.report('Hessian was built in %.2fs.', label='_anm_hessian')
        self._kirchhoff = kirchhoff
        self._hessian = hessian
        self._n_atoms = n_atoms
        self._dof = dof
Beispiel #49
0
def parsePfamPDBs(query, data=[], **kwargs):
    """Returns a list of :class:`.AtomGroup` objects containing sections of chains 
    that correspond to a particular PFAM domain family. These are defined by 
    alignment start and end residue numbers.

    :arg query: UniProt ID or PDB ID
        If a PDB ID is provided the corresponding UniProt ID is used.
        If this returns multiple matches then start or end must also be provided.
        This query is also used for label refinement of the Pfam domain MSA.
    :type query: str

    :arg data: If given the data list from the Pfam mapping table will 
        be output through this argument.
    :type data: list

    :keyword start: Residue number for defining the start of the domain.
        The PFAM domain that starts closest to this will be selected. 
        Default is **1**
    :type start: int

    :keyword end: Residue number for defining the end of the domain.
        The PFAM domain that ends closest to this will be selected. 
    :type end: int
    """

    start = kwargs.pop('start', 1)
    end = kwargs.pop('end', None)

    if len(query) > 4 and query.startswith('PF'):
        pfam_acc = query
    else:
        pfam_matches = searchPfam(query)
        keys = list(pfam_matches.keys())

        if isinstance(start, Integral):
            start_diff = []
            for i, key in enumerate(pfam_matches):
                start_diff.append(
                    int(pfam_matches[key]['locations'][0]['start']) - start)
            start_diff = np.array(start_diff)
            pfam_acc = keys[np.where(
                abs(start_diff) == min(abs(start_diff)))[0][0]]

        elif isinstance(end, Integral):
            end_diff = []
            for i, key in enumerate(pfam_matches):
                end_diff.append(
                    int(pfam_matches[key]['locations'][0]['end']) - end)
            end_diff = np.array(end_diff)
            pfam_acc = keys[np.where(
                abs(end_diff) == min(abs(end_diff)))[0][0]]

        else:
            raise ValueError('Please provide an integer for start or end '
                             'when using a UniProt ID or PDB ID.')

    from ftplib import FTP
    from .uniprot import queryUniprot

    data_stream = BytesIO()
    ftp_host = 'ftp.ebi.ac.uk'
    ftp = FTP(ftp_host)
    ftp.login()
    ftp.cwd('pub/databases/Pfam/current_release')
    ftp.retrbinary('RETR pdbmap.gz', data_stream.write)
    ftp.quit()
    zip_data = data_stream.getvalue()
    data_stream.close()

    rawdata = gunzip(zip_data)
    if PY3K:
        rawdata = rawdata.decode()

    fields = [
        'PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc',
        'UniprotResnumRange'
    ]

    data_dicts = []
    for line in rawdata.split('\n'):
        if line.find(pfam_acc) != -1:
            data_dicts.append({})
            for j, entry in enumerate(line.strip().split('\t')):
                data_dicts[-1][fields[j]] = entry.strip(';')

    pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts]
    chains = [data_dict['chain'] for data_dict in data_dicts]

    header = kwargs.pop('header', False)
    model = kwargs.get('model', None)
    results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs)

    ags, headers = results
    ags, headers = list(ags), list(headers)

    if model == 0:
        LOGGER.info('only header is requested and returned')
        return results

    if header:
        results = (ags, headers)
    else:
        results = ags

    LOGGER.progress('Extracting Pfam domains...', len(ags))
    comma_splitter = re.compile(r'\s*,\s*').split
    no_info = []
    for i, ag in enumerate(ags):
        LOGGER.update(i)
        data_dict = data_dicts[i]
        pfamRange = data_dict['UniprotResnumRange'].split('-')
        uniprotAcc = data_dict['UniprotAcc']
        try:
            uniData = queryUniprot(uniprotAcc)
        except:
            LOGGER.warn('No Uniprot record found for {0}'.format(
                data_dict['PBD_ID']))
            continue

        resrange = None
        found = False
        for key, value in uniData.items():
            if not key.startswith('dbReference'):
                continue
            try:
                pdbid = value['PDB']
            except:
                continue
            if pdbid != data_dict['PDB_ID']:
                continue
            pdbchains = value['chains']

            # example chain strings: "A=27-139, B=140-150" or "A/B=27-150"
            pdbchains = comma_splitter(pdbchains)
            for chain in pdbchains:
                chids, resrange = chain.split('=')
                chids = [chid.strip() for chid in chids.split('/')]
                if data_dict['chain'] in chids:
                    resrange = resrange.split('-')
                    found = True
                    break
            if found:
                break

        if found:
            header = headers[i]
            chain_accessions = [
                dbref.accession for dbref in header[data_dict['chain']].dbrefs
            ]
            try:
                if len(chain_accessions) > 0:
                    right_part = np.where(
                        np.array(chain_accessions) ==
                        data_dict['UniprotAcc'])[0][0]
                else:
                    raise ValueError(
                        'There is no accession for a chain in the Header')
            except:
                LOGGER.warn(
                    'Could not map domains in {0}'.format(data_dict['PDB_ID'] +
                                                          data_dict['chain']))
                no_info.append(i)
                continue

            right_dbref = header[data_dict['chain']].dbrefs[right_part]
            chainStart = ag.select('chain {0}'.format(
                data_dict['chain'])).getResnums()[0]
            missing = chainStart - right_dbref.first[0]
            partStart = ag.getResindices()[np.where(
                ag.getResnums() == right_dbref.first[0] + missing)][0]
            pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1])
            uniStart, uniEnd = int(resrange[0]), int(resrange[1])

            resiStart = pfStart - uniStart + partStart - missing
            resiEnd = pfEnd - uniStart + partStart - missing
            ags[i] = ag.select('resindex {0} to {1}'.format(
                resiStart, resiEnd))
        else:
            no_info.append(i)
    LOGGER.finish()

    for i in reversed(no_info):
        ags.pop(i)
        if header:
            headers.pop(i)

    if isinstance(data, list):
        data.extend(data_dicts)
    else:
        LOGGER.warn('data should be a list in order to get output')

    return results
Beispiel #50
0
    def _superpose(self, **kwargs):
        """Superpose conformations and update coordinates."""

        ref = kwargs.pop('ref', None)

        indices = self._indices
        weights = self._weights
        mobs = self._confs
        if indices is None:
            idx = False
            tar = self._coords
            movs = None
        else:
            idx = True
            if self._weights is not None:
                weights = weights[indices]
            tar = self._coords[indices]
            movs = self._confs

        linalg = importLA()
        svd = linalg.svd
        det = linalg.det

        if weights is None:
            if ref is None:
                tar_com = tar.mean(0)
            else:
                tar_com = tar[ref]

            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)
            tar_org = tar_org.T
        else:
            weights_sum = weights.sum()
            weights_dot = dot(weights.T, weights)
            if ref is None:
                tar_com = (tar * weights).sum(axis=0) / weights_sum
            else:
                tar_com = (tar[ref] * weights[ref]).sum(axis=0) / sum(
                    weights[ref])
            tar_org = (tar - tar_com)
            mob_org = zeros(tar_org.shape, dtype=mobs.dtype)

        LOGGER.progress('Superposing ', len(mobs), '_prody_ensemble')
        for i, mob in enumerate(mobs):
            if idx:
                mob = mob[indices]
            if weights is None:
                mob_com = mob.mean(0)
                matrix = dot(tar_org, subtract(mob, mob_com, mob_org))
            else:
                mob_com = (mob * weights).sum(axis=0) / weights_sum
                subtract(mob, mob_com, mob_org)
                matrix = dot((tar_org * weights).T,
                             (mob_org * weights)) / weights_dot

            U, s, Vh = svd(matrix)
            Id = array([[1, 0, 0], [0, 1, 0], [0, 0, sign(det(matrix))]])
            rotation = dot(Vh.T, dot(Id, U.T))

            if movs is None:
                mobs[i] = dot(mob_org, rotation)
                add(mobs[i], tar_com, mobs[i])
            else:
                add(dot(movs[i], rotation), (tar_com - dot(mob_com, rotation)),
                    movs[i])
            LOGGER.update(i + 1, label='_prody_ensemble')
        LOGGER.finish()
Beispiel #51
0
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a 
        sequence file. Sequence queries must not contain gaps and 
        must be at least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    query = str(query)
    seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    result = root[0].get('id')
    return result
Beispiel #52
0
def refineMSA(msa,
              index=None,
              label=None,
              rowocc=None,
              seqid=None,
              colocc=None,
              **kwargs):
    """Refine *msa* by removing sequences (rows) and residues (columns) that
    contain gaps.

    :arg msa: multiple sequence alignment
    :type msa: :class:`.MSA`

    :arg index: remove columns that are gaps in the sequence with that index
    :type index: int

    :arg label: remove columns that are gaps in the sequence matching label,
        ``msa.getIndex(label)`` must return a sequence index, a PDB identifier
        is also acceptable
    :type label: str

    :arg rowocc: row occupancy, sequences with less occupancy will be
        removed after *label* refinement is applied
    :type rowocc: float

    :arg seqid: keep unique sequences at specified sequence identity level,
        unique sequences are identified using :func:`.uniqueSequences`
    :type seqid: float

    :arg colocc: column occupancy, residue positions with less occupancy
        will be removed after other refinements are applied
    :type colocc: float

    :arg keep: keep columns corresponding to residues not resolved in the PDB
        structure, default is **False**, applies when *label* is a PDB
        identifier
    :arg type: bool

    For Pfam MSA data, *label* is UniProt entry name for the protein.  You may
    also use PDB structure and chain identifiers, e.g. ``'1p38'`` or
    ``'1p38A'``, for *label* argument and UniProt entry names will be parsed
    using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and
    :class:`.DBRef`).

    The order of refinements are applied in the order of arguments.  If *label*
    and *unique* is specified, sequence matching *label* will
    be kept in the refined :class:`.MSA` although it may be similar to some
    other sequence."""

    # if msa is a char array, it will be refined but label won't work
    try:
        ndim, dtype_ = msa.ndim, msa.dtype
    except AttributeError:
        try:
            arr = msa._getArray()
        except AttributeError:
            raise TypeError('msa must be a character array or an MSA instance')
        ndim, dtype_ = arr.ndim, arr.dtype
    else:
        arr, msa = msa, None

    if dtype('|S1') != dtype_:
        raise ValueError('msa must be a character array or an MSA instance')
    if ndim != 2:
        raise ValueError('msa must be a 2D array or an MSA instance')

    title = []
    cols = None

    if index is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        cols = char.isalpha(arr[index]).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('index=' + str(index))
        LOGGER.report(
            'Index refinement reduced number of columns from {0} to '
            '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

    if label is not None:
        if index is not None:
            LOGGER.info('An index was provided so the label will be ignored.')

        else:
            before = arr.shape[1]
            LOGGER.timeit('_refine')
            try:
                upper, lower = label.upper(), label.lower()
            except AttributeError:
                raise TypeError('label must be a string')

            if msa is None:
                raise TypeError('msa must be an MSA instance, '
                                'label cannot be used')

            index = msa.getIndex(label)
            if index is None:
                index = msa.getIndex(upper)
            if index is None:
                index = msa.getIndex(lower)

            chain = None
            if index is None and (len(label) == 4 or len(label) == 5):
                from prody import parsePDB
                try:
                    structure, header = parsePDB(label[:4], header=True)
                except Exception as err:
                    raise IOError(
                        'failed to parse header for {0} ({1})'.format(
                            label[:4], str(err)))

                chid = label[4:].upper()
                for poly in header['polymers']:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if index is None:
                            index = msa.getIndex(dbref.idcode)
                            if index is not None:
                                LOGGER.info('{0} idcode {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database, dbref.idcode,
                                                label[:4], poly.chid,
                                                str(msa)))
                                break
                        if index is None:
                            index = msa.getIndex(dbref.accession)
                            if index is not None:
                                LOGGER.info('{0} accession {1} for {2}{3} '
                                            'is found in chain {4}.'.format(
                                                dbref.database,
                                                dbref.accession, label[:4],
                                                poly.chid, str(msa)))
                                break
                if index is not None:
                    chain = structure[poly.chid]
                    resnums = chain.ca.getResnums()

            if index is None:
                raise ValueError('label is not in msa, or msa is not indexed')
            try:
                len(index)
            except TypeError:
                pass
            else:
                raise ValueError(
                    'label {0} maps onto multiple sequences, '
                    'so cannot be used for refinement'.format(label))

            title.append('label=' + label)
            cols = char.isalpha(arr[index]).nonzero()[0]
            arr = arr.take(cols, 1)
            LOGGER.report(
                'Label refinement reduced number of columns from {0} to '
                '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine')

            if chain is not None and not kwargs.get('keep', False):
                before = arr.shape[1]
                LOGGER.timeit('_refine')

                from Bio import pairwise2
                from prody.utilities import MATCH_SCORE, MISMATCH_SCORE
                from prody.utilities import GAP_PENALTY, GAP_EXT_PENALTY, ALIGNMENT_METHOD

                chseq = chain.getSequence()
                algn = pairwise2.align.localms(pystr(
                    arr[index].tostring().upper()),
                                               pystr(chseq),
                                               MATCH_SCORE,
                                               MISMATCH_SCORE,
                                               GAP_PENALTY,
                                               GAP_EXT_PENALTY,
                                               one_alignment_only=1)
                torf = []
                for s, c in zip(*algn[0][:2]):
                    if s == '-':
                        continue
                    elif c != '-':
                        torf.append(True)
                    else:
                        torf.append(False)
                torf = array(torf)
                tsum = torf.sum()
                assert tsum <= before, 'problem in mapping sequence to structure'
                if tsum < before:
                    arr = arr.take(torf.nonzero()[0], 1)
                    resnums = resnums.take(torf.nonzero()[0] -
                                           torf.nonzero()[0][0] + 1)
                    LOGGER.report(
                        'Structure refinement reduced number of '
                        'columns from {0} to {1} in %.2fs.'.format(
                            before, arr.shape[1]), '_refine')
                else:
                    LOGGER.debug(
                        'All residues in the sequence are contained in '
                        'PDB structure {0}.'.format(label))

                labels = msa._labels
                labels[index] = splitSeqLabel(labels[index])[0] + '/' + str(
                    resnums[0]) + '-' + str(resnums[-1])

    from .analysis import calcMSAOccupancy, uniqueSequences

    rows = None
    if rowocc is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        try:
            rowocc = float(rowocc)
        except Exception as err:
            raise TypeError('rowocc must be a float ({0})'.format(str(err)))
        assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1'

        rows = calcMSAOccupancy(arr, 'row') >= rowocc
        if index is not None:
            index = rows[:index].sum()
        rows = (rows).nonzero()[0]
        arr = arr[rows]
        title.append('rowocc>=' + str(rowocc))
        LOGGER.report(
            'Row occupancy refinement reduced number of rows from '
            '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine')

    if seqid is not None:
        before = arr.shape[0]
        LOGGER.timeit('_refine')
        unique = uniqueSequences(arr, seqid)
        if index is not None:
            unique[index] = True
        unique = unique.nonzero()[0]
        arr = arr[unique]
        title.append('seqid>=' + str(seqid))
        if rows is not None:
            rows = rows[unique]
        else:
            rows = unique
        LOGGER.report(
            'Sequence identity refinement reduced number of rows '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]),
            '_refine')

    if colocc is not None:
        before = arr.shape[1]
        LOGGER.timeit('_refine')
        try:
            colocc = float(colocc)
        except Exception as err:
            raise TypeError('colocc must be a float ({0})'.format(str(err)))
        assert 0. <= colocc <= 1., 'colocc must be between 0 and 1'

        cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0]
        arr = arr.take(cols, 1)
        title.append('colocc>=' + str(colocc))
        LOGGER.report(
            'Column occupancy refinement reduced number of columns '
            'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]),
            '_refine')

    if not title:
        raise ValueError(
            'label, index, seqid, rowocc, colocc all cannot be None')

    # depending on slicing of rows, arr may not have it's own memory
    if arr.base is not None:
        arr = arr.copy()

    if msa is None:
        return arr
    else:
        if rows is None:
            from copy import copy
            labels = copy(msa._labels)
        else:
            labels = msa._labels
            labels = [labels[i] for i in rows]
        return MSA(arr,
                   title=msa.getTitle() +
                   ' refined ({0})'.format(', '.join(title)),
                   labels=labels)
Beispiel #53
0
def rhapsody(query, query_type='SAVs',
             main_classifier=None, aux_classifier=None,
             custom_PDB=None, force_env=None,
             refresh=False, log=True, **kwargs):
    """Obtain Rhapsody pathogenicity predictions on a list of human missense
    variants ([ref]_)

    :arg query: Single Amino Acid Variants (SAVs) in Uniprot coordinates

      - if *query_type* = ``'SAVs'`` (default), it should be a filename, a
        string or a list/tuple of strings, containing Uniprot SAV coordinates,
        with the format ``'P17516 135 G E'``. The string could also be just
        a single Uniprot sequence identifier (e.g. ``'P17516'``), or the
        coordinate of a specific site in a sequence (e.g. ``'P17516 135'``), in
        which case all possible 19 amino acid substitutions at the specified
        positions will be analyzed.
      - if *query_type* = ``'PolyPhen2'``, it should be a filename containing
        the output from PolyPhen-2, usually named :file:`pph2-full.txt`
    :type query: str, list

    :arg query_type: ``'SAVs'`` or ``'PolyPhen2'``
    :type query_type: str

    :arg main_classifier: main classifier's filename. If **None**, the default
      *full* Rhapsody classifier will be used
    :type main_classifier: str

    :arg aux_classifier: auxiliary classifier's filename. If both
      *main_classifier* and *aux_classifier* are **None**, the default
      *reduced* Rhapsody classifier will be used
    :type aux_classifier: str

    :arg custom_PDB: a PDBID, a filename or an :class:`Atomic` to be used
      for computing structural and dynamical features, instead of the PDB
      structure automatically selected by the program
    :type custom_PDB: str, :class:`AtomGroup`

    :arg force_env: force a specific environment model for GNM/ANM
      calculations, among ``'chain'``, ``'reduced'`` and ``'sliced'``.
      If **None** (default), the model of individual dynamical features will
      match that found in the classifier's feature set
    :type force_env: str

    :arg refresh: if **True**, precomputed features and PDB mappings found in
      the working directory will be ignored and computed again
    :type refresh: str

    :arg log: if **True**, log messages will be saved in
      :file:`rhapsody-log.txt`
    :type log: str

    .. [ref] Ponzoni L, Bahar I. Structural dynamics is a determinant of
      the functional significance of missense variants. *PNAS* **2018**
      115 (16) 4164-4169.
    """

    assert query_type in ['SAVs', 'PolyPhen2'], 'Invalid query type.'

    if log:
        LOGGER.start('rhapsody-log.txt')

    # select classifiers
    if main_classifier is None:
        main_classifier = getDefaultClassifiers()['full']
        if aux_classifier is None:
            aux_classifier = getDefaultClassifiers()['reduced']

    # initialize object that will contain all results and predictions
    r = Rhapsody(**kwargs)

    # import classifiers and feature set from pickle
    r.importClassifiers(main_classifier, aux_classifier, force_env=force_env)

    # import custom PDB structure
    if custom_PDB is not None:
        r.setCustomPDB(custom_PDB)

    # obtain or import PolyPhen-2 results
    if query_type == 'SAVs':
        r.queryPolyPhen2(query)
    elif query_type == 'PolyPhen2':
        r.importPolyPhen2output(query)

    # compute predictions
    r.getPredictions(refresh=refresh)

    # print predictions to file
    r.printPredictions()
    if aux_classifier is not None:
        # print both 'full' and 'reduced' predictions in a more detailed format
        r.printPredictions(
            classifier="both", PolyPhen2=False, EVmutation=False,
            filename='rhapsody-predictions-full_vs_reduced.txt')

    # save pickle
    r.savePickle()

    if log:
        LOGGER.close('rhapsody-log.txt')

    return r
Beispiel #54
0
    def _parseHeader(self):
        """Read the header information from a dcd file.
        Input: fd - a file struct opened for binary reading.
        Output: 0 on success, negative error code on failure.
        Side effects: *natoms set to number of atoms per frame
                      *nsets set to number of frames in dcd file
                      *istart set to starting timestep of dcd file
                      *nsavc set to timesteps between dcd saves
                      *delta set to value of trajectory timestep
                      *nfixed set to number of fixed atoms
                      *freeind may be set to heap-allocated space
                      *reverse set to one if reverse-endian, zero if not.
                      *charmm set to internal code for handling charmm data.
        """

        dcd = self._file
        endian = b''  #'=' # native endian
        rec_scale = RECSCALE32BIT
        charmm = None
        dcdcordmagic = unpack(endian + b'i', b'CORD')[0]
        # Check magic number in file header and determine byte order
        bits = dcd.read(calcsize('ii'))

        temp = unpack(endian + b'ii', bits)

        if temp[0] + temp[1] == 84:
            LOGGER.info('Detected CHARMM -i8 64-bit DCD file of native '
                        'endianness.')
            rec_scale = RECSCALE64BIT
        elif temp[0] == 84 and temp[1] == dcdcordmagic:
            pass
            #LOGGER.info('Detected standard 32-bit DCD file of native '
            #            'endianness.')
        else:
            if unpack(b'>ii', bits) == temp:
                endian = '>'
            else:
                endian = '<'
            temp = unpack(endian + b'ii', bits)
            if temp[0] + temp[1] == 84:
                rec_scale = RECSCALE64BIT
                LOGGER.info('Detected CHARMM -i8 64-bit DCD file of opposite '
                            'endianness.')
            else:
                endian = ''
                temp = unpack(endian + b'ii', bits)
                if temp[0] == 84 and temp[1] == dcdcordmagic:
                    LOGGER.info('Detected standard 32-bit DCD file of '
                                'opposite endianness.')
                else:
                    raise IOError('Unrecognized DCD header or unsupported '
                                  'DCD format.')

        # check for magic string, in case of long record markers
        if rec_scale == RECSCALE64BIT:
            raise IOError('CHARMM 64-bit DCD files are not yet supported.')
            temp = unpack(b'I', dcd.read(calcsize('I')))
            if temp[0] != dcdcordmagic:
                raise IOError('Failed to find CORD magic in CHARMM -i8 64-bit '
                              'DCD file.')

        # Buffer the entire header for random access
        bits = dcd.read(80)

        # CHARMm-genereate DCD files set the last integer in the
        # header, which is unused by X-PLOR, to its version number.
        # Checking if this is nonzero tells us this is a CHARMm file
        # and to look for other CHARMm flags.
        temp = unpack(endian + b'i' * 20, bits)

        if temp[-1] != 0:
            charmm = True

        if charmm:
            #LOGGER.info('CHARMM format DCD file (also NAMD 2.1 and later).')
            temp = unpack(endian + b'i' * 9 + b'f' + b'i' * 10, bits)
        else:
            LOGGER.info('X-PLOR format DCD file (also NAMD 2.0 and earlier) '
                        'is not supported.')
            return None

        # Store the number of sets of coordinates (NSET)
        self._n_csets = temp[0]
        # Store ISTART, the starting timestep
        self._first_ts = temp[1]
        # Store NSAVC, the number of timesteps between dcd saves
        self._framefreq = temp[2]
        # Store NAMNF, the number of fixed atoms
        self._n_fixed = temp[8]

        if self._n_fixed > 0:
            raise IOError('DCD files with fixed atoms is not yet supported.')

        # Read in the timestep, DELTA
        # Note: DELTA is stored as double with X-PLOR but as float with CHARMm
        self._timestep = temp[9]
        self._unitcell = temp[10] == 1

        # Get the end size of the first block
        if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 84:
            raise IOError('Unrecognized DCD format.')

        # Read in the size of the next block
        temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))

        if (temp[0] - 4) % 80 != 0:
            raise IOError('Unrecognized DCD format.')
        noremarks = temp[0] == 84

        # Read NTITLE, the number of 80 character title strings there are
        temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))

        self._dcdtitle = dcd.read(80)

        if not noremarks:
            self._remarks = dcd.read(80)

        # Get the ending size for this block
        temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))

        if (temp[0] - 4) % 80 != 0:
            raise IOError('Unrecognized DCD format.')

        # Read in an integer '4'
        if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 4:
            raise IOError('Unrecognized DCD format.')

        # Read in the number of atoms
        self._n_atoms = unpack(endian + b'i',
                               dcd.read(rec_scale * calcsize('i')))[0]
        # Read in an integer '4'
        if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 4:
            raise IOError('Bad DCD format.')

        self._is64bit = rec_scale == RECSCALE64BIT
        self._endian = endian
        self._n_floats = (self._n_atoms + 2) * 3

        if self._is64bit:
            if self._unitcell:
                self._bytes_per_frame = 56 + self._n_floats * 8
            else:
                self._bytes_per_frame = self._n_floats * 8
            LOGGER.warning('Reading of 64 bit DCD files has not been tested. '
                           'Please report any problems that you may find.')
            self._dtype = np.float64
            self._itemsize = 8
        else:
            if self._unitcell:
                self._bytes_per_frame = 56 + self._n_floats * 4
            else:
                self._bytes_per_frame = self._n_floats * 4
            self._dtype = np.float32
            self._itemsize = 4

        self._first_byte = self._file.tell()
        n_csets = (getsize(self._filename) -
                   self._first_byte) // self._bytes_per_frame
        if n_csets != self._n_csets:
            LOGGER.warning('DCD header claims {0} frames, file size '
                           'indicates there are actually {1} frames.'.format(
                               self._n_csets, n_csets))
            self._n_csets = n_csets

        self._coords = self.nextCoordset()
        self._file.seek(self._first_byte)
        self._nfi = 0
Beispiel #55
0
def mapSAVs2PDB(SAV_coords,
                custom_PDB=None,
                refresh=False,
                status_file=None,
                status_prefix=None):
    LOGGER.info('Mapping SAVs to PDB structures...')
    LOGGER.timeit('_map2PDB')
    # sort SAVs, so to group together those
    # with identical accession number
    accs = [s.split()[0] for s in SAV_coords]
    sorting_map = np.argsort(accs)
    # define a structured array
    PDBmap_dtype = np.dtype([('orig. SAV coords', 'U25'),
                             ('unique SAV coords', 'U25'),
                             ('PDB SAV coords', 'U100'), ('PDB size', 'i')])
    nSAVs = len(SAV_coords)
    mapped_SAVs = np.zeros(nSAVs, dtype=PDBmap_dtype)
    # define how to report progress
    if status_prefix is None:
        status_prefix = ''
    bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
    if status_file is not None:
        status_file = open(status_file, 'w')
        progress_bar = tqdm([(i, SAV_coords[i]) for i in sorting_map],
                            file=status_file,
                            bar_format=bar_format + '\n')
    else:
        progress_bar = tqdm([(i, SAV_coords[i]) for i in sorting_map],
                            bar_format=bar_format)
    # map to PDB using Uniprot class
    cache = {'acc': None, 'obj': None}
    count = 0
    for indx, SAV in progress_bar:
        count += 1
        acc, pos, aa1, aa2 = SAV.split()
        pos = int(pos)
        # report progress
        progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to PDB"
        # LOGGER.info(f"[{count}/{nSAVs}] {progress_msg}...")
        progress_bar.set_description(progress_msg)
        # map Uniprot to PDB chains
        if acc == cache['acc']:
            # use mapping from previous iteration
            U2P_map = cache['obj']
        else:
            # save previous mapping
            if isinstance(cache['obj'], UniprotMapping):
                cache['obj'].savePickle()
            cache['acc'] = acc
            # compute the new mapping
            try:
                U2P_map = UniprotMapping(acc, recover_pickle=not (refresh))
                if custom_PDB is not None:
                    LOGGER.info('Aligning Uniprot sequence to custom PDB...')
                    U2P_map.alignCustomPDB(custom_PDB, 'all')
            except Exception as e:
                U2P_map = str(e)
            cache['obj'] = U2P_map
        # map specific SAV
        try:
            if isinstance(U2P_map, str):
                raise RuntimeError(U2P_map)
            # check wt aa
            if not 0 < pos <= len(U2P_map.sequence):
                raise ValueError('Index out of range')
            wt_aa = U2P_map.sequence[pos - 1]
            if aa1 != wt_aa:
                raise ValueError(f'Incorrect wt aa: {aa1} instead of {wt_aa}')
            # map to PDB. Format: [('2DZF', 'A', 150, 'N', 335)]
            if custom_PDB is None:
                r = U2P_map.mapSingleResidue(pos, check_aa=True)
            else:
                r = U2P_map.mapSingleRes2CustomPDBs(pos, check_aa=True)
            if len(r) == 0:
                raise RuntimeError('Unable to map SAV to PDB')
            else:
                PDBID, chID, resid, aa, PDB_size = r[0]
                # NB: check for blank "chain" field
                if chID.strip() == '':
                    chID = '?'
                res_map = f'{PDBID} {chID} {resid} {aa}'
        except Exception as e:
            res_map = str(e)
            PDB_size = 0
        # store SAVs mapped on PDB chains and unique Uniprot coordinates
        if isinstance(U2P_map, str):
            uniq_coords = U2P_map
        else:
            uniq_coords = f'{U2P_map.uniq_acc} {pos} {aa1} {aa2}'
        mapped_SAVs[indx] = (SAV, uniq_coords, res_map, PDB_size)
    # save last pickle
    if isinstance(cache['obj'], UniprotMapping):
        cache['obj'].savePickle()
    n = sum(mapped_SAVs['PDB size'] != 0)
    LOGGER.report(f'{n} out of {nSAVs} SAVs have been mapped to PDB in %.1fs.',
                  '_map2PDB')
    if status_file:
        os.remove(status_file.name)
    return mapped_SAVs
Beispiel #56
0
def buildBiomolecules(header, atoms, biomol=None):
    """Returns *atoms* after applying biomolecular transformations from *header*
    dictionary.  Biomolecular transformations are applied to all coordinate
    sets in the molecule.

    Some PDB files contain transformations for more than 1 biomolecules.  A
    specific set of transformations can be choosen using *biomol* argument.
    Transformation sets are identified by numbers, e.g. ``"1"``, ``"2"``, ...

    If multiple biomolecular transformations are provided in the *header*
    dictionary, biomolecules will be returned as
    :class:`.AtomGroup` instances in a :func:`list`.

    If the resulting biomolecule has more than 26 chains, the molecular
    assembly will be split into multiple :class:`.AtomGroup`
    instances each containing at most 26 chains.  These
    :class:`.AtomGroup` instances will be returned in a tuple.

    Note that atoms in biomolecules are ordered according to chain identifiers.
    """

    if not isinstance(header, dict):
        raise TypeError('header must be a dictionary')

    if not isinstance(atoms, Atomic):
        raise TypeError('atoms must be an Atomic instance')

    biomt = header.get('biomoltrans')
    if not isinstance(biomt, dict) or len(biomt) == 0:
        raise ValueError("header doesn't contain biomolecular transformations")

    if not isinstance(atoms, AtomGroup):
        atoms = atoms.copy()

    biomols = []
    if biomol is None:
        keys = list(biomt)
    else:
        biomol = str(biomol)
        if biomol in biomt:
            keys = [biomol]
        else:
            LOGGER.warn('Transformations for biomolecule {0} was not '
                        'found in the header dictionary.'.format(biomol))
            return None

    keys.sort()
    for i in keys:
        segnm = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ' * 20)
        ags = []
        mt = biomt[i]
        # mt is a list, first item is list of chain identifiers
        # following items are lines corresponding to transformation
        # mt must have 3n + 1 lines
        if (len(mt)) % 4 != 0:
            LOGGER.warn('Biomolecular transformations {0} were not '
                        'applied'.format(i))
            continue

        for times in range(int((len(mt)) / 4)):
            rotation = np.zeros((3, 3))
            translation = np.zeros(3)
            line0 = np.fromstring(mt[times * 4 + 1], sep=' ')
            rotation[0, :] = line0[:3]
            translation[0] = line0[3]
            line1 = np.fromstring(mt[times * 4 + 2], sep=' ')
            rotation[1, :] = line1[:3]
            translation[1] = line1[3]
            line2 = np.fromstring(mt[times * 4 + 3], sep=' ')
            rotation[2, :] = line2[:3]
            translation[2] = line2[3]
            t = Transformation(rotation, translation)

            newag = atoms.select('chain ' + ' '.join(mt[0])).copy()
            if newag is None:
                continue
            newag.all.setSegnames(segnm.pop(0))
            for acsi in range(newag.numCoordsets()):
                newag.setACSIndex(acsi)
                newag = t.apply(newag)
            newag.setACSIndex(0)
            ags.append(newag)

        if ags:
            newag = ags.pop(0)
            while ags:
                newag += ags.pop(0)
            newag.setTitle('{0} biomolecule {1}'.format(atoms.getTitle(), i))
            biomols.append(newag)

    if biomols:
        if len(biomols) == 1:
            return biomols[0]
        else:
            return biomols
    else:
        return None
Beispiel #57
0
def matchAlign(mobile, target, **kwargs):
    """Superpose *mobile* onto *target* based on best matching pair of chains.
    This function uses :func:`matchChains` for matching chains and returns a
    tuple that contains the following items:

      * *mobile* after it is superposed,
      * matching chain from *mobile* as a :class:`.AtomMap` instance,
      * matching chain from *target* as a :class:`.AtomMap` instance,
      * percent sequence identity of the match,
      * percent sequence overlap of the match.

    :arg mobile: atoms that contain a protein chain
    :type mobile: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg target: atoms that contain a protein chain
    :type target: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg tarsel: *target* atoms that will be used for alignment,
        default is ``'calpha'``
    :type tarsel: str

    :arg allcsets: align all coordinate sets of *mobile*, default is **True**
    :type allcsets: bool

    :keyword seqid: percent sequence identity, default is 90
    :type seqid: float

    :keyword overlap: percent overlap, default is 90
    :type overlap: float

    :keyword pwalign: perform pairwise sequence alignment
    :type pwalign: bool"""

    selstr = kwargs.pop('tarsel', 'calpha')
    if selstr == 'calpha':
        selstr = None
    subset = 'calpha'
    if selstr:
        if selstr in _SUBSETS:
            subset = selstr
        else:
            subset = 'all'
        sel = target.select(selstr)
        if sel is None:
            raise ValueError('selection {0} did not match any atoms'.format(
                repr(selstr)))
        chid = set(sel.getChids())
        if len(chid) == 1:
            chid = chid.pop()
            target = target.select('chain ' + chid)

    match = matchChains(mobile, target, subset=subset, **kwargs)
    if not match:
        return
    match = match[0]
    mob = match[0]
    tar = match[1]
    if selstr:
        which = SELECT.getIndices(tar, selstr)
        n_atoms = len(which)
    else:
        which = slice(None)
        n_atoms = len(tar)
        selstr = 'calpha'

    if kwargs.get('allcets', True):
        csets = range(mobile.numCoordsets())  # PY3K: OK
    else:
        csets = [mobile.getACSIndex()]

    LOGGER.info('Alignment is based on {0} atoms matching {1}.'.format(
        n_atoms, repr(selstr)))
    printRMSD(tar._getCoords()[which],
              mob._getCoordsets()[:, which],
              msg='Before alignment ')
    for acsi in csets:
        mob.setACSIndex(acsi)
        mobile.setACSIndex(acsi)
        calcTransformation(mob._getCoords()[which],
                           tar._getCoords()[which]).apply(mobile)
    printRMSD(tar._getCoords()[which],
              mob._getCoordsets()[:, which],
              msg='After alignment  ')
    return (mobile, ) + match
Beispiel #58
0
def writeDeformProfile(model, pdb, filename='dp_out', selstr='protein and name CA',\
                                            pdb_selstr='protein', loadToVMD=True):
    """Calculate deformability (plasticity) profile of molecule based on mechanical
    stiffness matrix (see [EB08]_).

    :arg model: this is an 3-dimensional NMA instance from a :class:`.ANM
        calculations
    :type model: :class:`.ANM`
    :arg pdb: a coordinate set or an object with ``getCoords`` method
    :type pdb: :class:`numpy.ndarray`    
    
    Note: selection can be done usig ``selstr`` and ``pdb_selstr``. ``selstr`` define
    ``model`` selection (used for building :class:`.ANM` model) and ``pdb_selstr`` 
    will be used in VMD program for visualization. 
    
    By default files are saved as *filename* and loaded to VMD program. To change 
    it use ``loadToVMD=False``.
     
    Mean value of mechanical stiffness for molecule can be found in occupancy column
    in PDB file.
    """

    pdb = pdb.select(pdb_selstr)
    coords = pdb.select(selstr)
    meanSiff = np.mean(model.getStiffness(), axis=0)

    out_mean = open(filename + '_mean.txt',
                    'w')  # mean value of Kij for each residue
    for nr_i, i in enumerate(meanSiff):
        out_mean.write("{} {}\n".format(nr_i, i))
    out_mean.close()

    from collections import Counter
    aa_counter = Counter(pdb.getResindices())

    meanStiff_all = []
    for i in range(coords.numAtoms()):
        meanStiff_all.extend(aa_counter.values()[i] * [round(meanSiff[i], 2)])

    kw = {'occupancy': meanStiff_all}
    writePDB(filename, pdb, **kw)
    LOGGER.info('PDB file with deformability profile has been saved.')
    LOGGER.info('Creating TCL file.')
    out_tcl = open(filename + '.tcl', 'w')
    out_tcl.write('display resetview \nmol addrep 0 \ndisplay resetview \n')
    out_tcl.write('mol new {./' + filename +
                  '.pdb} type {pdb} first 0 last -1 step 1 waitfor 1 \n')
    out_tcl.write('animate style Loop \ndisplay projection Orthographic \n')
    out_tcl.write(
        'display depthcue off \ndisplay rendermode GLSL \naxes location Off \n'
    )
    out_tcl.write('color Display Background white \n')
    out_tcl.write(
        'mol modstyle 0 0 NewCartoon 0.300000 10.000000 4.100000 0 \n')
    out_tcl.write(
        'mol modmaterial 0 0 Diffuse \nmol modcolor 0 0 Occupancy \n')
    out_tcl.write('menu colorscalebar on \n')
    out_tcl.close()

    if (loadToVMD == True):
        from prody import pathVMD
        LOGGER.info('File will be loaded to VMD program.')
        os.system(pathVMD() + " -e " + str(filename) + ".tcl")
Beispiel #59
0
def fetchPDBClusters(sqid=None):
    """Retrieve PDB sequence clusters.  PDB sequence clusters are results of
    the weekly clustering of protein chains in the PDB generated by blastclust.
    They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/

    This function will download about 10 Mb of data and save it after
    compressing in your home directory in :file:`.prody/pdbclusters`.
    Compressed files will be less than 4 Mb in size.  Cluster data can
    be loaded using :func:`loadPDBClusters` function and be accessed
    using :func:`listPDBCluster`."""

    if sqid is not None:
        if sqid not in PDB_CLUSTERS:
            raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR)
        keys = [sqid]
    else:
        keys = list(PDB_CLUSTERS)

    PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters')
    if not os.path.isdir(PDB_CLUSTERS_PATH):
        os.mkdir(PDB_CLUSTERS_PATH)
    LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS),
                    '_prody_fetchPDBClusters')
    count = 0
    for i, x in enumerate(keys):
        filename = 'bc-{0}.out'.format(x)
        url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename)
        try:
            inp = openURL(url)
        except IOError:
            LOGGER.warning('Clusters at {0}% sequence identity level could '
                           'not be downloaded.')
            continue
        else:
            out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH)
            out.write(inp.read())
            inp.close()
            out.close()
            count += 1
        LOGGER.update(i, '_prody_fetchPDBClusters')
    LOGGER.clear()
    if len(PDB_CLUSTERS) == count:
        LOGGER.info('All PDB clusters were downloaded successfully.')
    elif count == 0:
        LOGGER.warn('PDB clusters could not be downloaded.')
Beispiel #60
0
def matchTNMAChains(atoms1, atoms2, prepareForHungarian=True, **kwargs):
    """Modified matchChains that only uses the Biopython chain matching.
    Return pairs of chains matched based on Biopython similarity.  Makes an
    all-to-all comparison of chains in *atoms1* and *atoms2*.  Chains are
    obtained from hierarchical views (:class:`.HierView`) of atom groups.
    This function returns a list of matching chains in a tuples that contain
    4 items:

      * matching chain from *atoms1* as a :class:`.AtomMap`
        instance,
      * matching chain from *atoms2* as a :class:`.AtomMap`
        instance,
      * percent sequence identity of the match,
      * percent sequence overlap of the match.

    List of matches are sorted in decreasing percent sequence identity order.
    :class:`.AtomMap` instances can be used to calculate RMSD values and
    superpose atom groups.

    :arg atoms1: atoms that contain a chain
    :type atoms1: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :arg atoms2: atoms that contain a chain
    :type atoms2: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection`

    :keyword subset: one of the following well-defined subsets of atoms:
        ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``),
        ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"``
    :type subset: string

    :keyword seqid: percent sequence identity, default is 90
    :type seqid: float

    :keyword overlap: percent overlap, default is 90
    :type overlap: float

    :keyword pwalign: perform pairwise sequence alignment
    :type pwalign: bool

    If *subset* is set to *calpha* or *backbone*, only alpha carbon
    atoms or backbone atoms will be paired. If set to *all*, all atoms
    common to matched residues will be returned.

    This function tries to match chains based on residue numbers and names.
    All chains in *atoms1* is compared to all chains in *atoms2*.  This works
    well for different structures of the same protein.  When it fails,
    :mod:`Bio.pairwise2` is used for pairwise sequence alignment, and matching
    is performed based on the sequence alignment.  User can control, whether
    sequence alignment is performed or not with *pwalign* keyword.  If
    ``pwalign=True`` is passed, pairwise alignment is enforced."""

    if not isinstance(atoms1, (AtomGroup, Chain, Selection)):
        raise TypeError('atoms1 must be an AtomGroup, Chain, or Selection')
    if not isinstance(atoms2, (AtomGroup, Chain, Selection)):
        raise TypeError('atoms2 must be an AtomGroup, Chain, or Selection')

    subset = kwargs.get('subset', 'calpha')
    if subset not in _SUBSETS:
        raise ValueError('{0} is not a valid subset argument'.format(
            str(subset)))
    seqid = kwargs.get('seqid', 90.)
    assert isinstance(seqid, (float, int)), 'seqid must be float'
    assert 0 < seqid <= 100, 'seqid must be in the range from 0 to 100'
    coverage = kwargs.get('overlap')
    if coverage is None:
        coverage = kwargs.get('coverage', 90.)
    assert isinstance(coverage, (float, int)), 'overlap must be float'
    assert 0 < coverage <= 100, 'overlap must be in the range from 0 to 100'
    pwalign = kwargs.get('pwalign', None)

    if isinstance(atoms1, Chain):
        chains1 = [atoms1]
        atoms1 = atoms1.getAtomGroup()
    else:
        chains1 = list(atoms1.getHierView().iterChains())
        if not isinstance(atoms1, AtomGroup):
            atoms1 = atoms1.getAtomGroup()
    chains = list()
    for ch in chains1:
        simpch = SimpleChain(ch)
        if len(simpch) > 0:
            chains.append(simpch)
    chains1 = chains
    if not isinstance(atoms1, Chain):
        LOGGER.debug('Checking {0}: {1} chains are identified'.format(
            str(atoms1), len(chains1)))

    if isinstance(atoms2, Chain):
        chains2 = [atoms2]
        atoms2 = atoms2.getAtomGroup()
    else:
        chains2 = list(atoms2.getHierView().iterChains())
        if not isinstance(atoms2, AtomGroup):
            atoms2 = atoms2.getAtomGroup()
    chains = list()
    for ch in chains2:
        simpch = SimpleChain(ch)
        if len(simpch) > 0:
            chains.append(simpch)
    chains2 = chains
    if not isinstance(atoms2, Chain):
        LOGGER.debug('Checking {0}: {1} chains are identified'.format(
            str(atoms2), len(chains2)))

    matches = []
    unmatched = []
    LOGGER.debug('Trying to match chains based on residue numbers and names:')
    for simpch1 in chains1:
        for simpch2 in chains2:
            LOGGER.debug('  Comparing {0} (len={1}) and {2} (len={3}):'.format(
                simpch1.getTitle(), len(simpch1), simpch2.getTitle(),
                len(simpch2)))

            match1, match2, nmatches = getTrivialMatch(simpch1, simpch2)
            _seqid = nmatches * 100 / min(len(simpch1), len(simpch2))
            _cover = len(match2) * 100 / max(len(simpch1), len(simpch2))

            if _seqid >= seqid and _cover >= coverage:
                LOGGER.debug(
                    '\tIgnoring Match: {0} residues match with {1:.0f}% '
                    'sequence identity and {2:.0f}% overlap.'.format(
                        len(match1), _seqid, _cover))
                unmatched.append((simpch1, simpch2))
            else:
                LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                             'overlap={1:.0f}%).'.format(_seqid, _cover))
                unmatched.append((simpch1, simpch2))

    if pwalign or (not matches and (pwalign is None or pwalign)):
        pairwise2 = importBioPairwise2()
        if pairwise2:
            LOGGER.debug('Trying to match chains based on {0} sequence '
                         'alignment:'.format(ALIGNMENT_METHOD))
            for simpch1, simpch2 in unmatched:
                LOGGER.debug('  Comparing {0} (len={1}) and {2} '
                             '(len={3}):'.format(simpch1.getTitle(),
                                                 len(simpch1),
                                                 simpch2.getTitle(),
                                                 len(simpch2)))
                match1, match2, nmatches = getAlignedMatch(simpch1, simpch2)
                _seqid = nmatches * 100 / min(len(simpch1), len(simpch2))
                _cover = len(match2) * 100 / max(len(simpch1), len(simpch2))
                if _seqid >= seqid and _cover >= coverage:
                    LOGGER.debug(
                        '\tMatch: {0} residues match with {1:.0f}% '
                        'sequence identity and {2:.0f}% overlap.'.format(
                            len(match1), _seqid, _cover))
                    matches.append(
                        (match1, match2, _seqid, _cover, simpch1, simpch2))
                else:
                    LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, '
                                 'overlap={1:.0f}%).'.format(_seqid, _cover))
        else:
            LOGGER.warning('Pairwise alignment could not be performed.')
    if not matches:
        return None
    subset = _SUBSETS[subset]
    for mi, result in enumerate(matches):
        match1, match2, _seqid, _cover, simpch1, simpch2 = result

        indices1 = []
        indices2 = []

        for i in range(len(match1)):
            ares = match1[i]
            bres = match2[i]

            if subset == 'ca':
                try:
                    aid = ares.getNames().tolist().index('CA')
                except ValueError:
                    aid = None
                try:
                    bid = bres.getNames().tolist().index('CA')
                    if aid is not None:
                        indices1.append(ares._indices[aid])
                        indices2.append(bres._indices[bid])
                except ValueError:
                    pass
            elif subset == 'bb':
                for bban in ('N', 'CA', 'C', 'O'):
                    try:
                        aid = ares.getNames().tolist().index(bban)
                    except ValueError:
                        continue
                    try:
                        bid = bres.getNames().tolist().index(bban)
                    except ValueError:
                        continue
                    else:
                        indices1.append(ares._indices[aid])
                        indices2.append(bres._indices[bid])
            elif subset == 'noh':
                for han, aid, noh in zip(ares.getNames(), ares._indices,
                                         ares.getFlags('noh')):
                    if not noh:
                        continue
                    try:
                        bid = bres.getNames().tolist().index(han)
                    except ValueError:
                        continue
                    else:
                        indices1.append(aid)
                        indices2.append(bres._indices[bid])
            elif subset is None or subset is 'all':
                aans = ares.getNames()
                bans = bres.getNames().tolist()

                aids = ares.getIndices()
                #bids = bres.getIndices()

                for j in range(len(aans)):
                    try:
                        bid = bres._indices[bans.index(aans[j])]
                        indices1.append(aids[j])
                        indices2.append(bid)
                    except ValueError:
                        pass

        indices1 = np.array(indices1, int)
        indices2 = np.array(indices2, int)

        match1 = AM(atoms1,
                    indices1,
                    atoms1.getACSIndex(),
                    title=simpch1.getTitle() + ' -> ' + simpch2.getTitle(),
                    intarrays=True)
        match2 = AM(atoms2,
                    indices2,
                    atoms2.getACSIndex(),
                    title=simpch2.getTitle() + ' -> ' + simpch1.getTitle(),
                    intarrays=True)

        matches[mi] = (match1, match2, _seqid, _cover)
    if len(matches) > 1:

        def compare(m1, m2):
            return cmp(m1[2], m2[2])

        if not prepareForHungarian:
            matches.sort(compare, reverse=True)
    return matches