Example #1
0
def calcANM(pdb,
            selstr='calpha',
            cutoff=15.,
            gamma=1.,
            n_modes=20,
            zeros=False):
    """Returns an :class:`ANM` instance and atoms used for the calculations.
    By default only alpha carbons are considered, but selection string helps
    selecting a subset of it.  *pdb* can be :class:`.Atomic` instance."""

    if isinstance(pdb, str):
        ag = parsePDB(pdb)
        title = ag.getTitle()
    elif isinstance(pdb, Atomic):
        ag = pdb
        if isinstance(pdb, AtomGroup):
            title = ag.getTitle()
        else:
            title = ag.getAtomGroup().getTitle()
    else:
        raise TypeError('pdb must be an atomic class, not {0}'.format(
            type(pdb)))
    anm = ANM(title)
    sel = ag.select(selstr)
    anm.buildHessian(sel, cutoff, gamma)
    anm.calcModes(n_modes, zeros)
    return anm, sel
Example #2
0
    def parsePDBs(self, **kwargs):
        """Load PDB into memory as :class:`.AtomGroup` instances using :func:`.parsePDB` and 
        perform selection based on residue ranges given by CATH."""
        
        pdbs = self.getPDBs(True)
        selstrs = self.getSelStrs()
        header = kwargs.get('header', False)
        model = kwargs.get('model', None)

        LOGGER.timeit('_cath_parsePDB')
        LOGGER.info('Parsing {0} PDB files...'.format(len(pdbs)))
        ret = parsePDB(*pdbs, **kwargs)

        if model != 0:
            if header:
                prots, _ = ret
            else:
                prots = ret

            LOGGER.info('Extracting domains...')
            for i in range(len(prots)):
                sel = prots[i].select(selstrs[i])
                prots[i] = sel
        LOGGER.report('CATH domains are parsed and extracted in %.2fs', '_cath_parsePDB')

        return ret
Example #3
0
def test2(pdb='2nwl-mem.pdb'):
    from prody import parsePDB
    structure = parsePDB(pdb, subset='ca')
    exanm = exANM('2nwl')
    exanm.buildHessian(structure)
    exanm.calcModes()
    return exanm
Example #4
0
File: exanm.py Project: sixpi/ProDy
def test2(pdb='2nwl-mem.pdb'):
    from prody import parsePDB
    structure = parsePDB(pdb, subset='ca')
    exanm = exANM('2nwl')
    exanm.buildHessian(structure)
    exanm.calcModes()
    return exanm
Example #5
0
def test(pdb='2ci2'):

    from prody import parsePDB
    from numpy import zeros

    pdb = parsePDB(pdb, subset='ca')
    bbenm = bbENM()
    bbenm.buildHessian(pdb, cutoff=12.)
    return bbenm
Example #6
0
def test(pdb='2ci2'):

    from prody import parsePDB
    from numpy import zeros

    pdb = parsePDB(pdb, subset='ca')
    bbenm = bbENM()
    bbenm.buildHessian(pdb, cutoff=12.)
    return bbenm
Example #7
0
def test(pdb='2ci2'):

    from prody import parsePDB
    from numpy import zeros


    pdb = parsePDB(pdb, subset='ca')
    bbenm = bbENM('2ci2')
    bbenm.buildHessian(pdb, cutoff=7.)
    bbenm.calcModes(n_modes = None)
    return bbenm
Example #8
0
def test():
    from prody import parsePDB, GNM
    from prody.dynamics.analysis import calcOverallNetEntropyTransfer
    import matplotlib.pyplot as plt

    pdb = parsePDB('1z83', subset='ca', chain='A')
    gnm = GNM()
    gnm.buildKirchhoff(pdb, cutoff=7.0)
    gnm.calcModes(n_modes=None)
    entTransfer = calcOverallNetEntropyTransfer(gnm,turbo=True)

    # f = open('/data/Manuscript_data/Data/1Z83A/monomer_overallnet_A_cihan2.txt','w')
    # for i in range(gnm.numAtoms()):
    #     for j in range(gnm.numAtoms()):
    #         if i != j:
    #             f.write('%d\t%d\t%f\n' % (i+1,j+1,entTransfer[i,j]))
    # f.close()
    return entTransfer
Example #9
0
def test(pdb='2nwl-mem.pdb', blk='2nwl.blk'):

    from prody import parsePDB
    from numpy import zeros

    pdb = parsePDB(pdb, subset='ca')
    pdb.setData('block', zeros(len(pdb), int))
    with open(blk) as inp:
        for line in inp:
            if line.startswith('BLOCK'):
                _, b, n1, c1, r1, n2, c2, r2 = line.split()
                sel = pdb.select('chain {} and resnum {} to {}'.format(
                    c1, r1, r2))
                if sel:
                    sel.setData('block', int(b))
    pdb.setBetas(pdb.getData('block'))
    from prody import writePDB
    writePDB('pdb2gb1_truncated.pdb', pdb)
    rtb = RTB('2nwl')
    rtb.buildHessian(pdb, pdb.getData('block'))
    return rtb
Example #10
0
def test(pdb='2nwl-mem.pdb', blk='2nwl.blk'):

    from prody import parsePDB
    from numpy import zeros

    pdb = parsePDB(pdb, subset='ca')
    pdb.setData('block', zeros(len(pdb), int))
    with open(blk) as inp:
        for line in inp:
            if line.startswith('BLOCK'):
                _, b, n1, c1, r1, n2, c2, r2 = line.split()
                sel = pdb.select('chain {} and resnum {} to {}'
                                 .format(c1, r1, r2))
                if sel:
                    sel.setData('block', int(b))
    pdb.setBetas(pdb.getData('block'))
    from prody import writePDB
    writePDB('pdb2gb1_truncated.pdb', pdb)
    rtb = RTB('2nwl')
    rtb.buildHessian(pdb, pdb.getData('block'))
    return rtb
Example #11
0
File: gnm.py Project: njekin/ProDy
def calcGNM(pdb, selstr="calpha", cutoff=15.0, gamma=1.0, n_modes=20, zeros=False):
    """Return a :class:`GNM` instance and atoms used for the calculations.
    By default only alpha carbons are considered, but selection string helps
    selecting a subset of it.  *pdb* can be :class:`.Atomic` instance."""

    if isinstance(pdb, str):
        ag = parsePDB(pdb)
        title = ag.getTitle()
    elif isinstance(pdb, Atomic):
        ag = pdb
        if isinstance(pdb, AtomGroup):
            title = ag.getTitle()
        else:
            title = ag.getAtomGroup().getTitle()
    else:
        raise TypeError("pdb must be an atom container, not {0}".format(type(pdb)))
    gnm = GNM(title)
    sel = ag.select(selstr)
    gnm.buildKirchhoff(sel, cutoff, gamma)
    gnm.calcModes(n_modes)
    return gnm, sel
Example #12
0
def calcANM(pdb, selstr='calpha', cutoff=15., gamma=1., n_modes=20,
            zeros=False):
    """Returns an :class:`ANM` instance and atoms used for the calculations.
    By default only alpha carbons are considered, but selection string helps
    selecting a subset of it.  *pdb* can be :class:`.Atomic` instance."""

    if isinstance(pdb, str):
        ag = parsePDB(pdb)
        title = ag.getTitle()
    elif isinstance(pdb, Atomic):
        ag = pdb
        if isinstance(pdb, AtomGroup):
            title = ag.getTitle()
        else:
            title = ag.getAtomGroup().getTitle()
    else:
        raise TypeError('pdb must be an atomic class, not {0}'
                        .format(type(pdb)))
    anm = ANM(title)
    sel = ag.select(selstr)
    anm.buildHessian(sel, cutoff, gamma)
    anm.calcModes(n_modes, zeros)
    return anm, sel
Example #13
0
    def parsePDBs(self, **kwargs):
        """Load PDB into memory as :class:`.AtomGroup` instances using :func:`.parsePDB` and 
        perform selection based on residue ranges given by CATH."""

        pdbs = self.getPDBs()
        # selstrs = self.getSelstrs()
        header = kwargs.get('header', False)
        model = kwargs.get('model', None)

        LOGGER.timeit('_uniprot_parsePDB')
        LOGGER.info('Parsing {0} PDB files...'.format(len(pdbs)))
        ret = parsePDB(*pdbs, **kwargs)

        if model != 0:
            headers = None
            if header:
                prots, headers = ret
            else:
                prots = ret

            if not isinstance(prots, list):
                prots = [prots]

                if header:
                    headers = [headers]
                    ret = (prots, headers)
                else:
                    ret = prots

            LOGGER.info('Extracting domains...')
            # for i in range(len(prots)):
            #     sel = prots[i].select(selstrs[i])
            #     prots[i] = sel
        LOGGER.report('Uniprot domains are parsed and extracted in %.2fs',
                      '_uniprot_parsePDB')

        return ret
Example #14
0
def test(pdb="2nwl-mem.pdb", blk="2nwl.blk"):

    from prody import parsePDB
    from numpy import zeros, linalg

    pdb = parsePDB(pdb, subset="ca")
    pdb.setData("block", zeros(len(pdb), int))
    with open(blk) as inp:
        for line in inp:
            if line.startswith("BLOCK"):
                _, b, n1, c1, r1, n2, c2, r2 = line.split()
                sel = pdb.select("chain {} and resnum {} to {}".format(c1, r1, r2))
                if sel:
                    sel.setData("block", int(b))
    pdb.setBetas(pdb.getData("block"))
    coords = pdb.getCoords()
    blocks = pdb.getBetas()
    from prody import writePDB

    writePDB("pdb2gb1_truncated.pdb", pdb)
    rtb = RTB("2nwl")
    rtb.buildHessian(coords, blocks, scale=64)
    # rtb.calcModes()
    return rtb
Example #15
0
def parseScipionModes(run_path, title=None, pdb=None):
    """Returns :class:`.NMA` containing eigenvectors and eigenvalues 
    parsed from a ContinuousFlex FlexProtNMA Run directory.

    :arg run_path: path to the Run directory
    :type run_path: str
    
    :arg title: title for :class:`.NMA` object
    :type title: str
    """
    if run_path.endswith("/"):
        run_path = run_path[:-1]
    run_name = os.path.split(run_path)[-1]
    top_dirs = os.path.split(run_path)[0][:-4]  # exclude "Runs"

    star_data = parseSTAR(run_path + '/modes.xmd')
    star_loop = star_data[0][0]

    n_modes = star_loop.numRows()

    row1 = star_loop[0]
    mode1 = parseArray(top_dirs + row1['_nmaModefile']).reshape(-1)
    dof = mode1.shape[0]

    if pdb is not None:
        atoms = parsePDB(pdb)
        n_atoms = atoms.numAtoms()
    else:
        # assume standard NMA
        n_atoms = dof // 3

    vectors = np.zeros((dof, n_modes))
    vectors[:, 0] = mode1

    eigvals = np.zeros(n_modes)

    try:
        eigvals[0] = float(row1['_nmaEigenval'])
        found_eigvals = True
    except:
        found_eigvals = False

    for i, row in enumerate(star_loop[1:]):
        vectors[:,
                i + 1] = parseArray(top_dirs + row['_nmaModefile']).reshape(-1)
        if found_eigvals:
            eigvals[i + 1] = float(row['_nmaEigenval'])

    if not found_eigvals:
        log_fname = run_path + '/logs/run.stdout'
        fi = open(log_fname, 'r')
        lines = fi.readlines()
        fi.close()

        for line in lines:
            if line.find('Eigenvector number') != -1:
                j = int(line.strip().split()[-1]) - 1
            if line.find('Corresponding eigenvalue') != -1:
                eigvals[j] = float(line.strip().split()[-1])
                if not found_eigvals:
                    found_eigvals = True

    if title is None:
        title = run_name

    if not found_eigvals:
        LOGGER.warn('No eigenvalues found')
        eigvals = None

    if dof == n_atoms * 3:
        nma = NMA(title)
    else:
        nma = GNM(title)

    nma.setEigens(vectors, eigvals)
    return nma
Example #16
0
def alignPDBEnsemble(ensemble, suffix='_aligned', outdir='.', gzip=False):
    """Align PDB files using transformations from *ensemble*, which may be
    a :class:`.PDBEnsemble` or a :class:`.PDBConformation` instance. Label of
    the conformation (see :meth:`~.PDBConformation.getLabel`) will be used to
    determine the PDB structure and model number.  First four characters of
    the label is expected to be the PDB identifier and ending numbers to be the
    model number.  For example, the :class:`.Transformation` from conformation
    with label *2k39_ca_selection_'resnum_<_71'_m116* will be applied to 116th
    model of structure **2k39**.  After applicable transformations are made,
    structure will be written into *outputdir* as :file:`2k39_aligned.pdb`.
    If ``gzip=True``, output files will be compressed.  Return value is
    the output filename or list of filenames, in the order files are processed.
    Note that if multiple models from a file are aligned, that filename will
    appear in the list multiple times."""

    if not isinstance(ensemble, (PDBEnsemble, PDBConformation)):
        raise TypeError('ensemble must be a PDBEnsemble or PDBConformation')
    if isinstance(ensemble, PDBConformation):
        ensemble = [ensemble]
    if gzip:
        gzip = '.gz'
    else:
        gzip = ''
    output = []
    pdbdict = {}
    for conf in ensemble:
        trans = conf.getTransformation()
        if trans is None:
            raise ValueError('transformations are not calculated, call '
                             '`superpose` or `iterpose`')
        label = conf.getLabel()

        pdb = label[:4]
        filename = pdbdict.get(pdb, fetchPDB(pdb))
        if filename is None:
            LOGGER.warning(
                'PDB file for conformation {0} is not found.'.format(label))
            output.append(None)
            continue
        LOGGER.info('Parsing PDB file {0} for conformation {1}.'.format(
            pdb, label))

        acsi = None
        model = label.rfind('m')
        if model > 3:
            model = label[model + 1:]
            if model.isdigit():
                acsi = int(model) - 1
            LOGGER.info('Applying transformation to model {0}.'.format(model))

        if isinstance(filename, str):
            ag = parsePDB(filename)
        else:
            ag = filename

        if acsi is not None:
            if acsi >= ag.numCoordsets():
                LOGGER.warn('Model number {0} for {1} is out of range.'.format(
                    model, pdb))
                output.append(None)
                continue
            ag.setACSIndex(acsi)
        trans.apply(ag)
        outfn = os.path.join(outdir, pdb + suffix + '.pdb' + gzip)
        if ag.numCoordsets() > 1:
            pdbdict[pdb] = ag
        else:
            writePDB(outfn, ag)
        output.append(os.path.normpath(outfn))

    for pdb, ag in pdbdict.items():  # PY3K: OK
        writePDB(os.path.join(outdir, pdb + suffix + '.pdb' + gzip), ag)
    if len(output) == 1:
        return output[0]
    else:
        return output
Example #17
0
def parsePfamPDBs(query, data=[], **kwargs):
    """Returns a list of AtomGroups containing sections of chains that 
    correspond to a particular PFAM domain family. These are defined by 
    alignment start and end residue numbers.

    :arg query: UniProt ID or PDB ID
        If a PDB ID is provided the corresponding UniProt ID is used.
        If this returns multiple matches then start or end must also be provided.
        This query is also used for label refinement of the Pfam domain MSA.
    :type query: str

    :arg data: If given the data list from the Pfam mapping table will 
        be output through this argument.
    :type data: list

    :keyword start: Residue number for defining the start of the domain.
        The PFAM domain that starts closest to this will be selected. 
        Default is **1**
    :type start: int

    :keyword end: Residue number for defining the end of the domain.
        The PFAM domain that ends closest to this will be selected. 
    :type end: int
    """
    
    start = kwargs.pop('start', 1)
    end = kwargs.pop('end', None)

    if len(query) > 4 and query.startswith('PF'):
        pfam_acc = query
    else:
        pfam_matches = searchPfam(query)
        keys = list(pfam_matches.keys())

        if isinstance(start, Integral):
            start_diff = []
            for i, key in enumerate(pfam_matches):
                start_diff.append(int(pfam_matches[key]['locations'][0]['start']) - start)
            start_diff = np.array(start_diff)
            pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]]

        elif isinstance(end, Integral):
            end_diff = []
            for i, key in enumerate(pfam_matches):
                end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end)
            end_diff = np.array(end_diff)
            pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]]

        else:
            raise ValueError('Please provide an integer for start or end '
                             'when using a UniProt ID or PDB ID.')

    from ftplib import FTP
    from .uniprot import queryUniprot

    data_stream = BytesIO()
    ftp_host = 'ftp.ebi.ac.uk'
    ftp = FTP(ftp_host)
    ftp.login()
    ftp.cwd('pub/databases/Pfam/current_release')
    ftp.retrbinary('RETR pdbmap.gz', data_stream.write)
    ftp.quit()
    zip_data = data_stream.getvalue()
    data_stream.close()

    rawdata = gunzip(zip_data)
    if PY3K:
        rawdata = rawdata.decode()

    fields = ['PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 
              'UniprotAcc', 'UniprotResnumRange']
    
    data_dicts = []
    for line in rawdata.split('\n'):
        if line.find(pfam_acc) != -1:
            data_dicts.append({})
            for j, entry in enumerate(line.strip().split('\t')):
                data_dicts[-1][fields[j]] = entry.strip(';')

    pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts]
    chains = [data_dict['chain'] for data_dict in data_dicts]

    header = kwargs.pop('header', False)
    model = kwargs.get('model', None)
    results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs)

    ags, headers = results
    ags, headers = list(ags), list(headers)

    if model == 0:
        LOGGER.info('only header is requested and returned')
        return results

    if header:
        results = (ags, headers)
    else:
#        ags = results
#        ags = list(ags)
        results = ags

    LOGGER.progress('Extracting Pfam domains...', len(ags))
    comma_splitter = re.compile(r'\s*,\s*').split
    no_info = []
    for i, ag in enumerate(ags):
        LOGGER.update(i)
        data_dict = data_dicts[i]
        pfamRange = data_dict['UniprotResnumRange'].split('-')
        uniprotAcc = data_dict['UniprotAcc']
        try:
            uniData = queryUniprot(uniprotAcc)
        except:
            LOGGER.warn('No Uniprot record found for {0}'.format(data_dict['PBD_ID']))
            continue

        resrange = None
        found = False
        for key, value in uniData.items():
            if not key.startswith('dbReference'):
                continue
            try:
                pdbid = value['PDB']
            except:
                continue
            if pdbid != data_dict['PDB_ID']:
                continue
            pdbchains = value['chains']

            # example chain strings: "A=27-139, B=140-150" or "A/B=27-150"
            pdbchains = comma_splitter(pdbchains)
            for chain in pdbchains:
                chids, resrange = chain.split('=')
                chids = [chid.strip() for chid in chids.split('/')]
                if data_dict['chain'] in chids:
                    resrange = resrange.split('-')
                    found = True
                    break
            if found:
                break

        if found:
            header = headers[i]
            chain_accessions = [dbref.accession 
                                for dbref in header[data_dict['chain']].dbrefs]
            try:
                if len(chain_accessions) > 0:
                    right_part = np.where(np.array(chain_accessions) == 
                                        data_dict['UniprotAcc'])[0][0]
                else:
                    raise ValueError('There is no accession for a chain in the Header')
            except:
                LOGGER.warn('Could not map domains in {0}'
                            .format(data_dict['PDB_ID'] 
                            + data_dict['chain']))
                no_info.append(i)
                continue

            right_dbref = header[data_dict['chain']].dbrefs[right_part]
            chainStart = ag.select('chain {0}'.format(data_dict['chain'])
                                  ).getResnums()[0]
            missing = chainStart - right_dbref.first[0]
            partStart = ag.getResindices()[np.where(ag.getResnums() == 
                                           right_dbref.first[0] + missing)][0]
            pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1])
            uniStart, uniEnd = int(resrange[0]), int(resrange[1])

            resiStart = pfStart - uniStart + partStart - missing
            resiEnd = pfEnd - uniStart + partStart - missing
            ags[i] = ag.select('resindex {0} to {1}'.format(
                            resiStart, resiEnd)) 
        else:
            no_info.append(i)
    LOGGER.finish()

    for i in reversed(no_info):
        ags.pop(i)
        if header:
            headers.pop(i)

    if isinstance(data, list):
        data.extend(data_dicts)
    else:
        LOGGER.warn('data should be a list in order to get output')
    
    return results
Example #18
0
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = { 'format' : 'tsv' }
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace('results','download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url) 
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        #xml = urllib2.urlopen(result_request).read()
        tsv = urllib2.urlopen(result_request).read()
        # openURL(url, timeout=timeout).read()
        
        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
            #if child.tag == 'hits':
                # accession = child.get('acc')
                # pfam_id = accession.split('.')[0]
                # matches[pfam_id]={}
                # matches[pfam_id]['accession']=accession
                # matches[pfam_id]['class']='Domain'
                # matches[pfam_id]['id']=child.get('name')
                # matches[pfam_id]['locations']={}
                # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['evalue']=child.get('evalue')
                # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                # matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                # matches[pfam_id]['type']='Pfam-A'
        # return matches

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id]={}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']   
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
 
            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'
                                .format(idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND','RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' + seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Example #19
0
def renumber_InputAlign(alnfile,pdbid,refid,selection="protein"\
	,outfile="renumbered.pdb",pdbfile="",newAA=None,first=1):
	'''
	Renumber input pdb using an exsiting multiple alignment. 
	- alnfile: alignment in .fasta format. Beware of weird 	characters in the 
	sequence ids, eg "|"
	- pdbid: sequence id in the alginment file that corresponds	to the input 
	structure. Must be the same number of residues
	- refid: sequence id corresponding to the reference sequence by which to 
	renumber the pdbid sequence. pdbid musnt' align to any gaps in refid.
	- selection: atom selection(s) in the the structure file to renumber. 
	Will iterate over comma separated selections to renumber each.
	- pdbfile: original structure file
	- outfile: output structure file
	- newAA: comma separated list of unrepresented amino acids
		XXXYCA: 
		XXX = three letter abbrevation as in pdbfile
		Y = one letter code in the alignment
		CA = atom to use as CA if different from "CA", eg 
		C1 in PVL of 1JEN	 

	'''


	selections = selection.split(",")
	tmp=tempfile.gettempdir()

	modified_selections = []

	if os.path.exists(alnfile):
		aln = AlignIO.read(alnfile, "fasta",alphabet=IUPAC.protein)
	else:
		print("ERROR, no such alignment: %s"%alnfile)
		exit(1)

	aln_ids = [x.id for x in aln]
	if pdbid in aln_ids and refid in aln_ids:		
		pdbSeqRec = seqbyname(aln, pdbid)
		if not pdbSeqRec:
			print("ERROR, bad pdbid name")
			exit(1)

		refSeqRec = seqbyname(aln, refid)
		if not refSeqRec:
			print("ERROR, bad refid name")
			exit(1)

		if pdbfile != '':
			if os.path.exists(pdbfile):
				structure = parsePDB(pdbfile)
				updateAA(structure,newAA)
			else:
				print("ERROR, no such pdb file: %s"%pdbfile)
				exit(1)

		renumber_aln(aln, refid, pdbid,first)
		for polymer in selections:
			currentSel = structure.select("not hetero and protein and name CA and %s"%polymer)
			if currentSel:
				renumber_struct(structure, pdbSeqRec, polymer)
				modified_selections.append(polymer)
			else:
				print('ERROR: Selection \"%s\" has zero CA atoms'%polymer)
	else:
		if pdbid not in [x.id for x in aln]:
			print("ERROR, no such sequence to renumber: %s"%pdbid)
		if refid not in [x.id for x in aln]:
			print("ERROR, no such sequence to renumber by: %s"%refid)
		exit(1)

	if writePDB(outfile, structure):
		print("Wrote renumbered %s selections from %s to %s"\
				%(str(modified_selections),pdbfile,outfile))
Example #20
0
def renumber_noInputAlign(pdbfile,refseqfile,selection="protein",\
	outfile="renumbered.pdb",newAA=None,first=1):
	'''
	Renumber pdb file (pdbfile) according to reference sequence in refseqfile. 
	Pdb sequence is extracted and aligned with reference sequence using needle 
	from EMBOSS.
	- refseqfile: .fasta file containing the reference sequence by which to 
	renumber
	- selection: atom selection(s) in the the structure file to renumber. 
	Will iterate over comma separated selections to renumber each.
	- pdbfile: original structure file
	- outfile: output structure file
	- newAA: comma separated list of unrepresented amino acids
		XXXYCA: 
		XXX = three letter abbrevation as in pdbfile
		Y = one letter code in the alignment
		CA = atom to use as CA if different from "CA", eg 
		C1 in PVL of 1JEN	

	'''
	# selections = selection.split(",")
	selections = selection
	tmp=tempfile.gettempdir()
	tmp_refseqfile="%s/refseq.fasta"%tmp
	pdbID = re.search("\w+\.\w+", pdbfile).group(0)
	tmp_pdbseqfile="%s/%s.fasta"%(tmp,pdbID)
	tmp_needle="%s/needle.out"%tmp
	if os.path.exists(refseqfile):
		refseqRec = SeqIO.read(refseqfile,"fasta",alphabet=IUPAC.protein )
		refseqRec.id = "refseq"
		SeqIO.write(refseqRec,tmp_refseqfile,"fasta")
	else: 
		print ("ERROR, no such file: %s"%refseqfile)
		exit(1)

	if os.path.exists(pdbfile):
		structure=parsePDB("%s"%pdbfile)
		updateAA(structure,newAA)
	else:
		print ("ERROR, no such file: %s"%pdbfile)
		exit(1)

	modified_selections = []
	for polymer in selections:
		currentSel = structure.select("protein and name CA and %s"%polymer)
		if currentSel:
			pdbseq_str=''.join([oneletter[i] for i in currentSel.getResnames()])
			pdbseqRec=SeqRecord(Seq(pdbseq_str,IUPAC.protein),id=pdbID)
			SeqIO.write(pdbseqRec,tmp_pdbseqfile,"fasta")

			needle_cli = NeedleCommandline(asequence=tmp_pdbseqfile,bsequence=tmp_refseqfile,\
				gapopen=10,gapextend=0.5,outfile=tmp_needle)
			needle_cli()
			aln = AlignIO.read(tmp_needle, "emboss",alphabet=IUPAC.protein )
			# os.remove(tmp_needle)
			# os.remove(tmp_pdbseqfile)		

			gpdb.renumber_aln(aln,"refseq",pdbID,first)
			pdbRenSeq = gpdb.seqbyname(aln, pdbID)
			gpdb.renumber_struct(structure, pdbRenSeq,polymer)
			pdbRenSeq.annotations["resnum"]=str(pdbRenSeq.letter_annotations["resnum"])
			modified_selections.append(polymer)
			# seems to be the only way to store pret residue annotations
			# AlignIO.write(aln,"pdb.outseq","seqxml")		
		else:
			print ('ERROR: Selection \"%s\" has zero CA atoms'%polymer)

	if writePDB(outfile, structure):
		print ("Wrote renumbered %s selections from %s to %s"%\
				(str(modified_selections),pdbfile,outfile))
	os.remove(tmp_refseqfile)
Example #21
0
def alignPDBEnsemble(ensemble, suffix='_aligned', outdir='.', gzip=False):
    """Align PDB files using transformations from *ensemble*, which may be
    a :class:`.PDBEnsemble` or a :class:`.PDBConformation` instance. Label of
    the conformation (see :meth:`~.PDBConformation.getLabel`) will be used to
    determine the PDB structure and model number.  First four characters of
    the label is expected to be the PDB identifier and ending numbers to be the
    model number.  For example, the :class:`.Transformation` from conformation
    with label *2k39_ca_selection_'resnum_<_71'_m116* will be applied to 116th
    model of structure **2k39**.  After applicable transformations are made,
    structure will be written into *outputdir* as :file:`2k39_aligned.pdb`.
    If *gzip* is **True**, output files will be compressed.  Return value is
    the output filename or list of filenames, in the order files are processed.
    Note that if multiple models from a file are aligned, that filename will
    appear in the list multiple times."""

    if not isinstance(ensemble, (PDBEnsemble, PDBConformation)):
        raise TypeError('ensemble must be a PDBEnsemble or PDBConformation')
    if isinstance(ensemble, PDBConformation):
        ensemble = [ensemble]
    if gzip:
        gzip = '.gz'
    else:
        gzip = ''
    output = []
    pdbdict = {}
    for conf in ensemble:
        trans = conf.getTransformation()
        if trans is None:
            raise ValueError('transformations are not calculated, call '
                             '`superpose` or `iterpose`')
        label = conf.getLabel()

        pdb = label[:4]
        filename = pdbdict.get(pdb, fetchPDB(pdb))
        if filename is None:
            LOGGER.warning('PDB file for conformation {0} is not found.'
                           .format(label))
            output.append(None)
            continue
        LOGGER.info('Parsing PDB file {0} for conformation {1}.'
                    .format(pdb, label))

        acsi = None
        model = label.rfind('m')
        if model > 3:
            model = label[model+1:]
            if model.isdigit():
                acsi = int(model) - 1
            LOGGER.info('Applying transformation to model {0}.'
                        .format(model))

        if isinstance(filename, str):
            ag = parsePDB(filename)
        else:
            ag = filename

        if acsi is not None:
            if acsi >= ag.numCoordsets():
                LOGGER.warn('Model number {0} for {1} is out of range.'
                            .format(model, pdb))
                output.append(None)
                continue
            ag.setACSIndex(acsi)
        trans.apply(ag)
        outfn = os.path.join(outdir, pdb + suffix + '.pdb' + gzip)
        if ag.numCoordsets() > 1:
            pdbdict[pdb] = ag
        else:
            writePDB(outfn, ag)
        output.append(os.path.normpath(outfn))

    for pdb, ag in pdbdict.items():  # PY3K: OK
        writePDB(os.path.join(outdir, pdb + suffix + '.pdb' + gzip), ag)
    if len(output) == 1:
        return output[0]
    else:
        return output
Example #22
0
File: pfam.py Project: uibcdf/ProDy
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = {'hmmdb': 'pfam', 'seq': fseq}
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request(
            'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = {'format': 'tsv'}
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace(
            'results', 'download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url)
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(
            seq[:MINSEQLEN]))

        try:
            #xml = urllib2.urlopen(result_request).read()
            tsv = urllib2.urlopen(result_request).read()
            # openURL(url, timeout=timeout).read()
        except:
            raise ValueError('No matching Pfam domains were found.')

        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
        #if child.tag == 'hits':
        # accession = child.get('acc')
        # pfam_id = accession.split('.')[0]
        # matches[pfam_id]={}
        # matches[pfam_id]['accession']=accession
        # matches[pfam_id]['class']='Domain'
        # matches[pfam_id]['id']=child.get('name')
        # matches[pfam_id]['locations']={}
        # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
        # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
        # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['evalue']=child.get('evalue')
        # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
        # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
        # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
        # matches[pfam_id]['locations']['significant']=child[0].get('significant')
        # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
        # matches[pfam_id]['type']='Pfam-A'
        # return matches

        if PY3K:
            tsv = tsv.decode()

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id] = {}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child[
                'Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'.format(
                    seq[:4], str(err)))
            else:
                chid = seq[4:].upper()

            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'.format(
                                    idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND', 'RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' +
                                 seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Example #23
0
File: pfam.py Project: uibcdf/ProDy
def parsePfamPDBs(query, data=[], **kwargs):
    """Returns a list of AtomGroups containing sections of chains that 
    correspond to a particular PFAM domain family. These are defined by 
    alignment start and end residue numbers.

    :arg query: UniProt ID or PDB ID
        If a PDB ID is provided the corresponding UniProt ID is used.
        If this returns multiple matches then start or end must also be provided.
        This query is also used for label refinement of the Pfam domain MSA.
    :type query: str

    :arg data: If given the data list from the Pfam mapping table will 
        be output through this argument.
    :type data: list

    :keyword start: Residue number for defining the start of the domain.
        The PFAM domain that starts closest to this will be selected. 
        Default is **1**
    :type start: int

    :keyword end: Residue number for defining the end of the domain.
        The PFAM domain that ends closest to this will be selected. 
    :type end: int
    """

    start = kwargs.pop('start', 1)
    end = kwargs.pop('end', None)

    if len(query) > 4 and query.startswith('PF'):
        pfam_acc = query
    else:
        pfam_matches = searchPfam(query)
        keys = list(pfam_matches.keys())

        if isinstance(start, Integral):
            start_diff = []
            for i, key in enumerate(pfam_matches):
                start_diff.append(
                    int(pfam_matches[key]['locations'][0]['start']) - start)
            start_diff = np.array(start_diff)
            pfam_acc = keys[np.where(
                abs(start_diff) == min(abs(start_diff)))[0][0]]

        elif isinstance(end, Integral):
            end_diff = []
            for i, key in enumerate(pfam_matches):
                end_diff.append(
                    int(pfam_matches[key]['locations'][0]['end']) - end)
            end_diff = np.array(end_diff)
            pfam_acc = keys[np.where(
                abs(end_diff) == min(abs(end_diff)))[0][0]]

        else:
            raise ValueError('Please provide an integer for start or end '
                             'when using a UniProt ID or PDB ID.')

    from ftplib import FTP
    from .uniprot import queryUniprot

    data_stream = BytesIO()
    ftp_host = 'ftp.ebi.ac.uk'
    ftp = FTP(ftp_host)
    ftp.login()
    ftp.cwd('pub/databases/Pfam/current_release')
    ftp.retrbinary('RETR pdbmap.gz', data_stream.write)
    ftp.quit()
    zip_data = data_stream.getvalue()
    data_stream.close()

    rawdata = gunzip(zip_data)
    if PY3K:
        rawdata = rawdata.decode()

    fields = [
        'PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc',
        'UniprotResnumRange'
    ]

    data_dicts = []
    for line in rawdata.split('\n'):
        if line.find(pfam_acc) != -1:
            data_dicts.append({})
            for j, entry in enumerate(line.strip().split('\t')):
                data_dicts[-1][fields[j]] = entry.strip(';')

    pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts]
    chains = [data_dict['chain'] for data_dict in data_dicts]

    header = kwargs.pop('header', False)
    model = kwargs.get('model', None)
    results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs)

    ags, headers = results
    ags, headers = list(ags), list(headers)

    if model == 0:
        LOGGER.info('only header is requested and returned')
        return results

    if header:
        results = (ags, headers)
    else:
        #        ags = results
        #        ags = list(ags)
        results = ags

    LOGGER.progress('Extracting Pfam domains...', len(ags))
    comma_splitter = re.compile(r'\s*,\s*').split
    no_info = []
    for i, ag in enumerate(ags):
        LOGGER.update(i)
        data_dict = data_dicts[i]
        pfamRange = data_dict['UniprotResnumRange'].split('-')
        uniprotAcc = data_dict['UniprotAcc']
        try:
            uniData = queryUniprot(uniprotAcc)
        except:
            LOGGER.warn('No Uniprot record found for {0}'.format(
                data_dict['PBD_ID']))
            continue

        resrange = None
        found = False
        for key, value in uniData.items():
            if not key.startswith('dbReference'):
                continue
            try:
                pdbid = value['PDB']
            except:
                continue
            if pdbid != data_dict['PDB_ID']:
                continue
            pdbchains = value['chains']

            # example chain strings: "A=27-139, B=140-150" or "A/B=27-150"
            pdbchains = comma_splitter(pdbchains)
            for chain in pdbchains:
                chids, resrange = chain.split('=')
                chids = [chid.strip() for chid in chids.split('/')]
                if data_dict['chain'] in chids:
                    resrange = resrange.split('-')
                    found = True
                    break
            if found:
                break

        if found:
            header = headers[i]
            chain_accessions = [
                dbref.accession for dbref in header[data_dict['chain']].dbrefs
            ]
            try:
                if len(chain_accessions) > 0:
                    right_part = np.where(
                        np.array(chain_accessions) ==
                        data_dict['UniprotAcc'])[0][0]
                else:
                    raise ValueError(
                        'There is no accession for a chain in the Header')
            except:
                LOGGER.warn(
                    'Could not map domains in {0}'.format(data_dict['PDB_ID'] +
                                                          data_dict['chain']))
                no_info.append(i)
                continue

            right_dbref = header[data_dict['chain']].dbrefs[right_part]
            chainStart = ag.select('chain {0}'.format(
                data_dict['chain'])).getResnums()[0]
            missing = chainStart - right_dbref.first[0]
            partStart = ag.getResindices()[np.where(
                ag.getResnums() == right_dbref.first[0] + missing)][0]
            pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1])
            uniStart, uniEnd = int(resrange[0]), int(resrange[1])

            resiStart = pfStart - uniStart + partStart - missing
            resiEnd = pfEnd - uniStart + partStart - missing
            ags[i] = ag.select('resindex {0} to {1}'.format(
                resiStart, resiEnd))
        else:
            no_info.append(i)
    LOGGER.finish()

    for i in reversed(no_info):
        ags.pop(i)
        if header:
            headers.pop(i)

    if isinstance(data, list):
        data.extend(data_dicts)
    else:
        LOGGER.warn('data should be a list in order to get output')

    return results
Example #24
0
    def scanPockets(self):

        'Generates ESSA z-scores for pockets and parses pocket features. It requires both Fpocket 3.0 and Pandas being installed in your system.'

        from re import findall

        fpocket = which('fpocket')

        if fpocket is None:
            LOGGER.warning(
                'Fpocket (version >= 3.0) was not found, please install it.')
            return None

        try:
            from pandas import Index, DataFrame
        except ImportError as ie:
            LOGGER.warning(ie.__str__() + ' was found, please install it.')
            return None

        rcr = {(i, j): k if self._rib else self._ri[k]
               for i, j, k in zip(self._ca.getChids(), self._ca.getResnums(),
                                  self._ca.getResindices())}

        writePDB('{}_pro'.format(self._title), self._heavy)

        direc = '{}_pro_out'.format(self._title)
        if not isdir(direc):
            system('fpocket -f {}_pro.pdb'.format(self._title))

        chdir(direc + '/pockets')
        l = [x for x in listdir('.') if x.endswith('.pdb')]
        l.sort(key=lambda x: int(x.partition('_')[0][6:]))

        ps = []
        for x in l:
            with open(x, 'r') as f:
                tmp0 = f.read()
                tmp1 = [(x[1].strip(), float(x[2])) for x in findall(
                    r'(\w+\s\w+\s*-\s*)(.+):\s*([\d.-]+)(\n)', tmp0)]
            fea, sco = list(zip(*tmp1))
            ps.append(sco)
        pdbs = parsePDB(l)
        chdir('../..')

        # ----- # ----- #

        ps = array(ps)

        pcn = {
            int(pdb.getTitle().partition('_')[0][6:]):
            set(zip(pdb.getChids().tolist(),
                    pdb.getResnums().tolist()))
            for pdb in pdbs
        }
        pi = {p: [rcr[x] for x in crn] for p, crn in pcn.items()}

        pzs_max = {k: max(self._zscore[v]) for k, v in pi.items()}
        pzs_med = {k: median(self._zscore[v]) for k, v in pi.items()}

        # ----- # ----- #

        indices = Index(range(1, ps.shape[0] + 1), name='Pocket #')

        columns = Index(fea, name='Feature')

        self._df = DataFrame(index=indices, columns=columns, data=ps)

        # ----- # ----- #

        columns_zs = Index(['ESSA_max', 'ESSA_med', 'LHD'], name='Z-score')

        zps = c_[list(pzs_max.values())]
        zps = hstack((zps, c_[list(pzs_med.values())]))
        zps = hstack(
            (zps, zscore(self._df[['Local hydrophobic density Score']])))

        self._df_zs = DataFrame(index=indices, columns=columns_zs, data=zps)
Example #25
0
def writePerturbResponsePDB(prs_matrix, pdbIn=None, **kwargs):
    """ Write the average response to perturbation of
    a particular residue (a row of a perturbation response matrix)
    or the average effect of perturbation of a particular residue
    (a column of a normalized perturbation response matrix)
    into the b-factor field of a PDB file for visualisation in a
    molecular graphics program.
    If no chain is given this will be done for that residue in all chains.

    If no residue number is given then the effectiveness and sensitivity
    profiles will be written out instead. These two profiles are also returned
    as arrays for further analysis if they aren't already provided.

    :arg prs_matrix: a perturbation response matrix 
        or a :class:`.AtomGroup` object with a PRS matrix associated as data
    :type prs_matrix: array or :class:`.AtomGroup`

    :arg pdbIn: file name for the input PDB file where you would like the PRS
        data mapped
    :type pdbIn: str

    :arg pdbOut: a list of file names (enclosed in square
        brackets) for the output PDB file, default is to append
        the chain and residue info (name and number) onto the pdbIn stem.
        The input for pdbOut can also be used as a stem if you enter a 
        single string enclosed in quotes.
        If no residue number is supplied, chain is ignored and the default 
        is to append '_effectiveness' and '_sensitivity' onto the stem.
    :type pdbOut: list

    :arg chain: chain identifier for the residue of interest, default is all chains
        If you want to analyse residues in a subset of chains, concatentate them
        together e.g. 'AC'
    :type chain: str

    :arg resnum: residue number for the residue of interest
    :type resnum: int

    :arg direction: the direction you want to use to read data out
        of the PRS matrix for plotting: the options are 'effect' or 'response'.
        Default is 'effect'.
        A row gives the effect on each residue of peturbing the specified 
        residue.
        A column gives the response of the specified residue to perturbing 
        each residue.
        If no residue number is provided then this option will be ignored
    :type direction: str

    :arg returnData: whether to return effectiveness and sensitivity for analysis
        default is False
    :type returnProfiles: bool

    :arg effectiveness: effectiveness profile
    :type array

    :arg sensitivity: sensitivity profile
    :type array
    """

    if not isinstance(prs_matrix, np.ndarray):
        try:
            prs_matrix = prs_matrix.getData('prs_matrix')
        except:
            raise TypeError(
                'Please provide a valid PRS matrix in numpy ndarray format.')

    try:
        fi = open(pdbIn, 'r')
        lines = fi.readlines()
        fi.close()
    except:
        raise PRSMatrixParseError(
            'Please provide a valid file name for the input PDB.')

    chain = kwargs.get('chain', None)

    structure = parsePDB(pdbIn, subset='ca')
    structure.setData('prs_matrix', prs_matrix)

    hv = structure.getHierView()
    chains = []
    for i in range(len(list(hv))):
        chainAg = list(hv)[i]
        chains.append(chainAg.getChids()[0])

    chains = np.array(chains)
    if chain is None:
        chain = ''.join(chains)

    resnum = kwargs.get('resnum', None)
    pdbOut = kwargs.get('pdbOut', None)
    if pdbOut is None:
        out_stem = pdbIn.split('.')[0]
    elif type(pdbOut) is str:
        out_stem = pdbOut.split('.')[0]
        pdbOut = None

    if resnum is None:
        effectiveness = kwargs.get('effectiveness', None)
        sensitivity = kwargs.get('sensitivity', None)
        if effectiveness is None or sensitivity is None:
            effectiveness, sensitivity = calcPerturbResponseProfiles(
                prs_matrix)

        structure.setData('effectiveness', effectiveness)
        structure.setData('sensitivity', sensitivity)

        file_effs_name = '{0}_effectiveness.pdb'.format(out_stem)
        file_sens_name = '{0}_sensitivity.pdb'.format(out_stem)
        fileEffs = open(file_effs_name, 'w')
        fileSens = open(file_sens_name, 'w')

        for line in lines:
            if line.find('ATOM') != 0 and line.find(
                    'HETATM') != 0 and line.find('ANISOU') != 0:
                fileEffs.write(line)
                fileSens.write(line)
            elif line.find('ATOM') == 0:
                fileEffs.write(line[:60] + '{:6.2f}'.format(float(structure.select( \
                               'chain {0} and resnum {1}'.format(line[21],line[22:26])) \
                               .getData('effectiveness')) * 100/np.max( \
                               structure.getData('effectiveness'))) + line[66:])
                fileSens.write(line[:60] + '{:6.2f}'.format(float(structure.select( \
                               'chain {0} and resnum {1}'.format(line[21],line[22:26])) \
                               .getData('sensitivity')) * 100/np.max( \
                               structure.getData('sensitivity'))) + line[66:])
            elif line.find('HETATM') == 0:
                fileEffs.write(line[:60] + '  0.00' + line[66:])
                fileSens.write(line[:60] + '  0.00' + line[66:])

        fileEffs.close()
        fileSens.close()
        LOGGER.info('The effectiveness and sensitivity profiles were written' \
                    ' to {0} and {1}.'.format(file_effs_name,file_sens_name))

        returnData = kwargs.get('returnData', False)
        if returnData:
            return structure, effectiveness, sensitivity
        else:
            return

    direction = kwargs.get('direction', 'effect')
    for n in range(len(chain)):
        if not chain[n] in chains:
            raise PRSMatrixParseError('Chain {0} was not found in {1}'.format(
                chain[n], pdbIn))

    if pdbOut is None:
        pdbOut = []
        for c in chain:
            pdbOut.append('{0}_{1}_{2}{3}_{4}.pdb' \
                          .format(out_stem, c, \
                                  str(structure.select('chain {0} and resnum {1}' \
                                      .format(c, resnum)).getResnames()), \
                                  resnum, direction))

    for c in chain:
        fo = open(pdbOut[n], 'w')
        for line in lines:
            if line.find('ATOM') != 0 and line.find(
                    'HETATM') != 0 and line.find('ANISOU') != 0:
                fo.write(line)
            elif line.find('ATOM') == 0:
                if direction is 'effect':
                    fo.write(line[:60] + '{:6.2f}'.format(float(structure.getData('prs_matrix') \
                                         [structure.select('chain {0} and resnum {1}' \
                                          .format(c, resnum)).getResindices(), \
                                          structure.select('chain {0} and resnum {1}' \
                                          .format(line[21], line[22:26])).getResindices()])*100) \
                             + line[66:])
                else:
                    fo.write(line[:60] + '{:6.2f}'.format(float(structure.getData('prs_matrix') \
                                         [structure.select('chain {0} and resnum {1}' \
                                          .format(line[21], line[22:26])).getResindices(), \
                                          structure.select('chain {0} and resnum {1}' \
                                          .format(c, resnum)).getResindices()])*100) \
                             + line[66:])
            elif line.find('HETATM') == 0:
                fo.write(line[:60] + '  0.00' + line[66:])

        LOGGER.info('Perturbation responses for specific residues were written' \
                    ' to {0}.'.format(', '.join(pdbOut)))
Example #26
0
def writePerturbResponsePDB(prs_matrix, pdbIn, **kwargs):
    """ Write the average response to perturbation of
    a particular residue (a row of a perturbation response matrix)
    or the average effect of perturbation of a particular residue
    (a column of a normalized perturbation response matrix)
    into the b-factor field of a PDB file for visualisation in a
    molecular graphics program.
    If no chain is given this will be done for that residue in all chains.

    If no residue number is given then the effectiveness and sensitivity
    profiles will be written out instead. These two profiles are also returned
    as arrays for further analysis if they aren't already provided.

    :arg prs_matrix: a perturbation response matrix
    :type prs_matrix: ndarray

    :arg pdbIn: file name for the input PDB file where you would like the PRS
        data mapped
    :type pdbIn: str

    :arg pdbOut: a list of file names (enclosed in square
        brackets) for the output PDB file, default is to append
        the chain and residue info (name and number) onto the pdbIn stem.
        The input for pdbOut can also be used as a stem if you enter a 
        single string enclosed in quotes.
        If no residue number is supplied, chain is ignored and the default 
        is to append '_effectiveness' and '_sensitivity' onto the stem.
    :type pdbOut: list

    :arg chain: chain identifier for the residue of interest, default is all chains
        If you want to analyse residues in a subset of chains, concatentate them
        together e.g. 'AC'
    :type chain: str

    :arg resnum: residue number for the residue of interest
    :type resnum: int

    :arg direction: the direction you want to use to read data out
        of the PRS matrix for plotting: the options are 'row' or 'column'.
        Default is 'row'.
        A row gives the effect on each residue of peturbing the specified 
        residue.
        A column gives the response of the specified residue to perturbing 
        each residue.
        If no residue number is provided then this option will be ignored
    :type direction: str

    :arg returnData: whether to return effectiveness and sensitivity for analysis
        default is False
    :type returnProfiles: bool
    """

    if not type(prs_matrix) is np.ndarray:
        raise TypeError(
            'Please provide a valid PRS matrix in numpy ndarray format.')

    try:
        fi = open(pdbIn, 'r')
        lines = fi.readlines()
        fi.close()
    except:
        raise PRSMatrixParseError(
            'Please provide a valid file name for the input PDB.')

    chain = kwargs.get('chain', None)
    structure = parsePDB(pdbIn).calpha
    hv = structure.getHierView()
    chains = []
    for i in range(len(list(hv))):
        chainAg = list(hv)[i]
        chains.append(chainAg.getChids()[0])

    chains = np.array(chains)
    if chain is None:
        chain = ''.join(chains)

    resnum = kwargs.get('resnum', None)
    pdbOut = kwargs.get('pdbOut', None)
    if pdbOut is None:
        out_stem = pdbIn.split('.')[0]
    elif type(pdbOut) is str:
        out_stem = pdbOut.split('.')[0]
        pdbOut = None

    if resnum is None:
        effectiveness = kwargs.get('effectiveness')
        sensitivity = kwargs.get('sensitivity')
        if effectiveness is None or sensitivity is None:
            effectiveness, sensitivity = calcPerturbResponseProfiles(
                prs_matrix)

        file_effs_name = '{0}_effectiveness.pdb'.format(out_stem)
        file_sens_name = '{0}_sensitivity.pdb'.format(out_stem)
        fileEffs = open(file_effs_name, 'w')
        fileSens = open(file_sens_name, 'w')

        for line in lines:
            if line.find('ATOM') != 0 and line.find(
                    'HETATM') != 0 and line.find('ANISOU') != 0:
                fileEffs.write(line)
                fileSens.write(line)
            elif line.find('ATOM') == 0:
                sel_line_res = structure.select('resid {0}'.format(
                    line[22:26]))
                j = np.where(structure.getResnums() == int(line[22:26]))[0] \
                    [np.where(sel_line_res.getChids() == line[21])[0][0]]
                fileEffs.write(line[:60] + ' '*(6-len('{:3.2f}'.format(( \
                               effectiveness[j]*100/np.max(effectiveness))))) \
                               + '{:3.2f}'.format((effectiveness[j]) \
                               *100/np.max(effectiveness)) + line[66:])
                fileSens.write(line[:60] + ' '*(6-len('{:3.2f}'.format((\
                               sensitivity[j]*100/np.max(sensitivity))))) \
                               + '{:3.2f}'.format((sensitivity[j]) \
                               *100/np.max(sensitivity)) + line[66:])
            elif line.find('HETATM') == 0:
                fileEffs.write(line[:60] + ' ' * 2 + '0.00' + line[66:])
                fileSens.write(line[:60] + ' ' * 2 + '0.00' + line[66:])

        fileEffs.close()
        fileSens.close()
        LOGGER.info('The effectiveness and sensitivity profiles were written' \
                    ' to {0} and {1}.'.format(file_effs_name,file_sens_name))

        returnData = kwargs.get('returnData', False)
        if returnData:
            return effectiveness, sensitivity
        else:
            return

    timesNF = 0
    direction = kwargs.get('direction', 'row')
    for n in range(len(chain)):
        if not chain[n] in chains:
            raise PRSMatrixParseError('Chain {0} was not found in {1}'.format(
                chain[n], pdbIn))

        chainNum = int(np.where(chains == chain[n])[0])
        chainAg = list(hv)[chainNum]
        if not resnum in chainAg.getResnums():
            LOGGER.info('A residue with number {0} was not found',
                        ' in chain {1}. Continuing to next chain.' \
                        .format(resnum, chain[n]))
            timesNF += 1
            continue

    if pdbOut is None:
        pdbOut = []
        for n in range(len(chain)):
            chainNum = int(np.where(chains == chain[n])[0])
            i = np.where(structure.getResnums() == resnum)[0][chainNum -
                                                              timesNF]
            pdbOut.append('{0}_{1}_{2}{3}_{4}.pdb'.format(out_stem, chain[n], \
                           structure.getResnames()[i], resnum, direction))

    for n in range(len(chain)):
        chainNum = int(np.where(chains == chain)[0])
        i = np.where(structure.getResnums() == resnum)[0][chainNum - timesNF]
        fo = open(pdbOut[n], 'w')
        for line in lines:
            if line.find('ATOM') != 0 and line.find(
                    'HETATM') != 0 and line.find('ANISOU') != 0:
                fo.write(line)
            elif line.find('ATOM') == 0:
                sel_line_res = structure.select('resid {0}'.format(
                    line[22:26]))
                j = np.where(structure.getResnums() == int(line[22:26]))[0] \
                    [np.where(sel_line_res.getChids() == line[21])[0][0]]

                if direction is 'row':
                    fo.write(line[:60] + ' '*(6-len('{:3.2f}'.format(( \
                             prs_matrix[i][j])*100/np.max(prs_matrix)))) \
                             + '{:3.2f}'.format((prs_matrix[i][j]) \
                             *100/np.max(prs_matrix)) + line[66:])
                else:
                    fo.write(line[:60] + ' '*(6-len('{:3.2f}'.format(( \
                             prs_matrix[j][i])*100/np.max(prs_matrix)))) \
                             + '{:3.2f}'.format((prs_matrix[j][i]) \
                             *100/np.max(prs_matrix)) + line[66:])
            elif line.find('HETATM') == 0:
                fo.write(line[:60] + ' ' * 2 + '0.00' + line[66:])

        LOGGER.info('Perturbation responses for specific residues were written' \
                    ' to {0}.'.format(', '.join(pdbOut)))

    return
Example #27
0
def fetchPfamPDBs(**kwargs):
    """Returns a list of AtomGroups containing sections of chains that 
    correspond to a particular PFAM domain family. These are defined by 
    alignment start and end residue numbers.

    :arg pfam_acc: The accession number for a pfam domain family, if known.
        Alternatively you can select a family based on a query (see below).
    :type pfam_acc: str

    :arg query: UniProt ID or PDB ID
        If a PDB ID is provided the corresponding UniProt ID is used.
        If no query is provided but a pfam_acc is then the first entry
        will be used as a query. 
        This query is also used for label refinement of the pfam domain MSA.
    :type query: str

    You must provide one of these two arguments.
    Use of query requires start or end to also be provided.

    :arg start: Residue number for defining the start of the domain.
        The PFAM domain that starts closest to this will be selected. 
    :type start: int

    :arg end: Residue number for defining the end of the domain.
        The PFAM domain that ends closest to this will be selected. 
    :type end: int

    :arg return_data: Whether to return the data dictionary from
        the Pfam mapping table, default is False
    :type return_data: bool
    """
    pfam_acc = kwargs.pop('pfam_acc', None)
    query = kwargs.pop('query', None)
    start = kwargs.pop('start', None)
    end = kwargs.pop('end', None)
    return_data = kwargs.pop('return_data', False)

    if pfam_acc is None:
        if query is None:
            raise ValueError('Please provide a value for pfam_acc or query.')
        else:
            pfam_matches = searchPfam(query)

            if start is not None and type(start) is int:
                start_diff = []
                for i, key in enumerate(pfam_matches):
                    start_diff.append(
                        int(pfam_matches[key]['locations'][0]['start']) -
                        start)
                start_diff = np.array(start_diff)
                pfam_acc = pfam_matches.keys()[np.where(
                    abs(start_diff) == min(abs(start_diff)))[0][0]]

            elif end is not None and type(end) is int:
                end_diff = []
                for i, key in enumerate(pfam_matches):
                    end_diff.append(
                        int(pfam_matches[key]['locations'][0]['end']) - end)
                end_diff = np.array(end_diff)
                pfam_acc = pfam_matches.keys()[np.where(
                    abs(end_diff) == min(abs(end_diff)))[0][0]]

            else:
                raise ValueError(
                    'Please provide an integer for start or end when using query.'
                )

    from ftplib import FTP
    data = []
    ftp_host = 'ftp.ebi.ac.uk'
    ftp = FTP(ftp_host)
    ftp.login('')
    ftp.cwd('pub/databases/Pfam/mappings')
    ftp.retrlines('RETR pdb_pfam_mapping.txt', data.append)

    fields = []
    for field in data[0].strip().split('\t'):
        fields.append(field)

    data_dict = []
    for line in data[1:]:
        if line.find(pfam_acc) != -1:
            data_dict.append({})
            for j, entry in enumerate(line.strip().split('\t')):
                data_dict[-1][fields[j]] = entry

    pdb_ids = []
    pdbs = []
    headers = []
    for i in range(len(data_dict)):
        pdb_id = data_dict[i]['PDB_ID']
        if not pdb_id in pdb_ids:
            pdb_ids.append(pdb_id)

    result = parsePDB(*pdb_ids, **kwargs)

    if return_data:
        return data_dict, result
    else:
        return result