Ejemplo n.º 1
0
def prody_align(opt):
    """Align models in a PDB file or a PDB file onto others."""
            
    import prody
    LOGGER = prody.LOGGER

    args = opt.pdb
    if len(args) == 1:
        pdb = args[0]
        LOGGER.info('Aligning multiple models in: ' + pdb)
        selstr, prefix, model = opt.select, opt.prefix, opt.model
        pdb = prody.parsePDB(pdb)
        pdbselect = pdb.select(selstr)
        if pdbselect is None:
            opt.subparser.error('Selection {0:s} do not match any atoms.'
                               .format(repr(selstr)))
        LOGGER.info('{0:d} atoms will be used for alignment.'
                    .format(len(pdbselect)))
        pdbselect.setACSIndex(model-1)
        prody.printRMSD(pdbselect, msg='Before alignment ')
        prody.alignCoordsets(pdbselect)
        prody.printRMSD(pdbselect, msg='After alignment  ')
        if prefix == '':
            prefix = pdb.getTitle() + '_aligned'
        outfn = prefix + '.pdb'
        LOGGER.info('Writing file: ' + outfn)
        prody.writePDB(outfn, pdb)
    else:
        reffn = args.pop(0)
        seqid=opt.seqid
        overlap=opt.overlap
        LOGGER.info('Aligning structures onto: ' + reffn)
        ref = prody.parsePDB(reffn)
        for arg in args:
            if arg == reffn:
                continue
            if '_aligned.pdb' in arg:
                continue
            pdb = prody.parsePDB(arg)
            result = prody.matchAlign(pdb, ref, seqid=seqid, overlap=overlap, 
                                      tarsel=opt.select, allcsets=True,
                                      cslabel='Model', csincr=1) 
            if result:
                outfn = pdb.getTitle() + '_aligned.pdb'
                LOGGER.info('Writing file: ' + outfn)
                prody.writePDB(outfn, pdb)
            else:
                LOGGER.warning('Failed to align ' + arg)
Ejemplo n.º 2
0
def prody_align(opt):
    """Align models in a PDB file or a PDB file onto others."""
            
    import prody
    LOGGER = prody.LOGGER

    args = opt.pdb
    if len(args) == 1:
        pdb = args[0]
        LOGGER.info('Aligning multiple models in: ' + pdb)
        selstr, prefix, model = opt.select, opt.prefix, opt.model
        pdb = prody.parsePDB(pdb)
        pdbselect = pdb.select(selstr)
        if pdbselect is None:
            LOGGER.warning('Selection "{0:s}" do not match any atoms.'
                           .format(selstr))
            sys.exit(-1)
        LOGGER.info('{0:d} atoms will be used for alignment.'
                               .format(len(pdbselect)))
        pdb.setACSIndex(model-1)
        prody.alignCoordsets(pdb, selstr=selstr)
        rmsd = prody.calcRMSD(pdb)
        LOGGER.info('Max RMSD: {0:0.2f} Mean RMSD: {1:0.2f}'
              .format(rmsd.max(), rmsd.mean()))
        if prefix == '':
            prefix = pdb.getTitle() + '_aligned'
        outfn = prefix + '.pdb'
        LOGGER.info('Writing file: ' + outfn)
        prody.writePDB(outfn, pdb)
    else:
        reffn = args.pop(0)
        LOGGER.info('Aligning structures onto: ' + reffn)
        ref = prody.parsePDB(reffn)
        for arg in args:
            if arg == reffn:
                continue
            if '_aligned.pdb' in arg:
                continue
            pdb = prody.parsePDB(arg)
            if prody.matchAlign(pdb, ref):
                outfn = pdb.getTitle() + '_aligned.pdb'
                LOGGER.info('Writing file: ' + outfn)
                prody.writePDB(outfn, pdb)
            else:
                LOGGER.warning('Failed to align ' + arg)
Ejemplo n.º 3
0
def prody_align(*pdbs, **kwargs):
    """Align models in a PDB file or multiple structures in separate PDB files.
    By default, protein chains will be matched based on selected atoms and
    alignment will be performed based on matching residues.  If non-protein
    atoms are selected and selected atoms match in multiple structures,
    they will be used for alignment.

    :arg pdbs: PDB identifier(s) or filename(s)

    :arg select: atom selection string, default is :term:`calpha`,
        see :ref:`selections`

    :arg model: for NMR files, reference model index, default is ``1``

    :arg seqid: percent sequence identity, default is ``90``

    :arg overlap: percent sequence overlap, default is ``90``

    :arg prefix: prefix for output file, default is PDB filename

    :arg suffix: output filename suffix, default is :file:`_aligned`"""

    from numpy import all
    from prody import LOGGER, writePDB, parsePDB
    from prody import alignCoordsets, printRMSD, matchAlign, superpose

    selstr = kwargs.get('select', 'calpha')
    suffix = kwargs.get('suffix', '_aligned')
    if len(pdbs) == 1:
        pdb = pdbs[0]
        LOGGER.info('Aligning multiple models in: ' + pdb)
        prefix = kwargs.get('prefix')
        model = kwargs.get('model')
        pdb = parsePDB(pdb)
        pdbselect = pdb.select(selstr)
        if pdbselect is None:
            subparser = kwargs.get('subparser')
            if subparser:
                subparser.error('Selection {0} do not match any atoms.'.format(
                    repr(selstr)))
            else:
                raise ValueError('select does not match any atoms')
        LOGGER.info('{0} atoms will be used for alignment.'.format(
            len(pdbselect)))
        pdbselect.setACSIndex(model - 1)
        printRMSD(pdbselect, msg='Before alignment ')
        alignCoordsets(pdbselect)
        printRMSD(pdbselect, msg='After alignment  ')
        outfn = (prefix or pdb.getTitle()) + suffix + '.pdb'
        LOGGER.info('Writing file: ' + outfn)
        writePDB(outfn, pdb)
    else:
        pdbs = list(pdbs)
        reffn = pdbs.pop(0)
        seqid = kwargs.get('seqid')
        overlap = kwargs.get('overlap')
        LOGGER.info('Aligning structures onto: ' + reffn)
        ref = parsePDB(reffn)

        ref_sel = ref.select(selstr)
        if ref_sel:
            LOGGER.info('Selection {0} matched {1} atoms.'.format(
                repr(selstr), len(ref_sel)))
        else:
            raise ValueError('selection {0} did not match any atoms'.format(
                repr(selstr)))
        match = True
        if ref_sel.numAtoms('ca') < 2:
            match = False

        for arg in pdbs:
            if arg == reffn:
                continue
            #if '_aligned.pdb' in arg:
            #    continue
            LOGGER.info('Evaluating structure: ' + arg)
            pdb = parsePDB(arg)
            if match:
                result = matchAlign(pdb,
                                    ref,
                                    seqid=seqid,
                                    overlap=overlap,
                                    tarsel=selstr,
                                    allcsets=True,
                                    cslabel='Model',
                                    csincr=1)
                if result:
                    outfn = pdb.getTitle() + suffix + '.pdb'
                    LOGGER.info('Writing file: ' + outfn)
                    writePDB(outfn, pdb)
                    continue

            pdb_sel = pdb.select(selstr)
            LOGGER.info('Selection {0} matched {1} atoms.'.format(
                repr(selstr), len(pdb_sel)))
            if (len(pdb_sel) == len(ref_sel)
                    and all(pdb_sel.getNames() == ref_sel.getNames())):
                printRMSD(ref_sel, pdb_sel, msg='Before alignment ')
                superpose(pdb_sel, ref_sel)
                printRMSD(ref_sel, pdb_sel, msg='After alignment  ')
                outfn = pdb.getTitle() + suffix + '.pdb'
                LOGGER.info('Writing file: ' + outfn)
                writePDB(outfn, pdb)
            else:
                LOGGER.warn('Failed to align structure ' + arg + '.')
def prune_pdb_models(pdb_models):
    '''
	This function takes a list of structural models corresponding to a single
	pdb ID (just isolated models). It prunes them to find representative
	models and eliminates redundant ones

	Arguments: 
	pdb_models -- full list of pdb models (iso)

	Returns:
	pruned_models -- list of pruned representative pdb models
	'''
    pruned_models = []

    # determine which files actually exist, delete parent dirs of those that don't
    iso_pdb_models = []

    for model in pdb_models:
        if not os.path.exists(model):
            print os.path.basename(
                model), 'does not exist! Deleting parent directory.'
            delete_model(model)
        else:
            iso_pdb_models.append(model)

    # find representative models
    rep_overlap_cutoff = 50  # percent seq overlap required (90% seq ID required)
    rep_rmsd_cutoff = 5  # models less than 4A apart are represented by a single model

    # find representative iso models
    print 'Finding representative PDB ISO models...'
    rep_iso_models = []

    for iso_model in iso_pdb_models:
        if len(rep_iso_models) == 0:
            rep_iso_models.append(iso_model)
        else:
            model = prody.parsePDB(iso_model)  # get structure
            redundant = False

            for rep_iso_model in rep_iso_models:
                rep = prody.parsePDB(rep_iso_model)  # get structure

                # calc RMSD between model and rep
                alignment = prody.matchAlign(model,
                                             rep,
                                             overlap=rep_overlap_cutoff)
                if alignment != None:
                    rmsd = prody.calcRMSD(alignment[1], alignment[2])
                    if rmsd <= rep_rmsd_cutoff:
                        redundant = True  # we already have a representative for this segment
                        # take the larger structure as the representative
                        if model.numResidues() > rep.numResidues():
                            rep_iso_models.remove(rep_iso_model)
                            rep_iso_models.append(iso_model)
                        break

            # if the iso model does not match any of our representative models,
            # then add it to the representative models list
            if not redundant:
                rep_iso_models.append(iso_model)

    print 'Found', len(rep_iso_models), 'representative ISO models:', map(
        os.path.basename, rep_iso_models)

    # move representative models to their own directory
    if len(rep_iso_models) > 0:
        pdb_dir = os.path.abspath(
            os.path.join(rep_iso_models[0], os.pardir + '/' + os.pardir))
        rep_model_dir = pdb_dir + '/representative_pdb_models/'
        if os.path.exists(rep_model_dir):
            shutil.rmtree(rep_model_dir)
        os.mkdir(rep_model_dir)

        for rep_iso_model in rep_iso_models:
            rep_iso_model_pardir = os.path.abspath(
                os.path.join(rep_iso_model, os.pardir))
            new_path = rep_model_dir + '/' + os.path.basename(
                rep_iso_model_pardir)
            shutil.copytree(rep_iso_model_pardir, new_path)
            # define new pathname to keep track of the models once we move them
            new_iso_model_path = rep_model_dir + os.path.basename(
                rep_iso_model_pardir) + '/' + os.path.basename(rep_iso_model)
            pruned_models.append(new_iso_model_path)

    # return all representative pdb models
    return pruned_models
Ejemplo n.º 5
0
#debug
#pairs = pairs[:2]

def select_chains(atoms, chains):
    only_letters = lambda c: c.isalpha()
    chains = filter(only_letters, list(''.join(chains)))
    return atoms.select('protein and ('+' or '.join(['chain '+c.upper() for c in chains]) + ')')


for p in pairs:
    (bound_pdb, bound_chn, peptide_chn, unbound_pdb, unbound_chn) = p
    bound = select_chains(parsePDB(bound_pdb), bound_chn+peptide_chn)
    unbound = select_chains(parsePDB(unbound_pdb), unbound_chn)

    align_results = matchAlign(bound, unbound)
    if not align_results:
        #raise IOError('cannot align bound and unbound')
        continue
    bound = align_results[0]
    
    bound_r = select_chains(bound, bound_chn)
    unbound_r = select_chains(unbound, unbound_chn)
    peptide = select_chains(bound, peptide_chn)
    
    writePDB('bound/%s.%s.pdb' % (bound_pdb, bound_chn), bound_r)
    #writePDB('%s.%s.pdb' % (unbound_pdb, unbound_chn), unbound_r)
    writePDB('%s.receptor.pdb' % unbound_pdb, unbound_r)
    writePDB('%s.peptide.pdb' % unbound_pdb, peptide)

Ejemplo n.º 6
0
def generateTrainingSet(inputdict, distance, output=None, combineOutput=True):

    devnull = open(os.devnull, 'w')
    subprocess.check_call('dssp --version', shell=True, stdout=devnull,
                          stderr=devnull)
    subprocess.check_call('stride -h', shell=True, stdout=devnull,
                          stderr=devnull)
    subprocess.check_call('netsurfp -h', shell=True, stdout=devnull,
                          stderr=devnull)
    subprocess.check_call('runpsipred', shell=True, stdout=devnull,
                          stderr=devnull)
    devnull.close()

    finalData = pd.DataFrame() if combineOutput else []

    for target, models in inputdict.items():

        targetPDB = prody.parsePDB(target)
        assert distance, "Distance is not valid"

        tempdir = tempfile.mkdtemp()

        # we don't want to run NetSurfP and PSIPRED over and over again for all
        # model structures. we compute them for target structure and just reuse
        # on model structures
        netsurfp.parseNetSurfP(netsurfp.execNetSurfP(target, outputdir=tempdir),
                                                     targetPDB)

        psipred.parsePSIPRED(psipred.execPSIPRED(target, outputdir=tempdir),
                                                 targetPDB)

        for i, modelFilename in enumerate(models):
            datadict = {}
            modelPDB = prody.parsePDB(modelFilename)

            if not modelPDB:
                print('Model file %s cannot be parsed, skipping...' %
                      modelFilename)
                continue

            #if model has no chainID, let's assign one. That makes STRIDE parser
            #happy
            if np.unique(modelPDB.getChids()) == ' ':
                modelPDB.all.setChids('A')
                modelFilename = os.path.join(tempdir,
                                             os.path.basename(modelFilename))

                modelFilename = prody.writePDB(modelFilename,
                                               modelPDB,
                                               autoext=False)

            #superimpose model onto target structure
            match = prody.matchAlign(modelPDB, targetPDB, tarsel='calpha',
                                     seqid=50, overlap=20)
            mapmodel = match[1]
            maptarget = match[2]

            #and copy NetSurfP and PSIPRED data from target to model
            copyDataFromTarget(targetPDB, modelPDB)

            #run STRIDE
            prody.parseSTRIDE(prody.execSTRIDE(modelFilename, outputdir=tempdir),
                                               modelPDB)

            datadict['STRIDEarea'] = \
                pd.Series(modelPDB.ca.getData('stride_area')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            ss = pd.Series(mapmodel.getSecstrs(), index=maptarget.getResindices())
            ss[ss == ''] = '-' #empty strings cause trouble in csv load/save
            datadict['STRIDEss'] = ss

            #run DSSP
            prody.parseDSSP(prody.execDSSP(modelFilename, outputdir=tempdir), modelPDB)

            datadict['DSSPacc'] = \
                pd.Series(modelPDB.ca.getData('dssp_acc')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            ss = pd.Series(mapmodel.getSecstrs(), index=maptarget.getResindices())
            ss[ss == ''] = '-' #empty strings cause trouble in csv load/save
            datadict['DSSPss'] = ss


            #save NetSurfP data
            datadict['NetSurfP_exp'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_exposure')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            datadict['NetSurfP_asa'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_asa')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['NetSurfP_rsa'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_rsa')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['NetSurfP_alpha'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_alphascore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['NetSurfP_beta'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_betascore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['NetSurfP_coil'] = \
                pd.Series(modelPDB.ca.getData('netsurfp_coilscore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            #save PSIPRED data
            datadict['PSIPRED_ss'] = \
                pd.Series(modelPDB.ca.getData('psipred_ss')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['PSIPRED_coilscore'] = \
                pd.Series(modelPDB.ca.getData('psipred_coilscore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['PSIPRED_helixscore'] = \
                pd.Series(modelPDB.ca.getData('psipred_helixscore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())
            datadict['PSIPRED_strandscore'] = \
                pd.Series(modelPDB.ca.getData('psipred_strandscore')[mapmodel.getResindices()],
                          index=maptarget.getResindices())

            #Compute class labels based on the distance argument
            datadict['ClassLabel'] = pd.Series((np.abs(
                prody.calcDistance(maptarget.copy(), mapmodel.copy())) < distance).astype(int),
                index=maptarget.getResindices())

            if combineOutput:
                finalData = pd.concat([finalData, pd.DataFrame(datadict)])
            else:
                finalData.append(pd.DataFrame(datadict))

        #remove temporary directory
        shutil.rmtree(tempdir, ignore_errors=True)

    if output:
        if combineOutput:
            finalData.to_csv(output, index=False, quoting=csv.QUOTE_NONNUMERIC)
            #dataframe.to_csv(output)
        else:
            print('Warning! Output must be combined to be saved into a CSV '
                  'file')

    return finalData
Ejemplo n.º 7
0
def find_rep_gene_iso_models(hi_res_iso_models, lo_res_iso_models,
                             rep_rmsd_cutoff):
    '''
	Function to pick representative iso gene models from a pool of representative
	pdb iso models. Uses a greedy algorithm to cover as much of the gene sequence
	as possible using first high resolution models and then filling any gaps 
	with low resolution models

	'''

    # only make a model a representative model if is at least 15 residues
    # long and if it includes at least 10
    # residues that have never been seen in previous models or if it has
    # a significantly different conformation than previous models
    rep_gene_iso_models = []
    min_length = 20
    num_new_residue_cutoff = 10
    rep_overlap_cutoff = 10

    # tag each model with it's sequence coverage
    hi_res_iso_models = [[m, get_seq_range(m)] for m in hi_res_iso_models]
    lo_res_iso_models = [[m, get_seq_range(m)] for m in lo_res_iso_models]

    # sort lists of models by length of sequence coverage
    sorted_hi_res_iso_models = sorted(hi_res_iso_models,
                                      key=lambda m: -1 * len(m[1]))
    sorted_lo_res_iso_models = sorted(lo_res_iso_models,
                                      key=lambda m: -1 * len(m[1]))
    sorted_iso_models = sorted_hi_res_iso_models + sorted_lo_res_iso_models

    # use greedy algorithm to try to cover full gene sequence
    gene_coverage = []

    # start with large hi res models, end with small lo res models
    for model in sorted_iso_models:
        model_file = model[0]
        model_coverage = model[1]

        # discrard structures that have too few number of residues
        if len(model_coverage) >= min_length:
            intersection = list(set(model_coverage) & set(gene_coverage))
            num_new_residues = len(model_coverage) - len(intersection)

            # if rep model list is empty, make it a rep model
            if len(rep_gene_iso_models) == 0:
                rep_gene_iso_models.append(model_file)
                gene_coverage += model_coverage

            # otherwise, if this model has enough new residues, add it to the representatives list
            elif num_new_residues >= num_new_residue_cutoff:
                rep_gene_iso_models.append(model_file)
                gene_coverage += model_coverage
                gene_coverage = list(set(gene_coverage))

            # otherwise check if it has a unique conformation
            else:
                model_struct = prody.parsePDB(model_file)
                redundant = False
                for rep_gene_iso_model in rep_gene_iso_models:
                    rep_struct = prody.parsePDB(
                        rep_gene_iso_model)  # get structure
                    # calc RMSD between model and rep
                    alignment = prody.matchAlign(model_struct,
                                                 rep_struct,
                                                 overlap=rep_overlap_cutoff)
                    if alignment != None:
                        rmsd = prody.calcRMSD(alignment[1], alignment[2])
                        if rmsd <= rep_rmsd_cutoff:
                            redundant = True  # we already have a representative for this segment
                            break
                # if the model does not match any of our representative models,
                # then it is unique - add it to the representative models list
                if not redundant:
                    rep_gene_iso_models.append(model_file)
                    gene_coverage += model_coverage
                    gene_coverage = list(set(gene_coverage))

    return rep_gene_iso_models
Ejemplo n.º 8
0
def prody_align(*pdbs, **kwargs):
    """Align models in a PDB file or multiple structures in separate PDB files.
    By default, protein chains will be matched based on selected atoms and
    alignment will be performed based on matching residues.  If non-protein
    atoms are selected and selected atoms match in multiple structures,
    they will be used for alignment.

    :arg pdbs: PDB identifier(s) or filename(s)

    :arg select: atom selection string, default is :term:`calpha`,
        see :ref:`selections`

    :arg model: for NMR files, reference model index, default is ``1``

    :arg seqid: percent sequence identity, default is ``90``

    :arg overlap: percent sequence overlap, default is ``90``

    :arg prefix: prefix for output file, default is PDB filename

    :arg suffix: output filename suffix, default is :file:`_aligned`"""

    from numpy import all
    from prody import LOGGER, writePDB, parsePDB
    from prody import alignCoordsets, printRMSD, matchAlign, superpose

    selstr = kwargs.get('select', 'calpha')
    suffix = kwargs.get('suffix', '_aligned')
    if len(pdbs) == 1:
        pdb = pdbs[0]
        LOGGER.info('Aligning multiple models in: ' + pdb)
        prefix = kwargs.get('prefix')
        model = kwargs.get('model')
        pdb = parsePDB(pdb)
        pdbselect = pdb.select(selstr)
        if pdbselect is None:
            subparser = kwargs.get('subparser')
            if subparser:
                subparser.error('Selection {0} do not match any atoms.'
                                .format(repr(selstr)))
            else:
                raise ValueError('select does not match any atoms')
        LOGGER.info('{0} atoms will be used for alignment.'
                    .format(len(pdbselect)))
        pdbselect.setACSIndex(model-1)
        printRMSD(pdbselect, msg='Before alignment ')
        alignCoordsets(pdbselect)
        printRMSD(pdbselect, msg='After alignment  ')
        outfn = (prefix or pdb.getTitle()) + suffix + '.pdb'
        LOGGER.info('Writing file: ' + outfn)
        writePDB(outfn, pdb)
    else:
        pdbs = list(pdbs)
        reffn = pdbs.pop(0)
        seqid = kwargs.get('seqid')
        overlap = kwargs.get('overlap')
        LOGGER.info('Aligning structures onto: ' + reffn)
        ref = parsePDB(reffn)

        ref_sel = ref.select(selstr)
        if ref_sel:
            LOGGER.info('Selection {0} matched {1} atoms.'
                        .format(repr(selstr), len(ref_sel)))
        else:
            raise ValueError('selection {0} did not match any atoms'
                               .format(repr(selstr)))
        match = True
        if ref_sel.numAtoms('ca') < 2:
            match = False

        for arg in pdbs:
            if arg == reffn:
                continue
            #if '_aligned.pdb' in arg:
            #    continue
            LOGGER.info('Evaluating structure: ' + arg)
            pdb = parsePDB(arg)
            if match:
                result = matchAlign(pdb, ref, seqid=seqid, overlap=overlap,
                                    tarsel=selstr, allcsets=True,
                                    cslabel='Model', csincr=1)
                if result:
                    outfn = pdb.getTitle() + suffix + '.pdb'
                    LOGGER.info('Writing file: ' + outfn)
                    writePDB(outfn, pdb)
                    continue

            pdb_sel = pdb.select(selstr)
            LOGGER.info('Selection {0} matched {1} atoms.'
                        .format(repr(selstr), len(pdb_sel)))
            if (len(pdb_sel) == len(ref_sel) and
                all(pdb_sel.getNames() == ref_sel.getNames())):
                printRMSD(ref_sel, pdb_sel, msg='Before alignment ')
                superpose(pdb_sel, ref_sel)
                printRMSD(ref_sel, pdb_sel, msg='After alignment  ')
                outfn = pdb.getTitle() + suffix + '.pdb'
                LOGGER.info('Writing file: ' + outfn)
                writePDB(outfn, pdb)
            else:
                LOGGER.warn('Failed to align structure ' + arg + '.')