def get_pdb_from_remote_or_db(pdb_id, selection, source_folder = ""):
    """
    If called without "source_folder" the behaviour of this function is actually that
    of 'get_pdb'. If "source_folder" is defined (a user-defined folder) Prody will look
    the availability of the pdb there before trying to download it from rcbs pdb (slower).  
    """
    prody.pathPDBFolder(source_folder)
    return get_pdb(pdb_id, selection)
Beispiel #2
0
def get_pdb_from_remote_or_db(pdb_id, selection, source_folder=""):
    """
    If called without "source_folder" the behaviour of this function is actually that
    of 'get_pdb'. If "source_folder" is defined (a user-defined folder) Prody will look
    the availability of the pdb there before trying to download it from rcbs pdb (slower).  
    """
    prody.pathPDBFolder(source_folder)
    return get_pdb(pdb_id, selection)
def get_pdb_chain(pdbid,chain,as_object = False,folder = "results",local_path = None):

    """This function parse pdb_ids and chain name, then write the specific chain.
    Also this check for non standard aminoacid MSE and rename it as MET. To parse PDBs from a local
    folder set path in *local_path* 
    
    pdbid : a PDB identifier or a filename
    chain : The PDB chain identifier
    as_object : True for return prody object
    folder : **path** Folder to storage results
    local_path : **path** to a local Folder
    """
    
    folder_result = make_folder(folder_result = folder)
    

    if local_path is not None:
        if os.path.isdir(local_path):
            pdbs = prody.findPDBFiles(path=local_path)
            parse = prody.parsePDB(pdbs[pdbid])
        else:
            raise IOError("{0} is not a valid path".format(local_path))


    else:
        prody.pathPDBFolder(folder=folder_result)
        parse = prody.parsePDB(pdbid)
    
        
    
    protein = parse.select("protein")
    p_chain = protein.select("chain %s" %chain)
    if p_chain == None:
        
        return
        
    hv= p_chain.getHierView()
    hvc = hv[chain]
    
    
    for i,r in enumerate(hvc):
        if str(r)[:3] =="MSE":
            r.setResname("MET")
            
    if as_object:
        return hvc            
    return prody.writePDB(folder_result+"/%s_%s" %(pdbid,chain),hvc)
Beispiel #4
0
	def compare(): 
		
		###get PDB files from databank that are associated with each protein for later use
		##change directory
		
		#create a folder that contains all pdb files from the PDB if it does not exist
		prody.pathPDBFolder(wd + '/challengedata/PDBfiles')
			
		#list of proteins that need to be downloaded
		weeks = []
		for(_, dirnames, _) in os.walk(wd + '/challengedata'): 
			if (dirnames=='latest.txt' or dirnames=='answers' or dirnames =='rdkit-scripts'):
				pass
			elif (dirnames not in weeks): 
				weeks.extend(dirnames)
		proteins = [x for x in weeks if 'celpp' not in x]
		
		#download pdb using prody 
		for x in proteins:
			if x=='rdkit-scripts' or x=='PDBfiles' or x=='answers': 
				pass
			else:
				protein = prody.fetchPDB(x)
import protocols

DEBUG_MODE = False

LOGGER._setprefix('')
LOGGER.info(f'Started on   {datetime.datetime.now()}')
LOGGER.info('')

# set PDB folder
old_verbosity = LOGGER.verbosity
LOGGER._setverbosity('none')
home_dir = os.environ['HOME']
pdb_dir = os.path.join(home_dir, 'PDBs')
if not os.path.isdir(pdb_dir):
    os.mkdir(pdb_dir)
pd.pathPDBFolder(pdb_dir)
LOGGER._setverbosity(old_verbosity)

# check Rhapsody installation
rd.initialSetup()

if DEBUG_MODE:
    time.sleep(5)
else:
    # run appropriate protocol
    if os.path.isfile('input-sm_query.txt'):
        # perform saturation mutagenesis
        rh = protocols.sat_mutagen()
    elif os.path.isfile('input-batch_query.txt'):
        # analyse batch query
        rh = protocols.batch_query()
    """
    pattern = domain + r"\s+(?P<pdbid>\S{4})\s+(?P<desc>\S+)"
    m = re.search(pattern, ASTRAL_FILE_DATA)
    print(domain, m.group('pdbid'), m.group('desc'))
    return m.group('pdbid'), m.group('desc')


def main():
    d = torch.load(args.input_pn_dict)

    for pnid, data in d.items():
        try:
            pdb_id, model_id, chain_id = pnid.split("_")
        except ValueError:
            print(pnid)
            continue
        print(pdb_id, model_id, chain_id)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Parses the ProteinNet dictionary for PDB IDs so they may be "
                                                 "downloaded and processed for the all-atom ProteinTransformer.")
    parser.add_argument('input_pn_dict', type=str, help='Path to PN-parsed dictionary file')
    parser.add_argument("--pdb_dir", default="/home/jok120/pdb/", type=str, help="Path for ProDy-downloaded PDB files.")
    args = parser.parse_args()
    with open(ASTRAL_FILE, "r") as f:
        ASTRAL_FILE_DATA = f.read()
    pr.pathPDBFolder(args.pdb_dir)
    main()

Beispiel #7
0
def get_relative_solvent_accessibility(pdb_id,
                                       residue_mapper,
                                       chain,
                                       full_pdb_solvent_accessibility=True,
                                       aa_surface_area=AA_SA_VOL):
    """
    Run DSSP on a PDB file and return the resulting AtomGroup
    
    Parameters
    ----------
    pdb_id
        String containing PDB ID
    residue_mapper
        Dictionary of residue - unitprot mappings
    chain
        String containing the selected chain ID(s) from the residue mapper
    full_pdb_solvent_accessibility
        Boolean to use the full PDB for solvent accessibility calculations -- otherwise
        only the chain residues will be selected. Default is True.
    aa_surface_area
        Dictionary with amino acid abbreviations as keys and surface area 
        calculations as values

    Returns
    -------
    a numpy array containing relative solvent accessibility measurement for residues
    """

    if full_pdb_solvent_accessibility:
        dssp_chain = None
    else:
        dssp_chain = chain

    with tempfile.TemporaryDirectory() as tdir:
        pdb_file = os.path.join(tdir, '.'.join([pdb_id, 'pdb']))
        dssp_file = os.path.join(tdir, '.'.join([pdb_id, 'dssp']))

        # DSSP doesn't work with CIF-based atom groups, so must re-run here
        pd.pathPDBFolder(tdir)
        structure = pd.parsePDB(pdb_id, chain=dssp_chain)

        # Must write PDB file for DSSP with only chain selections
        # TODO how to silence output from the DSSP functions
        pd.writePDB(pdb_file, structure)
        pd.execDSSP(pdb_file, outputdir=tdir)
        pd.parseDSSP(dssp_file, structure)

    # Gather results
    # There should not be missing residues
    mapped_residue_list = list(residue_mapper.keys())
    mapped_residue_list = ' '.join([str(x) for x in mapped_residue_list])

    selection_string = f"resnum {mapped_residue_list}"
    if dssp_chain is not None:
        selection_string += f" AND chain {chain}"

    iter_resi_list = sorted(
        set(structure.select(selection_string).getResnums()))
    rel_acc_list = list()

    for resi in iter_resi_list:
        dssp_resi = structure[(chain, resi)]
        surface_accessibilty = dssp_resi.getData('dssp_acc')[0]
        resn = dssp_resi.getResname()
        rel_surface_accessibilty = surface_accessibilty / aa_surface_area[resn]
        rel_acc_list.append(rel_surface_accessibilty)

    return np.array(rel_acc_list)
Beispiel #8
0
import sys
import os
import numpy as np
import prody as pd

# check if rhapsody can be imported correctly
sys.path.append('../../')
import rhapsody as rd

# set folders
if not os.path.isdir('workspace'):
    os.mkdir('workspace')
old_rhaps_dir = rd.pathRhapsodyFolder()
old_EVmut_dir = rd.pathEVmutationFolder()
old_prody_dir = pd.pathPDBFolder()
rd.pathRhapsodyFolder('./workspace')
rd.pathEVmutationFolder('./data')
pd.pathPDBFolder('./data')

# test cases
test_SAVs = [
    'O00294 496 A T',  # "good" SAV where all features are well-defined
    'O00238 31 R H'  # "bad" SAV with no PDB structure (but has Pfam domain)
]

# initialize a rhapsody object
rh = rd.Rhapsody()

# import precomputed PolyPhen-2 output file
rh.importPolyPhen2output('data/pph2-full.txt')
Beispiel #9
0
                        help="Path for ProDy-downloaded PDB files.")
    parser.add_argument('--training_set',
                        type=int,
                        default=100,
                        help='Which thinning of the training set to parse. '
                        '{30,50,70,90,95,100}. Default 100.')
    args = parser.parse_args()

    VALID_SPLITS = [10, 20, 30, 40, 50, 70, 90]
    TRAIN_FILE = f"training_{args.training_set}.pt"
    PN_TRAIN_DICT, PN_VALID_DICT, PN_TEST_DICT = None, None, None
    ASTRAL_FILE = "../data/proteinnet/astral_pdb_map.txt"  #"data/fullDict.txt" # combined previous versions of dir.des.scope.2.xx-stable.txt into one big dict
    ASTRAL_ID_MAPPING = parse_astral_summary_file(ASTRAL_FILE)
    SUFFIX = str(
        datetime.datetime.today().strftime("%y%m%d")) + f"_{args.training_set}"
    match = re.search(r"casp\d+", args.input_dir, re.IGNORECASE)
    assert match, "The input_dir is not titled with 'caspX'."
    CASP_VERSION = match.group(0)

    pr.pathPDBFolder(args.pdb_dir)  # Set PDB download location
    np.set_printoptions(
        suppress=True)  # suppresses scientific notation when printing
    np.set_printoptions(
        threshold=sys.maxsize)  # suppresses '...' when printing

    try:
        main()
    except Exception as e:
        ERRORS.summarize()
        raise e
Beispiel #10
0
    def alignment_monstrosity(self,
                              rmsd_cutoff=0.5,
                              use_local_pdb_database=False,
                              verify_substructure=True):
        """
        Consequences of not thinking ahead...
        For each fragment, align all fragment-containing ligands to fragment
        Generate PDBs with aligned coordinate systems
        :param args:
        :param rmsd_cutoff: fragment alignment RMSD cutoff, anything higher gets rejected
        :return:
        """

        # Create directory for processed PDBs
        rejected_dict = self.load_previously_rejected_pdbs()

        # Create directories...
        if not use_local_pdb_database:
            os.makedirs(self.pdb_bank_dir, exist_ok=True)
        os.makedirs(self.processed_PDBs_path, exist_ok=True)

        # If use_local_pdb_database=False, use PDB FTP to download all structures
        # Otherwise, all relevant structures should be found in the local PDB database
        if not use_local_pdb_database:
            prody.pathPDBFolder(folder=self.pdb_bank_dir)

            for current_fragment in self.pdb_ligand_json:

                # Only download PDBs that aren't already in PDB bank directory
                existing_PDBs = [
                    pdb[:4].lower() for pdb in os.listdir(self.pdb_bank_dir)
                ]
                PDBs_to_download = list(
                    set(self.pdb_ligand_json[current_fragment]['PDBs']) -
                    set(existing_PDBs))

                if len(PDBs_to_download) > 0:
                    print(f'Downloading PDBs for {current_fragment}...\n')
                    prody.fetchPDBviaFTP(*PDBs_to_download)
                else:
                    print(
                        f'All relevant PDBs for {current_fragment} found in {self.pdb_bank_dir}!\n'
                    )

        # Fragment_1, Fragment_2, ...
        for current_fragment in self.pdb_ligand_json:

            # Create directory for processed PDBs
            processed_dir = os.path.join(self.processed_PDBs_path,
                                         current_fragment)
            processed_dir_exists = os.path.exists(processed_dir)
            os.makedirs(processed_dir, exist_ok=True)

            # Get list of already processed PDBs for current_fragment
            already_processed_pdbs = [
                file[:4].lower() for file in os.listdir(processed_dir)
            ]

            # Save ideal_ligand_containers for each fragment so things are only downloaded once
            ideal_ligand_dict = dict()
            ideal_ligand_dict['Ligands'] = dict()
            ideal_ligand_dict['Failed'] = list()

            # Align_PDB class holds all information for the current fragment
            align = Align_PDB(self.user_defined_dir,
                              current_fragment,
                              self.sanitized_smiles_dict[current_fragment],
                              verify_substructure=verify_substructure)

            # Get PDB IDs that are viable for extracting protein-fragment contacts
            reject_pdbs = rejected_dict[
                current_fragment] if current_fragment in rejected_dict.keys(
                ) else list()
            if not processed_dir_exists:
                reject_pdbs = list()
            reject_pdbs.append('3k87')  # DEBUGGING

            viable_pdbs = list(
                set(self.pdb_ligand_json[current_fragment]['PDBs']) -
                set(reject_pdbs) - set(already_processed_pdbs))

            # For each PDB containing a fragment-containing compound
            for pdbid in viable_pdbs:

                # Return path of PDB file to use for processing
                found_pdb, pdb_path = self.return_PDB_to_use_for_alignments(
                    pdbid, use_local_pdb_database=use_local_pdb_database)

                if not found_pdb:
                    print(f'Cannot find {pdbid}!')
                    continue

                # Proceed with processing if the current PDB passes all filters
                print("\n\nProcessing {}...".format(pdbid))

                # --- Check which ligands contain relevant fragments --- #

                relevant_ligands = self.return_substructure_containing_ligands(
                    pdb_path, self.pdb_ligand_json, current_fragment)

                # Set things up! Get ligands from Ligand Expo if haven't already tried and failed
                for ligand in relevant_ligands:

                    if not ideal_ligand_dict['Ligands'].get(
                            ligand
                    ) and ligand not in ideal_ligand_dict['Failed']:
                        ideal_ligand_container = Ideal_Ligand_PDB_Container(
                            ligand)

                        if ideal_ligand_container.success:
                            ideal_ligand_dict['Ligands'][
                                ligand] = ideal_ligand_container
                        else:
                            ideal_ligand_dict['Failed'].append(ligand)

                # Create a temp list for ligands that will be pulled from the current PDB
                ligand_container_dict_for_current_pdb = {
                    lig: ideal_ligand_dict['Ligands'][lig]
                    for lig in ideal_ligand_dict['Ligands']
                    if lig in relevant_ligands
                }
                relevant_ligands_prody_dict = align.extract_ligand_records(
                    pdb_path, ligand_container_dict_for_current_pdb)

                # Reject if no ligands with all atoms represented can be found for the given PDB
                if len(relevant_ligands_prody_dict) < 1:
                    if current_fragment in rejected_dict.keys():
                        rejected_dict[current_fragment].append(pdbid)
                    else:
                        rejected_dict[current_fragment] = [pdbid]
                    print(
                        'REJECTED - no target ligands were fully represented in the PDB'
                    )
                    continue

                # --- Perform alignment of PDB fragment substructure (mobile) onto defined fragment (target) --- #

                # ...if PDB has not been processed, rejected, or excluded by the user

                else:

                    # Iterate over ligands found to contain fragments as substructures
                    for ligand_resname, ligand_chain, ligand_resnum in relevant_ligands_prody_dict:

                        # Mapping of fragment atoms to target ligand atoms
                        target_ligand_ideal_smiles = ligand_container_dict_for_current_pdb[
                            ligand_resname].smiles

                        # todo: catch ligands with missing SMILES strings earlier...
                        if target_ligand_ideal_smiles is None:
                            continue

                        target_ligand_pdb_string = io.StringIO()
                        target_ligand_prody = relevant_ligands_prody_dict[(
                            ligand_resname, ligand_chain,
                            ligand_resnum)].select('not hydrogen')
                        prody.writePDBStream(target_ligand_pdb_string,
                                             target_ligand_prody)

                        mapping_successful, fragment_target_map = align.fragment_target_mapping(
                            target_ligand_ideal_smiles,
                            target_ligand_pdb_string)

                        if not mapping_successful:
                            if current_fragment in rejected_dict.keys():
                                rejected_dict[current_fragment].append(pdbid)
                            else:
                                rejected_dict[current_fragment] = [pdbid]
                            print(
                                'REJECTED - failed atom mapping between target and reference fragment'
                            )
                            continue

                        print(
                            f'\n{len(fragment_target_map)} possible mapping(s) of fragment onto {pdbid}:{ligand} found...\n'
                        )

                        # Iterate over possible mappings of fragment onto current ligand
                        rmsd_success = False
                        for count, mapping in enumerate(fragment_target_map):

                            # todo: refactor to use RDKit's atom.GetMonomerInfo() for atom selections...
                            # Determine translation vector and rotation matrix
                            target_coords_and_serials, frag_atom_coords, transformation_matrix = align.determine_rotation_and_translation(
                                mapping, target_ligand_prody)
                            trgt_atom_coords, target_fragment_atom_serials = target_coords_and_serials

                            # Apply transformation to protein_ligand complex if rmsd if below cutoff
                            # Use information from PubChem fragment SMILES in determining correct mappings
                            # Actually, map fragment onto source ligand and use valence information to determine correct mappings
                            rmsd = prody.calcRMSD(
                                frag_atom_coords,
                                prody.applyTransformation(
                                    transformation_matrix, trgt_atom_coords))
                            print(
                                'RMSD of target onto reference fragment:\t{}'.
                                format(rmsd))

                            if rmsd < rmsd_cutoff:
                                transformed_pdb = align.apply_transformation(
                                    pdb_path, ligand_resnum,
                                    target_fragment_atom_serials,
                                    transformation_matrix)

                                # Continue if transformed_pdb - ligand is None
                                if transformed_pdb.select(
                                        f'not (resname {ligand_resname})'
                                ) is None:
                                    continue

                                transformed_pdb_name = f'{pdbid}_{ligand_resname}_{ligand_chain}_{ligand_resnum}-{count}.pdb'
                                prody.writePDB(
                                    os.path.join(processed_dir,
                                                 transformed_pdb_name),
                                    transformed_pdb)
                                rmsd_success = True

                            else:
                                print(
                                    'REJECTED - high RMSD upon alignment to reference fragment'
                                )

                        if rmsd_success is False:
                            if current_fragment in rejected_dict.keys():
                                rejected_dict[current_fragment].append(pdbid)
                            else:
                                rejected_dict[current_fragment] = [pdbid]

        # Remember rejected PDBs
        with open(self.rejected_dict_pickle, 'wb') as reject_pickle:
            pickle.dump(rejected_dict, reject_pickle)
Beispiel #11
0
__date__ = "December 2019"
__maintainer__ = "Luca Ponzoni"
__email__ = "*****@*****.**"
__status__ = "Production"

# temporarily switch to new set of folders
if not os.path.isdir('workspace'):
    os.mkdir('workspace')
if not os.path.isdir('workspace/pickles'):
    os.mkdir('workspace/pickles')
old_rhaps_dir = pd.SETTINGS.get('rhapsody_local_folder')
old_EVmut_dir = pd.SETTINGS.get('EVmutation_local_folder')
old_prody_dir = pd.SETTINGS.get('pdb_local_folder')
pd.SETTINGS['rhapsody_local_folder'] = os.path.abspath('./workspace')
pd.SETTINGS['EVmutation_local_folder'] = os.path.abspath('./data')
pd.pathPDBFolder('./data')

# test cases
test_SAVs = [
    'O00294 496 A T',  # "good" SAV where all features are well-defined
    'O00238 31 R H'  # "bad" SAV with no PDB structure (but has Pfam domain)
]

# initialize a rhapsody object
rh = rd.Rhapsody()

# import precomputed PolyPhen-2 output file
rh.importPolyPhen2output('data/pph2-full.txt')

# we would like to compute all features
rh.setFeatSet('all')