def download_pdbs(base_dir, protein_codes): """ Downloads the PDB database (or a part of it) as PDB files. Every protein is stored in it's own directory (with name the PDB code) under base_dir. :param base_dir: where to download all the proteins. :param protein_codes: the PDB codes of the proteins that should be downloaded. """ prot_codes = [] if isinstance(protein_codes, dict): for key in protein_codes.keys(): prot_codes += protein_codes[key] else: prot_codes = protein_codes prot_codes = list(set(prot_codes)) from Bio.PDB import PDBList failed = 0 attempted = len(prot_codes) for code in prot_codes: try: pl = PDBList(pdb=os.path.join(base_dir, code.upper())) pl.flat_tree = 1 pl.retrieve_pdb_file(pdb_code=code) except IOError: log.warning("Failed to download protein {}".format(code)) failed += 1 continue log.info("Downloaded {0}/{1} molecules".format(attempted - failed, attempted))
from configure import configure config = configure() from Bio.PDB import PDBList from os.path import dirname, join, basename from glob import glob from os import remove pdb_dir = dirname(config["cif"]) pl = PDBList(pdb=pdb_dir) pl.flat_tree = True existingCifs = glob(join(pdb_dir, "*.cif")) existingPdbs = set([]) for cif in existingCifs: existingPdbs.add(basename(cif).replace(".cif", "")) allPdbs = set(pdb_code.lower() for pdb_code in pl.get_all_entries()) pdb2delete = existingPdbs - allPdbs pdb2download = allPdbs - existingPdbs print("Found ", len(pdb2delete), " files to delete") print("and ", len(pdb2download), " file to download") for pdb_code in pdb2download: pl.retrieve_pdb_file(pdb_code, file_format="mmCif") for pdb_code in pdb2delete: cifPath = join(pdb_dir, pdb_code + ".cif") remove(cifPath)