import random from Bio.PDB import PDBList try: DATASIZE = int(sys.argv[1]) except: print('Usage:', os.path.basename(sys.argv[0]), '''N N - number of entries to be randomly selected from all PDB entries. Output written to stdout.''', file=sys.stderr) sys.exit(1) # Get index file pdbl = PDBList() #print(f"Downloading PDB index file, this takes a while.", file=sys.stderr) all_entries = pdbl.get_all_entries() # Select random subset selected = random.sample(all_entries, DATASIZE) print(f"Randomly selected %d entries from {len(all_entries)} entries." % DATASIZE, file=sys.stderr) # Write results to stdout for entry in selected: print(entry, file=sys.stdout) # Get data files #for entry in selected: # pdbl.retrieve_pdb_file(entry, file_format="pdb", pdir="pdb")
# Test which PDB entries error on PDB/mmCIF parsers # Writes output to a file labelled with the week import os from datetime import datetime from math import ceil from Bio.PDB import PDBList from Bio.PDB.PDBParser import PDBParser from Bio.PDB.MMCIFParser import MMCIFParser start = datetime.now() basedir = "." pdbl = PDBList() pdblist = pdbl.get_all_entries() outstrs = [ "Checking all PDB entries at {}".format(start.isoformat()), "Checking {} entries".format(len(pdblist)) ] pdb_parser = PDBParser() mmcif_parser = MMCIFParser() for pu in sorted(pdblist): p = pu.lower() try: pdbl.retrieve_pdb_file(p, pdir=basedir, file_format="pdb") except: # Not having a PDB file is acceptable, though a failure to download an # available file may hide an error in parsing try:
from configure import configure config = configure() from Bio.PDB import PDBList from os.path import dirname, join, basename from glob import glob from os import remove pdb_dir = dirname(config["cif"]) pl = PDBList(pdb=pdb_dir) pl.flat_tree = True existingCifs = glob(join(pdb_dir, "*.cif")) existingPdbs = set([]) for cif in existingCifs: existingPdbs.add(basename(cif).replace(".cif", "")) allPdbs = set(pdb_code.lower() for pdb_code in pl.get_all_entries()) pdb2delete = existingPdbs - allPdbs pdb2download = allPdbs - existingPdbs print("Found ", len(pdb2delete), " files to delete") print("and ", len(pdb2download), " file to download") for pdb_code in pdb2download: pl.retrieve_pdb_file(pdb_code, file_format="mmCif") for pdb_code in pdb2delete: cifPath = join(pdb_dir, pdb_code + ".cif") remove(cifPath)