def download_pdbs(base_dir, protein_codes):
    """
    Downloads the PDB database (or a part of it) as PDB files. Every protein is stored in it's own
    directory (with name the PDB code) under base_dir.

    :param base_dir: where to download all the proteins.
    :param protein_codes: the PDB codes of the proteins that should be downloaded.
    """
    prot_codes = []
    if isinstance(protein_codes, dict):
        for key in protein_codes.keys():
            prot_codes += protein_codes[key]
    else:
        prot_codes = protein_codes
    prot_codes = list(set(prot_codes))
    from Bio.PDB import PDBList
    failed = 0
    attempted = len(prot_codes)
    for code in prot_codes:
        try:
            pl = PDBList(pdb=os.path.join(base_dir, code.upper()))
            pl.flat_tree = 1
            pl.retrieve_pdb_file(pdb_code=code)
        except IOError:
            log.warning("Failed to download protein {}".format(code))
            failed += 1
            continue
    log.info("Downloaded {0}/{1} molecules".format(attempted - failed,
                                                   attempted))
from configure import configure
config = configure()

from Bio.PDB import PDBList
from os.path import dirname, join, basename
from glob import glob
from os import remove

pdb_dir = dirname(config["cif"])
pl = PDBList(pdb=pdb_dir)
pl.flat_tree = True

existingCifs = glob(join(pdb_dir, "*.cif"))
existingPdbs = set([])
for cif in existingCifs:
    existingPdbs.add(basename(cif).replace(".cif", ""))

allPdbs = set(pdb_code.lower() for pdb_code in pl.get_all_entries())

pdb2delete = existingPdbs - allPdbs
pdb2download = allPdbs - existingPdbs

print("Found ", len(pdb2delete), " files to delete")
print("and ", len(pdb2download), " file to download")

for pdb_code in pdb2download:
    pl.retrieve_pdb_file(pdb_code, file_format="mmCif")

for pdb_code in pdb2delete:
    cifPath = join(pdb_dir, pdb_code + ".cif")
    remove(cifPath)