pdbids.remove('2O4V')

# Make group objects. I'm going to be associating a lot of stuff with
# each protein, it's easiest to just group them together.
groupdict = CIDict([(pdbid, Group(pdbid)) for pdbid in pdbids])

# The slow part: load structures.
structure_dir = '../pymol/structures'
def filename(pdbid):
    return structure_dir + '/aligned_{}.pdb'.format(pdbid)
# Daniel's aligned structures give "invalid/missing occupancy" and
# "invalid/missing B factor" warnings - thousands of them! Have to filter
# warnings or the structures won't get loaded
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for group in groupdict.values():
        group.structure = PDBParser().get_structure(group.name,
                                                    filename(group.name))

print('structures loaded after ' + str(time.time() - start))
afterstructures = time.time()

# Open the asymmetric ezbeta spreadsheets used for retrieving DSSP results
# and the residue numbers of the residues included in the dataset
phrasebooks = biodata.phrasebooks('weights phrasebook.csv')
weights_phrasebook = phrasebooks['weights']
for group in groupdict.values():
    csv_name = group.name.upper() + '.csv'
    if csv_name in os.listdir('non ppi residues'):
        group.non_ppi_data=biodata.Spreadsheet('non ppi residues/'\
                                                 + csv_name,