# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training
logger.debug('Getting non-redundant set of PDB chains...')
nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)
logger.debug('There are ' + str(len(nrpdbs)) + ' non-redundant chains.')

# then, filter the query_pdbids and the trained_pdbids to be only reps
# then subtract out the trained_pdbids

# get all the PDB IDs on which to test (don't test already trained superfamily)
logger.debug('Getting records to test from SCOP Classification file...')
original_query_pdbids = all_pdbids_from_file(SCOP_CLASSIFICATION_FILE)
query_pdbids = set([nrpdbs[x] for x in original_query_pdbids if x in nrpdbs])
logger.debug('  total number of chains: ' + str(len(query_pdbids)))
original_trained_pdbids = all_pdbids_from_file_in(SCOP_CLASSIFICATION_FILE,
                                         target_level, target_sunid)
trained_pdbids = set([nrpdbs[x] for x in original_trained_pdbids if x in nrpdbs])                   

used_base_pdbids = set([x.split(':')[0] for x in trained_pdbids])
                      
logger.debug('  number of trained chains: ' + str(len(trained_pdbids)))
query_pdbids -= trained_pdbids

logger.debug('  number of query chains: ' + str(len(query_pdbids)))



# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)
# summary of the program configuration
config = {'output_dir' : output_dir,
          'pdb_dir' : pdb_dir,
          'representative_field' : representative_field,
          'target_level' : target_level,
          'target_sunid' : target_sunid}
logger.debug('Program configuration: ' + str(config))

# get the set of non-redundant PDB chains from the NRPDB file, and use only
# these chains for training

nrpdbs = nrpdbs_from_file(NRPDB_FILENAME, representative_field)

# get all the records from the SCOP classification file
logger.debug('Getting PDB IDs from SCOP Classification file...')
all_pdbs = all_pdbids_from_file_in(SCOP_CLASSIFICATION_FILE, target_level,
                                     target_sunid)
logger.debug('all_pdbs: ' + str(all_pdbs))
if len(all_pdbs) == 0:
    logger.critical('Nothing in all_pdbs for target level: ' + str(target_level) + ' and target_sunid: ' + str(target_sunid))
    sys.exit(1)

# create the whitelist of PDB chains on which to train explicitly
#logger.debug('Getting the whitelist of chains to test...')
#whitelist = whitelist_from_file(WHITELIST_FILENAME)

# create the output directory if it doesn't exist
logger.debug('Checking whether output directory exists...')
if not os.path.isdir(output_dir):
    logger.debug('...it doesn\'t so we create it')
    os.mkdir(output_dir)