def parse_all(pdb_dataset, output_dir, num_cpus): """Parse pdb dataset (pdb files) to pandas dataframes.""" requested_filenames = db.get_structures_filenames(pdb_dataset) produced_filenames = db.get_structures_filenames( output_dir, extension='.pkl') requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] work_filenames = [x[0] for x in db.get_all_filenames(work_keys, pdb_dataset, enforcement=2)] logging.info("{:} requested keys, {:} produced keys, {:} work keys" .format(len(requested_keys), len(produced_keys), len(work_keys))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append( sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") inputs = [(key, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(parse, inputs, num_cpus)
def map_all_pssms(pdb_dataset, blastdb, output_dir, num_cpus): ext = '.pkl' requested_filenames = \ db.get_structures_filenames(pdb_dataset, extension=ext) requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] work_filenames = [ x[0] for x in db.get_all_filenames(work_keys, pdb_dataset, extension=ext, keyer=lambda x: db.get_pdb_name(x)) ] output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) inputs = [(key, blastdb, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_pssms, inputs, num_cpus)
def main(pair_dir, tfrecord_dir, num_cpus): """Run write_pairs on all provided complexes.""" requested_filenames = \ db.get_structures_filenames(pair_dir, extension='.dill') requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = \ db.get_structures_filenames(tfrecord_dir, extension='.tfrecord') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) work_filenames = [ x[0] for x in db.get_all_filenames(work_keys, pair_dir, extension='.dill') ] output_filenames = [] for pdb_filename in work_filenames: sub_dir = tfrecord_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".tfrecord") inputs = [(i, o) for i, o in zip(work_filenames, output_filenames)] par.submit_jobs(pairs_to_tfrecord, inputs, num_cpus)