def _lookup(self, file_path): target = util.get_target_name(file_path) decoy = util.get_decoy_name(file_path) key = (target, decoy) if key in self._scores.index: return key, self._scores.loc[key] return None, None
def main(data_dir, target_list, labels_dir, struct_format, num_cpus, overwrite, tmscore_exe): """ Compute rmsd, tm-score, gdt-ts, gdt-ha of decoy structures """ logger = logging.getLogger(__name__) logger.info("Compute rmsd, tm-score, gdt-ts, gdt-ha of decoys in {:}".format( data_dir)) os.makedirs(labels_dir, exist_ok=True) with open(target_list, 'r') as f: requested_filenames = \ [os.path.join(labels_dir, '{:}.dat'.format(x.strip())) for x in f] logger.info("{:} requested keys".format(len(requested_filenames))) produced_filenames = [] if not overwrite: produced_filenames = [f for f in fi.find_files(labels_dir, 'dat') \ if 'targets' not in f] logger.info("{:} produced keys".format(len(produced_filenames))) inputs = [] for filename in requested_filenames: if filename in produced_filenames: continue target_name = util.get_target_name(filename) target_dir = os.path.join(data_dir, target_name) inputs.append((tmscore_exe, filename, target_name, target_dir, struct_format)) logger.info("{:} work keys".format(len(inputs))) par.submit_jobs(run_tmscore_per_target, inputs, num_cpus)
def run_tmscore_per_target(tmscore_exe, output_filename, target_name, target_dir, struct_format): ''' Run TM-score to compare all decoy structures of a target with its native structure. Write the result into a tab-delimited file with the following headers: <target> <decoy> <rmsd> <tm_score> <gdt_ts> <gdt_ha> ''' native = os.path.join(target_dir, '{:}.{:}'.format( target_name, struct_format)) decoys = fi.find_files(target_dir, struct_format) logging.info("Running tm-scores for {:} with {:} decoys".format( target_name, len(decoys))) rows = [] for decoy in decoys: result = run_tmscore_per_structure(tmscore_exe, decoy, native) if result == None: logging.warning("Skip target {:} decoy {:} due to failure".format( target_name, decoy)) continue rmsd, tm, gdt_ts, gdt_ha = result rows.append([util.get_target_name(decoy), util.get_decoy_name(decoy), rmsd, gdt_ts, gdt_ha, tm]) df = pd.DataFrame( rows, columns=['target', 'decoy', 'rmsd', 'gdt_ts', 'gdt_ha', 'tm']) df = df.sort_values( ['rmsd', 'gdt_ts', 'gdt_ha', 'tm', 'decoy'], ascending=[True, False, False, False, False]).reset_index(drop=True) # Write to file df.to_csv(output_filename, sep='\t', index=False)
def _lookup(self, file_path): target = util.get_target_name(file_path) decoy = util.get_decoy_name(file_path) key = (target, decoy) if key in self._scores.index: score = self._scores.loc[key].head(1).astype(np.float64).squeeze().to_dict() return key, score return key, None
def casp_ensembler(pdb_files): targets = col.defaultdict(list) for f in pdb_files: target_name = util.get_target_name(f) targets[target_name].append(f) # target_name -> (decoy_name -> filename) ensembles = {} for target_name, files in targets.items(): subunits = {util.get_decoy_name(f): f for f in files} ensembles[target_name] = subunits return ensembles
def gen_splits(target_list, input_dir, output_sharded_train, output_sharded_val, output_sharded_test, splitby, test_years, train_years, val_years, train_size, val_size, test_size, train_decoy_size, val_decoy_size, test_decoy_size, exclude_natives, shuffle, random_seed): """ Generate train/val/test sets from the input dataset. """ targets_df = pd.read_csv( target_list, delimiter='\s*', engine='python').dropna() files = fi.find_files(input_dir, dt.patterns['pdb']) structures_df = pd.DataFrame( [[util.get_target_name(f), util.get_decoy_name(f), f] for f in files], columns = ['target', 'decoy', 'path']) # Remove duplicates structures_df = structures_df.drop_duplicates( subset=['target', 'decoy'], keep='first').reset_index(drop=True) structures_df = pd.merge(structures_df, targets_df, on='target') # Keep only (target, year) that also appear in structure_df targets_df = structures_df[['target', 'year']].drop_duplicates( keep='first').reset_index(drop=True) if splitby == 'random': targets_train, targets_val, targets_test = split_targets_random( targets_df, train_size, val_size, test_size, shuffle, random_seed) elif splitby == 'year': targets_train, targets_val, targets_test = split_targets_by_year( targets_df, test_years, train_years, val_years, val_size, shuffle, random_seed) else: assert 'Unrecognized splitby option %s' % splitby print('Generating dataset: train ({:} targets), val ({:} targets), ' 'test ({:} targets)'.format(len(targets_train), len(targets_val), len(targets_test))) train_set, val_set, test_set = generate_train_val_targets_tests( structures_df, targets_train, targets_val, targets_test, train_decoy_size, val_decoy_size, test_decoy_size, exclude_natives, random_seed) print('Finished generating dataset: train ({:} decoys), val ({:} decoys), ' 'test ({:} decoys)'.format(len(train_set), len(val_set), len(test_set))) for (output_sharded, dataset) in [(output_sharded_train, train_set), (output_sharded_val, val_set), (output_sharded_test, test_set)]: print('\nWriting out dataset to {:}'.format(output_sharded)) files = dataset.path.unique() create_sharded_dataset(files, output_sharded)
def train_model(sess, args): # tf Graph input # Subgrid maps for each residue in a protein logging.debug('Create input placeholder...') grid_size = subgrid_gen.grid_size(args.grid_config) channel_size = subgrid_gen.num_channels(args.grid_config) feature_placeholder = tf.placeholder( tf.float32, [None, grid_size, grid_size, grid_size, channel_size], name='main_input') label_placeholder = tf.placeholder(tf.float32, [None, 1], 'label') # Placeholder for model parameters training_placeholder = tf.placeholder(tf.bool, shape=[], name='is_training') conv_drop_rate_placeholder = tf.placeholder(tf.float32, name='conv_drop_rate') fc_drop_rate_placeholder = tf.placeholder(tf.float32, name='fc_drop_rate') top_nn_drop_rate_placeholder = tf.placeholder(tf.float32, name='top_nn_drop_rate') # Define loss and optimizer logging.debug('Define loss and optimizer...') predict_op, loss_op = conv_model(feature_placeholder, label_placeholder, training_placeholder, conv_drop_rate_placeholder, fc_drop_rate_placeholder, top_nn_drop_rate_placeholder, args) logging.debug('Generate training ops...') train_op = model.training(loss_op, args.learning_rate) # Initialize the variables (i.e. assign their default value) logging.debug('Initializing global variables...') init = tf.global_variables_initializer() # Create saver and summaries. logging.debug('Initializing saver...') saver = tf.train.Saver(max_to_keep=100000) logging.debug('Finished initializing saver...') def __loop(generator, mode, num_iters): tf_dataset, next_element = batch_dataset_generator( generator, args, is_testing=(mode == 'test')) structs, losses, preds, labels = [], [], [], [] epoch_loss = 0 progress_format = mode + ' loss: {:6.6f}' # Loop over all batches (one batch is all feature for 1 protein) num_batches = int(math.ceil(float(num_iters) / args.batch_size)) #print('Running {:} -> {:} iters in {:} batches (batch size: {:})'.format( # mode, num_iters, num_batches, args.batch_size)) with tqdm.tqdm(total=num_batches, desc=progress_format.format(0)) as t: for i in range(num_batches): try: struct_, feature_, label_ = sess.run(next_element) _, pred, loss = sess.run( [train_op, predict_op, loss_op], feed_dict={ feature_placeholder: feature_, label_placeholder: label_, training_placeholder: (mode == 'train'), conv_drop_rate_placeholder: args.conv_drop_rate if mode == 'train' else 0.0, fc_drop_rate_placeholder: args.fc_drop_rate if mode == 'train' else 0.0, top_nn_drop_rate_placeholder: args.top_nn_drop_rate if mode == 'train' else 0.0 }) epoch_loss += (np.mean(loss) - epoch_loss) / (i + 1) structs.extend(struct_.astype(str)) losses.append(loss) preds.extend(pred) labels.extend(label_) t.set_description(progress_format.format(epoch_loss)) t.update(1) except (tf.errors.OutOfRangeError, StopIteration): logging.info("\nEnd of dataset at iteration {:}".format(i)) break def __concatenate(array): try: array = np.concatenate(array) return array except: return array structs = __concatenate(structs) preds = __concatenate(preds) labels = __concatenate(labels) losses = __concatenate(losses) return structs, preds, labels, losses, epoch_loss # Run the initializer logging.debug('Running initializer...') sess.run(init) logging.debug('Finished running initializer...') ##### Training + validation if not args.test_only: prev_val_loss, best_val_loss = float("inf"), float("inf") if (args.max_targets_train == None) and (args.max_decoys_train == None): train_num_structs = args.train_sharded.get_num_structures( ['ensemble', 'subunit']) elif (args.max_targets_train == None): train_num_structs = args.train_sharded.get_num_keyed( ) * args.max_decoys_train elif (args.max_decoys_train == None): assert False else: train_num_structs = args.max_targets_train * args.max_decoys_train if (args.max_targets_val == None) and (args.max_decoys_val == None): val_num_structs = args.val_sharded.get_num_structures( ['ensemble', 'subunit']) elif (args.max_targets_val == None): val_num_structs = args.val_sharded.get_num_keyed( ) * args.max_decoys_val elif (args.max_decoys_val == None): assert False else: val_num_structs = args.max_targets_val * args.max_decoys_val train_num_structs *= args.repeat_gen #val_num_structs *= args.repeat_gen logging.info( "Start training with {:} structs for train and {:} structs for val per epoch" .format(train_num_structs, val_num_structs)) def _save(): ckpt = saver.save(sess, os.path.join(args.output_dir, 'model-ckpt'), global_step=epoch) return ckpt run_info_filename = os.path.join(args.output_dir, 'run_info.json') run_info = {} def __update_and_write_run_info(key, val): run_info[key] = val with open(run_info_filename, 'w') as f: json.dump(run_info, f, indent=4) per_epoch_val_losses = [] for epoch in range(1, args.num_epochs + 1): random_seed = args.random_seed #random.randint(1, 10e6) logging.info('Epoch {:} - random_seed: {:}'.format( epoch, args.random_seed)) logging.debug('Creating train generator...') train_generator_callable = functools.partial( feature_psp.dataset_generator, args.train_sharded, args.grid_config, score_type=args.score_type, shuffle=args.shuffle, repeat=args.repeat_gen, max_targets=args.max_targets_train, max_decoys=args.max_decoys_train, max_dist_threshold=300.0, random_seed=random_seed) logging.debug('Creating val generator...') val_generator_callable = functools.partial( feature_psp.dataset_generator, args.val_sharded, args.grid_config, score_type=args.score_type, shuffle=args.shuffle, repeat=1, #*args.repeat_gen, max_targets=args.max_targets_val, max_decoys=args.max_decoys_val, max_dist_threshold=300.0, random_seed=random_seed) # Training train_structs, train_preds, train_labels, _, curr_train_loss = __loop( train_generator_callable, 'train', num_iters=train_num_structs) # Validation val_structs, val_preds, val_labels, _, curr_val_loss = __loop( val_generator_callable, 'val', num_iters=val_num_structs) per_epoch_val_losses.append(curr_val_loss) __update_and_write_run_info('val_losses', per_epoch_val_losses) if args.use_best or args.early_stopping: if curr_val_loss < best_val_loss: # Found new best epoch. best_val_loss = curr_val_loss ckpt = _save() __update_and_write_run_info('val_best_loss', best_val_loss) __update_and_write_run_info('best_ckpt', ckpt) logging.info("New best {:}".format(ckpt)) if (epoch == args.num_epochs - 1 and not args.use_best): # At end and just using final checkpoint. ckpt = _save() __update_and_write_run_info('best_ckpt', ckpt) logging.info("Last checkpoint {:}".format(ckpt)) if args.save_all_ckpts: # Save at every checkpoint ckpt = _save() logging.info("Saving checkpoint {:}".format(ckpt)) ## Save last train and val results train_df = pd.DataFrame( np.array([train_structs, train_labels, train_preds]).T, columns=['structure', 'true', 'pred'], ) train_df['target'] = train_df.structure.apply( lambda x: psp_util.get_target_name(x)) train_df.to_pickle( os.path.join(args.output_dir, 'train_result-{:}.pkl'.format(epoch))) __stats('Train Epoch {:}'.format(epoch), train_df) val_df = pd.DataFrame( np.array([val_structs, val_labels, val_preds]).T, columns=['structure', 'true', 'pred'], ) val_df['target'] = val_df.structure.apply( lambda x: psp_util.get_target_name(x)) val_df.to_pickle( os.path.join(args.output_dir, 'val_result-{:}.pkl'.format(epoch))) __stats('Val Epoch {:}'.format(epoch), val_df) if args.early_stopping and curr_val_loss >= prev_val_loss: logging.info("Validation loss stopped decreasing, stopping...") break else: prev_val_loss = curr_val_loss logging.info("Finished training") ##### Testing logging.debug("Run testing") if not args.test_only: to_use = run_info['best_ckpt'] if args.use_best else ckpt else: with open(os.path.join(args.model_dir, 'run_info.json')) as f: run_info = json.load(f) to_use = run_info['best_ckpt'] saver = tf.train.import_meta_graph(to_use + '.meta') logging.info("Using {:} for testing".format(to_use)) saver.restore(sess, to_use) test_generator_callable = functools.partial( feature_psp.dataset_generator, args.test_sharded, args.grid_config, score_type=args.score_type, shuffle=args.shuffle, repeat=1, max_targets=args.max_targets_test, max_decoys=args.max_decoys_test, max_dist_threshold=None, random_seed=args.random_seed) if (args.max_targets_test == None) and (args.max_decoys_test == None): test_num_structs = args.test_sharded.get_num_structures( ['ensemble', 'subunit']) elif (args.max_targets_test == None): test_num_structs = args.test_sharded.get_num_keyed( ) * args.max_decoys_test elif (args.max_decoys_test == None): assert False else: test_num_structs = args.max_targets_test * args.max_decoys_test logging.info("Start testing with {:} structs".format(test_num_structs)) test_structs, test_preds, test_labels, _, test_loss = __loop( test_generator_callable, 'test', num_iters=test_num_structs) logging.info("Finished testing") test_df = pd.DataFrame( np.array([test_structs, test_labels, test_preds]).T, columns=['structure', 'true', 'pred'], ) test_df.to_pickle(os.path.join(args.output_dir, 'test_result.pkl')) test_df['target'] = test_df.structure.apply( lambda x: psp_util.get_target_name(x)) test_df.to_pickle(os.path.join(args.output_dir, 'test_result.pkl')) __stats('Test', test_df)