def build_sad(self, job, data_ops, embed_penultimate=False, target_subset=None): """Build SAD predict ops.""" if not self.hparams_set: self.hp = params.make_hparams(job) self.hparams_set = True # training conditional self.is_training = tf.placeholder(tf.bool, name='is_training') # eval data ops w/ deterministic augmentation data_ops_eval = augmentation.augment_deterministic_set( data_ops, job["ensemble_rc"], job["ensemble_shifts"]) data_seq_eval = tf.stack([do['sequence'] for do in data_ops_eval]) data_rev_eval = tf.stack([do['reverse_preds'] for do in data_ops_eval]) # compute eval representation map_elems_eval = (data_seq_eval, data_rev_eval) build_rep = lambda do: self.build_predict(do[0], do[ 1], embed_penultimate, target_subset) self.preds_ensemble = tf.map_fn(build_rep, map_elems_eval, dtype=tf.float32, back_prop=False) self.preds_eval = tf.reduce_mean(self.preds_ensemble, axis=0) # update # targets if target_subset is not None: self.hp.num_targets = len(target_subset) # helper variables self.preds_length = self.preds_eval.shape[1]
def build_from_data_ops(self, job, data_ops, embed_penultimate=False, target_subset=None): """Build training ops from input data ops.""" if not self.hparams_set: self.hp = params.make_hparams(job) self.hparams_set = True # training conditional self.is_training = tf.placeholder(tf.bool, name='is_training') ################################################## # training # training data_ops w/ stochastic augmentation data_ops_train = augmentation.augment_stochastic( data_ops, job["augment_rc"], job["augment_shifts"]) # compute train representation self.preds_train = self.build_predict(data_ops_train['sequence'], None, embed_penultimate, target_subset, save_reprs=True) self.target_length = self.preds_train.shape[1].value # training losses if not embed_penultimate: loss_returns = self.build_loss(self.preds_train, data_ops_train['label'], target_subset) self.loss_train, self.loss_train_targets, self.targets_train = loss_returns # optimizer self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.build_optimizer(self.loss_train) ################################################## # eval # eval data ops w/ deterministic augmentation data_ops_eval = augmentation.augment_deterministic_set( data_ops, job["ensemble_rc"], job["ensemble_shifts"]) data_seq_eval = tf.stack([do['sequence'] for do in data_ops_eval]) data_rev_eval = tf.stack([do['reverse_preds'] for do in data_ops_eval]) # compute eval representation map_elems_eval = (data_seq_eval, data_rev_eval) build_rep = lambda do: self.build_predict(do[0], do[ 1], embed_penultimate, target_subset) self.preds_ensemble = tf.map_fn(build_rep, map_elems_eval, dtype=tf.float32, back_prop=False) self.preds_eval = tf.reduce_mean(self.preds_ensemble, axis=0) # eval loss if not embed_penultimate: loss_returns = self.build_loss(self.preds_eval, data_ops['label'], target_subset) self.loss_eval, self.loss_eval_targets, self.targets_eval = loss_returns # update # targets if target_subset is not None: self.hp.num_targets = len(target_subset) # helper variables self.preds_length = self.preds_train.shape[1]
def test_deterministic(self): # get HDF5 data hdf5_open = h5py.File(self.data_h5) hdf5_seqs = hdf5_open['valid_in'] hdf5_targets = hdf5_open['valid_out'] # get TFR data tfr_pattern = '%s/tfrecords/valid-0.tfr' % self.tfr_data_dir next_op = make_data_op(tfr_pattern, self.seq_length, self.target_length) # define augmentation augment_shifts = [-2, -1, 0, 1, 2] next_op_list = augmentation.augment_deterministic_set( next_op, True, augment_shifts) # initialize counters augment_counts = {} for fwdrc in [True, False]: for shift in augment_shifts: augment_counts[(fwdrc, shift)] = 0 # choose # sequences max_seqs = min(32, hdf5_seqs.shape[0]) si = 0 # iterate over data with tf.Session() as sess: next_datums = sess.run(next_op_list) while next_datums and si < max_seqs: for next_datum in next_datums: # parse TFRecord seqs_tfr = next_datum['sequence'][0] targets_tfr = next_datum['label'][0] # parse HDF5 seqs_h5 = hdf5_seqs[si].astype('float32') targets_h5 = hdf5_targets[si].astype('float32') # expand dim seqs1_h5 = np.reshape( seqs_h5, (1, seqs_h5.shape[0], seqs_h5.shape[1])) # check augmentation matched = False for fwdrc in [True, False]: for shift in augment_shifts: # modify sequence seqs_h5_aug = dna_io.hot1_augment( seqs1_h5, fwdrc, shift)[0] # modify targets if fwdrc: targets_h5_aug = targets_h5 else: targets_h5_aug = targets_h5[::-1, :] # check match if np.array_equal(seqs_tfr, seqs_h5_aug) and np.allclose( targets_tfr, targets_h5_aug): # print(si, fwdrc, shift) matched = True augment_counts[(fwdrc, shift)] += 1 # assert augmentation found self.assertTrue(matched) try: next_datums = sess.run(next_op_list) si += 1 except tf.errors.OutOfRangeError: next_datums = False hdf5_open.close() # verify all augmentations appear for fwdrc in [True, False]: for shift in augment_shifts: #print(fwdrc, shift, augment_counts[(fwdrc,shift)]) self.assertEqual(max_seqs, augment_counts[(fwdrc, shift)])
def build_from_data_ops( self, job, data_ops, augment_rc=False, augment_shifts=[0], ensemble_rc=False, ensemble_shifts=[0], embed_penultimate=False, target_subset=None, ): """Build training ops from input data ops.""" if not self.hparams_set: self.hp = params.make_hparams(job) self.hparams_set = True # training conditional self.is_training = tf.placeholder(tf.bool, name="is_training") ################################################## # training # training data_ops w/ stochastic augmentation data_ops_train = augmentation.augment_stochastic( data_ops, augment_rc, augment_shifts) # compute train representation self.preds_train = self.build_predict( data_ops_train["sequence"], None, embed_penultimate, target_subset, save_reprs=True, ) self.target_length = self.preds_train.shape[1].value # training losses if not embed_penultimate: loss_returns = self.build_loss( self.preds_train, data_ops_train["label"], data_ops.get("genome", None), target_subset, ) self.loss_train, self.loss_train_targets, self.targets_train = loss_returns[: 3] # optimizer self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.build_optimizer(self.loss_train) # allegedly correct, but outperformed by skipping # with tf.control_dependencies(self.update_ops): # self.build_optimizer(self.loss_train) ################################################## # eval # eval data ops w/ deterministic augmentation data_ops_eval = augmentation.augment_deterministic_set( data_ops, ensemble_rc, ensemble_shifts) data_seq_eval = tf.stack([do["sequence"] for do in data_ops_eval]) data_rev_eval = tf.stack([do["reverse_preds"] for do in data_ops_eval]) # compute eval representation map_elems_eval = (data_seq_eval, data_rev_eval) build_rep = lambda do: self.build_predict(do[0], do[ 1], embed_penultimate, target_subset) self.preds_ensemble = tf.map_fn(build_rep, map_elems_eval, dtype=tf.float32, back_prop=False) self.preds_eval = tf.reduce_mean(self.preds_ensemble, axis=0) # eval loss and metrics if not embed_penultimate: loss_returns = self.build_loss( self.preds_eval, data_ops["label"], data_ops.get("genome", None), target_subset, ) self.loss_eval, self.loss_eval_targets, self.targets_eval, self.preds_eval_loss = ( loss_returns) # update # targets if target_subset is not None: self.hp.num_targets = len(target_subset) # helper variables self.preds_length = self.preds_train.shape[1].value