Ejemplo n.º 1
0
    def build_sad(self,
                  job,
                  data_ops,
                  embed_penultimate=False,
                  target_subset=None):
        """Build SAD predict ops."""
        if not self.hparams_set:
            self.hp = params.make_hparams(job)
            self.hparams_set = True

        # training conditional
        self.is_training = tf.placeholder(tf.bool, name='is_training')

        # eval data ops w/ deterministic augmentation
        data_ops_eval = augmentation.augment_deterministic_set(
            data_ops, job["ensemble_rc"], job["ensemble_shifts"])
        data_seq_eval = tf.stack([do['sequence'] for do in data_ops_eval])
        data_rev_eval = tf.stack([do['reverse_preds'] for do in data_ops_eval])

        # compute eval representation
        map_elems_eval = (data_seq_eval, data_rev_eval)
        build_rep = lambda do: self.build_predict(do[0], do[
            1], embed_penultimate, target_subset)
        self.preds_ensemble = tf.map_fn(build_rep,
                                        map_elems_eval,
                                        dtype=tf.float32,
                                        back_prop=False)
        self.preds_eval = tf.reduce_mean(self.preds_ensemble, axis=0)

        # update # targets
        if target_subset is not None:
            self.hp.num_targets = len(target_subset)

        # helper variables
        self.preds_length = self.preds_eval.shape[1]
Ejemplo n.º 2
0
    def build_from_data_ops(self,
                            job,
                            data_ops,
                            embed_penultimate=False,
                            target_subset=None):
        """Build training ops from input data ops."""
        if not self.hparams_set:
            self.hp = params.make_hparams(job)
            self.hparams_set = True

        # training conditional
        self.is_training = tf.placeholder(tf.bool, name='is_training')

        ##################################################
        # training

        # training data_ops w/ stochastic augmentation
        data_ops_train = augmentation.augment_stochastic(
            data_ops, job["augment_rc"], job["augment_shifts"])

        # compute train representation
        self.preds_train = self.build_predict(data_ops_train['sequence'],
                                              None,
                                              embed_penultimate,
                                              target_subset,
                                              save_reprs=True)
        self.target_length = self.preds_train.shape[1].value

        # training losses
        if not embed_penultimate:
            loss_returns = self.build_loss(self.preds_train,
                                           data_ops_train['label'],
                                           target_subset)
            self.loss_train, self.loss_train_targets, self.targets_train = loss_returns

            # optimizer
            self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            self.build_optimizer(self.loss_train)

        ##################################################
        # eval

        # eval data ops w/ deterministic augmentation
        data_ops_eval = augmentation.augment_deterministic_set(
            data_ops, job["ensemble_rc"], job["ensemble_shifts"])
        data_seq_eval = tf.stack([do['sequence'] for do in data_ops_eval])
        data_rev_eval = tf.stack([do['reverse_preds'] for do in data_ops_eval])

        # compute eval representation
        map_elems_eval = (data_seq_eval, data_rev_eval)
        build_rep = lambda do: self.build_predict(do[0], do[
            1], embed_penultimate, target_subset)
        self.preds_ensemble = tf.map_fn(build_rep,
                                        map_elems_eval,
                                        dtype=tf.float32,
                                        back_prop=False)
        self.preds_eval = tf.reduce_mean(self.preds_ensemble, axis=0)

        # eval loss
        if not embed_penultimate:
            loss_returns = self.build_loss(self.preds_eval, data_ops['label'],
                                           target_subset)
            self.loss_eval, self.loss_eval_targets, self.targets_eval = loss_returns

        # update # targets
        if target_subset is not None:
            self.hp.num_targets = len(target_subset)

        # helper variables
        self.preds_length = self.preds_train.shape[1]
Ejemplo n.º 3
0
    def test_deterministic(self):
        # get HDF5 data
        hdf5_open = h5py.File(self.data_h5)
        hdf5_seqs = hdf5_open['valid_in']
        hdf5_targets = hdf5_open['valid_out']

        # get TFR data
        tfr_pattern = '%s/tfrecords/valid-0.tfr' % self.tfr_data_dir
        next_op = make_data_op(tfr_pattern, self.seq_length,
                               self.target_length)

        # define augmentation
        augment_shifts = [-2, -1, 0, 1, 2]
        next_op_list = augmentation.augment_deterministic_set(
            next_op, True, augment_shifts)

        # initialize counters
        augment_counts = {}
        for fwdrc in [True, False]:
            for shift in augment_shifts:
                augment_counts[(fwdrc, shift)] = 0

        # choose # sequences
        max_seqs = min(32, hdf5_seqs.shape[0])
        si = 0

        # iterate over data
        with tf.Session() as sess:
            next_datums = sess.run(next_op_list)
            while next_datums and si < max_seqs:
                for next_datum in next_datums:
                    # parse TFRecord
                    seqs_tfr = next_datum['sequence'][0]
                    targets_tfr = next_datum['label'][0]

                    # parse HDF5
                    seqs_h5 = hdf5_seqs[si].astype('float32')
                    targets_h5 = hdf5_targets[si].astype('float32')

                    # expand dim
                    seqs1_h5 = np.reshape(
                        seqs_h5, (1, seqs_h5.shape[0], seqs_h5.shape[1]))

                    # check augmentation
                    matched = False
                    for fwdrc in [True, False]:
                        for shift in augment_shifts:
                            # modify sequence
                            seqs_h5_aug = dna_io.hot1_augment(
                                seqs1_h5, fwdrc, shift)[0]

                            # modify targets
                            if fwdrc:
                                targets_h5_aug = targets_h5
                            else:
                                targets_h5_aug = targets_h5[::-1, :]

                            # check match
                            if np.array_equal(seqs_tfr,
                                              seqs_h5_aug) and np.allclose(
                                                  targets_tfr, targets_h5_aug):
                                # print(si, fwdrc, shift)
                                matched = True
                                augment_counts[(fwdrc, shift)] += 1

                    # assert augmentation found
                    self.assertTrue(matched)

                try:
                    next_datums = sess.run(next_op_list)
                    si += 1
                except tf.errors.OutOfRangeError:
                    next_datums = False

        hdf5_open.close()

        # verify all augmentations appear
        for fwdrc in [True, False]:
            for shift in augment_shifts:
                #print(fwdrc, shift, augment_counts[(fwdrc,shift)])
                self.assertEqual(max_seqs, augment_counts[(fwdrc, shift)])
Ejemplo n.º 4
0
    def build_from_data_ops(
        self,
        job,
        data_ops,
        augment_rc=False,
        augment_shifts=[0],
        ensemble_rc=False,
        ensemble_shifts=[0],
        embed_penultimate=False,
        target_subset=None,
    ):
        """Build training ops from input data ops."""

        if not self.hparams_set:
            self.hp = params.make_hparams(job)
            self.hparams_set = True

        # training conditional
        self.is_training = tf.placeholder(tf.bool, name="is_training")

        ##################################################
        # training

        # training data_ops w/ stochastic augmentation
        data_ops_train = augmentation.augment_stochastic(
            data_ops, augment_rc, augment_shifts)

        # compute train representation
        self.preds_train = self.build_predict(
            data_ops_train["sequence"],
            None,
            embed_penultimate,
            target_subset,
            save_reprs=True,
        )
        self.target_length = self.preds_train.shape[1].value

        # training losses
        if not embed_penultimate:
            loss_returns = self.build_loss(
                self.preds_train,
                data_ops_train["label"],
                data_ops.get("genome", None),
                target_subset,
            )
            self.loss_train, self.loss_train_targets, self.targets_train = loss_returns[:
                                                                                        3]

            # optimizer
            self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            self.build_optimizer(self.loss_train)

            # allegedly correct, but outperformed by skipping
            # with tf.control_dependencies(self.update_ops):
            #   self.build_optimizer(self.loss_train)

        ##################################################
        # eval

        # eval data ops w/ deterministic augmentation
        data_ops_eval = augmentation.augment_deterministic_set(
            data_ops, ensemble_rc, ensemble_shifts)
        data_seq_eval = tf.stack([do["sequence"] for do in data_ops_eval])
        data_rev_eval = tf.stack([do["reverse_preds"] for do in data_ops_eval])

        # compute eval representation
        map_elems_eval = (data_seq_eval, data_rev_eval)
        build_rep = lambda do: self.build_predict(do[0], do[
            1], embed_penultimate, target_subset)
        self.preds_ensemble = tf.map_fn(build_rep,
                                        map_elems_eval,
                                        dtype=tf.float32,
                                        back_prop=False)
        self.preds_eval = tf.reduce_mean(self.preds_ensemble, axis=0)

        # eval loss and metrics
        if not embed_penultimate:
            loss_returns = self.build_loss(
                self.preds_eval,
                data_ops["label"],
                data_ops.get("genome", None),
                target_subset,
            )
            self.loss_eval, self.loss_eval_targets, self.targets_eval, self.preds_eval_loss = (
                loss_returns)

        # update # targets
        if target_subset is not None:
            self.hp.num_targets = len(target_subset)

        # helper variables
        self.preds_length = self.preds_train.shape[1].value