Exemple #1
0
    def test_stochastic(self):
        # get HDF5 data
        hdf5_open = h5py.File(self.data_h5)
        hdf5_seqs = hdf5_open['valid_in']
        hdf5_targets = hdf5_open['valid_out']

        # get TFR data
        tfr_pattern = '%s/tfrecords/valid-0.tfr' % self.tfr_data_dir
        next_op = make_data_op(tfr_pattern, self.seq_length,
                               self.target_length)

        # define augmentation
        augment_shifts = [-2, -1, 0, 1, 2]
        next_op = augmentation.augment_stochastic(next_op, True,
                                                  augment_shifts)

        # initialize counters
        augment_counts = {}
        for fwdrc in [True, False]:
            for shift in augment_shifts:
                augment_counts[(fwdrc, shift)] = 0

        # choose # sequences
        max_seqs = min(64, hdf5_seqs.shape[0])
        si = 0

        # iterate over data
        si = 0
        with tf.Session() as sess:
            next_datum = sess.run(next_op)
            while next_datum:
                # parse TFRecord
                seqs_tfr = next_datum['sequence'][0]
                targets_tfr = next_datum['label'][0]

                # parse HDF5
                seqs_h5 = hdf5_seqs[si].astype('float32')
                targets_h5 = hdf5_targets[si].astype('float32')

                # expand dim
                seqs1_h5 = np.reshape(seqs_h5,
                                      (1, seqs_h5.shape[0], seqs_h5.shape[1]))

                # check augmentations for matches
                matched = False
                for fwdrc in [True, False]:
                    for shift in augment_shifts:
                        # modify sequence
                        seqs_h5_aug = dna_io.hot1_augment(
                            seqs1_h5, fwdrc, shift)[0]

                        # modify targets
                        if fwdrc:
                            targets_h5_aug = targets_h5
                        else:
                            targets_h5_aug = targets_h5[::-1, :]

                        # check match
                        if np.array_equal(seqs_tfr,
                                          seqs_h5_aug) and np.allclose(
                                              targets_tfr, targets_h5_aug):
                            #  print(si, fwdrc, shift)
                            matched = True
                            augment_counts[(fwdrc, shift)] += 1

                # assert augmentation found
                self.assertTrue(matched)

                try:
                    next_datum = sess.run(next_op)
                    si += 1
                except tf.errors.OutOfRangeError:
                    next_datum = False

        hdf5_open.close()

        # verify all augmentations appear
        for fwdrc in [True, False]:
            for shift in augment_shifts:
                # print(fwdrc, shift, augment_counts[(fwdrc,shift)])
                self.assertGreater(augment_counts[(fwdrc, shift)], 0)
Exemple #2
0
    def build_from_data_ops(self,
                            job,
                            data_ops,
                            embed_penultimate=False,
                            target_subset=None):
        """Build training ops from input data ops."""
        if not self.hparams_set:
            self.hp = params.make_hparams(job)
            self.hparams_set = True

        # training conditional
        self.is_training = tf.placeholder(tf.bool, name='is_training')

        ##################################################
        # training

        # training data_ops w/ stochastic augmentation
        data_ops_train = augmentation.augment_stochastic(
            data_ops, job["augment_rc"], job["augment_shifts"])

        # compute train representation
        self.preds_train = self.build_predict(data_ops_train['sequence'],
                                              None,
                                              embed_penultimate,
                                              target_subset,
                                              save_reprs=True)
        self.target_length = self.preds_train.shape[1].value

        # training losses
        if not embed_penultimate:
            loss_returns = self.build_loss(self.preds_train,
                                           data_ops_train['label'],
                                           target_subset)
            self.loss_train, self.loss_train_targets, self.targets_train = loss_returns

            # optimizer
            self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            self.build_optimizer(self.loss_train)

        ##################################################
        # eval

        # eval data ops w/ deterministic augmentation
        data_ops_eval = augmentation.augment_deterministic_set(
            data_ops, job["ensemble_rc"], job["ensemble_shifts"])
        data_seq_eval = tf.stack([do['sequence'] for do in data_ops_eval])
        data_rev_eval = tf.stack([do['reverse_preds'] for do in data_ops_eval])

        # compute eval representation
        map_elems_eval = (data_seq_eval, data_rev_eval)
        build_rep = lambda do: self.build_predict(do[0], do[
            1], embed_penultimate, target_subset)
        self.preds_ensemble = tf.map_fn(build_rep,
                                        map_elems_eval,
                                        dtype=tf.float32,
                                        back_prop=False)
        self.preds_eval = tf.reduce_mean(self.preds_ensemble, axis=0)

        # eval loss
        if not embed_penultimate:
            loss_returns = self.build_loss(self.preds_eval, data_ops['label'],
                                           target_subset)
            self.loss_eval, self.loss_eval_targets, self.targets_eval = loss_returns

        # update # targets
        if target_subset is not None:
            self.hp.num_targets = len(target_subset)

        # helper variables
        self.preds_length = self.preds_train.shape[1]
Exemple #3
0
    def build_from_data_ops(
        self,
        job,
        data_ops,
        augment_rc=False,
        augment_shifts=[0],
        ensemble_rc=False,
        ensemble_shifts=[0],
        embed_penultimate=False,
        target_subset=None,
    ):
        """Build training ops from input data ops."""

        if not self.hparams_set:
            self.hp = params.make_hparams(job)
            self.hparams_set = True

        # training conditional
        self.is_training = tf.placeholder(tf.bool, name="is_training")

        ##################################################
        # training

        # training data_ops w/ stochastic augmentation
        data_ops_train = augmentation.augment_stochastic(
            data_ops, augment_rc, augment_shifts)

        # compute train representation
        self.preds_train = self.build_predict(
            data_ops_train["sequence"],
            None,
            embed_penultimate,
            target_subset,
            save_reprs=True,
        )
        self.target_length = self.preds_train.shape[1].value

        # training losses
        if not embed_penultimate:
            loss_returns = self.build_loss(
                self.preds_train,
                data_ops_train["label"],
                data_ops.get("genome", None),
                target_subset,
            )
            self.loss_train, self.loss_train_targets, self.targets_train = loss_returns[:
                                                                                        3]

            # optimizer
            self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            self.build_optimizer(self.loss_train)

            # allegedly correct, but outperformed by skipping
            # with tf.control_dependencies(self.update_ops):
            #   self.build_optimizer(self.loss_train)

        ##################################################
        # eval

        # eval data ops w/ deterministic augmentation
        data_ops_eval = augmentation.augment_deterministic_set(
            data_ops, ensemble_rc, ensemble_shifts)
        data_seq_eval = tf.stack([do["sequence"] for do in data_ops_eval])
        data_rev_eval = tf.stack([do["reverse_preds"] for do in data_ops_eval])

        # compute eval representation
        map_elems_eval = (data_seq_eval, data_rev_eval)
        build_rep = lambda do: self.build_predict(do[0], do[
            1], embed_penultimate, target_subset)
        self.preds_ensemble = tf.map_fn(build_rep,
                                        map_elems_eval,
                                        dtype=tf.float32,
                                        back_prop=False)
        self.preds_eval = tf.reduce_mean(self.preds_ensemble, axis=0)

        # eval loss and metrics
        if not embed_penultimate:
            loss_returns = self.build_loss(
                self.preds_eval,
                data_ops["label"],
                data_ops.get("genome", None),
                target_subset,
            )
            self.loss_eval, self.loss_eval_targets, self.targets_eval, self.preds_eval_loss = (
                loss_returns)

        # update # targets
        if target_subset is not None:
            self.hp.num_targets = len(target_subset)

        # helper variables
        self.preds_length = self.preds_train.shape[1].value