Exemple #1
0
    def __init__(self, model_params, **kwargs):
        """Create a Trainer, and give it the parameters needed to instantiate the model

        :param model_params: The model parameters
        :param kwargs: See below

        :Keyword Arguments:

          * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
          * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
          * *clip* (`int`) -- If we are doing gradient clipping, what value to use
          * *optim* (`str`) -- The name of the optimizer we are using
          * *lr* (`float`) -- The learning rate we are using
          * *mom* (`float`) -- If we are using SGD, what value to use for momentum
          * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
          * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
          * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8

        """
        super().__init__()
        if type(model_params) is dict:
            self.model = create_model_for('tagger', **model_params)
        else:
            self.model = model_params
        span_type = kwargs.get('span_type', 'iob')
        verbose = kwargs.get('verbose', False)
        self.evaluator = TaggerEvaluatorEagerTf(self.model, span_type, verbose)
        self.optimizer = EagerOptimizer(loss, **kwargs)
        self.nsteps = kwargs.get('nsteps', six.MAXSIZE)
        checkpoint_dir = kwargs.get('checkpoint')
        if checkpoint_dir is None:
            checkpoint_dir = f'./tf-tagger-{os.getpid()}'
        self._checkpoint, self.checkpoint_manager = setup_tf2_checkpoints(
            self.optimizer, self.model, checkpoint_dir)
Exemple #2
0
    def __init__(self, model_params, **kwargs):
        super().__init__()

        if type(model_params) is dict:
            self.model = create_model_for('lm', **model_params)
        else:
            self.model = model_params

        loss_fn = loss_with_state if self.model.requires_state else loss_without_state
        self.optimizer = EagerOptimizer(loss_fn, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)
        self._checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer.optimizer, model=self.model)
        checkpoint_dir = '{}-{}'.format("./tf-lm", os.getpid())

        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=checkpoint_dir, max_to_keep=5)
Exemple #3
0
    def __init__(self, model_params, **kwargs):
        super().__init__()

        if type(model_params) is dict:
            self.model = create_model_for('seq2seq', **model_params)
        else:
            self.model = model_params

        self.tgt_rlut = kwargs['tgt_rlut']
        self.loss = Seq2SeqLoss(**kwargs)
        self.optimizer = EagerOptimizer(self.loss, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)
        self._checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer.optimizer, model=self.model)
        checkpoint_dir = '{}-{}'.format("./tf-seq2seq", os.getpid())
        self.bleu_n_grams = int(kwargs.get("bleu_n_grams", 4))

        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=checkpoint_dir, max_to_keep=5)
Exemple #4
0
    def __init__(self, model_params, **kwargs):
        super().__init__()

        if type(model_params) is dict:
            self.model = create_model_for('seq2seq', **model_params)
        else:
            self.model = model_params

        self.tgt_rlut = kwargs['tgt_rlut']
        self.loss = Seq2SeqLoss(**kwargs)
        self.optimizer = EagerOptimizer(self.loss, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)

        checkpoint_dir = kwargs.get('checkpoint')
        if checkpoint_dir is None:
            checkpoint_dir = f'./tf-seq2seq-{os.getpid()}'
        self._checkpoint, self.checkpoint_manager = setup_tf2_checkpoints(
            self.optimizer, self.model, checkpoint_dir)

        self.bleu_n_grams = int(kwargs.get("bleu_n_grams", 4))
Exemple #5
0
    def __init__(self, model_params, **kwargs):
        super().__init__()
        if type(model_params) is dict:
            self.model = create_model_for('seq2seq', **model_params)
        else:
            self.model = model_params

        self.tgt_rlut = kwargs['tgt_rlut']
        self.optimizer = EagerOptimizer(loss, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)
        self._checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer.optimizer, model=self.model)
        checkpoint_dir = '{}-{}'.format("./tf-seq2seq", os.getpid())

        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=checkpoint_dir, max_to_keep=5)
        strategy_type = kwargs.get('strategy_type', 'mirror')
        gpus = int(kwargs.get('gpus', 1))
        endpoint = kwargs.get('endpoint')
        self.strategy = create_distribute_strategy(strategy_type, gpus,
                                                   endpoint)
        self.bleu_n_grams = int(kwargs.get("bleu_n_grams", 4))
model = to_device(
    L.EmbedPoolStackModel(
        2, L.EmbeddingsStack(embeddings),
        L.WithoutLength(L.ParallelConv(None, args.poolsz, args.filts)),
        L.Highway(stacksz)))


def loss(model, x, y):
    y_ = model(x)
    return tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=y,
                                                            logits=y_)


# This works with TF 2.0 and PyTorch:
optimizer = EagerOptimizer(loss, optim="adam", lr=0.001)


@tf.function
def train_step(optimizer, model, x, y):
    loss_value = optimizer.update(model, x, y)
    return loss_value


for epoch in range(num_epochs):
    loss_acc = 0.
    step = 0
    start = time.time()
    for x, y in train_set.get_input(training=True):
        loss_value = train_step(optimizer, model, x, y)
        loss_acc += loss_value
Exemple #7
0
class LanguageModelTrainerDistributedTf(Trainer):
    """A Trainer for LM distributed eager training
    """
    def __init__(self, model_params, **kwargs):
        super().__init__()
        if type(model_params) is dict:
            self.model = create_model_for('lm', **model_params)
        else:
            self.model = model_params

        loss_fn = loss_with_state if self.model.requires_state else loss_without_state
        self.optimizer = EagerOptimizer(loss_fn, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)
        self._checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer.optimizer, model=self.model)
        checkpoint_dir = '{}-{}'.format("./tf-lm", os.getpid())

        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=checkpoint_dir, max_to_keep=5)

        strategy_type = kwargs.get('strategy_type', 'mirror')
        gpus = int(kwargs.get('gpus', 1))
        endpoint = kwargs.get('endpoint')
        self.strategy = create_distribute_strategy(strategy_type, gpus,
                                                   endpoint)

    def checkpoint(self):
        """This method saves a checkpoint

        :return: None
        """
        self.checkpoint_manager.save()

    def recover_last_checkpoint(self):
        """Recover the last saved checkpoint

        :return: None
        """
        print(
            self._checkpoint.restore(
                self.checkpoint_manager.latest_checkpoint))

    @staticmethod
    def _num_toks(y):
        return tf.reduce_prod(get_shape_as_list(y))

    def train(self, ts, reporting_fns, steps=0, dataset=True):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        strategy = self.strategy

        def _replicated_train_step_no_state(inputs):

            features, y = inputs
            per_replica_loss = self.optimizer.update(self.model, features, y)
            per_replica_toks = self._num_toks(y)
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return per_replica_report_loss, per_replica_toks

        def _replicated_train_step_with_state(inputs, hidden):
            features, y = inputs
            per_replica_loss, new_hidden = self.optimizer.update_with_hidden(
                self.model, hidden, features, y)
            per_replica_toks = self._num_toks(y)
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return new_hidden, per_replica_report_loss, per_replica_toks

        with strategy.scope():
            train_iter = iter(ts)
            SET_TRAIN_FLAG(True)
            epoch_loss = tf.Variable(0.0)
            epoch_div = tf.Variable(0, dtype=tf.int32)
            nstep_loss = tf.Variable(0.0)
            nstep_div = tf.Variable(0, dtype=tf.int32)
            self.nstep_start = time.time()
            start = time.time()

            @tf.function
            def _distributed_train_no_state(inputs):
                per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_train_step_no_state, args=(inputs, ))
                return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_loss,
                                       axis=None), strategy.reduce(
                                           tf.distribute.ReduceOp.SUM,
                                           per_replica_toks,
                                           axis=None)

            @tf.function
            def _distributed_train_with_state(inputs, hidden):

                h, per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_train_step_with_state, args=(
                        inputs,
                        hidden,
                    ))
                step_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_loss,
                                            axis=None)
                step_toks = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_toks,
                                            axis=None)
                return h, step_loss, step_toks

            h = None
            for i in range(steps):

                inputs = next(train_iter)
                if self.model.requires_state:
                    h, step_loss, step_toks = _distributed_train_with_state(
                        inputs, h)
                else:
                    step_loss, step_toks = _distributed_train_no_state(inputs)
                epoch_loss.assign_add(step_loss)
                nstep_loss.assign_add(step_loss)
                epoch_div.assign_add(step_toks)
                nstep_div.assign_add(step_toks)
                step = self.optimizer.global_step.numpy() + 1
                if step % self.nsteps == 0:
                    metrics = self.calc_metrics(nstep_loss.numpy(),
                                                nstep_div.numpy())
                    self.report(step, metrics, self.nstep_start, 'Train',
                                'STEP', reporting_fns, self.nsteps)
                    nstep_loss.assign(0.0)
                    nstep_div.assign(0)
                    self.nstep_start = time.time()

            epoch_loss = epoch_loss.numpy()
            epoch_div = epoch_div.numpy()
            metrics = self.calc_metrics(epoch_loss, epoch_div)
            self.train_epochs += 1
            self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH',
                        reporting_fns)
            return metrics

    def calc_metrics(self, agg, norm):
        metrics = super().calc_metrics(agg, norm)
        metrics['perplexity'] = np.exp(metrics['avg_loss'])
        return metrics

    def test(self, vs, reporting_fns, phase, steps=0):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        strategy = self.strategy

        def _replicated_test_step_no_state(inputs):
            features, y = inputs
            per_replica_loss = loss_without_state(self.model, features, y)
            per_replica_toks = self._num_toks(y)
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return per_replica_report_loss, per_replica_toks

        def _replicated_test_step_with_state(inputs, hidden):
            features, y = inputs
            per_replica_loss, new_hidden = loss_with_state(
                self.model, hidden, features, y)
            per_replica_toks = self._num_toks(y)
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return new_hidden, per_replica_report_loss, per_replica_toks

        with strategy.scope():
            SET_TRAIN_FLAG(False)
            test_iter = iter(vs)
            epoch_loss = tf.Variable(0.0)
            epoch_div = tf.Variable(0, dtype=tf.int32)
            self.nstep_start = time.time()
            start = time.time()

            @tf.function
            def _distributed_test_no_state(inputs):
                per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_test_step_no_state, args=(inputs, ))
                return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_loss,
                                       axis=None), strategy.reduce(
                                           tf.distribute.ReduceOp.SUM,
                                           per_replica_toks,
                                           axis=None)

            @tf.function
            def _distributed_test_with_state(inputs, hidden):

                h, per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_test_step_with_state, args=(
                        inputs,
                        hidden,
                    ))
                step_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_loss,
                                            axis=None)
                step_toks = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_toks,
                                            axis=None)
                return h, step_loss, step_toks

            epochs = 0
            if phase == 'Valid':
                self.valid_epochs += 1
                epochs = self.valid_epochs

            h = None
            for i in range(steps):
                inputs = next(test_iter)
                if self.model.requires_state:
                    h, per_replica_loss, per_replica_toks = _distributed_test_with_state(
                        inputs, h)
                else:
                    per_replica_loss, per_replica_toks = _distributed_test_no_state(
                        inputs)
                epoch_loss.assign_add(per_replica_loss)
                epoch_div.assign_add(per_replica_toks)
            metrics = self.calc_metrics(epoch_loss.numpy(), epoch_div.numpy())
            self.report(epochs, metrics, start, phase, 'EPOCH', reporting_fns)
            return metrics

    def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset)
Exemple #8
0
class Seq2SeqTrainerEagerTf(Trainer):
    """Eager mode trainer for seq2sew

    The trainer can run in 2 modes: `dataset` and `feed_dict`.  When the former, the graph is assumed to
    be connected by features attached to the input so the `feed_dict` will only be used to pass dropout information.

    When the latter, we will use the baseline DataFeed to read the object into the `feed_dict`
    """
    def __init__(self, model_params, **kwargs):
        super().__init__()

        if type(model_params) is dict:
            self.model = create_model_for('seq2seq', **model_params)
        else:
            self.model = model_params

        self.tgt_rlut = kwargs['tgt_rlut']
        self.loss = Seq2SeqLoss(**kwargs)
        self.optimizer = EagerOptimizer(self.loss, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)

        checkpoint_dir = kwargs.get('checkpoint')
        if checkpoint_dir is None:
            checkpoint_dir = f'./tf-seq2seq-{os.getpid()}'
        self._checkpoint, self.checkpoint_manager = setup_tf2_checkpoints(
            self.optimizer, self.model, checkpoint_dir)

        self.bleu_n_grams = int(kwargs.get("bleu_n_grams", 4))

    def checkpoint(self):
        """This method saves a checkpoint

        :return: None
        """
        self.checkpoint_manager.save()

    def recover_last_checkpoint(self):
        """Recover the last saved checkpoint

        :return: None
        """
        print(
            self._checkpoint.restore(
                self.checkpoint_manager.latest_checkpoint))

    @staticmethod
    def _num_toks(y):
        return tf.prod(get_shape_as_list(y))

    def _num_toks(self, lens):
        return tf.reduce_sum(lens)

    def train(self, ts, reporting_fns):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        SET_TRAIN_FLAG(True)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.perf_counter()
        start = time.perf_counter()

        @tf.function
        def _train_step(features, y):
            """Replicated training step."""

            loss = self.optimizer.update(self.model, features, y)
            toks = self._num_toks(features['tgt_len'])
            report_loss = loss * tf.cast(toks, tf.float32)
            return report_loss, toks

        with autograph_options({
                "function_optimization": False,
                "layout_optimizer": False
        }):
            for features, y in ts:
                features['dst'] = y[:, :-1]
                step_report_loss, step_toks = _train_step(features, y)
                epoch_loss.assign_add(step_report_loss)
                nstep_loss.assign_add(step_report_loss)
                epoch_div.assign_add(step_toks)
                nstep_div.assign_add(step_toks)

                step = self.optimizer.global_step.numpy() + 1
                if step % self.nsteps == 0:
                    metrics = self.calc_metrics(nstep_loss.numpy(),
                                                nstep_div.numpy())
                    self.report(step, metrics, self.nstep_start, 'Train',
                                'STEP', reporting_fns, self.nsteps)
                    nstep_loss.assign(0.0)
                    nstep_div.assign(0)
                    self.nstep_start = time.perf_counter()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        self.train_epochs += 1
        self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH',
                    reporting_fns)
        return metrics

    def calc_metrics(self, agg, norm):
        metrics = super().calc_metrics(agg, norm)
        metrics['perplexity'] = np.exp(metrics['avg_loss'])
        return metrics

    def _evaluate(self, es, reporting_fns, **kwargs):
        """Run the model with beam search and report Bleu.

        :param es: `tf.dataset` of input
        :param reporting_fns: Input hooks
        """
        preds = []
        golds = []
        start = time.perf_counter()

        for features, tgt in es:
            features['dst'] = tgt[:, :-1]
            tgt_lens = features.pop('tgt_len')
            top_preds = self.model.predict(features,
                                           make_input=False,
                                           **kwargs)[0]
            preds.extend(
                convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut))
            golds.extend(convert_seq2seq_golds(tgt, tgt_lens, self.tgt_rlut))
        metrics = {'bleu': bleu(preds, golds, self.bleu_n_grams)[0]}
        self.report(0, metrics, start, 'Test', 'EPOCH', reporting_fns)
        return metrics

    def test(self, vs, reporting_fns, phase='Valid', **kwargs):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        SET_TRAIN_FLAG(False)
        if phase == 'Test':
            return self._evaluate(vs, reporting_fns, **kwargs)

        self.valid_epochs += 1

        total_loss = 0
        total_toks = 0
        preds = []
        golds = []

        start = time.perf_counter()
        for features, tgt in vs:
            features['dst'] = tgt[:, :-1]
            top_preds = self.model.predict(features, beam=1,
                                           make_input=False)[0]
            loss_value = self.loss(self.model, features, tgt).numpy()
            toks = tf.cast(self._num_toks(features['tgt_len']),
                           tf.float32).numpy()
            total_loss += loss_value * toks
            total_toks += toks
            preds.extend(
                convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut))
            golds.extend(
                convert_seq2seq_golds(tgt, features['tgt_len'], self.tgt_rlut))

        metrics = self.calc_metrics(total_loss, total_toks)
        metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0]
        self.report(self.valid_epochs, metrics, start, phase, 'EPOCH',
                    reporting_fns)
        return metrics
Exemple #9
0
    return text_generated


def loss(model, h, x, y):
    x["h"] = h
    logits, h = model(x)
    vsz = embeddings["word"].get_vsz()
    targets = tf.reshape(y, [-1])
    bt_x_v = tf.nn.log_softmax(tf.reshape(logits, [-1, vsz]), axis=-1)
    one_hots = tf.one_hot(targets, vsz)
    example_loss = -tf.reduce_sum(one_hots * bt_x_v, axis=-1)
    loss = tf.reduce_mean(example_loss)
    return loss, h


optimizer = EagerOptimizer(loss, optim="adam", lr=args.lr)
for epoch in range(args.epochs):

    loss_accum = 0.
    step = 0
    start = time.time()
    h = None

    SET_TRAIN_FLAG(True)

    for x, y in train_input_fn():
        # Optimize the model
        loss_value, h = optimizer.update_with_hidden(model, h, x, y)
        loss_accum += loss_value
        step += 1
    print('training time {}'.format(time.time() - start))
Exemple #10
0
class LanguageModelTrainerEagerTf(Trainer):
    """A Trainer to use for eager mode

    """
    def __init__(self, model_params, **kwargs):
        super().__init__()

        if type(model_params) is dict:
            self.model = create_model_for('lm', **model_params)
        else:
            self.model = model_params

        loss_fn = loss_with_state if self.model.requires_state else loss_without_state
        self.optimizer = EagerOptimizer(loss_fn, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)
        self._checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer.optimizer, model=self.model)
        checkpoint_dir = '{}-{}'.format("./tf-lm", os.getpid())

        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=checkpoint_dir, max_to_keep=5)

    def checkpoint(self):
        """This method saves a checkpoint

        :return: None
        """
        self.checkpoint_manager.save()

    def recover_last_checkpoint(self):
        """Recover the last saved checkpoint

        :return: None
        """
        print(
            self._checkpoint.restore(
                self.checkpoint_manager.latest_checkpoint))

    @staticmethod
    def _num_toks(y):
        return tf.reduce_prod(get_shape_as_list(y))

    def train(self, ts, reporting_fns):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """

        SET_TRAIN_FLAG(True)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.perf_counter()
        start = time.perf_counter()

        def _train_step_no_state(inputs):
            """Replicated training step."""

            features, y = inputs
            loss = self.optimizer.update(self.model, features, y)
            toks = self._num_toks(y)
            report_loss = loss * tf.cast(toks, tf.float32)
            return report_loss, toks

        def _train_step_with_state(inputs, hidden):
            """Replicated training step."""

            features, y = inputs
            loss, hidden = self.optimizer.update_with_hidden(
                self.model, hidden, features, y)
            toks = self._num_toks(y)
            report_loss = loss * tf.cast(toks, tf.float32)
            return hidden, report_loss, toks

        if get_version(tf) >= 2:
            _train_step_with_state = tf.function(_train_step_with_state)
            _train_step_no_state = tf.function(_train_step_no_state)

        h = None
        for inputs in ts:
            if self.model.requires_state:
                h, step_report_loss, step_toks = _train_step_with_state(
                    inputs, h)
            else:
                step_report_loss, step_toks = _train_step_no_state(inputs)

            epoch_loss.assign_add(step_report_loss)
            nstep_loss.assign_add(step_report_loss)
            epoch_div.assign_add(step_toks)
            nstep_div.assign_add(step_toks)

            step = self.optimizer.global_step.numpy() + 1
            if step % self.nsteps == 0:
                metrics = self.calc_metrics(nstep_loss.numpy(),
                                            nstep_div.numpy())
                self.report(step, metrics, self.nstep_start, 'Train', 'STEP',
                            reporting_fns, self.nsteps)
                nstep_loss.assign(0.0)
                nstep_div.assign(0)
                self.nstep_start = time.perf_counter()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        self.train_epochs += 1
        self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH',
                    reporting_fns)
        return metrics

    def calc_metrics(self, agg, norm):
        metrics = super().calc_metrics(agg, norm)
        metrics['perplexity'] = np.exp(metrics['avg_loss'])
        return metrics

    def test(self, vs, reporting_fns, phase):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        total_loss = 0.0
        total_toks = 0
        epochs = 0
        if phase == 'Valid':
            self.valid_epochs += 1
            epochs = self.valid_epochs
        SET_TRAIN_FLAG(False)

        start = time.perf_counter()
        h = None
        for features, y in vs:
            if self.model.requires_state:
                loss_value, h = loss_with_state(self.model, h, features, y)
            else:
                loss_value = loss_without_state(self.model, features, y)
            loss_value = loss_value.numpy()
            toks = self._num_toks(y)
            total_loss += loss_value * tf.cast(toks, tf.float32).numpy()
            total_toks += toks.numpy()

        metrics = self.calc_metrics(total_loss, total_toks)
        self.report(epochs, metrics, start, phase, 'EPOCH', reporting_fns)
        return metrics
Exemple #11
0
class TaggerTrainerEagerTf(EpochReportingTrainer):
    """A Trainer to use for eager mode training
    """
    def __init__(self, model_params, **kwargs):
        """Create a Trainer, and give it the parameters needed to instantiate the model

        :param model_params: The model parameters
        :param kwargs: See below

        :Keyword Arguments:

          * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
          * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
          * *clip* (`int`) -- If we are doing gradient clipping, what value to use
          * *optim* (`str`) -- The name of the optimizer we are using
          * *lr* (`float`) -- The learning rate we are using
          * *mom* (`float`) -- If we are using SGD, what value to use for momentum
          * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
          * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
          * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8

        """
        super().__init__()
        if type(model_params) is dict:
            self.model = create_model_for('tagger', **model_params)
        else:
            self.model = model_params
        span_type = kwargs.get('span_type', 'iob')
        verbose = kwargs.get('verbose', False)
        self.evaluator = TaggerEvaluatorEagerTf(self.model, span_type, verbose)
        self.optimizer = EagerOptimizer(loss, **kwargs)
        self.nsteps = kwargs.get('nsteps', six.MAXSIZE)
        checkpoint_dir = kwargs.get('checkpoint')
        if checkpoint_dir is None:
            checkpoint_dir = f'./tf-tagger-{os.getpid()}'
        self._checkpoint, self.checkpoint_manager = setup_tf2_checkpoints(
            self.optimizer, self.model, checkpoint_dir)

    def checkpoint(self):
        """This method saves a checkpoint

        :return: None
        """
        self.checkpoint_manager.save()

    def recover_last_checkpoint(self):
        """Recover the last saved checkpoint

        :return: None
        """
        print(
            self._checkpoint.restore(
                self.checkpoint_manager.latest_checkpoint))

    @staticmethod
    def _get_batchsz(batch_dict):
        return batch_dict['y'].shape[0]

    def _train(self, loader, steps=0, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """
        SET_TRAIN_FLAG(True)
        reporting_fns = kwargs.get('reporting_fns', [])
        pg = create_progress_bar(steps)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.perf_counter()

        @tf.function
        def _train_step(inputs):
            features, y = inputs
            loss = self.optimizer.update(self.model, features, y)
            batchsz = get_shape_as_list(y)[0]
            report_loss = loss * batchsz
            return report_loss, batchsz

        with autograph_options({
                "function_optimization": False,
                "layout_optimizer": False
        }):
            for inputs in pg(loader):
                step_report_loss, step_batchsz = _train_step(inputs)
                epoch_loss.assign_add(step_report_loss)
                nstep_loss.assign_add(step_report_loss)
                epoch_div.assign_add(step_batchsz)
                nstep_div.assign_add(step_batchsz)

                step = self.optimizer.global_step.numpy() + 1
                if step % self.nsteps == 0:
                    metrics = self.calc_metrics(nstep_loss.numpy(),
                                                nstep_div.numpy())
                    self.report(step, metrics, self.nstep_start, 'Train',
                                'STEP', reporting_fns, self.nsteps)
                    nstep_loss.assign(0.0)
                    nstep_div.assign(0)
                    self.nstep_start = time.perf_counter()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics

    def _test(self, ts, steps=0, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """
        return self.evaluator.test(ts, steps, **kwargs)
Exemple #12
0
class Seq2SeqTrainerDistributedTf(Trainer):
    """A Trainer to use for eager distributed mode
    """
    def __init__(self, model_params, **kwargs):
        super().__init__()
        if type(model_params) is dict:
            self.model = create_model_for('seq2seq', **model_params)
        else:
            self.model = model_params

        self.tgt_rlut = kwargs['tgt_rlut']
        self.optimizer = EagerOptimizer(loss, **kwargs)
        self.nsteps = kwargs.get('nsteps', 500)
        self._checkpoint = tf.train.Checkpoint(
            optimizer=self.optimizer.optimizer, model=self.model)
        checkpoint_dir = '{}-{}'.format("./tf-seq2seq", os.getpid())

        self.checkpoint_manager = tf.train.CheckpointManager(
            self._checkpoint, directory=checkpoint_dir, max_to_keep=5)
        strategy_type = kwargs.get('strategy_type', 'mirror')
        gpus = int(kwargs.get('gpus', 1))
        endpoint = kwargs.get('endpoint')
        self.strategy = create_distribute_strategy(strategy_type, gpus,
                                                   endpoint)
        self.bleu_n_grams = int(kwargs.get("bleu_n_grams", 4))

    def checkpoint(self):
        """This method saves a checkpoint

        :return: None
        """
        self.checkpoint_manager.save()

    def recover_last_checkpoint(self):
        """Recover the last saved checkpoint

        :return: None
        """
        print(
            self._checkpoint.restore(
                self.checkpoint_manager.latest_checkpoint))

    @staticmethod
    def _num_toks(y):
        return tf.prod(get_shape_as_list(y))

    def _num_toks(self, lens):
        return tf.reduce_sum(lens)

    def train(self, ts, reporting_fns, steps=0):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        strategy = self.strategy

        #num_replicas = strategy.num_replicas_in_sync
        def _replicated_train_step(inputs):
            features, y = inputs
            per_replica_loss = self.optimizer.update(self.model, features, y)
            per_replica_toks = self._num_toks(features['tgt_len'])
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return per_replica_report_loss, per_replica_toks

        with strategy.scope():
            SET_TRAIN_FLAG(True)
            epoch_loss = tf.Variable(0.0)
            epoch_div = tf.Variable(0, dtype=tf.int32)
            nstep_loss = tf.Variable(0.0)
            nstep_div = tf.Variable(0, dtype=tf.int32)
            self.nstep_start = time.time()
            start = time.time()

            @tf.function
            def _distributed_train_step(inputs):
                per_replica_loss, per_replica_toks = strategy.experimental_run_v2(
                    _replicated_train_step, args=(inputs, ))
                total_step_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                                  per_replica_loss,
                                                  axis=None)
                total_toks = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                             per_replica_toks,
                                             axis=None)
                return total_step_loss, total_toks

            with autograph_options({
                    "function_optimization": False,
                    "layout_optimizer": False
            }):
                train_iter = iter(ts)
                for i in range(steps):
                    features, y = next(train_iter)
                    step_report_loss, step_toks = _distributed_train_step(
                        (features, y))
                    epoch_loss.assign_add(step_report_loss)
                    nstep_loss.assign_add(step_report_loss)
                    epoch_div.assign_add(step_toks)
                    nstep_div.assign_add(step_toks)

                    step = self.optimizer.global_step.numpy().item() + 1
                    if step % self.nsteps == 0:
                        metrics = self.calc_metrics(nstep_loss.numpy().item(),
                                                    nstep_div.numpy().item())
                        self.report(step, metrics, self.nstep_start, 'Train',
                                    'STEP', reporting_fns, self.nsteps)
                        nstep_loss.assign(0.0)
                        nstep_div.assign(0)
                        self.nstep_start = time.time()

                epoch_loss = epoch_loss.numpy()
                epoch_div = epoch_div.numpy()
                metrics = self.calc_metrics(epoch_loss, epoch_div)
                self.train_epochs += 1
                self.report(self.train_epochs, metrics, start, 'Train',
                            'EPOCH', reporting_fns)
                return metrics

    def calc_metrics(self, agg, norm):
        metrics = super().calc_metrics(agg, norm)
        metrics['perplexity'] = np.exp(metrics['avg_loss']).item()
        return metrics

    def _evaluate(self, es, reporting_fns, **kwargs):
        """Run the model with beam search and report Bleu.

        :param es: `tf.dataset` of input
        :param reporting_fns: Input hooks
        """
        preds = []
        golds = []
        start = time.time()
        kwargs['make_input'] = False

        for features, tgt in es:
            tgt_lens = features.pop('tgt_len')
            top_preds = self.model.predict(features, **kwargs)
            preds.extend(
                convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut))
            golds.extend(convert_seq2seq_golds(tgt, tgt_lens, self.tgt_rlut))
        metrics = {'bleu': bleu(preds, golds, self.bleu_n_grams)[0]}
        self.report(0, metrics, start, 'Test', 'EPOCH', reporting_fns)
        return metrics

    def test(self, vs, reporting_fns, steps=0, phase='Valid', **kwargs):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        def _replicated_valid_step(inputs):
            features, tgt = inputs
            top_preds = self.model.predict(features, beam=1, make_input=False)
            per_replica_loss = loss(self.model, features, tgt)
            per_replica_toks = self._num_toks(features['tgt_len'])
            per_replica_report_loss = per_replica_loss * tf.cast(
                per_replica_toks, tf.float32)
            return per_replica_report_loss, per_replica_toks, top_preds

        if phase == 'Test':
            SET_TRAIN_FLAG(False)
            return self._evaluate(vs, reporting_fns, **kwargs)

        strategy = self.strategy
        num_replicas = strategy.num_replicas_in_sync

        with strategy.scope():

            SET_TRAIN_FLAG(False)
            self.valid_epochs += 1

            total_loss = tf.Variable(0.0)
            total_toks = tf.Variable(0, dtype=tf.int32)
            preds = []
            golds = []

            start = time.time()

            test_iter = iter(vs)

            for i in range(steps):
                features, tgt = next(test_iter)
                inputs = (features, tgt)
                per_replica_loss, per_replica_toks, _ = strategy.experimental_run_v2(
                    _replicated_valid_step, args=(inputs, ))
                total_loss.assign_add(
                    strategy.reduce(tf.distribute.ReduceOp.SUM,
                                    per_replica_loss,
                                    axis=None))
                total_toks.assign_add(
                    strategy.reduce(tf.distribute.ReduceOp.SUM,
                                    per_replica_toks,
                                    axis=None))
                # Not sure a good way to get top preds merged yet

            metrics = self.calc_metrics(total_loss.numpy(), total_toks.numpy())
            self.report(self.valid_epochs, metrics, start, phase, 'EPOCH',
                        reporting_fns)
            return metrics

    def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset)
    return dataset


transducer = L.BiLSTMEncoderSequence(None, args.hsz, args.layers, 0.5)
model = L.TagSequenceModel(len(labels), embeddings, transducer)

train_loss_results = []
train_accuracy_results = []


def loss(model, x, y):
    unary = model.transduce(x)
    return model.decoder_model.neg_log_loss(unary, y, x['lengths'])


optim = EagerOptimizer(loss, optim="adam", lr=args.lr)

import time
num_epochs = args.epochs
for epoch in range(num_epochs):

    # Training loop - using batches of 32
    loss_acc = 0.
    step = 0
    start = time.time()
    for x, y in train_input_fn():
        # Optimize the model
        loss_value = optim.update(model, x, y)

        loss_acc += loss_value
        step += 1