Beispiel #1
0
def get_model_fn(features, labels, mode, params, cosmoflow_config):
    """model definition"""
    model = get_model(**cosmoflow_config['model'])
    outputs = model(features, training=mode == tf.estimator.ModeKeys.TRAIN)

    train_config = cosmoflow_config['train']
    loss_name = train_config['loss']
    if loss_name == "mse":
        loss = tf.losses.mean_squared_error(labels=labels, predictions=outputs)
    else:
        raise NotImplementedError("loss: %s" % loss_name)

    if mode == tf.estimator.ModeKeys.EVAL:
        predictions = outputs
        eval_metric_ops = {
            "mae":
            tf.metrics.mean_absolute_error(labels=labels,
                                           predictions=predictions),
        }
        return tf.estimator.EstimatorSpec(mode,
                                          loss=loss,
                                          eval_metric_ops=eval_metric_ops)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(params["learning_rate"])
        if cosmoflow_config['ipu_config']['num_ipus'] > 1:
            optimizer = CrossReplicaOptimizer(optimizer)
        train_op = optimizer.minimize(loss=loss)
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)
    raise NotImplementedError(mode)
Beispiel #2
0
def graph_builder(opts,
                  observed=None,
                  ground_truth=None,
                  learning_rate=0.001,
                  mode=util.Modes.TRAIN):

    # Build the neural network
    predictions = MLPModel(opts, mode=mode)(observed)

    # Loss
    loss = opts.loss_scaling * tf.cast(tf.losses.absolute_difference(
        ground_truth, predictions, reduction=tf.losses.Reduction.MEAN),
                                       dtype=getattr(tf, opts.dtypes[0]))

    # Error metric
    rmse_metric = util.exp_rmspe(ground_truth, predictions)

    if mode == util.Modes.TRAIN:
        # Training
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=learning_rate)
        # Wrap in a CrossReplica if we're replicating across multiple IPUs
        if opts.replication_factor > 1:
            optimizer = CrossReplicaOptimizer(optimizer)
        # Batch norm variable update dependency
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            # Op to calculate every variable gradient
            grads = tf.gradients(loss, tf.trainable_variables())
        grads = list(zip(grads, tf.trainable_variables()))

        # Loss scaling
        grads = [(grad / opts.loss_scaling, var) for grad, var in grads]

        # Apply weight_decay directly to gradients
        if opts.weight_decay != 0:
            grads = [(grad + (opts.weight_decay * var),
                      var) if 'l2tag' in var.name and 'kernel' in var.name else
                     (grad, var) for grad, var in grads]

        # clip gradients
        if opts.gradient_clipping:
            grads = [(tf.clip_by_value(grad, -1., 1.), var)
                     for grad, var in grads]

        # Op to update all variables according to their gradient
        apply_grads = optimizer.apply_gradients(grads_and_vars=grads)
        return loss / opts.loss_scaling, rmse_metric, apply_grads
    elif mode == util.Modes.VALID:
        return loss / opts.loss_scaling, rmse_metric, None
Beispiel #3
0
 def get_optimiser(self):
     _learning_rate = self.get_current_learning_rate()
     opt_kwargs = self.optimiser_kwargs.copy()
     if 'dtype' in opt_kwargs:
         opt_kwargs['dtype'] = self.experiment.dtype
     if self.n_replicas == 1:
         return self.optimiser_type(_learning_rate, **opt_kwargs)
     else:
         return CrossReplicaOptimizer(self.optimiser_type(_learning_rate, **opt_kwargs))
Beispiel #4
0
        def body(loss, features, labels):
            with tf.variable_scope("MainGraph"):
                model = get_model(**cosmoflow_config['model'])
                outputs = model(features, training=True)
            train_config = cosmoflow_config['train']
            loss_name = train_config['loss']
            if loss_name == "mse":
                loss = tf.losses.mean_squared_error(labels=labels,
                                                    predictions=outputs)
            else:
                raise NotImplementedError("loss: %s" % loss_name)

            optimizer = tf.train.GradientDescentOptimizer(
                cosmoflow_config['optimizer']['lr'])
            if cosmoflow_config['ipu_config']['num_ipus'] > 1:
                optimizer = CrossReplicaOptimizer(optimizer)
            train_op = optimizer.minimize(loss=loss)
            with tf.control_dependencies([train_op]):
                return loss, outfeed_queue.enqueue(loss)
Beispiel #5
0
def build_train_op(previous_loss, *infeed_data):
    """Construct loss and optimizer."""
    with ipu_scope("/device:IPU:0"):
        action_prob = create_policy(*infeed_data)
        loss = tf.reduce_sum(action_prob * infeed_data[-2])
        opt = tf.train.GradientDescentOptimizer(LEARNING_RATE)
        if args.accumulate_grad:
            opt = GradientAccumulationOptimizer(
                opt, num_mini_batches=args.num_mini_batches)
        opt = CrossReplicaOptimizer(opt)
        train_op = opt.minimize(loss)
        with tf.control_dependencies([train_op]):
            loss = tf.identity(loss)
        return previous_loss + loss