def graph_builder(opts, observed=None, ground_truth=None, learning_rate=0.001, mode=util.Modes.TRAIN): # Build the neural network predictions = MLPModel(opts, mode=mode)(observed) # Loss loss = opts.loss_scaling * tf.cast(tf.losses.absolute_difference( ground_truth, predictions, reduction=tf.losses.Reduction.MEAN), dtype=getattr(tf, opts.dtypes[0])) # Error metric rmse_metric = util.exp_rmspe(ground_truth, predictions) if mode == util.Modes.TRAIN: # Training optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) # Wrap in a CrossReplica if we're replicating across multiple IPUs if opts.replication_factor > 1: optimizer = cross_replica_optimizer.CrossReplicaOptimizer( optimizer) # Batch norm variable update dependency update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): # Op to calculate every variable gradient grads = tf.gradients(loss, tf.trainable_variables()) grads = list(zip(grads, tf.trainable_variables())) # Loss scaling grads = [(grad / opts.loss_scaling, var) for grad, var in grads] # Apply weight_decay directly to gradients if opts.weight_decay != 0: grads = [(grad + (opts.weight_decay * var), var) if 'l2tag' in var.name and 'kernel' in var.name else (grad, var) for grad, var in grads] # clip gradients if opts.gradient_clipping: grads = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads] # Op to update all variables according to their gradient apply_grads = optimizer.apply_gradients(grads_and_vars=grads) return loss / opts.loss_scaling, rmse_metric, apply_grads elif mode == util.Modes.VALID: return loss / opts.loss_scaling, rmse_metric, None
def build_train_op(previous_loss, *infeed_data): """Construct loss and optimizer.""" with ipu_scope("/device:IPU:0"): action_prob = create_policy(*infeed_data) loss = tf.reduce_sum(action_prob * infeed_data[-2]) opt = tf.train.GradientDescentOptimizer(LEARNING_RATE) if args.accumulate_grad: opt = gradient_accumulation_optimizer.GradientAccumulationOptimizer( opt, num_mini_batches=args.num_mini_batches) opt = cross_replica_optimizer.CrossReplicaOptimizer(opt) train_op = opt.minimize(loss) with tf.control_dependencies([train_op]): loss = tf.identity(loss) return previous_loss + loss
# Create training examples / targets ds = tf.data.Dataset.from_tensor_slices(text_as_int) ds = ds.batch(sequence_length, drop_remainder=True) ds = ds.shuffle(batch_size * batch_size) ds = ds.batch(batch_size, drop_remainder=True) ds = ds.repeat() # The host side queues infeed_queue = ipu_infeed_queue.IPUInfeedQueue( ds, feed_name="infeed", replication_factor=replication_factor) # Set the learning rate lr = 0.0001 # Create a momentum optimiser for replication optimizer = cross_replica_optimizer.CrossReplicaOptimizer( tf.train.MomentumOptimizer(lr, 0.99)) # Create a host embedding object embedding = embedding_ops.create_host_embedding( "char_embedding", shape=[256, 256], dtype=tf.float32, partition_strategy="TOKEN", optimizer_spec=embedding_ops.HostEmbeddingOptimizerSpec(lr)) # PopnnGRU is time-major def gru(partials): gru_ = rnn_ops.PopnnGRU(256) partial_t = tf.transpose(partials, [1, 0, 2]) gru_outputs_t, _ = gru_(partial_t)