def mgpu_predict(*xs): gpu_ops = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=True): lm_logits, lm_losses = model(*xs, units=units, n_vocab=n_vocab, n_special=n_special, n_embd=n_embd, embd_pdrop=embd_pdrop, train=False, reuse=True) gpu_ops.append([lm_logits, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] return ops
def mgpu_train(*xs): gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): clf_logits, clf_losses, lm_losses = model(*xs, clf_token=clf_token, n_vocab=n_vocab, n_class=n_class, n_special=n_special, n_ctx=n_ctx, n_embd=n_embd, embd_pdrop=embd_pdrop, n_layer=n_layer, n_head=n_head, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, clf_pdrop=clf_pdrop, train=True, reuse=do_reuse) if lm_coef > 0: train_loss = tf.reduce_mean( clf_losses) + lm_coef * tf.reduce_mean(lm_losses) else: train_loss = tf.reduce_mean(clf_losses) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) gpu_ops.append([clf_logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] grads = average_grads(gpu_grads) grads = [g for g, p in grads] train = adam(params, grads, lr, lr_schedule, n_updates_total, warmup=lr_warmup, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e) return [train] + ops
def mgpu_train(*xs): gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): s_preds, e_preds, qa_losses = model(*xs, n_word=n_word, n_char=n_char, n_pred=n_pred, n_wembd=n_wembd, n_cembd=n_cembd, units=units, embd_pdrop=embd_pdrop, n_head=n_head, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, train=True, reuse=do_reuse) train_loss = tf.reduce_mean(qa_losses) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) gpu_ops.append([s_preds, e_preds, qa_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] grads = average_grads(gpu_grads) grads = [g for g, p in grads] train = adam(params, grads, lr, lr_schedule, n_updates_total, warmup=lr_warmup, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e) return [train] + ops
def mgpu_predict(*xs): gpu_ops = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=True): s_preds, e_preds, qa_losses = model(*xs, n_word=n_word, n_char=n_char, n_pred=n_pred, n_wembd=n_wembd, n_cembd=n_cembd, units=units, embd_pdrop=embd_pdrop, n_head=n_head, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, train=False, reuse=True) gpu_ops.append([s_preds, e_preds, qa_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] return ops
def mgpu_predict(*xs): gpu_ops = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=True): clf_logits, clf_losses, lm_losses = model(*xs, clf_token=clf_token, n_vocab=n_vocab, n_class=n_class, n_special=n_special, n_ctx=n_ctx, n_embd=n_embd, embd_pdrop=embd_pdrop, n_layer=n_layer, n_head=n_head, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, clf_pdrop=clf_pdrop, train=False, reuse=True) gpu_ops.append([clf_logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] return ops