def mgpu_train(*xs): gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): clf_logits, clf_losses, lm_losses = model(*xs, clf_token=clf_token, n_vocab=n_vocab, n_class=n_class, n_special=n_special, n_ctx=n_ctx, n_embd=n_embd, embd_pdrop=embd_pdrop, n_layer=n_layer, n_head=n_head, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, clf_pdrop=clf_pdrop, train=True, reuse=do_reuse) if lm_coef > 0: train_loss = tf.reduce_mean( clf_losses) + lm_coef * tf.reduce_mean(lm_losses) else: train_loss = tf.reduce_mean(clf_losses) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) gpu_ops.append([clf_logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] grads = average_grads(gpu_grads) grads = [g for g, p in grads] train = adam(params, grads, lr, lr_schedule, n_updates_total, warmup=lr_warmup, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e) return [train] + ops
def mgpu_train(*xs): gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope( tf.get_variable_scope(), reuse=do_reuse): s_preds, e_preds, qa_losses = model(*xs, n_word=n_word, n_char=n_char, n_pred=n_pred, n_wembd=n_wembd, n_cembd=n_cembd, units=units, embd_pdrop=embd_pdrop, n_head=n_head, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, train=True, reuse=do_reuse) train_loss = tf.reduce_mean(qa_losses) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) gpu_ops.append([s_preds, e_preds, qa_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] grads = average_grads(gpu_grads) grads = [g for g, p in grads] train = adam(params, grads, lr, lr_schedule, n_updates_total, warmup=lr_warmup, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e) return [train] + ops
# place holders X_train = tf.placeholder(tf.int32, [n_batch_train, n_ctx]) M_train = tf.placeholder(tf.float32, [n_batch_train, n_ctx]) X = tf.placeholder(tf.int32, [None, n_ctx]) M = tf.placeholder(tf.float32, [None, n_ctx]) # mgpu train and predict train, logits, lm_losses = mgpu_train(X_train, M_train) lm_loss = tf.reduce_mean(lm_losses) eval_mgpu_logits, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train) eval_logits, eval_lm_losses = model(X, M, units=units, n_vocab=n_vocab, n_special=n_special, n_embd=n_embd, embd_pdrop=embd_pdrop, train=False, reuse=True) eval_lm_loss = tf.reduce_mean(eval_lm_losses) eval_mgpu_lm_loss = tf.reduce_mean(eval_mgpu_lm_losses) # params params = find_trainable_variables('model') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(tf.global_variables_initializer()) # get saved params if use_prev_best and os.path.isfile(os.path.join(save_dir, desc, 'best_params.jl')): sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(os.path.join(save_dir, desc, 'best_params.jl')))]) else: # get the embedding matrix of the pretrained model #emb = np.concatenate([np.load('{}params_{}.npy'.format(pretrained_lm_dir, n)) for n in range(3)], 0)[393216:31480320].reshape((40478,768)) emb = np.load('{}elmo_768_40478_matrix.npy'.format(elmo_dir)) emb = np.concatenate([emb, (np.random.randn(n_special, n_embd)*0.02).astype(np.float32)], 0) sess.run(params[0].assign(emb)) del emb # train, eval, test n_updates = 0 n_epochs = 0