Example #1
0
def mgpu_train(*xs):
    gpu_ops = []
    gpu_grads = []
    xs = (tf.split(x, n_gpu, 0) for x in xs)
    for i, xs in enumerate(zip(*xs)):
        do_reuse = True if i > 0 else None
        with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(
                tf.get_variable_scope(), reuse=do_reuse):
            clf_logits, clf_losses, lm_losses = model(*xs,
                                                      clf_token=clf_token,
                                                      n_vocab=n_vocab,
                                                      n_class=n_class,
                                                      n_special=n_special,
                                                      n_ctx=n_ctx,
                                                      n_embd=n_embd,
                                                      embd_pdrop=embd_pdrop,
                                                      n_layer=n_layer,
                                                      n_head=n_head,
                                                      attn_pdrop=attn_pdrop,
                                                      resid_pdrop=resid_pdrop,
                                                      clf_pdrop=clf_pdrop,
                                                      train=True,
                                                      reuse=do_reuse)
            if lm_coef > 0:
                train_loss = tf.reduce_mean(
                    clf_losses) + lm_coef * tf.reduce_mean(lm_losses)
            else:
                train_loss = tf.reduce_mean(clf_losses)
            params = find_trainable_variables("model")
            grads = tf.gradients(train_loss, params)
            grads = list(zip(grads, params))
            gpu_grads.append(grads)
            gpu_ops.append([clf_logits, clf_losses, lm_losses])
    ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
    grads = average_grads(gpu_grads)
    grads = [g for g, p in grads]
    train = adam(params,
                 grads,
                 lr,
                 lr_schedule,
                 n_updates_total,
                 warmup=lr_warmup,
                 l2=l2,
                 max_grad_norm=max_grad_norm,
                 vector_l2=vector_l2,
                 b1=b1,
                 b2=b2,
                 e=e)
    return [train] + ops
Example #2
0
def mgpu_train(*xs):
    gpu_ops = []
    gpu_grads = []
    xs = (tf.split(x, n_gpu, 0) for x in xs)
    for i, xs in enumerate(zip(*xs)):
        do_reuse = True if i > 0 else None
        with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(
                tf.get_variable_scope(), reuse=do_reuse):
            s_preds, e_preds, qa_losses = model(*xs,
                                                n_word=n_word,
                                                n_char=n_char,
                                                n_pred=n_pred,
                                                n_wembd=n_wembd,
                                                n_cembd=n_cembd,
                                                units=units,
                                                embd_pdrop=embd_pdrop,
                                                n_head=n_head,
                                                attn_pdrop=attn_pdrop,
                                                resid_pdrop=resid_pdrop,
                                                train=True,
                                                reuse=do_reuse)
            train_loss = tf.reduce_mean(qa_losses)
            params = find_trainable_variables("model")
            grads = tf.gradients(train_loss, params)
            grads = list(zip(grads, params))
            gpu_grads.append(grads)
            gpu_ops.append([s_preds, e_preds, qa_losses])
    ops = [tf.concat(op, 0) for op in zip(*gpu_ops)]
    grads = average_grads(gpu_grads)
    grads = [g for g, p in grads]
    train = adam(params,
                 grads,
                 lr,
                 lr_schedule,
                 n_updates_total,
                 warmup=lr_warmup,
                 l2=l2,
                 max_grad_norm=max_grad_norm,
                 vector_l2=vector_l2,
                 b1=b1,
                 b2=b2,
                 e=e)
    return [train] + ops
Example #3
0
 # place holders
 X_train = tf.placeholder(tf.int32, [n_batch_train, n_ctx])
 M_train = tf.placeholder(tf.float32, [n_batch_train, n_ctx])
 X = tf.placeholder(tf.int32, [None, n_ctx])
 M = tf.placeholder(tf.float32, [None, n_ctx])
 # mgpu train and predict
 train, logits, lm_losses = mgpu_train(X_train, M_train)
 lm_loss = tf.reduce_mean(lm_losses)
 eval_mgpu_logits, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train)
 eval_logits, eval_lm_losses = model(X, M,                 
             units=units, n_vocab=n_vocab, n_special=n_special, n_embd=n_embd, 
             embd_pdrop=embd_pdrop, train=False, reuse=True)
 eval_lm_loss = tf.reduce_mean(eval_lm_losses)
 eval_mgpu_lm_loss = tf.reduce_mean(eval_mgpu_lm_losses)
 # params
 params = find_trainable_variables('model')
 sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
 sess.run(tf.global_variables_initializer())
 # get saved params
 if use_prev_best and os.path.isfile(os.path.join(save_dir, desc, 'best_params.jl')):
     sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(os.path.join(save_dir, desc, 'best_params.jl')))])
 else:
     # get the embedding matrix of the pretrained model
     #emb = np.concatenate([np.load('{}params_{}.npy'.format(pretrained_lm_dir, n)) for n in range(3)], 0)[393216:31480320].reshape((40478,768))
     emb = np.load('{}elmo_768_40478_matrix.npy'.format(elmo_dir))
     emb = np.concatenate([emb, (np.random.randn(n_special, n_embd)*0.02).astype(np.float32)], 0)
     sess.run(params[0].assign(emb))
     del emb
 # train, eval, test
 n_updates = 0
 n_epochs = 0