def test_windowed_ra(): num_heads = 4 d_model = 64 rpr_k = 1 batchsize = 2 nctx = 256 d_k = d_model // num_heads with tf.device("/cpu:0"): old = SeqScaledDotProductRelativeAttention(pdrop=0.) new = SeqScaledWindowedRelativeAttention(pdrop=0.) rpr_key_emb = tf.keras.layers.Embedding(2 * rpr_k + 1, d_k) rpr_value_emb = tf.keras.layers.Embedding(2 * rpr_k + 1, d_k) Q = tf.random.normal([batchsize, num_heads, nctx, d_k]) K = tf.random.normal([batchsize, num_heads, nctx, d_k]) V = tf.random.normal([batchsize, num_heads, nctx, d_k]) lengths = tf.random.uniform([ batchsize, ], 0, nctx, dtype=tf.int32) seq_mask = tf.sequence_mask(lengths, maxlen=nctx, dtype=tf.float32) in_mask = tf.expand_dims(tf.expand_dims(seq_mask, 1), 1) out_mask = tf.expand_dims(tf.expand_dims(seq_mask, 1), -1) # manually create a ra_mask to prevent attention beyond rpr_k ones = tf.ones([nctx, nctx]) ra_mask = tf.linalg.band_part(ones, rpr_k, rpr_k) mask = in_mask * tf.expand_dims(tf.expand_dims(ra_mask, 0), 0) rpr_key_old, rpr_value_old = make_rpr(rpr_key_emb, rpr_value_emb, rpr_k, nctx) SET_TRAIN_FLAG(False) out_old = old((Q, K, V, rpr_key_old, rpr_value_old, mask)) out_old = masked_fill(out_old, tf.equal(out_mask, 0), 1) print(out_old.shape) # using the windowed relative attention with the original sequence mask rpr_key_new, rpr_value_new = unfold_rpr(rpr_key_emb, rpr_value_emb, rpr_k) out_new = new((Q, K, V, rpr_key_new, rpr_value_new, in_mask)) out_new = masked_fill(out_new, tf.equal(out_mask, 0), 1) print(out_new.shape) if get_version(tf) < 2: with tf.compat.v1.Session() as sess: out_old, out_new = sess.run([out_old, out_new]) else: out_old, out_new = out_old.numpy(), out_new.numpy() assert np.allclose(out_old, out_new, atol=1e-6)
def test_scaled_attn_value(qkv): q, k, v = qkv with tf.device("/cpu:0"): q = tf.zeros_like(q) scaled_dot_product_attention = SeqScaledDotProductAttention(0.0) res = scaled_dot_product_attention((q, k, v, None)) if get_version(tf) < 2: with tf.compat.v1.Session() as sess: res, gold = sess.run([res, v]) else: res, gold = res.numpy(), v.numpy() B, H, T, _ = q.get_shape().as_list() for b in range(B): for h in range(H): for t in range(T): np.testing.assert_allclose(res[b, h, t, :], np.mean(gold, axis=2)[b, h, :], atol=1e-5)
import pytest import numpy as np from eight_mile.utils import get_version from eight_mile.embeddings import RandomInitVecModel from collections import namedtuple import string tf = pytest.importorskip('tensorflow') pytestmark = pytest.mark.skipif(get_version(tf) < 2, reason='TF1.X') from eight_mile.utils import Offsets def test_rnn_decode_shapes(): from baseline.tf.embeddings import LookupTableEmbeddingsModel from baseline.tf.seq2seq.decoders import RNNDecoder # Always pick the right path encoder = namedtuple("EncoderOutput", "output src_mask") batchsz = 2 temporal = 7 temporal_output = 4 hsz = 20 dsz = 10 layers = 1 # Always pick the right path wv = RandomInitVecModel( dsz, {k: 1 for k in list(string.ascii_letters)} ) assert len(string.ascii_letters) + len(Offsets.VALUES) == wv.get_vsz() encoder.output = tf.cast(np.random.randn(batchsz, temporal, hsz), dtype=tf.float32) encoder.hidden = (tf.cast(np.random.randn(layers, batchsz, hsz), dtype=tf.float32), tf.cast(np.random.randn(layers, batchsz, hsz), dtype=tf.float32)) encoder.src_mask = np.zeros((batchsz, temporal), dtype=np.uint8)
import os import pytest import numpy as np tf = pytest.importorskip('tensorflow') from eight_mile.utils import get_version pytestmark = pytest.mark.skipif(get_version(tf) >= 2, reason='tf2.0') from baseline.tf.tfy import tie_weight @pytest.fixture(scope="module") def set_cpu(): os.environ['CUDA_VISIBLE_DEVICES'] = '' yield del os.environ['CUDA_VISIBLE_DEVICES'] def test_sharing(): # For some reason I can't get this to stop trying to use the gpu which causes it to fail with tf.device('/cpu:0'): input_ = tf.compat.v1.placeholder(tf.int32, shape=[None]) weight = tf.get_variable("weight", shape=[100, 200], initializer=tf.random_normal_initializer()) embed = tf.nn.embedding_lookup(weight, input_) tie_shape = [weight.get_shape()[-1], weight.get_shape()[0]] with tf.variable_scope("Share", custom_getter=tie_weight(weight, tie_shape)): layer = tf.layers.Dense( 100,
import os import json import pytest import numpy as np from eight_mile.utils import get_version tf = pytest.importorskip("tensorflow") pytestmark = pytest.mark.skipif(get_version(tf) >= 2, reason="TF2.0") from eight_mile.optz import ( create_lr_scheduler, ConstantScheduler, WarmupLinearScheduler, CyclicLRScheduler, PiecewiseDecayScheduler, ZarembaDecayScheduler, CosineDecayScheduler, InverseTimeDecayScheduler, ExponentialDecayScheduler, ) import numpy as np @pytest.fixture(scope="module") def set_cpu(): os.environ["CUDA_VISIBLE_DEVICES"] = "" yield del os.environ["CUDA_VISIBLE_DEVICES"] INIT_LR = 1.2 NUM_STEPS = 1000
def train(self, ts, reporting_fns): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ SET_TRAIN_FLAG(True) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.perf_counter() start = time.perf_counter() def _train_step_no_state(inputs): """Replicated training step.""" features, y = inputs loss = self.optimizer.update(self.model, features, y) toks = self._num_toks(y) report_loss = loss * tf.cast(toks, tf.float32) return report_loss, toks def _train_step_with_state(inputs, hidden): """Replicated training step.""" features, y = inputs loss, hidden = self.optimizer.update_with_hidden( self.model, hidden, features, y) toks = self._num_toks(y) report_loss = loss * tf.cast(toks, tf.float32) return hidden, report_loss, toks if get_version(tf) >= 2: _train_step_with_state = tf.function(_train_step_with_state) _train_step_no_state = tf.function(_train_step_no_state) h = None for inputs in ts: if self.model.requires_state: h, step_report_loss, step_toks = _train_step_with_state( inputs, h) else: step_report_loss, step_toks = _train_step_no_state(inputs) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_toks) nstep_div.assign_add(step_toks) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) self.report(step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.perf_counter() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) self.train_epochs += 1 self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns) return metrics
res = dot_product_attention((q, k, v, None)) if get_version(tf) < 2: with tf.compat.v1.Session() as sess: res, gold = sess.run([res, v]) else: res, gold = res.numpy(), v.numpy() B, H, T, _ = q.get_shape().as_list() for b in range(B): for h in range(H): for t in range(T): np.testing.assert_allclose(res[b, h, t, :], np.mean(gold, axis=2)[b, h, :], atol=1e-5) @pytest.mark.skipif(get_version(tf) < 2, reason="needs tf2") def test_attn_value_seq_mask(qkv): q, k, v = qkv with tf.device("/cpu:0"): B, H, T, _ = q.get_shape().as_list() q = tf.zeros_like(q) lens = np.random.randint(1, T, size=B).astype(np.int32) tf_lens = tf.constant(lens) mask = tf.expand_dims( tf.expand_dims(tf.sequence_mask(tf_lens, T, dtype=tf.float32), 1), 1) dot_product_attention = SeqDotProductAttention(0.0) res = dot_product_attention((q, k, v, mask)) res, gold = res.numpy(), v.numpy() for b in range(B): for h in range(H):
from eight_mile.tf.layers import SET_TRAIN_FLAG, get_shape_as_list, autograph_options, masked_fill from eight_mile.tf.optz import * from baseline.tf.tfy import setup_tf2_checkpoints from baseline.utils import get_model_file, get_metric_cmp from baseline.train import EpochReportingTrainer, register_trainer, register_training_func from baseline.model import create_model_for import numpy as np # Number of batches to prefetch if using tf.datasets NUM_PREFETCH = 2 # The shuffle buffer SHUF_BUF_SZ = 5000 log = logging.getLogger('baseline.timing') TF_VERSION = get_version(tf) if TF_VERSION < 2: tf.enable_eager_execution() def to_tensors(ts, lengths_key): """Convert a data feed into a tuple of `features` (`dict`) and `y` values This method is required to produce `tf.dataset`s from the input data feed :param ts: The data feed to convert :return: A `tuple` of `features` and `y` (labels) """ keys = ts[0].keys() features = dict((k, []) for k in keys) for sample in ts:
assert out.size(i) == shape[i] def test_vec_log_sum_exp_batch_stable(): h = np.random.randint(22, 41) i1 = torch.rand(1, h, h) i2 = torch.rand(1, h, h) i = torch.cat([i1, i2], dim=0) lse1 = vec_log_sum_exp(i1, 2) lse2 = vec_log_sum_exp(i2, 2) one_x_one = torch.cat([lse1, lse2], dim=0) lse = vec_log_sum_exp(i, 2) np.testing.assert_allclose(one_x_one.numpy(), lse.numpy()) @pytest.mark.skipif(get_version(torch) <= 1.4, reason="Old ONNX") def test_ONNX_export(): ort = pytest.importorskip("onnxruntime") v = ViterbiBatchSize1(Offsets.GO, Offsets.EOS) B = 1 T = np.random.randint(10, 100) H = np.random.randint(24, 76) unary = torch.rand(T, B, H) trans = torch.rand(1, H, H) length = torch.randint(1, T, size=(B, )) p1, s1 = v(unary, trans, length)
WarmupLearningRateScheduler, WarmupLinearScheduler, CyclicLRScheduler, PiecewiseDecayScheduler, ZarembaDecayScheduler, CosineDecayScheduler, InverseTimeDecayScheduler, ExponentialDecayScheduler, CompositeLRScheduler, ) logger = logging.getLogger("mead.layers") __all__ = [] export = exporter(__all__) if get_version(tf) < 2: @register_lr_scheduler("default") class ConstantSchedulerTensorFlow1: def __init__(self, **kwargs): pass def __call__(self, lr, global_step): return tf.identity(lr, name="lr") def __str__(self): return type(self).__name__ + "()" @register_lr_scheduler("warmup_linear") class WarmupLinearSchedulerTensorFlow1(WarmupLearningRateScheduler): def __init__(self, **kwargs):