def run_lstm_variant(variant='SlowLSTM', cuda=False, size=128, jit=False): assert variant in lstms p = AttrDict({'cuda': cuda, 'lstm_kind': variant, 'size': size}) name = '{}_size{}{}{}'.format(variant, size, tag(cuda=cuda), tag(jit=jit)) def C(x): if p.cuda: x = x.cuda() return x lstm = getattr(lstm_variants, p.lstm_kind) x = V(C(th.rand(1, BATCH, p.size))) hiddens = (V(C(th.rand(1, BATCH, p.size))), V(C(th.rand(1, BATCH, p.size)))) th.manual_seed(1234) cus = C(lstm(p.size, p.size, dropout=DROPOUT, jit=jit)) if hasattr(cus, 'mask'): cus.mask = C(cus.mask) iter_timer = Bench(name=name, cuda=cuda, warmup_iters=3) # Super slow on CPU iters = 20 if cuda else 6 for _ in range(iters): gc.collect() with iter_timer: out, h = x, hiddens for i in range(SEQ_LEN): out, h = cus(out, h) return iter_timer
def run_bnlstm(hidden_size=100, max_length=784, pmnist=False, num_batches=5, cuda=False, jit=False, warmup=10, benchmark=20): name = 'bnlstm{}{}'.format(tag(cuda=cuda), tag(jit=jit)) iter_timer = Bench(name, cuda=cuda, warmup_iters=2) # The CPU version is slow... batch_size = 20 if cuda else 5 class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.rnn = bnlstm.LSTM(cell_class=bnlstm.BNLSTMCell, input_size=1, hidden_size=hidden_size, batch_first=True, max_length=max_length, jit=jit) self.fc = nn.Linear(in_features=hidden_size, out_features=10) # 10 digits in mnist def forward(self, data): hx = None if not pmnist: h0 = Variable(data.data.new(data.size(0), hidden_size) .normal_(0, 0.1)) c0 = Variable(data.data.new(data.size(0), hidden_size) .normal_(0, 0.1)) hx = (h0, c0) _, (h_n, _) = self.rnn(input_=data, hx=hx) logits = self.fc(h_n[0]) return logits def cast(tensor): return tensor.cuda() if cuda else tensor model = Model() criterion = nn.CrossEntropyLoss() data_batches = [Variable(cast(torch.zeros(batch_size, 28 * 28, 1))) for _ in range(num_batches)] target_batches = [Variable(cast(torch.zeros(batch_size)).long()) for _ in range(num_batches)] if cuda: model.cuda() criterion.cuda() total_loss = 0 for data, targets in zip(data_batches, target_batches): gc.collect() with iter_timer: logits = model(data) loss = criterion(input=logits, target=targets) loss.backward() total_loss += float(loss.data.item()) # CUDA sync point return iter_timer
def run_qrnn(batch_size=20, input_size=128, seq_len=20, warmup=10, benchmark=10, hidden_size=256, num_layers=10, use_kernel=False, jit=False, cuda=False): assert not (use_kernel and jit) if use_kernel: assert cuda benchmark_init(0, 0, True) name = 'qrnn{}{}{}'.format(tag(cuda=cuda), tag(jit=jit), tag(kernel=use_kernel)) iter_timer = Bench(name=name, cuda=cuda, warmup_iters=warmup) niters = warmup + benchmark size = (seq_len, batch_size, input_size) if cuda: device = torch.device('cuda:0') else: device = torch.device('cpu') batches = [ torch.rand(size, requires_grad=True, device=device) for _ in range(niters) ] qrnn = QRNN(input_size, hidden_size, num_layers=num_layers, dropout=0.4, use_kernel=use_kernel, jit=jit).to(device) for X in batches: gc.collect() with iter_timer: output, hidden = qrnn(X) output.sum().backward() return iter_timer
def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512, seq_len=None, warmup=10, benchmark=20, autograd=False, variable=False, fused=False, jit=False, backward=False, skip_cpu_governor_check=False): if jit: autograd = True if backward: autograd = True if seq_len is None: if backward: seq_len = 32 else: seq_len = 512 assert not (jit and fused) assert not (variable and autograd) benchmark_init(cpu, gpu, skip_cpu_governor_check) if variable: V = lambda x, requires_grad=False: Variable(x, requires_grad=False) elif autograd: V = lambda x, requires_grad=False: Variable(x, requires_grad=requires_grad) else: V = lambda x, requires_grad=False: x input = V(torch.randn(batch_size, input_size).cuda(device=gpu)) hx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) cx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) w_ih = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True) w_hh = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True) if fused: if backward: print("using fused_autograd_lstm") lstm = fused_autograd_lstm else: print("using fused_forward_lstm") lstm = fused_autograd_lstm lstm = fused_lstm elif jit: print("tracing an unfused lstm") lstm = wrap_hidden(torch.jit.trace(input, hx0, cx0, w_ih, w_hh)(_unfused_lstm)) else: print("using unfused lstm") lstm = wrap_hidden(_unfused_lstm) name = 'lstm_cuda{}{}{}'.format(tag(autograd=autograd), tag(fused=fused), tag(jit=jit)) iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup) for i in range(warmup + benchmark): gc.collect() with iter_timer: hx, cx = hx0, cx0 for j in range(seq_len): hx, cx = lstm(input, (hx, cx), w_ih, w_hh) if backward: hx.sum().backward() return iter_timer