def run_cudnn_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512, layers=1, seq_len=512, warmup=10, benchmark=30, backward=False, skip_cpu_governor_check=False): benchmark_init(cpu, gpu, skip_cpu_governor_check) def V(x): return Variable(x) # mandatory input = V(torch.randn(seq_len, batch_size, input_size).cuda(gpu)) hx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu)) cx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu)) lstm = torch.nn.LSTM(input_size, hidden_size, layers).cuda(gpu) lstm.flatten_parameters() iter_timer = Bench(name='lstm_cudnn', cuda=True, warmup_iters=warmup) for i in range(warmup + benchmark): gc.collect() with iter_timer: hx_t, cx_t = lstm(input, (hx, cx)) if backward: hx_t.sum().backward() return iter_timer
def run_lstm_variant(variant='SlowLSTM', cuda=False, size=128, jit=False): assert variant in lstms p = AttrDict({'cuda': cuda, 'lstm_kind': variant, 'size': size}) name = '{}_size{}{}{}'.format(variant, size, tag(cuda=cuda), tag(jit=jit)) def C(x): if p.cuda: x = x.cuda() return x lstm = getattr(lstm_variants, p.lstm_kind) x = V(C(th.rand(1, BATCH, p.size))) hiddens = (V(C(th.rand(1, BATCH, p.size))), V(C(th.rand(1, BATCH, p.size)))) th.manual_seed(1234) cus = C(lstm(p.size, p.size, dropout=DROPOUT, jit=jit)) if hasattr(cus, 'mask'): cus.mask = C(cus.mask) iter_timer = Bench(name=name, cuda=cuda, warmup_iters=3) # Super slow on CPU iters = 20 if cuda else 6 for _ in range(iters): gc.collect() with iter_timer: out, h = x, hiddens for i in range(SEQ_LEN): out, h = cus(out, h) return iter_timer
def run_bnlstm(hidden_size=100, max_length=784, pmnist=False, num_batches=5, cuda=False, jit=False, warmup=10, benchmark=20): name = 'bnlstm{}{}'.format(tag(cuda=cuda), tag(jit=jit)) iter_timer = Bench(name, cuda=cuda, warmup_iters=2) # The CPU version is slow... batch_size = 20 if cuda else 5 class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.rnn = bnlstm.LSTM(cell_class=bnlstm.BNLSTMCell, input_size=1, hidden_size=hidden_size, batch_first=True, max_length=max_length, jit=jit) self.fc = nn.Linear(in_features=hidden_size, out_features=10) # 10 digits in mnist def forward(self, data): hx = None if not pmnist: h0 = Variable(data.data.new(data.size(0), hidden_size) .normal_(0, 0.1)) c0 = Variable(data.data.new(data.size(0), hidden_size) .normal_(0, 0.1)) hx = (h0, c0) _, (h_n, _) = self.rnn(input_=data, hx=hx) logits = self.fc(h_n[0]) return logits def cast(tensor): return tensor.cuda() if cuda else tensor model = Model() criterion = nn.CrossEntropyLoss() data_batches = [Variable(cast(torch.zeros(batch_size, 28 * 28, 1))) for _ in range(num_batches)] target_batches = [Variable(cast(torch.zeros(batch_size)).long()) for _ in range(num_batches)] if cuda: model.cuda() criterion.cuda() total_loss = 0 for data, targets in zip(data_batches, target_batches): gc.collect() with iter_timer: logits = model(data) loss = criterion(input=logits, target=targets) loss.backward() total_loss += float(loss.data.item()) # CUDA sync point return iter_timer
def run_sru(cpu=0, gpu=0, jit=False, use_kernel=False, backward=False, warmup=10, benchmark=20): assert not (jit and use_kernel) benchmark_init(0, 0, True) # input has length 20, batch size 32 and dimension 128 x = Variable(torch.rand(20, 32, 128).cuda()) input_size, hidden_size = 128, 128 rnn = SRU( input_size, hidden_size, num_layers=2, # number of stacking RNN layers dropout=0.00001, # dropout applied between RNN layers rnn_dropout= 0.0001, # variational dropout applied on linear transformation use_tanh=1, # use tanh? use_relu=0, # use ReLU? bidirectional=False, # bidirectional RNN ? use_kernel=use_kernel, jit=jit, ) rnn.cuda() kernel_tag = '_kernel' if use_kernel else '' backward_tag = '_training' if backward else '_forward' jit_tag = '_jit' if jit else '' name = 'sru{}{}{}'.format(backward_tag, kernel_tag, jit_tag) iter_timer = Bench(cuda=True, name=name, warmup_iters=warmup) for _ in range(warmup + benchmark): gc.collect() with iter_timer: output, hidden = rnn(x) # forward pass if backward: output.sum().backward() # output is (length, batch size, hidden size * number of directions) # hidden is (layers, batch size, hidden size * number of directions) return iter_timer
def run_qrnn(batch_size=20, input_size=128, seq_len=20, warmup=10, benchmark=10, hidden_size=256, num_layers=10, use_kernel=False, jit=False, cuda=False): assert not (use_kernel and jit) if use_kernel: assert cuda benchmark_init(0, 0, True) name = 'qrnn{}{}{}'.format(tag(cuda=cuda), tag(jit=jit), tag(kernel=use_kernel)) iter_timer = Bench(name=name, cuda=cuda, warmup_iters=warmup) niters = warmup + benchmark size = (seq_len, batch_size, input_size) if cuda: device = torch.device('cuda:0') else: device = torch.device('cpu') batches = [ torch.rand(size, requires_grad=True, device=device) for _ in range(niters) ] qrnn = QRNN(input_size, hidden_size, num_layers=num_layers, dropout=0.4, use_kernel=use_kernel, jit=jit).to(device) for X in batches: gc.collect() with iter_timer: output, hidden = qrnn(X) output.sum().backward() return iter_timer
def run_tensor(broadcast=True): benchmark_init(0, 0, False) d = torch.zeros(1000, 1000) e = torch.zeros(1) def time_broadcast(): d * e def time_no_broadcast(): d * d if broadcast: fn = time_broadcast else: fn = time_no_broadcast name = "mul_bcast" if broadcast else "mul_no_bcast" iter_timer = Bench(name=name, cuda=False, warmup_iters=2) for _ in range(20): with iter_timer: fn() return iter_timer
def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512, seq_len=None, warmup=10, benchmark=20, autograd=False, variable=False, fused=False, jit=False, backward=False, skip_cpu_governor_check=False): if jit: autograd = True if backward: autograd = True if seq_len is None: if backward: seq_len = 32 else: seq_len = 512 assert not (jit and fused) assert not (variable and autograd) benchmark_init(cpu, gpu, skip_cpu_governor_check) if variable: V = lambda x, requires_grad=False: Variable(x, requires_grad=False) elif autograd: V = lambda x, requires_grad=False: Variable(x, requires_grad=requires_grad) else: V = lambda x, requires_grad=False: x input = V(torch.randn(batch_size, input_size).cuda(device=gpu)) hx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) cx0 = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True) w_ih = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True) w_hh = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True) if fused: if backward: print("using fused_autograd_lstm") lstm = fused_autograd_lstm else: print("using fused_forward_lstm") lstm = fused_autograd_lstm lstm = fused_lstm elif jit: print("tracing an unfused lstm") lstm = wrap_hidden(torch.jit.trace(input, hx0, cx0, w_ih, w_hh)(_unfused_lstm)) else: print("using unfused lstm") lstm = wrap_hidden(_unfused_lstm) name = 'lstm_cuda{}{}{}'.format(tag(autograd=autograd), tag(fused=fused), tag(jit=jit)) iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup) for i in range(warmup + benchmark): gc.collect() with iter_timer: hx, cx = hx0, cx0 for j in range(seq_len): hx, cx = lstm(input, (hx, cx), w_ih, w_hh) if backward: hx.sum().backward() return iter_timer
def run_mlstm(cpu=0, gpu=0, batch_size=1, input_size=205, hidden_size=1900, embed_size=None, seq_len=20, warmup=10, benchmark=20, autograd=False, jit=False, backward=False, skip_cpu_governor_check=False): name = "mlstm_jit" if jit else "mlstm" iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup) if embed_size is None: embed_size = hidden_size if jit or backward: autograd = True benchmark_init(cpu, gpu, skip_cpu_governor_check) requires_grad = autograd device = torch.device(gpu) input = torch.randn(seq_len, batch_size, input_size, requires_grad=requires_grad, device=device) hx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device) cx = torch.randn(batch_size, hidden_size, requires_grad=requires_grad, device=device) w_xm = torch.randn(embed_size, input_size, requires_grad=requires_grad, device=device) w_hm = torch.randn(embed_size, hidden_size, requires_grad=requires_grad, device=device) w_ih = torch.randn(4 * hidden_size, input_size, requires_grad=requires_grad, device=device) w_mh = torch.randn(4 * hidden_size, embed_size, requires_grad=requires_grad, device=device) params = [input, hx, cx, w_xm, w_hm, w_ih, w_mh] if jit: mlstm = torch.jit.trace(input[0], hx, cx, w_xm, w_hm, w_ih, w_mh)(mlstm_raw) else: mlstm = mlstm_raw for _ in range(warmup + benchmark): gc.collect() with iter_timer: hx_t = hx cx_t = cx for j in range(seq_len): hx_t, cx_t = mlstm(input[j], hx_t, cx_t, w_xm, w_hm, w_ih, w_mh) if backward: hx_t.sum().backward() for param in params: param.grad.zero_() return iter_timer
def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False): jit_tag = '_jit' if jit else '' cuda_tag = '_cuda' if cuda else '' name = 'seqlab{}{}'.format(cuda_tag, jit_tag) iter_timer = Bench(name=name, cuda=False, warmup_iters=2) print print('# test on wsj subset') data, n_types, n_labels = pickle.load(open(wsj_path, 'rb')) d_emb = 50 d_rnn = 51 d_hid = 52 d_actemb = 5 minibatch_size = 5 n_epochs = epochs preprocess_minibatch = True embed_word = nn.Embedding(n_types, d_emb) gru = Model(d_emb, d_rnn, jit=jit) if cuda: gru.cuda() embed_action = nn.Embedding(n_labels, d_actemb) combine_arh = nn.Linear(d_actemb + d_rnn * 2 + d_hid, d_hid) initial_h_tensor = torch.Tensor(1, d_hid) initial_h_tensor.zero_() initial_h = Parameter(initial_h_tensor) initial_actemb_tensor = torch.Tensor(1, d_actemb) initial_actemb_tensor.zero_() initial_actemb = Parameter(initial_actemb_tensor) policy = nn.Linear(d_hid, n_labels) loss_fn = torch.nn.MSELoss(size_average=False) optimizer = torch.optim.Adam( list(embed_word.parameters()) + list(gru.parameters()) + list(embed_action.parameters()) + list(combine_arh.parameters()) + list(policy.parameters()) + [initial_h, initial_actemb], lr=0.01) for _ in range(n_epochs): gc.collect() total_loss = 0 prof = None with iter_timer: #with torch.autograd.profiler.profile() as prof: for batch in minibatch(data, minibatch_size, True): optimizer.zero_grad() loss = 0 if preprocess_minibatch: # for efficiency, combine RNN outputs on entire # minibatch in one go (requires padding with zeros, # should be masked but isn't right now) all_tokens = [ex.tokens for ex in batch] max_length = max(map(len, all_tokens)) all_tokens = [ tok + [0] * (max_length - len(tok)) for tok in all_tokens ] all_e = embed_word( Variable(torch.LongTensor(all_tokens), requires_grad=False)) if cuda: [all_rnn_out, _] = gru(all_e.cuda()) all_rnn_out = all_rnn_out.cpu() else: all_rnn_out, _ = gru(all_e) for ex in batch: N = len(ex.tokens) if preprocess_minibatch: rnn_out = all_rnn_out[0, :, :].view(-1, 1, 2 * d_rnn) else: e = embed_word( Variable(torch.LongTensor(ex.tokens), requires_grad=False)).view(N, 1, -1) [rnn_out, _] = gru(e) prev_h = initial_h # previous hidden state actemb = initial_actemb # embedding of previous action output = [] for t in range(N): # update hidden state based on most recent # *predicted* action (not ground truth) inputs = [actemb, prev_h, rnn_out[t]] h = F.relu(combine_arh(torch.cat(inputs, 1))) # make prediction pred_vec = policy(h) pred_vec = pred_vec.view(-1) pred = pred_vec.argmin() output.append(pred) # accumulate loss (squared error against costs) truth = torch.ones(n_labels) truth[ex.labels[t]] = 0 loss += loss_fn(pred_vec, Variable(truth, requires_grad=False)) # cache hidden state, previous action embedding prev_h = h actemb = embed_action( Variable(torch.LongTensor([pred.item()]), requires_grad=False)) # print('output=%s, truth=%s' % (output, ex.labels)) loss.backward() total_loss += float(loss) optimizer.step() if prof is not None: print(prof.key_averages()) print(total_loss) return iter_timer
def run_memnn(warmup=2, benchmark=18, jit=False, cuda=False): nbatches = warmup + benchmark default_params = dict(lr=0.01, embedding_size=128, hops=3, mem_size=100, time_features=False, position_encoding=True, output='rank', dropout=0.1, optimizer='adam', num_features=500, num_batches=nbatches, cuda=cuda) params = AttrDict(default_params) """Set up model.""" # The CPU version is slow... params['batch_size'] = 4 if params.cuda else 4 if params.cuda: device = torch.device('cuda:0') else: device = torch.device('cpu') model = memnn.MemNN(params, params.num_features) criterion = nn.CrossEntropyLoss() data_batches = [ [ # memories, queries, memory_lengths, query_lengths torch.zeros(params.batch_size * params.mem_size, dtype=torch.long, device=device), torch.zeros(params.batch_size * 28, dtype=torch.long, device=device), torch.ones(params.batch_size, params.mem_size, dtype=torch.long, device=device), torch.full((params.batch_size, ), 28, dtype=torch.long, device=device), ] for _ in range(params.num_batches) ] cand_batches = [ torch.zeros(params.batch_size * 14, params.embedding_size, device=device) for _ in range(params.num_batches) ] target_batches = [ torch.ones(params.batch_size, dtype=torch.long, device=device) for _ in range(params.num_batches) ] # model.to(device) # embeddings are performed on CPU # the memnn model takes care of things when it is passed the cuda flag criterion.to(device) """Time model.""" cuda_tag = '_cuda' if cuda else '' jit_tag = '_jit' if jit else '' name = 'memnn{}{}'.format(cuda_tag, jit_tag) bench = Bench(name=name, cuda=cuda, warmup_iters=warmup) trace_once = jit total_loss = 0 for data, cands, targets in zip(data_batches, cand_batches, target_batches): gc.collect() if trace_once: model = torch.jit.trace(*data)(model) trace_once = False with bench: output_embeddings = model(*data) scores = one_to_many(output_embeddings, cands) loss = criterion(scores, targets) loss.backward() total_loss += float(loss.item()) return bench