Example #1
0
def run_cudnn_lstm(cpu=0,
                   gpu=0,
                   batch_size=1,
                   input_size=256,
                   hidden_size=512,
                   layers=1,
                   seq_len=512,
                   warmup=10,
                   benchmark=30,
                   backward=False,
                   skip_cpu_governor_check=False):

    benchmark_init(cpu, gpu, skip_cpu_governor_check)

    def V(x):
        return Variable(x)  # mandatory

    input = V(torch.randn(seq_len, batch_size, input_size).cuda(gpu))
    hx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu))
    cx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu))

    lstm = torch.nn.LSTM(input_size, hidden_size, layers).cuda(gpu)
    lstm.flatten_parameters()

    iter_timer = Bench(name='lstm_cudnn', cuda=True, warmup_iters=warmup)

    for i in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            hx_t, cx_t = lstm(input, (hx, cx))
            if backward:
                hx_t.sum().backward()

    return iter_timer
def run_lstm_variant(variant='SlowLSTM', cuda=False, size=128, jit=False):
    assert variant in lstms
    p = AttrDict({'cuda': cuda, 'lstm_kind': variant, 'size': size})

    name = '{}_size{}{}{}'.format(variant, size, tag(cuda=cuda), tag(jit=jit))

    def C(x):
        if p.cuda:
            x = x.cuda()
        return x

    lstm = getattr(lstm_variants, p.lstm_kind)
    x = V(C(th.rand(1, BATCH, p.size)))
    hiddens = (V(C(th.rand(1, BATCH, p.size))), V(C(th.rand(1, BATCH,
                                                            p.size))))
    th.manual_seed(1234)
    cus = C(lstm(p.size, p.size, dropout=DROPOUT, jit=jit))
    if hasattr(cus, 'mask'):
        cus.mask = C(cus.mask)

    iter_timer = Bench(name=name, cuda=cuda, warmup_iters=3)

    # Super slow on CPU
    iters = 20 if cuda else 6
    for _ in range(iters):
        gc.collect()
        with iter_timer:
            out, h = x, hiddens
            for i in range(SEQ_LEN):
                out, h = cus(out, h)

    return iter_timer
Example #3
0
def run_bnlstm(hidden_size=100, max_length=784, pmnist=False, num_batches=5,
               cuda=False, jit=False, warmup=10, benchmark=20):
    name = 'bnlstm{}{}'.format(tag(cuda=cuda), tag(jit=jit))
    iter_timer = Bench(name, cuda=cuda, warmup_iters=2)

    # The CPU version is slow...
    batch_size = 20 if cuda else 5

    class Model(nn.Module):
        def __init__(self):
            super(Model, self).__init__()
            self.rnn = bnlstm.LSTM(cell_class=bnlstm.BNLSTMCell, input_size=1,
                                   hidden_size=hidden_size, batch_first=True,
                                   max_length=max_length, jit=jit)
            self.fc = nn.Linear(in_features=hidden_size, out_features=10)  # 10 digits in mnist

        def forward(self, data):
            hx = None
            if not pmnist:
                h0 = Variable(data.data.new(data.size(0), hidden_size)
                              .normal_(0, 0.1))
                c0 = Variable(data.data.new(data.size(0), hidden_size)
                              .normal_(0, 0.1))
                hx = (h0, c0)
            _, (h_n, _) = self.rnn(input_=data, hx=hx)
            logits = self.fc(h_n[0])
            return logits

    def cast(tensor):
        return tensor.cuda() if cuda else tensor

    model = Model()
    criterion = nn.CrossEntropyLoss()
    data_batches = [Variable(cast(torch.zeros(batch_size, 28 * 28, 1))) for _ in range(num_batches)]
    target_batches = [Variable(cast(torch.zeros(batch_size)).long()) for _ in range(num_batches)]
    if cuda:
        model.cuda()
        criterion.cuda()

    total_loss = 0
    for data, targets in zip(data_batches, target_batches):
        gc.collect()
        with iter_timer:
            logits = model(data)
            loss = criterion(input=logits, target=targets)
            loss.backward()
            total_loss += float(loss.data.item())  # CUDA sync point

    return iter_timer
Example #4
0
def run_sru(cpu=0,
            gpu=0,
            jit=False,
            use_kernel=False,
            backward=False,
            warmup=10,
            benchmark=20):
    assert not (jit and use_kernel)
    benchmark_init(0, 0, True)

    # input has length 20, batch size 32 and dimension 128
    x = Variable(torch.rand(20, 32, 128).cuda())
    input_size, hidden_size = 128, 128

    rnn = SRU(
        input_size,
        hidden_size,
        num_layers=2,  # number of stacking RNN layers
        dropout=0.00001,  # dropout applied between RNN layers
        rnn_dropout=
        0.0001,  # variational dropout applied on linear transformation
        use_tanh=1,  # use tanh?
        use_relu=0,  # use ReLU?
        bidirectional=False,  # bidirectional RNN ?
        use_kernel=use_kernel,
        jit=jit,
    )
    rnn.cuda()

    kernel_tag = '_kernel' if use_kernel else ''
    backward_tag = '_training' if backward else '_forward'
    jit_tag = '_jit' if jit else ''
    name = 'sru{}{}{}'.format(backward_tag, kernel_tag, jit_tag)
    iter_timer = Bench(cuda=True, name=name, warmup_iters=warmup)

    for _ in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            output, hidden = rnn(x)  # forward pass
            if backward:
                output.sum().backward()
        # output is (length, batch size, hidden size * number of directions)
        # hidden is (layers, batch size, hidden size * number of directions)
    return iter_timer
Example #5
0
def run_qrnn(batch_size=20,
             input_size=128,
             seq_len=20,
             warmup=10,
             benchmark=10,
             hidden_size=256,
             num_layers=10,
             use_kernel=False,
             jit=False,
             cuda=False):
    assert not (use_kernel and jit)
    if use_kernel:
        assert cuda

    benchmark_init(0, 0, True)
    name = 'qrnn{}{}{}'.format(tag(cuda=cuda), tag(jit=jit),
                               tag(kernel=use_kernel))
    iter_timer = Bench(name=name, cuda=cuda, warmup_iters=warmup)
    niters = warmup + benchmark

    size = (seq_len, batch_size, input_size)
    if cuda:
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    batches = [
        torch.rand(size, requires_grad=True, device=device)
        for _ in range(niters)
    ]
    qrnn = QRNN(input_size,
                hidden_size,
                num_layers=num_layers,
                dropout=0.4,
                use_kernel=use_kernel,
                jit=jit).to(device)

    for X in batches:
        gc.collect()
        with iter_timer:
            output, hidden = qrnn(X)
            output.sum().backward()

    return iter_timer
Example #6
0
def run_tensor(broadcast=True):
    benchmark_init(0, 0, False)

    d = torch.zeros(1000, 1000)
    e = torch.zeros(1)

    def time_broadcast():
        d * e

    def time_no_broadcast():
        d * d

    if broadcast:
        fn = time_broadcast
    else:
        fn = time_no_broadcast

    name = "mul_bcast" if broadcast else "mul_no_bcast"
    iter_timer = Bench(name=name, cuda=False, warmup_iters=2)
    for _ in range(20):
        with iter_timer:
            fn()

    return iter_timer
Example #7
0
def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512,
             seq_len=None, warmup=10, benchmark=20, autograd=False,
             variable=False, fused=False, jit=False, backward=False,
             skip_cpu_governor_check=False):
    if jit:
        autograd = True

    if backward:
        autograd = True

    if seq_len is None:
        if backward:
            seq_len = 32
        else:
            seq_len = 512

    assert not (jit and fused)
    assert not (variable and autograd)

    benchmark_init(cpu, gpu, skip_cpu_governor_check)

    if variable:
        V = lambda x, requires_grad=False: Variable(x, requires_grad=False)
    elif autograd:
        V = lambda x, requires_grad=False: Variable(x, requires_grad=requires_grad)
    else:
        V = lambda x, requires_grad=False: x

    input = V(torch.randn(batch_size, input_size).cuda(device=gpu))
    hx0   = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
    cx0   = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
    w_ih  = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True)
    w_hh  = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True)

    if fused:
        if backward:
            print("using fused_autograd_lstm")
            lstm = fused_autograd_lstm
        else:
            print("using fused_forward_lstm")
            lstm = fused_autograd_lstm
            lstm = fused_lstm
    elif jit:
        print("tracing an unfused lstm")
        lstm = wrap_hidden(torch.jit.trace(input, hx0, cx0, w_ih, w_hh)(_unfused_lstm))
    else:
        print("using unfused lstm")
        lstm = wrap_hidden(_unfused_lstm)

    name = 'lstm_cuda{}{}{}'.format(tag(autograd=autograd), tag(fused=fused),
                                    tag(jit=jit))
    iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup)

    for i in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            hx, cx = hx0, cx0
            for j in range(seq_len):
                hx, cx = lstm(input, (hx, cx), w_ih, w_hh)
            if backward:
                hx.sum().backward()

    return iter_timer
Example #8
0
def run_mlstm(cpu=0,
              gpu=0,
              batch_size=1,
              input_size=205,
              hidden_size=1900,
              embed_size=None,
              seq_len=20,
              warmup=10,
              benchmark=20,
              autograd=False,
              jit=False,
              backward=False,
              skip_cpu_governor_check=False):
    name = "mlstm_jit" if jit else "mlstm"
    iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup)

    if embed_size is None:
        embed_size = hidden_size

    if jit or backward:
        autograd = True

    benchmark_init(cpu, gpu, skip_cpu_governor_check)

    requires_grad = autograd
    device = torch.device(gpu)

    input = torch.randn(seq_len,
                        batch_size,
                        input_size,
                        requires_grad=requires_grad,
                        device=device)
    hx = torch.randn(batch_size,
                     hidden_size,
                     requires_grad=requires_grad,
                     device=device)
    cx = torch.randn(batch_size,
                     hidden_size,
                     requires_grad=requires_grad,
                     device=device)
    w_xm = torch.randn(embed_size,
                       input_size,
                       requires_grad=requires_grad,
                       device=device)
    w_hm = torch.randn(embed_size,
                       hidden_size,
                       requires_grad=requires_grad,
                       device=device)
    w_ih = torch.randn(4 * hidden_size,
                       input_size,
                       requires_grad=requires_grad,
                       device=device)
    w_mh = torch.randn(4 * hidden_size,
                       embed_size,
                       requires_grad=requires_grad,
                       device=device)
    params = [input, hx, cx, w_xm, w_hm, w_ih, w_mh]

    if jit:
        mlstm = torch.jit.trace(input[0], hx, cx, w_xm, w_hm, w_ih,
                                w_mh)(mlstm_raw)
    else:
        mlstm = mlstm_raw

    for _ in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            hx_t = hx
            cx_t = cx
            for j in range(seq_len):
                hx_t, cx_t = mlstm(input[j], hx_t, cx_t, w_xm, w_hm, w_ih,
                                   w_mh)
            if backward:
                hx_t.sum().backward()
                for param in params:
                    param.grad.zero_()

    return iter_timer
Example #9
0
def test_wsj(jit=False, epochs=6, wsj_path=wsj_default_path, cuda=False):
    jit_tag = '_jit' if jit else ''
    cuda_tag = '_cuda' if cuda else ''
    name = 'seqlab{}{}'.format(cuda_tag, jit_tag)
    iter_timer = Bench(name=name, cuda=False, warmup_iters=2)
    print
    print('# test on wsj subset')

    data, n_types, n_labels = pickle.load(open(wsj_path, 'rb'))

    d_emb = 50
    d_rnn = 51
    d_hid = 52
    d_actemb = 5

    minibatch_size = 5
    n_epochs = epochs
    preprocess_minibatch = True

    embed_word = nn.Embedding(n_types, d_emb)
    gru = Model(d_emb, d_rnn, jit=jit)
    if cuda:
        gru.cuda()

    embed_action = nn.Embedding(n_labels, d_actemb)
    combine_arh = nn.Linear(d_actemb + d_rnn * 2 + d_hid, d_hid)

    initial_h_tensor = torch.Tensor(1, d_hid)
    initial_h_tensor.zero_()
    initial_h = Parameter(initial_h_tensor)

    initial_actemb_tensor = torch.Tensor(1, d_actemb)
    initial_actemb_tensor.zero_()
    initial_actemb = Parameter(initial_actemb_tensor)

    policy = nn.Linear(d_hid, n_labels)

    loss_fn = torch.nn.MSELoss(size_average=False)

    optimizer = torch.optim.Adam(
        list(embed_word.parameters()) + list(gru.parameters()) +
        list(embed_action.parameters()) + list(combine_arh.parameters()) +
        list(policy.parameters()) + [initial_h, initial_actemb],
        lr=0.01)

    for _ in range(n_epochs):
        gc.collect()
        total_loss = 0
        prof = None
        with iter_timer:
            #with torch.autograd.profiler.profile() as prof:
            for batch in minibatch(data, minibatch_size, True):
                optimizer.zero_grad()
                loss = 0

                if preprocess_minibatch:
                    # for efficiency, combine RNN outputs on entire
                    # minibatch in one go (requires padding with zeros,
                    # should be masked but isn't right now)
                    all_tokens = [ex.tokens for ex in batch]
                    max_length = max(map(len, all_tokens))
                    all_tokens = [
                        tok + [0] * (max_length - len(tok))
                        for tok in all_tokens
                    ]
                    all_e = embed_word(
                        Variable(torch.LongTensor(all_tokens),
                                 requires_grad=False))

                    if cuda:
                        [all_rnn_out, _] = gru(all_e.cuda())
                        all_rnn_out = all_rnn_out.cpu()
                    else:
                        all_rnn_out, _ = gru(all_e)

                for ex in batch:
                    N = len(ex.tokens)
                    if preprocess_minibatch:
                        rnn_out = all_rnn_out[0, :, :].view(-1, 1, 2 * d_rnn)
                    else:
                        e = embed_word(
                            Variable(torch.LongTensor(ex.tokens),
                                     requires_grad=False)).view(N, 1, -1)
                        [rnn_out, _] = gru(e)
                    prev_h = initial_h  # previous hidden state
                    actemb = initial_actemb  # embedding of previous action
                    output = []
                    for t in range(N):
                        # update hidden state based on most recent
                        # *predicted* action (not ground truth)
                        inputs = [actemb, prev_h, rnn_out[t]]
                        h = F.relu(combine_arh(torch.cat(inputs, 1)))

                        # make prediction
                        pred_vec = policy(h)
                        pred_vec = pred_vec.view(-1)
                        pred = pred_vec.argmin()
                        output.append(pred)

                        # accumulate loss (squared error against costs)
                        truth = torch.ones(n_labels)
                        truth[ex.labels[t]] = 0
                        loss += loss_fn(pred_vec,
                                        Variable(truth, requires_grad=False))

                        # cache hidden state, previous action embedding
                        prev_h = h
                        actemb = embed_action(
                            Variable(torch.LongTensor([pred.item()]),
                                     requires_grad=False))

                    # print('output=%s, truth=%s' % (output, ex.labels))

                loss.backward()
                total_loss += float(loss)
                optimizer.step()
        if prof is not None:
            print(prof.key_averages())
        print(total_loss)
    return iter_timer
Example #10
0
def run_memnn(warmup=2, benchmark=18, jit=False, cuda=False):
    nbatches = warmup + benchmark

    default_params = dict(lr=0.01,
                          embedding_size=128,
                          hops=3,
                          mem_size=100,
                          time_features=False,
                          position_encoding=True,
                          output='rank',
                          dropout=0.1,
                          optimizer='adam',
                          num_features=500,
                          num_batches=nbatches,
                          cuda=cuda)
    params = AttrDict(default_params)
    """Set up model."""
    # The CPU version is slow...
    params['batch_size'] = 4 if params.cuda else 4

    if params.cuda:
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')

    model = memnn.MemNN(params, params.num_features)
    criterion = nn.CrossEntropyLoss()
    data_batches = [
        [  # memories, queries, memory_lengths, query_lengths
            torch.zeros(params.batch_size * params.mem_size,
                        dtype=torch.long,
                        device=device),
            torch.zeros(params.batch_size * 28,
                        dtype=torch.long,
                        device=device),
            torch.ones(params.batch_size,
                       params.mem_size,
                       dtype=torch.long,
                       device=device),
            torch.full((params.batch_size, ),
                       28,
                       dtype=torch.long,
                       device=device),
        ] for _ in range(params.num_batches)
    ]
    cand_batches = [
        torch.zeros(params.batch_size * 14,
                    params.embedding_size,
                    device=device) for _ in range(params.num_batches)
    ]
    target_batches = [
        torch.ones(params.batch_size, dtype=torch.long, device=device)
        for _ in range(params.num_batches)
    ]

    # model.to(device) # embeddings are performed on CPU
    # the memnn model takes care of things when it is passed the cuda flag
    criterion.to(device)
    """Time model."""
    cuda_tag = '_cuda' if cuda else ''
    jit_tag = '_jit' if jit else ''
    name = 'memnn{}{}'.format(cuda_tag, jit_tag)
    bench = Bench(name=name, cuda=cuda, warmup_iters=warmup)
    trace_once = jit

    total_loss = 0
    for data, cands, targets in zip(data_batches, cand_batches,
                                    target_batches):
        gc.collect()
        if trace_once:
            model = torch.jit.trace(*data)(model)
            trace_once = False
        with bench:
            output_embeddings = model(*data)
            scores = one_to_many(output_embeddings, cands)
            loss = criterion(scores, targets)
            loss.backward()
            total_loss += float(loss.item())

    return bench