예제 #1
0
def run_lstm_variant(variant='SlowLSTM', cuda=False, size=128, jit=False):
    assert variant in lstms
    p = AttrDict({'cuda': cuda, 'lstm_kind': variant, 'size': size})

    name = '{}_size{}{}{}'.format(variant, size, tag(cuda=cuda), tag(jit=jit))

    def C(x):
        if p.cuda:
            x = x.cuda()
        return x

    lstm = getattr(lstm_variants, p.lstm_kind)
    x = V(C(th.rand(1, BATCH, p.size)))
    hiddens = (V(C(th.rand(1, BATCH, p.size))), V(C(th.rand(1, BATCH,
                                                            p.size))))
    th.manual_seed(1234)
    cus = C(lstm(p.size, p.size, dropout=DROPOUT, jit=jit))
    if hasattr(cus, 'mask'):
        cus.mask = C(cus.mask)

    iter_timer = Bench(name=name, cuda=cuda, warmup_iters=3)

    # Super slow on CPU
    iters = 20 if cuda else 6
    for _ in range(iters):
        gc.collect()
        with iter_timer:
            out, h = x, hiddens
            for i in range(SEQ_LEN):
                out, h = cus(out, h)

    return iter_timer
예제 #2
0
def run_bnlstm(hidden_size=100, max_length=784, pmnist=False, num_batches=5,
               cuda=False, jit=False, warmup=10, benchmark=20):
    name = 'bnlstm{}{}'.format(tag(cuda=cuda), tag(jit=jit))
    iter_timer = Bench(name, cuda=cuda, warmup_iters=2)

    # The CPU version is slow...
    batch_size = 20 if cuda else 5

    class Model(nn.Module):
        def __init__(self):
            super(Model, self).__init__()
            self.rnn = bnlstm.LSTM(cell_class=bnlstm.BNLSTMCell, input_size=1,
                                   hidden_size=hidden_size, batch_first=True,
                                   max_length=max_length, jit=jit)
            self.fc = nn.Linear(in_features=hidden_size, out_features=10)  # 10 digits in mnist

        def forward(self, data):
            hx = None
            if not pmnist:
                h0 = Variable(data.data.new(data.size(0), hidden_size)
                              .normal_(0, 0.1))
                c0 = Variable(data.data.new(data.size(0), hidden_size)
                              .normal_(0, 0.1))
                hx = (h0, c0)
            _, (h_n, _) = self.rnn(input_=data, hx=hx)
            logits = self.fc(h_n[0])
            return logits

    def cast(tensor):
        return tensor.cuda() if cuda else tensor

    model = Model()
    criterion = nn.CrossEntropyLoss()
    data_batches = [Variable(cast(torch.zeros(batch_size, 28 * 28, 1))) for _ in range(num_batches)]
    target_batches = [Variable(cast(torch.zeros(batch_size)).long()) for _ in range(num_batches)]
    if cuda:
        model.cuda()
        criterion.cuda()

    total_loss = 0
    for data, targets in zip(data_batches, target_batches):
        gc.collect()
        with iter_timer:
            logits = model(data)
            loss = criterion(input=logits, target=targets)
            loss.backward()
            total_loss += float(loss.data.item())  # CUDA sync point

    return iter_timer
예제 #3
0
def run_qrnn(batch_size=20,
             input_size=128,
             seq_len=20,
             warmup=10,
             benchmark=10,
             hidden_size=256,
             num_layers=10,
             use_kernel=False,
             jit=False,
             cuda=False):
    assert not (use_kernel and jit)
    if use_kernel:
        assert cuda

    benchmark_init(0, 0, True)
    name = 'qrnn{}{}{}'.format(tag(cuda=cuda), tag(jit=jit),
                               tag(kernel=use_kernel))
    iter_timer = Bench(name=name, cuda=cuda, warmup_iters=warmup)
    niters = warmup + benchmark

    size = (seq_len, batch_size, input_size)
    if cuda:
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    batches = [
        torch.rand(size, requires_grad=True, device=device)
        for _ in range(niters)
    ]
    qrnn = QRNN(input_size,
                hidden_size,
                num_layers=num_layers,
                dropout=0.4,
                use_kernel=use_kernel,
                jit=jit).to(device)

    for X in batches:
        gc.collect()
        with iter_timer:
            output, hidden = qrnn(X)
            output.sum().backward()

    return iter_timer
예제 #4
0
파일: lstm.py 프로젝트: zou3519/benchmark
def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512,
             seq_len=None, warmup=10, benchmark=20, autograd=False,
             variable=False, fused=False, jit=False, backward=False,
             skip_cpu_governor_check=False):
    if jit:
        autograd = True

    if backward:
        autograd = True

    if seq_len is None:
        if backward:
            seq_len = 32
        else:
            seq_len = 512

    assert not (jit and fused)
    assert not (variable and autograd)

    benchmark_init(cpu, gpu, skip_cpu_governor_check)

    if variable:
        V = lambda x, requires_grad=False: Variable(x, requires_grad=False)
    elif autograd:
        V = lambda x, requires_grad=False: Variable(x, requires_grad=requires_grad)
    else:
        V = lambda x, requires_grad=False: x

    input = V(torch.randn(batch_size, input_size).cuda(device=gpu))
    hx0   = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
    cx0   = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
    w_ih  = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True)
    w_hh  = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True)

    if fused:
        if backward:
            print("using fused_autograd_lstm")
            lstm = fused_autograd_lstm
        else:
            print("using fused_forward_lstm")
            lstm = fused_autograd_lstm
            lstm = fused_lstm
    elif jit:
        print("tracing an unfused lstm")
        lstm = wrap_hidden(torch.jit.trace(input, hx0, cx0, w_ih, w_hh)(_unfused_lstm))
    else:
        print("using unfused lstm")
        lstm = wrap_hidden(_unfused_lstm)

    name = 'lstm_cuda{}{}{}'.format(tag(autograd=autograd), tag(fused=fused),
                                    tag(jit=jit))
    iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup)

    for i in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            hx, cx = hx0, cx0
            for j in range(seq_len):
                hx, cx = lstm(input, (hx, cx), w_ih, w_hh)
            if backward:
                hx.sum().backward()

    return iter_timer