コード例 #1
0
ファイル: cudnn_lstm.py プロジェクト: isabella232/benchmark-3
def run_cudnn_lstm(cpu=0,
                   gpu=0,
                   batch_size=1,
                   input_size=256,
                   hidden_size=512,
                   layers=1,
                   seq_len=512,
                   warmup=10,
                   benchmark=30,
                   backward=False,
                   skip_cpu_governor_check=False):

    benchmark_init(cpu, gpu, skip_cpu_governor_check)

    def V(x):
        return Variable(x)  # mandatory

    input = V(torch.randn(seq_len, batch_size, input_size).cuda(gpu))
    hx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu))
    cx = V(torch.randn(layers, batch_size, hidden_size).cuda(gpu))

    lstm = torch.nn.LSTM(input_size, hidden_size, layers).cuda(gpu)
    lstm.flatten_parameters()

    iter_timer = Bench(name='lstm_cudnn', cuda=True, warmup_iters=warmup)

    for i in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            hx_t, cx_t = lstm(input, (hx, cx))
            if backward:
                hx_t.sum().backward()

    return iter_timer
コード例 #2
0
def run_sru(cpu=0,
            gpu=0,
            jit=False,
            use_kernel=False,
            backward=False,
            warmup=10,
            benchmark=20):
    assert not (jit and use_kernel)
    benchmark_init(0, 0, True)

    # input has length 20, batch size 32 and dimension 128
    x = Variable(torch.rand(20, 32, 128).cuda())
    input_size, hidden_size = 128, 128

    rnn = SRU(
        input_size,
        hidden_size,
        num_layers=2,  # number of stacking RNN layers
        dropout=0.00001,  # dropout applied between RNN layers
        rnn_dropout=
        0.0001,  # variational dropout applied on linear transformation
        use_tanh=1,  # use tanh?
        use_relu=0,  # use ReLU?
        bidirectional=False,  # bidirectional RNN ?
        use_kernel=use_kernel,
        jit=jit,
    )
    rnn.cuda()

    kernel_tag = '_kernel' if use_kernel else ''
    backward_tag = '_training' if backward else '_forward'
    jit_tag = '_jit' if jit else ''
    name = 'sru{}{}{}'.format(backward_tag, kernel_tag, jit_tag)
    iter_timer = Bench(cuda=True, name=name, warmup_iters=warmup)

    for _ in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            output, hidden = rnn(x)  # forward pass
            if backward:
                output.sum().backward()
        # output is (length, batch size, hidden size * number of directions)
        # hidden is (layers, batch size, hidden size * number of directions)
    return iter_timer
コード例 #3
0
ファイル: qrnn.py プロジェクト: isabella232/benchmark-3
def run_qrnn(batch_size=20,
             input_size=128,
             seq_len=20,
             warmup=10,
             benchmark=10,
             hidden_size=256,
             num_layers=10,
             use_kernel=False,
             jit=False,
             cuda=False):
    assert not (use_kernel and jit)
    if use_kernel:
        assert cuda

    benchmark_init(0, 0, True)
    name = 'qrnn{}{}{}'.format(tag(cuda=cuda), tag(jit=jit),
                               tag(kernel=use_kernel))
    iter_timer = Bench(name=name, cuda=cuda, warmup_iters=warmup)
    niters = warmup + benchmark

    size = (seq_len, batch_size, input_size)
    if cuda:
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    batches = [
        torch.rand(size, requires_grad=True, device=device)
        for _ in range(niters)
    ]
    qrnn = QRNN(input_size,
                hidden_size,
                num_layers=num_layers,
                dropout=0.4,
                use_kernel=use_kernel,
                jit=jit).to(device)

    for X in batches:
        gc.collect()
        with iter_timer:
            output, hidden = qrnn(X)
            output.sum().backward()

    return iter_timer
コード例 #4
0
def run_tensor(broadcast=True):
    benchmark_init(0, 0, False)

    d = torch.zeros(1000, 1000)
    e = torch.zeros(1)

    def time_broadcast():
        d * e

    def time_no_broadcast():
        d * d

    if broadcast:
        fn = time_broadcast
    else:
        fn = time_no_broadcast

    name = "mul_bcast" if broadcast else "mul_no_bcast"
    iter_timer = Bench(name=name, cuda=False, warmup_iters=2)
    for _ in range(20):
        with iter_timer:
            fn()

    return iter_timer
コード例 #5
0
ファイル: lstm.py プロジェクト: zou3519/benchmark
def run_lstm(cpu=0, gpu=0, batch_size=1, input_size=256, hidden_size=512,
             seq_len=None, warmup=10, benchmark=20, autograd=False,
             variable=False, fused=False, jit=False, backward=False,
             skip_cpu_governor_check=False):
    if jit:
        autograd = True

    if backward:
        autograd = True

    if seq_len is None:
        if backward:
            seq_len = 32
        else:
            seq_len = 512

    assert not (jit and fused)
    assert not (variable and autograd)

    benchmark_init(cpu, gpu, skip_cpu_governor_check)

    if variable:
        V = lambda x, requires_grad=False: Variable(x, requires_grad=False)
    elif autograd:
        V = lambda x, requires_grad=False: Variable(x, requires_grad=requires_grad)
    else:
        V = lambda x, requires_grad=False: x

    input = V(torch.randn(batch_size, input_size).cuda(device=gpu))
    hx0   = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
    cx0   = V(torch.randn(batch_size, hidden_size).cuda(device=gpu), requires_grad=True)
    w_ih  = V(t_def(torch.randn(4 * hidden_size, input_size)).cuda(device=gpu), requires_grad=True)
    w_hh  = V(t_def(torch.randn(4 * hidden_size, hidden_size)).cuda(device=gpu), requires_grad=True)

    if fused:
        if backward:
            print("using fused_autograd_lstm")
            lstm = fused_autograd_lstm
        else:
            print("using fused_forward_lstm")
            lstm = fused_autograd_lstm
            lstm = fused_lstm
    elif jit:
        print("tracing an unfused lstm")
        lstm = wrap_hidden(torch.jit.trace(input, hx0, cx0, w_ih, w_hh)(_unfused_lstm))
    else:
        print("using unfused lstm")
        lstm = wrap_hidden(_unfused_lstm)

    name = 'lstm_cuda{}{}{}'.format(tag(autograd=autograd), tag(fused=fused),
                                    tag(jit=jit))
    iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup)

    for i in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            hx, cx = hx0, cx0
            for j in range(seq_len):
                hx, cx = lstm(input, (hx, cx), w_ih, w_hh)
            if backward:
                hx.sum().backward()

    return iter_timer
コード例 #6
0
ファイル: mlstm.py プロジェクト: isabella232/benchmark-3
def run_mlstm(cpu=0,
              gpu=0,
              batch_size=1,
              input_size=205,
              hidden_size=1900,
              embed_size=None,
              seq_len=20,
              warmup=10,
              benchmark=20,
              autograd=False,
              jit=False,
              backward=False,
              skip_cpu_governor_check=False):
    name = "mlstm_jit" if jit else "mlstm"
    iter_timer = Bench(name=name, cuda=True, warmup_iters=warmup)

    if embed_size is None:
        embed_size = hidden_size

    if jit or backward:
        autograd = True

    benchmark_init(cpu, gpu, skip_cpu_governor_check)

    requires_grad = autograd
    device = torch.device(gpu)

    input = torch.randn(seq_len,
                        batch_size,
                        input_size,
                        requires_grad=requires_grad,
                        device=device)
    hx = torch.randn(batch_size,
                     hidden_size,
                     requires_grad=requires_grad,
                     device=device)
    cx = torch.randn(batch_size,
                     hidden_size,
                     requires_grad=requires_grad,
                     device=device)
    w_xm = torch.randn(embed_size,
                       input_size,
                       requires_grad=requires_grad,
                       device=device)
    w_hm = torch.randn(embed_size,
                       hidden_size,
                       requires_grad=requires_grad,
                       device=device)
    w_ih = torch.randn(4 * hidden_size,
                       input_size,
                       requires_grad=requires_grad,
                       device=device)
    w_mh = torch.randn(4 * hidden_size,
                       embed_size,
                       requires_grad=requires_grad,
                       device=device)
    params = [input, hx, cx, w_xm, w_hm, w_ih, w_mh]

    if jit:
        mlstm = torch.jit.trace(input[0], hx, cx, w_xm, w_hm, w_ih,
                                w_mh)(mlstm_raw)
    else:
        mlstm = mlstm_raw

    for _ in range(warmup + benchmark):
        gc.collect()
        with iter_timer:
            hx_t = hx
            cx_t = cx
            for j in range(seq_len):
                hx_t, cx_t = mlstm(input[j], hx_t, cx_t, w_xm, w_hm, w_ih,
                                   w_mh)
            if backward:
                hx_t.sum().backward()
                for param in params:
                    param.grad.zero_()

    return iter_timer