Ejemplo n.º 1
0
    def test_gradient(self):
        """
        Test the correctness of the gradient against tensorflow
        """
        if ovl.cuda_enabled:
            devices = ['/cpu:0', '/gpu:0']
        else:
            devices = ['/cpu:0']
        # ensure TF runs on GPU when asked
        test_config=tf.ConfigProto(allow_soft_placement=False)
        test_config.graph_options.optimizer_options.opt_level = -1
        with tf.Session(config=test_config) as sess:
           for dev_string in devices:
                with tf.device(dev_string):
                    a = np.random.random(100)
                    grad_input = tf.constant(np.random.random(100))
                    arg = tf.constant(a)
                    ovl_op = log1p(arg)
                    ones = tf.constant(np.ones_like(a))
                    ovl_out = ovl.as_tensorflow(ovl_op)
                    tf_out = tf.log(arg + ones)

                    ovl_grad = tf.gradients(ovl_out, arg, grad_input)[0]
                    tf_grad = tf.gradients(tf_out, arg, grad_input)[0]
                    ovl_out, tf_out, ovl_grad, tf_grad = sess.run([ovl_out, tf_out, ovl_grad, tf_grad])

                    assert np.allclose(ovl_out, tf_out)
                    assert np.allclose(ovl_grad, tf_grad)
        sess.close()
Ejemplo n.º 2
0
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM)."""
        with vs.variable_scope(scope
                               or type(self).__name__):  # "BasicLSTMCell"
            # Parameters of gates are concatenated into one multiply for efficiency.
            if self._state_is_tuple:
                c, h = state
            else:
                c, h = array_ops.split(1, 2, state)
            concat = _linear([inputs, h], 4 * self._num_units, True)

            # changed from tensorflow code. set use_ovl to true to test ovl lstm
            use_ovl = True
            if use_ovl:
                # use this code to test manually merged OVL lstm cell
                # from lstm import lstm
                # import opveclib as ovl
                # new_c, new_h = ovl.as_tensorflow(lstm(concat, c, forget_bias=self._forget_bias))

                # use this code to test OVL standard ops. set opt_level=3 to turn on merger
                import opveclib as ovl
                import opveclib.stdops as ovlops
                i, j, f, o = ovlops.split(concat, split_dim=1, num_split=4)
                new_c = ovlops.mul(
                    c, ovlops.sigmoid(f)) + ovlops.sigmoid(i) * ovlops.tanh(j)
                new_h = ovlops.tanh(new_c) * ovlops.sigmoid(o)
                new_c, new_h = ovl.as_tensorflow([new_c, new_h], opt_level=3)

            else:

                # i = input_gate, j = new_input, f = forget_gate, o = output_gate
                i, j, f, o = array_ops.split(1, 4, concat)

                new_c = (c * sigmoid(f + self._forget_bias) +
                         sigmoid(i) * self._activation(j))
                new_h = self._activation(new_c) * sigmoid(o)
            # end changed code

            if self._state_is_tuple:
                new_state = LSTMStateTuple(new_c, new_h)
            else:
                new_state = array_ops.concat(1, [new_c, new_h])
            return new_h, new_state
Ejemplo n.º 3
0
    def test(self):
        """
        Test the correctness of ovl operator vs numpy implementation
        """
        a = np.array([1e-10, -1e-10, 0.0, np.Infinity], dtype=np.float64)
        expm1_op = expm1(a)
        ref = np.expm1(a)
        ovl_res = ovl.evaluate(expm1_op)
        ovl.logger.info(u'numpy: ' + str(ref) + u' ovl: ' + str(ovl_res))
        assert np.allclose(ref, ovl_res, rtol=0, atol=1e-20)
        if ovl.cuda_enabled:
            assert np.allclose(np.expm1(a),
                               ovl.evaluate(expm1_op, target_language='cuda'),
                               rtol=0,
                               atol=1e-20)

        # test  vs tensorflow
        # ensure TF runs on GPU when asked
        test_config = tf.ConfigProto(allow_soft_placement=False)
        test_config.graph_options.optimizer_options.opt_level = -1
        ones = np.ones_like(a)
        if ovl.cuda_enabled:
            devices = ['/cpu:0', '/gpu:0']
        else:
            devices = ['/cpu:0']
        with tf.Session(config=test_config) as sess:
            for dev_string in devices:
                with tf.device(dev_string):
                    expm1_tf = ovl.as_tensorflow(expm1_op)
                    sess.run(tf.initialize_all_variables())
                    expm1_tf_result = sess.run(expm1_tf)
                    assert np.allclose(ref,
                                       expm1_tf_result,
                                       rtol=0,
                                       atol=1e-20)

                    # TF exp - 1
                    tf_out = tf.exp(a) - ones
                    tf_result = tf_out.eval()
                    # this should fail
                    assert (np.allclose(ref, tf_result, rtol=0,
                                        atol=1e-20) == False)
        sess.close()
Ejemplo n.º 4
0
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM)."""
    with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
      # Parameters of gates are concatenated into one multiply for efficiency.
      if self._state_is_tuple:
        c, h = state
      else:
        c, h = array_ops.split(1, 2, state)
      concat = _linear([inputs, h], 4 * self._num_units, True)

      # changed from tensorflow code. set use_ovl to true to test ovl lstm
      use_ovl = True
      if use_ovl:
        # use this code to test manually merged OVL lstm cell
        # from lstm import lstm
        # import opveclib as ovl
        # new_c, new_h = ovl.as_tensorflow(lstm(concat, c, forget_bias=self._forget_bias))

        # use this code to test OVL standard ops. set opt_level=3 to turn on merger
        import opveclib as ovl
        import opveclib.stdops as ovlops
        i, j, f, o = ovlops.split(concat, split_dim=1, num_split=4)
        new_c = ovlops.mul(c,  ovlops.sigmoid(f)) + ovlops.sigmoid(i) * ovlops.tanh(j)
        new_h = ovlops.tanh(new_c) * ovlops.sigmoid(o)
        new_c, new_h = ovl.as_tensorflow([new_c, new_h], opt_level=3)

      else:

        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
        i, j, f, o = array_ops.split(1, 4, concat)

        new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
                 self._activation(j))
        new_h = self._activation(new_c) * sigmoid(o)
      # end changed code

      if self._state_is_tuple:
        new_state = LSTMStateTuple(new_c, new_h)
      else:
        new_state = array_ops.concat(1, [new_c, new_h])
      return new_h, new_state
Ejemplo n.º 5
0
    def test(self):
        """
        Test the correctness of ovl operator vs numpy implementation
        """
        a = np.array([1e-99, -1e-99, 0.0, np.Infinity], dtype=np.float64)
        log1pOp = log1p(a)
        ref = np.log1p(a)
        ovl_res = ovl.evaluate(log1pOp)
        ovl.logger.info(u'numpy: ' + str(ref) + u' ovl: ' + str(ovl_res))
        assert np.allclose(ref, ovl_res, rtol=0, atol=1e-20)
        if ovl.cuda_enabled:
            assert np.allclose(np.log1p(a),
                      ovl.evaluate(log1pOp, target_language='cuda'),
                      rtol=0, atol=1e-20)

        # test  vs tensorflow
        test_config=tf.ConfigProto(allow_soft_placement=False)
        # ensure TF runs on GPU when asked
        test_config.graph_options.optimizer_options.opt_level = -1
        ones = np.ones_like(a)
        if ovl.cuda_enabled:
            devices = ['/cpu:0', '/gpu:0']
        else:
            devices = ['/cpu:0']
        with tf.Session(config=test_config) as sess:
           for dev_string in devices:
                with tf.device(dev_string):
                    log1p_tf = ovl.as_tensorflow(log1pOp)
                    sess.run(tf.initialize_all_variables())
                    log1p_tf_result = sess.run(log1p_tf)
                    assert np.allclose(ref, log1p_tf_result,
                                       rtol=0, atol=1e-20)

                    # TF exp - 1
                    tf_out = tf.log(a - ones)
                    tf_result = tf_out.eval()
                    # this should fail
                    assert (np.allclose(ref, tf_result,
                                        rtol=0, atol=1e-20) == False)
        sess.close()
Ejemplo n.º 6
0
    def test_performance(self):
        """
        test the performance vs. numpy running standalone and from tensorflow
        based on tensorflow issue 813
        https://github.com/tensorflow/tensorflow/issues/813
        """
        import tensorflow as tf
        import timeit
        import time
        logger = ovl.logger
        iters = 10
        X = np.random.uniform(0, 1, size=(10000, 1000))
        # note, np.cumsum fails with memory error at input size 10 ^^ 6
        ref = np.cumsum(X, axis=0)
        # timeit returns seconds for 'number' iterations. For 10 iterations, multiply by 100 to get time in ms
        np_time = 100 * timeit.timeit(
            'np.cumsum(X, axis=0)',
            setup=
            'import numpy as np; X = np.random.uniform(0, 1, size=(10000, 1000))',
            number=iters)
        logger.debug(u'Best numpy time (ms): ' + str(np_time))
        cumsumOp = cumsum(X, axis=0)
        ovl_cpp, prof_cpp = ovl.profile(cumsumOp,
                                        target_language='cpp',
                                        profiling_iterations=iters,
                                        opt_level=0)
        assert np.allclose(ref, ovl_cpp)
        ovl_cpp_time = np.min(list(prof_cpp.values())[0])
        logger.debug(u'Best ovl cpp time (ms): ' + str(ovl_cpp_time))
        if ovl.cuda_enabled:
            ovl_cuda, prof_cuda = ovl.profile(cumsumOp,
                                              target_language='cuda',
                                              profiling_iterations=iters,
                                              opt_level=0)
            assert np.allclose(ref, ovl_cuda)
            ovl_cuda_time = np.min(list(prof_cuda.values())[0])
            logger.debug(u'Best ovl cuda time  (ms): ' + str(ovl_cuda_time))

        # OVL-TF integration
        # ensure TF runs on GPU
        test_config = tf.ConfigProto(allow_soft_placement=False)
        test_config.graph_options.optimizer_options.opt_level = -1
        if ovl.cuda_enabled:
            devices = ['/cpu:0', '/gpu:0']
        else:
            devices = ['/cpu:0']
        with tf.Session(config=test_config) as sess:
            for dev_string in devices:
                with tf.device(dev_string):
                    cumsum_tf = ovl.as_tensorflow(cumsumOp)
                    sess.run(tf.initialize_all_variables())
                    cumsum_tf_result = sess.run(cumsum_tf)
                    prof_ovl = np.zeros(iters)
                    for i in range(iters):
                        t0 = time.time()
                        sess.run(cumsum_tf.op)
                        t1 = time.time()
                        prof_ovl[i] = t1 - t0
                    tf_ovl_time = np.min(prof_ovl) * 1000.00
                    logger.debug(u'Best tf + ovl time  (ms) on ' + dev_string +
                                 ' :' + str(tf_ovl_time))
                    assert np.allclose(ref, cumsum_tf_result)

                    # TF cumsum
                    tf_out = tf.cumsum(X,
                                       axis=0,
                                       exclusive=False,
                                       reverse=False)
                    tf_result = tf_out.eval()
                    assert np.allclose(ref, tf_result)
                    prof_tf = np.zeros(iters)
                    for i in range(iters):
                        t0 = time.time()
                        sess.run(tf_out.op)
                        t1 = time.time()
                        prof_tf[i] = t1 - t0
                    tf_time = np.min(prof_tf) * 1000.00
                    logger.debug(u'Best tf cumsum time  (ms) on ' +
                                 dev_string + ' :' + str(tf_time))
        sess.close()
Ejemplo n.º 7
0
            # use_manual = False
            # opt_level = 0
            if use_tensorflow:
                i, j, f, o = tf.split(1, 4, concat_arg)
                new_c = tf.mul(
                    c, tf.sigmoid(f + forget)) + tf.sigmoid(i) * tf.tanh(j)
                new_h = tf.tanh(new_c) * tf.sigmoid(o)
                # dnc_dcat, dnc_dc = tf.gradients([new_c], [concat_arg, c], [d_new_c])
                # dnh_dcat, dnh_dc = tf.gradients([new_h], [concat_arg, c], [d_new_h])
                # grad = [dnc_dcat+dnh_dcat, dnc_dc + dnh_dc]
                grad = tf.gradients([new_c, new_h], [concat_arg, c],
                                    [d_new_c, d_new_h])
                trace_name = 'timeline_tf.ctf.json'
            else:
                if use_manual:
                    new_c, new_h = ovl.as_tensorflow(
                        lstm(concat_arg, c, forget_bias=0))
                    grad = tf.gradients([new_c, new_h], [concat_arg, c],
                                        [d_new_c, d_new_h])
                    trace_name = 'timeline_ovl_manual.ctf.json'
                else:
                    i, j, f, o = ops.split(concat_arg,
                                           split_dim=1,
                                           num_split=4)
                    new_c = ops.mul(
                        c, ops.sigmoid(f)) + ops.sigmoid(i) * ops.tanh(j)
                    new_h = ops.tanh(new_c) * ops.sigmoid(o)
                    new_c, new_h = ovl.as_tensorflow([new_c, new_h],
                                                     opt_level=opt_level)

                    grad = tf.gradients([new_c, new_h], [concat_arg, c],
                                        [d_new_c, d_new_h])
Ejemplo n.º 8
0
def run_tf(tensor_in_sizes, filter_in_sizes):
    # test TF  2D convolution operator in 1D vs. OVL
    total_size_1 = 1
    total_size_2 = 1
    for s in tensor_in_sizes:
        total_size_1 *= s
    for s in filter_in_sizes:
        total_size_2 *= s
    # Initializes the input tensor with array containing incrementing
    # numbers from 1.
    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
    tin1 = tf.constant(x1, shape=tensor_in_sizes, dtype=tf.float32)
    tin2 = tf.constant(x2, shape=filter_in_sizes, dtype=tf.float32)
    conv = tf.nn.conv2d(tin1,
                        tin2,
                        strides=[1, 1, 1, 1],
                        padding="SAME",
                        data_format='NHWC')

    # compare to OVL - need to convert input to 1-D - ie. input_rows = filter_rows = 1
    # also transpose initial data since filter index is last in TF and first in OVL
    # TF input = batch, input_row, input_col, channels
    # TF filter = filter_row, filter_col, channels, num_filters
    # OVL NEC input = batches, num_elements, channels
    # OVL NEC filter = num_filters, kernel_size, channels
    assert (tensor_in_sizes[1] == 1)
    assert (filter_in_sizes[0] == 1)
    ovl_tensor_in_sizes = [
        tensor_in_sizes[0], tensor_in_sizes[2], tensor_in_sizes[3]
    ]
    num_filter = filter_in_sizes[3]
    num_elem = filter_in_sizes[1]
    num_chan = filter_in_sizes[2]
    ovl_filter_in_sizes = [num_filter, num_elem, num_chan]
    ovl.logger.debug(u'input and filter sizes: ' + str(ovl_tensor_in_sizes) +
                     ', ' + str(ovl_filter_in_sizes))
    ar1 = np.array(x1, dtype=np.float).reshape(ovl_tensor_in_sizes)
    # does not produce the correct results
    # ar2 = np.array(x2, dtype=np.float).reshape(ovl_filter_in_sizes, order='F')
    ar2 = np.zeros(ovl_filter_in_sizes, dtype=np.float)
    for col in range(0, num_elem):
        for chan in range(0, num_chan):
            for num in range(0, num_filter):
                index = col * num_chan * num_filter + chan * num_filter + num
                # print('ar2 ' + str(num) + ',' + str(col) + ',' + str(chan) + ' is index ' + str(index) + ' val: ' + str(x2[index]))
                ar2[num, col, chan] = x2[index]

    t0 = time.time()
    ref = reference(ar1,
                    ar2,
                    mode='same',
                    orientation='as-is',
                    data_format='NEC')
    t1 = time.time()
    np_time = (t1 - t0) * 1000

    iters = 100
    ovlOp = conv_1d(ar1,
                    ar2,
                    mode='same',
                    kernel_orientation='as-is',
                    data_format='NEC')
    if ovl.cuda_enabled:
        ovlResult, prof = ovl.profile(ovlOp,
                                      target_language='cuda',
                                      profiling_iterations=iters,
                                      opt_level=3)
        ovl_cuda_time = np.min(list(prof.values())[0])
        assert np.allclose(ovlResult, ref)
    #TODO - cpp is really slow...
    ovlcppResult, profcpp = ovl.profile(ovlOp,
                                        target_language='cpp',
                                        profiling_iterations=iters,
                                        opt_level=3)
    ovl_cpp_time = np.min(list(profcpp.values())[0])
    assert np.allclose(ovlcppResult, ref)

    # ensure TF runs on GPU
    test_config = tf.ConfigProto(allow_soft_placement=False)
    test_config.graph_options.optimizer_options.opt_level = -1

    # OVL-TF integration
    ovl_tf_time = 0
    with tf.Session(config=test_config) as sess:
        with tf.device('/gpu:0'):
            ovlOp_tf = ovl.as_tensorflow(ovlOp)
            init = tf.initialize_all_variables()
            sess.run(init)
            ovlOp_tf_result = sess.run(ovlOp_tf)
            t0 = time.time()
            for dummy in itertools.repeat(None, iters):
                sess.run(ovlOp_tf.op)
            t1 = time.time()
            ovl_tf_time = (t1 - t0) / float(iters) * 1000.00
            assert np.allclose(ovlOp_tf_result, ref)
    sess.close()

    # run TF 2D conv alone
    tf_time = 0
    with tf.Session(config=test_config) as sess:
        with tf.device('/gpu:0'):
            result = sess.run([conv])
            t0 = time.time()
            for dummy in itertools.repeat(None, iters):
                sess.run([conv.op])
            t1 = time.time()
            tf_time = (t1 - t0) / float(iters) * 1000.00
            # TF result is 4D - have to convert to 3D to match OVL
            tf_shape = result[0].shape
            assert (tf_shape[1] == 1)
            ovl_shape = [tf_shape[0], tf_shape[2], tf_shape[3]]
            tf_result = np.array(result[0], dtype=np.float).reshape(ovl_shape)
            #TODO - if number of filter elements is even, TF result does not match reference - first element "wraps" to end
            assert np.allclose(tf_result, ref)
    sess.close()
    times = [np_time, ovl_cuda_time, ovl_cpp_time, ovl_tf_time, tf_time]
    ovl.logger.debug(u'    time [np, OVL_cuda, OVL_cpp, OVL_TF, TF]: ' +
                     str(times))
    def test_performance(self):
        """
        test the performance vs. numpy running standalone and from tensorflow
        based on tensorflow issue 813
        https://github.com/tensorflow/tensorflow/issues/813
        """
        import tensorflow as tf
        import timeit
        import time
        logger = ovl.logger
        iters = 10
        X = np.random.uniform(0, 1, size=(10000, 1000))
        # note, np.cumsum fails with memory error at input size 10 ^^ 6
        ref = np.cumsum(X, axis=0)
        # timeit returns seconds for 'number' iterations. For 10 iterations, multiply by 100 to get time in ms
        np_time = 100 * timeit.timeit('np.cumsum(X, axis=0)',
                                      setup='import numpy as np; X = np.random.uniform(0, 1, size=(10000, 1000))',
                                      number=iters)
        logger.info(u'Best numpy time (ms): ' + str(np_time))
        cumsumOp = cumsum(X, axis=0)
        ovl_cpp, prof_cpp = ovl.profile(cumsumOp, target_language='cpp', profiling_iterations=iters, opt_level=0)
        assert np.allclose(ref, ovl_cpp)
        ovl_cpp_time = np.min(list(prof_cpp.values())[0])
        logger.info(u'Best ovl cpp time (ms): ' + str(ovl_cpp_time))
        if ovl.cuda_enabled:
            ovl_cuda, prof_cuda = ovl.profile(cumsumOp, target_language='cuda', profiling_iterations=iters, opt_level=0)
            assert np.allclose(ref, ovl_cuda)
            ovl_cuda_time = np.min(list(prof_cuda.values())[0])
            logger.info(u'Best ovl cuda time  (ms): ' + str(ovl_cuda_time))

        # OVL-TF integration
        # ensure TF runs on GPU
        test_config=tf.ConfigProto(allow_soft_placement=False)
        test_config.graph_options.optimizer_options.opt_level = -1
        if ovl.cuda_enabled:
            devices = ['/cpu:0', '/gpu:0']
        else:
            devices = ['/cpu:0']
        with tf.Session(config=test_config) as sess:
           for dev_string in devices:
                with tf.device(dev_string):
                    cumsum_tf = ovl.as_tensorflow(cumsumOp)
                    sess.run(tf.initialize_all_variables())
                    cumsum_tf_result = sess.run(cumsum_tf)
                    prof_ovl = np.zeros(iters)
                    for i in range(iters):
                        t0 = time.time()
                        sess.run(cumsum_tf.op)
                        t1 = time.time()
                        prof_ovl[i] = t1 - t0
                    tf_ovl_time = np.min(prof_ovl) * 1000.00
                    logger.info(u'Best tf + ovl time  (ms) on ' + dev_string + ' :' + str(tf_ovl_time))
                    assert np.allclose(ref, cumsum_tf_result)

                    # TF cumsum
                    tf_out = tf.cumsum(X, axis=0, exclusive=False, reverse=False)
                    tf_result = tf_out.eval()
                    assert np.allclose(ref, tf_result)
                    prof_tf = np.zeros(iters)
                    for i in range(iters):
                        t0 = time.time()
                        sess.run(tf_out.op)
                        t1 = time.time()
                        prof_tf[i] = t1 - t0
                    tf_time = np.min(prof_tf) * 1000.00
                    logger.info(u'Best tf cumsum time  (ms) on ' + dev_string + ' :' + str(tf_time))
        sess.close()
Ejemplo n.º 10
0
def run_tf(tensor_in_sizes, filter_in_sizes):
    # test TF  2D convolution operator in 1D vs. OVL
    total_size_1 = 1
    total_size_2 = 1
    for s in tensor_in_sizes:
      total_size_1 *= s
    for s in filter_in_sizes:
      total_size_2 *= s
    # Initializes the input tensor with array containing incrementing
    # numbers from 1.
    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
    tin1 = tf.constant(x1, shape=tensor_in_sizes, dtype=tf.float32)
    tin2 = tf.constant(x2, shape=filter_in_sizes, dtype=tf.float32)
    conv = tf.nn.conv2d(tin1, tin2,
                      strides=[1, 1, 1, 1],
                      padding="SAME",
                      data_format='NHWC')

    # compare to OVL - need to convert input to 1-D - ie. input_rows = filter_rows = 1
    # also transpose initial data since filter index is last in TF and first in OVL
    # TF input = batch, input_row, input_col, channels
    # TF filter = filter_row, filter_col, channels, num_filters
    # OVL NEC input = batches, num_elements, channels
    # OVL NEC filter = num_filters, kernel_size, channels
    assert(tensor_in_sizes[1] == 1)
    assert(filter_in_sizes[0] == 1)
    ovl_tensor_in_sizes = [tensor_in_sizes[0], tensor_in_sizes[2], tensor_in_sizes[3]]
    num_filter = filter_in_sizes[3]
    num_elem = filter_in_sizes[1]
    num_chan = filter_in_sizes[2]
    ovl_filter_in_sizes = [num_filter, num_elem, num_chan]
    ovl.logger.debug(u'input and filter sizes: ' + str(ovl_tensor_in_sizes) + ', ' + str(ovl_filter_in_sizes))
    ar1 = np.array(x1, dtype=np.float).reshape(ovl_tensor_in_sizes)
    # does not produce the correct results
    # ar2 = np.array(x2, dtype=np.float).reshape(ovl_filter_in_sizes, order='F')
    ar2 = np.zeros(ovl_filter_in_sizes, dtype=np.float)
    for col in range(0, num_elem):
        for chan in range(0, num_chan):
            for num in range(0, num_filter):
                index = col * num_chan * num_filter + chan * num_filter + num
                # print('ar2 ' + str(num) + ',' + str(col) + ',' + str(chan) + ' is index ' + str(index) + ' val: ' + str(x2[index]))
                ar2[num,col,chan] = x2[index]

    t0 = time.time()
    ref = reference(ar1, ar2, mode='same', orientation='as-is', data_format= 'NEC')
    t1 = time.time()
    np_time = (t1-t0)*1000

    iters = 100
    ovlOp = conv_1d(ar1, ar2, mode='same', kernel_orientation='as-is', data_format='NEC')
    ovl_cuda_time = 0
    if ovl.cuda_enabled:
        ovlResult, prof = ovl.profile(ovlOp, target_language='cuda', profiling_iterations=iters, opt_level=3)
        ovl_cuda_time = np.min(list(prof.values())[0])
        assert np.allclose(ovlResult, ref)
    #TODO - cpp is really slow...
    ovlcppResult, profcpp = ovl.profile(ovlOp, target_language='cpp', profiling_iterations=iters, opt_level=3)
    ovl_cpp_time = np.min(list(profcpp.values())[0])
    assert np.allclose(ovlcppResult, ref)

    # ensure TF runs on GPU
    test_config=tf.ConfigProto(allow_soft_placement=False)
    test_config.graph_options.optimizer_options.opt_level = -1

    # OVL-TF integration
    ovl_tf_time = 0

    dev_string = '/cpu:0'
    if ovl.cuda_enabled:
        dev_string = '/gpu:0'
    with tf.Session(config=test_config) as sess:
        with tf.device(dev_string):
            ovlOp_tf = ovl.as_tensorflow(ovlOp)
            init = tf.initialize_all_variables()
            sess.run(init)
            ovlOp_tf_result = sess.run(ovlOp_tf)
            t0 = time.time()
            for dummy in itertools.repeat(None, iters):
                sess.run(ovlOp_tf.op)
            t1 = time.time()
            ovl_tf_time = (t1-t0)/float(iters) * 1000.00
            assert np.allclose(ovlOp_tf_result, ref)
    sess.close()

    # run TF 2D conv alone
    tf_time = 0
    with tf.Session(config=test_config) as sess:
        with tf.device(dev_string):
            result = sess.run([conv])
            t0 = time.time()
            for dummy in itertools.repeat(None, iters):
                sess.run([conv.op])
            t1 = time.time()
            tf_time = (t1-t0)/float(iters) * 1000.00
            # TF result is 4D - have to convert to 3D to match OVL
            tf_shape = result[0].shape
            assert(tf_shape[1] == 1)
            ovl_shape = [tf_shape[0], tf_shape[2], tf_shape[3]]
            tf_result = np.array(result[0], dtype=np.float).reshape(ovl_shape)
            #TODO - if number of filter elements is even, TF result does not match reference - first element "wraps" to end
            assert np.allclose(tf_result, ref)
    sess.close()
    times = [np_time, ovl_cuda_time, ovl_cpp_time, ovl_tf_time, tf_time]
    ovl.logger.debug(u'    time [np, OVL_cuda, OVL_cpp, OVL_TF, TF]: ' + str(times))