Exemple #1
0
def xla_runner(fe, model_name, batch_size, device, xla):
    if fe != 'tf':
        return None

    print('TF inter thread: {}, intra thread: {}'.format(
        tf.config.threading.get_inter_op_parallelism_threads(),
        tf.config.threading.get_intra_op_parallelism_threads()))
    if xla:
        tf.keras.backend.clear_session()
        tf.config.optimizer.set_jit(True)
    else:
        tf.keras.backend.clear_session()
        tf.config.optimizer.set_jit(False)
    if device == 'cpu':
        # FIXME: add a better way to use cpu device in tf
        # os.environ['CUDA_VISIBLE_DEVICES'] = ''
        pass
    else:
        if 'CUDA_VISIBLE_DEVICES' in os.environ:
            del os.environ['CUDA_VISIBLE_DEVICES']

    model, shape = util.tf_keras_model(model_name)

    class runner_wrapper:
        def __init__(self, graph_model, need_eval, batch_size=1):
            self.batch_size = batch_size
            self.data = np.random.rand(batch_size, *shape).astype(np.float32)
            self.need_eval = need_eval
            self.graph_model = graph_model

        def __call__(self, data_size):
            if self.need_eval:
                self.session_runner(data_size)
            else:
                for _ in range(data_size // self.batch_size):
                    # import pdb; pdb.set_trace()
                    print("graph start",
                          datetime.now().strftime("%m/%d/%Y, %H:%M:%S.%f"))
                    start = time.time()
                    ret = self.graph_model(self.data)
                    end = time.time()
                    print("graph end",
                          datetime.now().strftime("%m/%d/%Y, %H:%M:%S.%f"))
                    print("graph_model time: %s us" % ((end - start) * 10**6))
                    # print(ret)
                    ret.numpy()

        # explicitly eval is only needed when eager_execution is off
        def session_runner(self, data_size):
            with tf.compat.v1.Session() as sess:
                sess.run(tf.compat.v1.global_variables_initializer())
                for _ in range(data_size // self.batch_size):
                    ret = self.graph_model(self.data)
                    ret_np = ret.eval()

    graph_mode = tf.function(lambda x: model(x))

    runner = runner_wrapper(graph_mode, False, batch_size=batch_size)
    return runner
Exemple #2
0
def train_runner(model_name, batch_size, device='gpu', xla=True):
    if xla:
        tf.keras.backend.clear_session()
        tf.config.optimizer.set_jit(True)
    else:
        tf.keras.backend.clear_session()
        tf.config.optimizer.set_jit(False)
    if device == "gpu":
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    # Set up standard model.
    model, shape = util.tf_keras_model(model_name)
    opt = tf.optimizers.SGD(0.01)

    data = tf.random.uniform([batch_size, 224, 224, 3])
    target = tf.random.uniform([batch_size, 1],
                               minval=0,
                               maxval=999,
                               dtype=tf.int64)

    @tf.function
    def benchmark_step():
        with tf.GradientTape() as tape:
            probs = model(data, training=True)
            loss = tf.losses.sparse_categorical_crossentropy(target, probs)

        gradients = tape.gradient(loss, model.trainable_variables)
        opt.apply_gradients(zip(gradients, model.trainable_variables))

    def log(s, nl=True):
        print(s, end='\n' if nl else '')

    class runner_wrapper:
        def __init__(self, benchmark_step, batch_size):
            self.step = benchmark_step
            self.batch_size = batch_size

        def __call__(self, data_size):
            for _ in range(data_size // self.batch_size):
                self.step()

    return runner_wrapper(benchmark_step, batch_size)
Exemple #3
0
def tf2tvm_runner(model_name, batch_size=1, backend='cuda'):
    # tvm cuda will have issue with mobilenet
    if model_name == 'mobilenet' and backend == 'cuda':
        return None
    model, shape = util.tf_keras_model(model_name)
    # TODO: why tvm needs reversed shape
    shape = shape[::-1]
    data = np.random.rand(batch_size, *shape)
    # input_name has to match model's input name
    # use  model.input_names[0] instead of input_1 to compile different models inside same round
    # TODO: why would same models with cuda/lvvm can compile in same process? (different backends models doens't affect each other?)
    input_name = model.input_names[0]
    shape_dict = {input_name: data.shape}
    mod, params = relay.frontend.from_keras(model, shape_dict)

    if backend == 'llvm':
        with tvm.transform.PassContext(opt_level=3):
            lib = relay.build(mod,
                              target='llvm',
                              target_host='llvm',
                              params=params)

        ctx = tvm.cpu()
        module = graph_runtime.GraphModule(lib["default"](ctx))
    else:
        with tvm.transform.PassContext(opt_level=3):
            # has to specify target to tvm.target.cuda(), 'cuda' doesn't work
            lib = relay.build(mod, target=tvm.target.cuda(), params=params)

        ctx = tvm.gpu()
        module = graph_runtime.GraphModule(lib["default"](ctx))

    # FIXME: why neccessary to have dtype as float32 here, failed with float64?
    dtype = "float32"
    data = tvm.nd.array(data.astype(dtype))

    def runner(data_size):
        for _ in range(data_size // batch_size):
            module.set_input(input_name, data)
            tvm_output = module.get_output(0)

    return runner
Exemple #4
0
def grappler_runner(model_name, batch_size):
    model, shape = util.tf_keras_model(model_name)
    # data = np.random.rand(batch_size, *shape).astype(np.float32)
    grap_model = tf.function(lambda x: model(x))

    class runner_wrapper:
        def __init__(self, batch_size=1):
            self.batch_size = batch_size
            self.data = np.random.rand(batch_size, *shape).astype(np.float32)

        def __call__(self, data_size):
            with options({
                    'layout_optimizer': False,
                    'function_optimization': False
            }):
                for _ in range(data_size // self.batch_size):
                    ret = grap_model(self.data)

    runner = runner_wrapper(batch_size=batch_size)
    return runner
Exemple #5
0
def tf2trt_runner(model_name, batch_size=1):
    # tvm cuda will have issue with mobilenet
    model, shape = util.tf_keras_model(model_name)
    model_path = os.path.join(save_dir, model_name + '_saved_model')
    if not os.path.isdir(model_path):
        model.save(model_path)

    trt_path = os.path.join(save_dir, model_name + '_TFTRT_FP32_saved_model')

    # if not os.path.isdir(trt_path):
    # always regenerate model to avoid incompatibility between different onnx/trt version
    print('Converting to TF-TRT FP32...')
    conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(
        precision_mode=trt.TrtPrecisionMode.FP32,
        max_workspace_size_bytes=8000000000)

    converter = trt.TrtGraphConverterV2(input_saved_model_dir=model_path,
                                        conversion_params=conversion_params)
    converter.convert()
    converter.save(output_saved_model_dir=trt_path)
    print('Done Converting to TF-TRT FP32')

    saved_model_loaded = tf.saved_model.load(trt_path,
                                             tags=[tag_constants.SERVING])
    signature_keys = list(saved_model_loaded.signatures.keys())
    print(signature_keys)

    infer = saved_model_loaded.signatures['serving_default']
    data = np.random.rand(batch_size, *shape).astype(np.float32)
    x = tf.constant(data)

    def runner(data_size):
        for _ in range(data_size // batch_size):
            infer(x)

    return runner
Exemple #6
0
    parser.add_argument("--device",
                        choices=['gpu', 'cpu'],
                        default='gpu',
                        help='device to run on')
    parser.add_argument("--batch", type=int, default=1, help='batch size')
    parser.add_argument("--size", type=int, default=1024, help='data size')
    parser.add_argument("--visual",
                        action='store_true',
                        help='Output tensorboard log for visualization')

    args = parser.parse_args()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    if args.visual:
        model, shape = util.tf_keras_model(args.model)
        log_dir = "logs/fit/{}/{}".format(
            args.model,
            datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                              histogram_freq=1)
        x_train = tf.random.uniform([args.batch, 224, 224, 3])
        y_train = tf.random.uniform([args.batch, 1],
                                    minval=0,
                                    maxval=999,
                                    dtype=tf.int64)
        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

        model.fit(x=x_train,