def xla_runner(fe, model_name, batch_size, device, xla): if fe != 'tf': return None print('TF inter thread: {}, intra thread: {}'.format( tf.config.threading.get_inter_op_parallelism_threads(), tf.config.threading.get_intra_op_parallelism_threads())) if xla: tf.keras.backend.clear_session() tf.config.optimizer.set_jit(True) else: tf.keras.backend.clear_session() tf.config.optimizer.set_jit(False) if device == 'cpu': # FIXME: add a better way to use cpu device in tf # os.environ['CUDA_VISIBLE_DEVICES'] = '' pass else: if 'CUDA_VISIBLE_DEVICES' in os.environ: del os.environ['CUDA_VISIBLE_DEVICES'] model, shape = util.tf_keras_model(model_name) class runner_wrapper: def __init__(self, graph_model, need_eval, batch_size=1): self.batch_size = batch_size self.data = np.random.rand(batch_size, *shape).astype(np.float32) self.need_eval = need_eval self.graph_model = graph_model def __call__(self, data_size): if self.need_eval: self.session_runner(data_size) else: for _ in range(data_size // self.batch_size): # import pdb; pdb.set_trace() print("graph start", datetime.now().strftime("%m/%d/%Y, %H:%M:%S.%f")) start = time.time() ret = self.graph_model(self.data) end = time.time() print("graph end", datetime.now().strftime("%m/%d/%Y, %H:%M:%S.%f")) print("graph_model time: %s us" % ((end - start) * 10**6)) # print(ret) ret.numpy() # explicitly eval is only needed when eager_execution is off def session_runner(self, data_size): with tf.compat.v1.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) for _ in range(data_size // self.batch_size): ret = self.graph_model(self.data) ret_np = ret.eval() graph_mode = tf.function(lambda x: model(x)) runner = runner_wrapper(graph_mode, False, batch_size=batch_size) return runner
def train_runner(model_name, batch_size, device='gpu', xla=True): if xla: tf.keras.backend.clear_session() tf.config.optimizer.set_jit(True) else: tf.keras.backend.clear_session() tf.config.optimizer.set_jit(False) if device == "gpu": gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: tf.config.experimental.set_visible_devices(gpus[1], 'GPU') else: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Set up standard model. model, shape = util.tf_keras_model(model_name) opt = tf.optimizers.SGD(0.01) data = tf.random.uniform([batch_size, 224, 224, 3]) target = tf.random.uniform([batch_size, 1], minval=0, maxval=999, dtype=tf.int64) @tf.function def benchmark_step(): with tf.GradientTape() as tape: probs = model(data, training=True) loss = tf.losses.sparse_categorical_crossentropy(target, probs) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) def log(s, nl=True): print(s, end='\n' if nl else '') class runner_wrapper: def __init__(self, benchmark_step, batch_size): self.step = benchmark_step self.batch_size = batch_size def __call__(self, data_size): for _ in range(data_size // self.batch_size): self.step() return runner_wrapper(benchmark_step, batch_size)
def tf2tvm_runner(model_name, batch_size=1, backend='cuda'): # tvm cuda will have issue with mobilenet if model_name == 'mobilenet' and backend == 'cuda': return None model, shape = util.tf_keras_model(model_name) # TODO: why tvm needs reversed shape shape = shape[::-1] data = np.random.rand(batch_size, *shape) # input_name has to match model's input name # use model.input_names[0] instead of input_1 to compile different models inside same round # TODO: why would same models with cuda/lvvm can compile in same process? (different backends models doens't affect each other?) input_name = model.input_names[0] shape_dict = {input_name: data.shape} mod, params = relay.frontend.from_keras(model, shape_dict) if backend == 'llvm': with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target='llvm', target_host='llvm', params=params) ctx = tvm.cpu() module = graph_runtime.GraphModule(lib["default"](ctx)) else: with tvm.transform.PassContext(opt_level=3): # has to specify target to tvm.target.cuda(), 'cuda' doesn't work lib = relay.build(mod, target=tvm.target.cuda(), params=params) ctx = tvm.gpu() module = graph_runtime.GraphModule(lib["default"](ctx)) # FIXME: why neccessary to have dtype as float32 here, failed with float64? dtype = "float32" data = tvm.nd.array(data.astype(dtype)) def runner(data_size): for _ in range(data_size // batch_size): module.set_input(input_name, data) tvm_output = module.get_output(0) return runner
def grappler_runner(model_name, batch_size): model, shape = util.tf_keras_model(model_name) # data = np.random.rand(batch_size, *shape).astype(np.float32) grap_model = tf.function(lambda x: model(x)) class runner_wrapper: def __init__(self, batch_size=1): self.batch_size = batch_size self.data = np.random.rand(batch_size, *shape).astype(np.float32) def __call__(self, data_size): with options({ 'layout_optimizer': False, 'function_optimization': False }): for _ in range(data_size // self.batch_size): ret = grap_model(self.data) runner = runner_wrapper(batch_size=batch_size) return runner
def tf2trt_runner(model_name, batch_size=1): # tvm cuda will have issue with mobilenet model, shape = util.tf_keras_model(model_name) model_path = os.path.join(save_dir, model_name + '_saved_model') if not os.path.isdir(model_path): model.save(model_path) trt_path = os.path.join(save_dir, model_name + '_TFTRT_FP32_saved_model') # if not os.path.isdir(trt_path): # always regenerate model to avoid incompatibility between different onnx/trt version print('Converting to TF-TRT FP32...') conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace( precision_mode=trt.TrtPrecisionMode.FP32, max_workspace_size_bytes=8000000000) converter = trt.TrtGraphConverterV2(input_saved_model_dir=model_path, conversion_params=conversion_params) converter.convert() converter.save(output_saved_model_dir=trt_path) print('Done Converting to TF-TRT FP32') saved_model_loaded = tf.saved_model.load(trt_path, tags=[tag_constants.SERVING]) signature_keys = list(saved_model_loaded.signatures.keys()) print(signature_keys) infer = saved_model_loaded.signatures['serving_default'] data = np.random.rand(batch_size, *shape).astype(np.float32) x = tf.constant(data) def runner(data_size): for _ in range(data_size // batch_size): infer(x) return runner
parser.add_argument("--device", choices=['gpu', 'cpu'], default='gpu', help='device to run on') parser.add_argument("--batch", type=int, default=1, help='batch size') parser.add_argument("--size", type=int, default=1024, help='data size') parser.add_argument("--visual", action='store_true', help='Output tensorboard log for visualization') args = parser.parse_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' if args.visual: model, shape = util.tf_keras_model(args.model) log_dir = "logs/fit/{}/{}".format( args.model, datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) x_train = tf.random.uniform([args.batch, 224, 224, 3]) y_train = tf.random.uniform([args.batch, 1], minval=0, maxval=999, dtype=tf.int64) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) model.fit(x=x_train,