"Parent ids", tf_parent_ids_result, op_parent_ids_result, shape=[1 + max_seq_len, batch_size * beam_width]) int_result_cross_check("Sequence lengths", tf_sequence_lengths_result, op_sequence_lengths_result, shape=[1, batch_size * beam_width]) int_result_cross_check( "Finalized output ids", finalized_tf_output_ids_result.T, finalized_op_output_ids_result.T, shape=[1 + max_seq_len, batch_size * beam_width]) if args.test_time == 1 or args.test_tf_time == 1 or args.test_op_time == 1: if args.test_time == 1 or args.test_tf_time == 1: tf_time_result = time_test(sess, finalized_tf_output_ids, iterations=50, warmup=True) if args.test_time == 1 or args.test_op_time == 1: op_time_result = time_test(sess, finalized_op_output_ids, iterations=50, warmup=True) if args.test_time == 1 or args.test_tf_time == 1: print("[INFO] TF execution time: {} ms".format(tf_time_result)) if args.test_time == 1 or args.test_op_time == 1: print("[INFO] OP execution time: {} ms".format(op_time_result))
# encoder_variables_dict_2[var.name] = val # op_encoder_result_2 = op_encoder(inputs=from_tensor, # encoder_args=encoder_args, # attention_mask=attention_mask, # encoder_vars_dict=encoder_variables_dict_2, # sequence_length=sequence_length) # op_encoder_result_val_2 = sess.run(op_encoder_result_2) # cross_check("Encoder TF v.s. FT with numpy input", tf_encoder_result_val, # op_encoder_result_val_2, atol_threshold) if args.test_time == 1: ite = 50 #time.sleep(25) tf_time = time_test(sess, tf_encoder_result, ite) #time.sleep(25) op_encoder_int8_v1_time = time_test(sess, op_encoder_int8_v1_result, ite) #time.sleep(25) op_encoder_int8_v2_time = time_test(sess, op_encoder_int8_v2_result, ite) #time.sleep(25) op_encoder_notInt8_time = time_test(sess, op_encoder_notInt8_result, ite) # op_time_2 = time_test(sess, op_encoder_result_2, ite) print( "[INFO] batch_size {} max_seq_len {} {} layer TF-time {:6.2f} ms" .format(batch_size, max_seq_len, num_layer, tf_time)) print(
encoder_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) op_encoder_result = op_encoder(inputs=from_tensor, encoder_args=encoder_args, encoder_vars=encoder_variables, attention_mask=attention_mask) config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() for idx, var in enumerate(encoder_variables): print((str(idx) + " " + str(var.name) + " " + str(var.shape)) + " " + str(var.dtype)) print("#################################") tf_encoder_result_val = sess.run(tf_encoder_result) op_encoder_result_val = sess.run(op_encoder_result) cross_check("Encoder", tf_encoder_result_val, op_encoder_result_val, atol_threshold) if args.test_time == 1: ite = 100 tf_time = time_test(sess, tf_encoder_result, ite) op_time = time_test(sess, op_encoder_result, ite) print("[INFO] TF encoder time costs: {} ms".format(tf_time)) print("[INFO] OP encoder time costs: {} ms".format(op_time))
time_args = args.test_time test_lists = [] test_names = [] if time_args.find("0") != -1: test_lists.append(finalized_tf_output_ids) test_names.append("TF-decoding-beamsearch") if time_args.find("1") != -1: test_lists.append(finalized_op_output_ids) test_names.append("FT-OP-decoding-beamsearch") if time_args.find("2") != -1: test_lists.append(tf_sampling_target_ids) test_names.append("TF-decoding-sampling") if time_args.find("3") != -1: test_lists.append(op_sampling_target_ids) test_names.append("FT-OP-decoding-sampling") test_time_result = [] for op in test_lists: test_time_result.append( time_test(sess, op, iterations=10, warmup=True)) for name, t_result in zip(test_names, test_time_result): if name.find("beamsearch") != -1: print("[INFO] batch_size {} beam_width {} head_num {} size_per_head {} seq_len {} " \ "decoder_layers {} vocab_size {} {}-time {:6.2f} ms.".format(batch_size, beam_width, head_num, size_per_head, max_seq_len, num_layer, vocab_size, name, t_result)) elif name.find("sampling") != -1: print("[INFO] batch_size {} topk {} topp {} head_num {} size_per_head {} seq_len {} " \ "decoder_layers {} vocab_size {} {}-time {:6.2f} ms.".format(batch_size, sampling_topk, sampling_topp, head_num, size_per_head, max_seq_len, num_layer, vocab_size, name, t_result))
def encoder_sample(args_dict): print("\n=============== Argument ===============") for key in args_dict: print("{}: {}".format(key, args_dict[key])) print("========================================") np.random.seed(1) tf.set_random_seed(1) batch_size = args_dict['batch_size'] num_layer = args_dict['num_layer'] max_seq_len = args_dict['max_seq_len'] avg_seq_len = args_dict['avg_seq_len'] head_num = args_dict['head_number'] size_per_head = args_dict['size_per_head'] tf_datatype = tf.float32 np_datatype = np.float32 atol_threshold = 3e-5 int8_mode = args_dict['int8_mode'] allow_gemm_test = True if args_dict['allow_gemm_test'].lower() == "true" else False if args_dict['data_type'] == "fp16": tf_datatype = tf.float16 np_datatype = np.float16 atol_threshold = 3e-2 hidden_dim = head_num * size_per_head sequence_length = np.random.randint(1, max_seq_len + 1, size=batch_size) if avg_seq_len != -1: # This means we use "remove_padding" and set other average sequence length sequence_length = np.ones(batch_size) * avg_seq_len else: sequence_length = np.ones(batch_size) * (max_seq_len / 2) sequence_length = sequence_length.astype(np.int32) from_data = np.random.randn(batch_size, max_seq_len, hidden_dim) from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype) attention_mask = build_sequence_mask(sequence_length, num_heads=head_num, maximum_length=max_seq_len, dtype=tf_datatype) encoder_args = TransformerArgument(beam_width=1, head_num=head_num, size_per_head=size_per_head, num_layer=num_layer, dtype=tf_datatype, remove_padding=False, int8_mode=int8_mode, allow_gemm_test=allow_gemm_test) eff_encoder_args = copy.deepcopy(encoder_args) eff_encoder_args.remove_padding = True tf_encoder_result = tf_encoder(input_tensor=from_tensor, encoder_args=encoder_args, attention_mask=attention_mask) encoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) encoder_variables_dict = {} for v in encoder_vars: encoder_variables_dict[v.name] = v op_encoder_result = op_encoder(inputs=from_tensor, encoder_args=encoder_args, attention_mask=attention_mask, encoder_vars_dict=encoder_variables_dict, sequence_length=sequence_length) eff_encoder_result = op_encoder(inputs=from_tensor, encoder_args=eff_encoder_args, attention_mask=attention_mask, encoder_vars_dict=encoder_variables_dict, sequence_length=sequence_length) ''' Because FasterTransformer skip some computation for the padding parts, if we do not mask these parts, the cross check result would be wrong. ''' tf_encoder_result = tf_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1) op_encoder_result = op_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1) eff_encoder_result = eff_encoder_result * tf.expand_dims(tf.sequence_mask(sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for idx, name in enumerate(encoder_variables_dict): print((str(idx) + " " + str(name) + " " + str(encoder_variables_dict[name].shape)) + " " + str(encoder_variables_dict[name].dtype)) print("#################################") tf_encoder_result_val = sess.run(tf_encoder_result) op_encoder_result_val = sess.run(op_encoder_result) eff_encoder_result_val = sess.run(eff_encoder_result) cross_check("Encoder TF v.s. FT with tensor input", tf_encoder_result_val, op_encoder_result_val, atol_threshold) cross_check("Encoder TF v.s. EFF-FT with tensor input", tf_encoder_result_val, eff_encoder_result_val, atol_threshold) op_diff = abs(tf_encoder_result_val.reshape([-1]) - op_encoder_result_val.reshape([-1])) eff_diff = abs(tf_encoder_result_val.reshape([-1]) - eff_encoder_result_val.reshape([-1])) max_diff = max(op_diff.max(), eff_diff.max()) ite = 50 def _cond(from_tensor): return tf.constant(True) def _ft_body(from_tensor): op_encoder_result = op_encoder(inputs=from_tensor, encoder_args=encoder_args, attention_mask=attention_mask, encoder_vars_dict=encoder_variables_dict, sequence_length=sequence_length) return op_encoder_result def _eff_body(from_tensor): eff_encoder_result = op_encoder(inputs=from_tensor, encoder_args=eff_encoder_args, attention_mask=attention_mask, encoder_vars_dict=encoder_variables_dict, sequence_length=sequence_length) return eff_encoder_result def _tf_body(from_tensor): tf_encoder_result = tf_encoder(input_tensor=from_tensor, encoder_args=encoder_args, attention_mask=attention_mask) return tf_encoder_result tf_while_tensor = tf.while_loop(_cond, _tf_body, loop_vars=[from_tensor], back_prop=False, maximum_iterations=ite) ft_while_tensor = tf.while_loop(_cond, _ft_body, loop_vars=[from_tensor], back_prop=False, maximum_iterations=ite) eff_while_tensor = tf.while_loop(_cond, _eff_body, loop_vars=[from_tensor], back_prop=False, maximum_iterations=ite) if args_dict['test_time'] == 1: # tf_time = time_test(sess, tf_encoder_result, ite) # ft_time = time_test(sess, op_encoder_result, ite) # eff_time = time_test(sess, eff_encoder_result, ite) # Using while loop to run 'ite' times to ignore the overheads of memory copy and model preprocess. # We use these times as the profiling results. tf_while_time = time_test(sess, tf_while_tensor, 1) / ite # while_loop has run ite times time.sleep(60) ft_while_time = time_test(sess, ft_while_tensor, 1) / ite # while_loop has run ite times time.sleep(60) eff_while_time = time_test(sess, eff_while_tensor, 1) / ite # while_loop has run ite times time.sleep(60) ft_type = args_dict['data_type'].upper() if int8_mode != 0: ft_type = "INT8-v{}".format(int8_mode) # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer TF-time {:6.2f} ms".format(batch_size, max_seq_len, args_dict['data_type'].upper(), num_layer, tf_time)) # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer FT-OP-time {:6.2f} ms".format(batch_size, max_seq_len, ft_type, num_layer, ft_time)) # print("[INFO] batch_size {} max_seq_len {} precision {} {} layer EFF-OP-time {:6.2f} ms".format(batch_size, max_seq_len, ft_type, num_layer, eff_time)) print("[INFO] batch_size {} max_seq_len {} precision {} {} layer TF-while-time {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, args_dict['data_type'].upper(), num_layer, tf_while_time, ite)) print("[INFO] batch_size {} max_seq_len {} precision {} {} layer FT-OP-while-time {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, ft_type, num_layer, ft_while_time, ite)) print("[INFO] batch_size {} max_seq_len {} precision {} {} layer EFF-OP-while-time {:6.2f} ms ( {} iterations)".format(batch_size, max_seq_len, ft_type, num_layer, eff_while_time, ite)) if args_dict['thread_num'] > 1: # Multi-threading demonstration thread_list = [] thread_num = args_dict['thread_num'] def run(): ft_while_time = time_test(sess, ft_while_tensor, 1) / ite # while_loop has run ite times print("[INFO] batch_size {} max_seq_len {} {} layer FT-OP-while-time {:6.2f} ms with {} threads".format(batch_size, max_seq_len, num_layer, ft_while_time, thread_num)) for i in range(thread_num): thread_list.append(threading.Thread(target=run, name="RunFT")) for t in thread_list: t.start() for t in thread_list: t.join() return max_diff
decoder_args, 0.0) embedding_table = np.random.randn(vocab_size, hidden_dim).astype( np_datatype) * 0.01 # a [vocab_size, hidden_dim] table embedding_table = tf.convert_to_tensor(embedding_table) memory, memory_sequence_length = generate_encoder_result( batch_size, max_seq_len, memory_hidden_dim, tf_datatype) finalized_tf_output_ids, finalized_tf_sequence_lengths, _, _, _ = tf_beamsearch_decoding( memory, memory_sequence_length, embedding_table, decoding_args, decoder_type=args.decoder_type) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) sess.run(finalized_tf_output_ids) if args.test_time == 1: time_cost = time_test(sess, finalized_tf_output_ids, iterations=10) types = ["TF-decoding-beamsearch", "FT-OP-decoder", "TF+FT-OP"] print("[INFO] batch_size {} beam_width {} head_num {} size_per_head {} seq_len {} " \ "decoder_layers {} vocab_size {} {}-time {:6.2f} ms.".format(batch_size, beam_width, head_num, size_per_head, max_seq_len, num_layer, vocab_size, types[args.decoder_type], time_cost))
def run(): ft_while_time = time_test(sess, ft_while_tensor, 1) / ite # while_loop has run ite times print("[INFO] batch_size {} max_seq_len {} {} layer FT-OP-while-time {:6.2f} ms with {} threads".format(batch_size, max_seq_len, num_layer, ft_while_time, thread_num))
def encoder_sample(args_dict): print("\n=============== Argument ===============") for key in args_dict: print("{}: {}".format(key, args_dict[key])) print("========================================") np.random.seed(1) tf.set_random_seed(1) batch_size = args_dict['batch_size'] num_layer = args_dict['num_layer'] max_seq_len = args_dict['max_seq_len'] avg_seq_len = args_dict['avg_seq_len'] head_num = args_dict['head_number'] size_per_head = args_dict['size_per_head'] remove_padding = True if args_dict['remove_padding'].lower( ) == "true" else False tf_datatype = tf.float32 np_datatype = np.float32 atol_threshold = 3e-5 int8_mode = args_dict['int8_mode'] allow_gemm_test = True if args_dict['allow_gemm_test'].lower( ) == "true" else False if args_dict['data_type'] == "fp16": tf_datatype = tf.float16 np_datatype = np.float16 atol_threshold = 3e-2 hidden_dim = head_num * size_per_head sequence_length = np.random.randint(1, max_seq_len + 1, size=batch_size) if avg_seq_len != -1 and remove_padding == True: # This means we use "remove_padding" and set a smaller average sequence length sequence_length = np.ones(batch_size) * avg_seq_len else: sequence_length = np.ones(batch_size) * (max_seq_len / 2) sequence_length = sequence_length.astype(np.int32) from_data = np.random.randn(batch_size, max_seq_len, hidden_dim) from_tensor = tf.convert_to_tensor(from_data, dtype=tf_datatype) attention_mask = build_sequence_mask(sequence_length, num_heads=head_num, maximum_length=max_seq_len, dtype=tf_datatype) encoder_args = TransformerArgument(beam_width=1, head_num=head_num, size_per_head=size_per_head, num_layer=num_layer, dtype=tf_datatype, remove_padding=remove_padding, int8_mode=int8_mode, allow_gemm_test=allow_gemm_test) tf_encoder_result = tf_encoder(input_tensor=from_tensor, encoder_args=encoder_args, attention_mask=attention_mask) encoder_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) encoder_variables_dict = {} for v in encoder_vars: encoder_variables_dict[v.name] = v op_encoder_result = op_encoder(inputs=from_tensor, encoder_args=encoder_args, attention_mask=attention_mask, encoder_vars_dict=encoder_variables_dict, sequence_length=sequence_length) ''' Because FasterTransformer skip some computation for the padding parts, if we do not mask these parts, the cross check result would be wrong. ''' tf_encoder_result = tf_encoder_result * tf.expand_dims(tf.sequence_mask( sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1) op_encoder_result = op_encoder_result * tf.expand_dims(tf.sequence_mask( sequence_length, maxlen=max_seq_len, dtype=tf_datatype), axis=-1) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() for idx, name in enumerate(encoder_variables_dict): print((str(idx) + " " + str(name) + " " + str(encoder_variables_dict[name].shape)) + " " + str(encoder_variables_dict[name].dtype)) print("#################################") tf_encoder_result_val = sess.run(tf_encoder_result) op_encoder_result_val = sess.run(op_encoder_result) cross_check("Encoder TF v.s. FT with tensor input", tf_encoder_result_val, op_encoder_result_val, atol_threshold) ''' Use the numpy array as inputs of FasterTransformer OP. This method require more time for the op initialization (especially for FP16), but the inference time would be little faster than using tensor as input. ''' encoder_variables_dict_2 = {} for var, val in zip(encoder_vars, sess.run(encoder_vars)): encoder_variables_dict_2[var.name] = val # op_encoder_result_2 = op_encoder(inputs=from_tensor, # encoder_args=encoder_args, # attention_mask=attention_mask, # encoder_vars_dict=encoder_variables_dict_2, # sequence_length=sequence_length) # op_encoder_result_val_2 = sess.run(op_encoder_result_2) # cross_check("Encoder TF v.s. FT with numpy input", tf_encoder_result_val, # op_encoder_result_val_2, atol_threshold) if args_dict['test_time'] == 1: ite = 50 tf_time = time_test(sess, tf_encoder_result, ite) op_time = time_test(sess, op_encoder_result, ite) # op_time_2 = time_test(sess, op_encoder_result_2, ite) print( "[INFO] batch_size {} max_seq_len {} {} layer TF-time {:6.2f} ms" .format(batch_size, max_seq_len, num_layer, tf_time)) print( "[INFO] batch_size {} max_seq_len {} {} layer FT-OP-tensor-time {:6.2f} ms" .format(batch_size, max_seq_len, num_layer, op_time)) # print("[INFO] batch_size {} max_seq_len {} {} layer FT-OP-numpy-time {:6.2f} ms".format(batch_size, max_seq_len, num_layer, op_time_2)) return (tf_encoder_result_val.reshape([-1]) - op_encoder_result_val.reshape([-1])).max()
def sample_model(model_name='124M', nsamples=1, batch_size=1, length=12, temperature=1, top_k=4, top_p=0, models_dir='models', data_type='fp32'): """Run the sample_model. :model_name=124M : String, which model to use :nsamples=0 : Number of samples to return, if 0, continues to generate samples indefinately. :batch_size=1 : Number of batches (only affects speed/memory). :length=None : Number of tokens in generated text, if None (default), is determined by model hyperparameters :temperature=1 : Float value controlling randomness in boltzmann distribution. Lower temperature results in less random completions. As the temperature approaches zero, the model will become deterministic and repetitive. Higher temperature results in more random completions. :top_k=4 : Integer value controlling diversity. 1 means only 1 word is considered for each step (token), resulting in deterministic completions, while 40 means 40 words are considered at each step. 0 (default) is a special setting meaning no restrictions. 40 generally is a good value. :models_dir : path to parent folder containing model subfolders (i.e. contains the <model_name> folder) """ models_dir = os.path.expanduser(os.path.expandvars(models_dir)) enc = encoder.get_encoder(model_name, models_dir) hparams = HParams(n_vocab=0, n_ctx=1024, n_embd=768, n_head=12, n_layer=12) with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: hparams.override_from_dict(json.load(f)) if length is None: length = hparams.n_ctx elif length > hparams.n_ctx: raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) # start_ids has shape [batch_size, start_len].flatten() # start_ids = [15496, 11, 616, 3290, 468, # 15496, 11, 616, 3290, 469, # 15496, 11, 616, 3290, 470, # 15496, 11, 616, 3290, 471] start_ids = [enc.encoder['<|endoftext|>'] for i in range(batch_size)] with tf.Session(graph=tf.Graph()) as sess: saver = tf.train.import_meta_graph("{}/{}/model.ckpt.meta".format( models_dir, model_name)) print("[INFO] restore the model {}/{}".format(models_dir, model_name)) saver.restore(sess, ("{}/{}/model.ckpt".format(models_dir, model_name))) if data_type == 'fp32': tf_data_type = tf.float32 elif data_type == 'fp16': tf_data_type = tf.float16 else: assert (False) decoder_args = TransformerArgument(beam_width=1, head_num=hparams.n_head, size_per_head=hparams.n_embd // hparams.n_head, num_layer=hparams.n_layer, dtype=tf_data_type, kernel_init_range=0.00, bias_init_range=0.00) decoding_args = DecodingGpt2Argument(hparams.n_vocab, enc.encoder['<|endoftext|>'], enc.encoder['<|endoftext|>'], length + 2, decoder_args, top_k, top_p, temperature) ckpt_dict = {} for var in tf.trainable_variables(): ckpt_dict[var.name] = var decoding_vars = tf.trainable_variables() op_output = ft_gpt2_op(decoding_vars, decoding_args, batch_size, start_ids) generated = 0 while nsamples == 0 or generated < nsamples: print("[INFO] FT op time: {}".format( time_test(sess, op_output, iterations=5, warmup=True))) op_out = sess.run(op_output) for i in range(batch_size): generated += 1 text = enc.decode(op_out[i][1:]) print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) print(text)
batch_size=batch_size, q_seq_len=res_cnt, hidden_size=hidden_dim, k_seq_len=poly_m,attention_type=1 ) op_ctx_emb = tf.reshape(op_ctx_emb,(batch_size, res_cnt, hidden_dim)) # op_ctx_emb, _ = dot_product_attention(op_cand_emb, op_embs, op_embs, tf.estimator.ModeKeys.PREDICT) op_dot_product = tf.reduce_sum(op_ctx_emb * op_cand_emb, axis=-1) config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() tf_encoder_result_val = sess.run(tf_dot_product) print(tf_encoder_result_val) op_encoder_result_val = sess.run(op_dot_product) print(op_encoder_result_val) cross_check("Encoder", tf_encoder_result_val, op_encoder_result_val, atol_threshold) tf_time = time_test(sess, tf_dot_product, 1000) op_time = time_test(sess, op_dot_product, 1000) print("tf poly-encoder time:",tf_time) print("op poly-encoder time:", op_time)
time_args = args.test_time test_lists = [] test_names = [] if time_args.find("0") != -1: test_lists.append(finalized_tf_output_ids) test_names.append("TF-decoding-beamsearch") if time_args.find("1") != -1: test_lists.append(finalized_op_output_ids) test_names.append("FT-OP-decoding-beamsearch") if time_args.find("2") != -1: test_lists.append(tf_sampling_target_ids) test_names.append("TF-decoding-sampling") if time_args.find("3") != -1: test_lists.append(op_sampling_target_ids) test_names.append("FT-OP-decoding-sampling") test_time_result = [] for op in test_lists: test_time_result.append(time_test(sess, op, iterations=10, warmup=True)) for name, t_result in zip(test_names, test_time_result): if name.find("beamsearch") != -1: print("[INFO] batch_size {} beam_width {} head_num {} size_per_head {} seq_len {} " \ "decoder_layers {} vocab_size {} {}-time {:6.2f} ms.".format(batch_size, beam_width, head_num, size_per_head, max_seq_len, num_layer, vocab_size, name, t_result)) elif name.find("sampling") != -1: print("[INFO] batch_size {} topk {} topp {} head_num {} size_per_head {} seq_len {} " \ "decoder_layers {} vocab_size {} {}-time {:6.2f} ms.".format(batch_size, sampling_topk, sampling_topp, head_num, size_per_head, max_seq_len, num_layer, vocab_size, name, t_result))