def inference_gru_block_vs_gru_cell(batch_size, cell_size, input_size, time_steps, use_gpu=False, iters=30): """Benchmark inference speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: with benchmarking.device(use_gpu): # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed) np.random.seed(seed) # Inputs concat_x = vs.get_variable("concat_x", [time_steps, batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = rnn_cell.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) basic_time_inference = benchmarking.seconds_per_run( outputs_dynamic, sess, iters) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) block_time_inference = benchmarking.seconds_per_run( outputs_dynamic, sess, iters) performance_inference = (basic_time_inference - block_time_inference ) * 100 / basic_time_inference print(",".join([ str(batch_size), str(cell_size), str(input_size), str(time_steps), str(use_gpu), str(basic_time_inference), str(block_time_inference), str(performance_inference) ])) return basic_time_inference, block_time_inference
def single_bprop_step_gru_block_vs_gru_cell(batch_size, cell_size, input_size, use_gpu=False, iters=30): """Benchmark single bprop step speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: with benchmarking.device(use_gpu): initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989) # Inputs x = vs.get_variable("x", [batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = rnn_cell.GRUCell(cell_size)(array_ops.identity(x), array_ops.identity(h)) sess.run([variables.global_variables_initializer()]) grad_output_wrt_input = gradients_impl.gradients([output], h) basic_time_bprop = benchmarking.seconds_per_run( grad_output_wrt_input, sess, iters) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x), array_ops.identity(h)) sess.run([variables.global_variables_initializer()]) grad_output_wrt_input = gradients_impl.gradients([output], h) block_time_bprop = benchmarking.seconds_per_run( grad_output_wrt_input, sess, iters) performance_inference = (basic_time_bprop - block_time_bprop) * 100 / basic_time_bprop print(",".join([ str(batch_size), str(cell_size), str(input_size), str(use_gpu), str(basic_time_bprop), str(block_time_bprop), str(performance_inference) ])) return basic_time_bprop, block_time_bprop
def benchmarkLSTMBlockCellFpropWithDynamicRNN(self): print("BlockLSTMCell forward propagation via dynamic_rnn().") print("--------------------------------------------------------------") print("LSTMBlockCell Seconds per inference.") print("batch_size,cell_size,input_size,time_steps,use_gpu,wall_time") iters = 10 for config in benchmarking.dict_product({ "batch_size": [1, 8, 13, 32, 67, 128], "cell_size": [128, 250, 512, 650, 1024, 1350], "time_steps": [40], "use_gpu": [True, False], "dtype": ["float32", "float16"], }): dtype = dtypes.float32 if config[ "dtype"] == "float32" else dtypes.float16 with ops.Graph().as_default(): with benchmarking.device(use_gpu=config["use_gpu"]): inputs = variable_scope.get_variable( "x", dtype=dtype, shape=[ config["time_steps"], config["batch_size"], config["cell_size"] ]) cell = lstm_ops.LSTMBlockCell(config["cell_size"], dtype=dtype) outputs = rnn.dynamic_rnn(cell, inputs, time_major=True, dtype=dtype) init_op = variables.global_variables_initializer() with session.Session() as sess: sess.run(init_op) wall_time = benchmarking.seconds_per_run( outputs, sess, iters) # Print to stdout. If the TEST_REPORT_FILE_PREFIX environment variable # is set, this will produce a copy-paste-able CSV file. print(",".join( map(str, [ config["dtype"], config["batch_size"], config["cell_size"], config["cell_size"], config["time_steps"], config["use_gpu"], wall_time ]))) benchmark_name_template = "_".join([ "LSTMBlockCell_fprop", "DT_%(dtype)s", "BS%(batch_size)i", "CS%(cell_size)i", "IS%(cell_size)i", "TS%(time_steps)i", "gpu_%(use_gpu)s" ]) self.report_benchmark(name=benchmark_name_template % config, iters=iters, wall_time=wall_time, extras=config)
def training_gru_block_vs_gru_cell(batch_size, cell_size, input_size, time_steps, use_gpu=False, iters=30): """Benchmark training speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: # Specify the device which is been used. with benchmarking.device(use_gpu): # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed) np.random.seed(seed) # Inputs concat_x = vs.get_variable("concat_x", [time_steps, batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) y = vs.get_variable("y", [time_steps, batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = rnn_cell.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) cost = math_ops.reduce_mean( math_ops.square(outputs_dynamic - y)) learning_rate = 0.01 optimizer = gradient_descent.GradientDescentOptimizer( learning_rate).minimize(cost) # time for a training step. basic_time_training = benchmarking.seconds_per_run( optimizer, sess, iters) # Output from the basic GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) cost = math_ops.reduce_mean( math_ops.square(outputs_dynamic - y)) learning_rate = 0.01 optimizer = gradient_descent.GradientDescentOptimizer( learning_rate).minimize(cost) # time for a training step. block_time_training = benchmarking.seconds_per_run( optimizer, sess, iters) performance_training = (basic_time_training - block_time_training ) * 100 / basic_time_training print(",".join([ str(batch_size), str(cell_size), str(input_size), str(time_steps), str(use_gpu), str(basic_time_training), str(block_time_training), str(performance_training) ])) return basic_time_training, block_time_training