def __init__ (self, train_loop): self.train_loop = train_loop self.graph = train_loop.graph self.sess = train_loop.sess journalist = train_loop.logger num_actions = self.train_loop.num_actions; observation_size = self.train_loop.observation_size; observations_in_seq = 1; input_size = observation_size*observations_in_seq; learning_rate = 1e-4 r = tf.nn.relu t = tf.nn.tanh critic = MLP([input_size, num_actions], [512, 512, 512, 512, 512, 1], [r, r, r, r, t, tf.identity], scope='critic') self.actor = MLP([input_size,], [512, 512, 512, 512, 512, num_actions], [r, r, r, r, t, tf.nn.sigmoid], scope='actor') # step 1 optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # step 2 # optimizer = tf.train.GradientDescentOptimizer(learning_rate=5e-5) self.controller = ContinuousDeepQ( input_size, num_actions, self.actor, critic, optimizer, self.sess, discount_rate=0.99, target_actor_update_rate=0.01, target_critic_update_rate=0.01, exploration_period=5000, max_experience=10000, store_every_nth=4, train_every_nth=4, summary_writer=journalist, rewards = self.train_loop.dequeued_rewards, given_action = self.train_loop.dequeued_actions, observation = self.train_loop.dequeued_prev_states, next_observation = self.train_loop.dequeued_next_states, next_observation_mask = tf.ones(self.train_loop.dequeued_rewards.get_shape (), tf.float32) )
def main(desired_iterations, save_path): # Define a log file to use with tensorboard # Not that we currently make use of tensorboard at all LOG_DIR = tempfile.mkdtemp() print "Tensorboard Log: " + LOG_DIR + '\n' # The directory to save the animations to SAVE_DIR = save_path # Define the simulation sim = Planning(get_noodle_environment()) # Tensorflow! tf.reset_default_graph() session = tf.InteractiveSession() journalist = tf.train.SummaryWriter(LOG_DIR) brain = MLP([ sim.observation_size, ], [200, 200, sim.num_actions], [tf.tanh, tf.tanh, tf.identity]) optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9) # DiscreteDeepQ object current_controller = DiscreteDeepQ(sim.observation_size, sim.num_actions, brain, optimizer, session, random_action_probability=0.2, discount_rate=0.9, exploration_period=1000, max_experience=10000, store_every_nth=1, train_every_nth=1, summary_writer=journalist) # Initialize the session session.run(tf.initialize_all_variables()) session.run(current_controller.target_network_update) # journalist.add_graph(session.graph) # Run the simulation and let the robot learn num_simulations = 0 iterations_needed = [] total_rewards = [] try: for game_idx in range(desired_iterations + 1): current_random_prob = current_controller.random_action_probability update_random_prob = game_idx != 0 and game_idx % 200 == 0 if update_random_prob and 0.01 < current_random_prob <= 0.1: current_controller.random_action_probability = current_random_prob - 0.01 elif update_random_prob and 0.1 < current_random_prob: current_controller.random_action_probability = current_random_prob - 0.1 game = Planning(get_noodle_environment()) game_iterations = 0 observation = game.observe() while not game.is_over(): action = current_controller.action(observation) reward = game.collect_reward(action) new_observation = game.observe() current_controller.store(observation, action, reward, new_observation) current_controller.training_step() observation = new_observation game_iterations += 1 total_rewards.append(sum(game.collected_rewards)) iterations_needed.append(game_iterations) rewards = [] if game_idx % 50 == 0: print "\rGame %d:\nIterations before end: %d." % ( game_idx, game_iterations) if game.collected_rewards[-1] == 10: print "Hit target!" print "Total Rewards: %s\n" % (sum(game.collected_rewards)) if SAVE_DIR is not None: game.save_path(SAVE_DIR, game_idx) except KeyboardInterrupt: print "Interrupted" # Plot the iterations and reward plt.figure(figsize=(12, 8)) plt.plot(total_rewards, label='Reward') # plt.plot(iterations_needed, label='Iterations') plt.legend() plt.show()
b"a": 2, }) else: # Tensorflow business - it is always good to reset a graph before creating a new controller. tf.reset_default_graph() session = tf.InteractiveSession() # This little guy will let us run tensorboard # tensorboard --logdir [LOG_DIR] journalist = tf.train.SummaryWriter(LOG_DIR) # Brain maps from observation to Q values for different actions. # Here it is a done using a multi layer perceptron with 2 hidden # layers brain = MLP([ g.observation_size, ], [200, 200, g.num_actions], [tf.tanh, tf.tanh, tf.identity]) # The optimizer to use. Here we use RMSProp as recommended # by the publication optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9) # DiscreteDeepQ object current_controller = DiscreteDeepQ(g.observation_size, g.num_actions, brain, optimizer, session, discount_rate=0.99, exploration_period=5000, max_experience=10000,
observation_size = 2 observations_in_seq = 1 input_size = observation_size * observations_in_seq # actions num_actions = 3 #brain = MLP([input_size,], [5, 5, 5, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.identity]) #brain = MLP([input_size,], [20, 20, 20, 20, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity]) brain = SeparatedMLP([ MLP([ input_size, ], [64, 64, 1], [tf.sigmoid, tf.sigmoid, tf.identity], scope="mlp_action1"), MLP([ input_size, ], [64, 64, 1], [tf.sigmoid, tf.sigmoid, tf.identity], scope="mlp_action2"), MLP([ input_size, ], [64, 64, 1], [tf.sigmoid, tf.sigmoid, tf.identity], scope="mlp_action3") ]) # The optimizer to use. Here we use RMSProp as recommended # by the publication optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9)
observation_size = 7 observations_in_seq = 1 input_size = observation_size * observations_in_seq # actions num_actions = 5 #brain = MLP([input_size,], [5, 5, 5, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.identity]) #brain = MLP([input_size,], [20, 20, 20, 20, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity]) brain = SeparatedMLP([ MLP([ input_size, ], [8, 8, 1], [tf.nn.relu, tf.nn.relu, tf.identity], scope="mlp_action1"), MLP([ input_size, ], [8, 8, 1], [tf.nn.relu, tf.nn.relu, tf.identity], scope="mlp_action2"), MLP([ input_size, ], [8, 8, 1], [tf.nn.relu, tf.nn.relu, tf.identity], scope="mlp_action3"), MLP([ input_size, ], [8, 8, 1], [tf.nn.relu, tf.nn.relu, tf.identity], scope="mlp_action4"), MLP([ input_size,
import scipy.io as sio import copy N = Quadrotor.num_of_actions tf.reset_default_graph() session = tf.InteractiveSession() LOG_DIR = tempfile.mkdtemp() print(LOG_DIR) journalist = tf.train.SummaryWriter(LOG_DIR) brain = MLP([ 4, ], [32, 64, N], [tf.tanh, tf.tanh, tf.identity]) optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9) current_controller = DiscreteDeepQ(4, N, brain, optimizer, session, discount_rate=0.9, exploration_period=100000, max_experience=10000, minibatch_size=64, random_action_probability=0.05, store_every_nth=1,
observations_in_seq = 4 input_size = observation_size * observations_in_seq # actions num_actions = 2 #brain = MLP([input_size,], [5, 5, 5, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.identity]) #brain = MLP([input_size,], [20, 20, 20, 20, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity]) #brain = MLP([input_size,], [32, 32, 32, 32, 32, num_actions], # [tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.identity]) critic = MLP([input_size, num_actions * 2], [1024, 512, 1], [tf.nn.sigmoid, tf.nn.sigmoid, tf.identity], scope='critic') actor = MLP([ input_size, ], [1024, 512, num_actions], [tf.nn.sigmoid, tf.nn.sigmoid, tf.identity], scope='actor') # The optimizer to use. Here we use RMSProp as recommended # by the publication #optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0001, decay=0.9) #optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0005, decay=0.9) optimizer = tf.train.AdamOptimizer(learning_rate=0.0001) #optimizer = tf.train.GradientDescentOptimizer(learning_rate= 0.001) # DiscreteDeepQ object
# In[9]: # Tensorflow business - it is always good to reset a graph before creating a new controller. tf.reset_default_graph() session = tf.InteractiveSession() # This little guy will let us run tensorboard # tensorboard --logdir [LOG_DIR] journalist = tf.train.SummaryWriter(LOG_DIR) # Brain maps from observation to Q values for different actions. # Here it is a done using a multi layer perceptron with 2 hidden # layers brain = MLP([ 4, ], [10, 4], [tf.tanh, tf.identity]) # The optimizer to use. Here we use RMSProp as recommended # by the publication optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9) # DiscreteDeepQ object current_controller = DiscreteDeepQ(4, 4, brain, optimizer, session, discount_rate=0.9, exploration_period=100, max_experience=10000,
'kt': 1.0, 'd': 1.0, 'Fmax': 10, 'Mmax': 5, 'us': us } tf.reset_default_graph() session = tf.InteractiveSession() LOG_DIR = tempfile.mkdtemp() print(LOG_DIR) journalist = tf.train.SummaryWriter(LOG_DIR) brain = MLP([ 6, ], [32, 64, N * N], [tf.tanh, tf.tanh, tf.identity]) optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9) current_controller = DiscreteDeepQ(6, N * N, brain, optimizer, session, discount_rate=0.9, exploration_period=500000, max_experience=10000, minibatch_size=128, random_action_probability=0.05, store_every_nth=1,
input_size = observation_size * observations_in_seq # actions num_actions = 2 #critic = MLP([input_size, num_actions*2], [2048, 1024, 1], # [tf.nn.sigmoid, tf.nn.sigmoid, tf.identity], scope='critic') # #actor = MLP([input_size,], [2048, 1024, num_actions], # [tf.nn.sigmoid, tf.nn.sigmoid, tf.identity], scope='actor') r = tf.nn.relu t = tf.nn.tanh critic = MLP([input_size, num_actions], [2048, 512, 256, 256, 1], [t, t, t, t, tf.identity], scope='critic') actor = MLP([ input_size, ], [2048, 512, 256, 256, num_actions], [t, t, t, t, tf.identity], scope='actor') # The optimizer to use. Here we use RMSProp as recommended # by the publication #optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0001, decay=0.9) #optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0005, decay=0.9) optimizer = tf.train.AdamOptimizer(learning_rate=0.0001) #optimizer = tf.train.GradientDescentOptimizer(learning_rate= 0.001) # DiscreteDeepQ object
def start(self): """Start MLDaemon This function create tensorflow controller and running the tuning by iteratively training and choose action. """ if self.debugging_level >= 1: import cProfile import io import pstats pr = cProfile.Profile() pr.enable() logger.info(f"Connected to database {self.conf['replaydb']['dbfile']}") # set stopped to False, so daemon can run self.stopped = False logger.info('Starting MLDaemon...') try: # TensorFlow business - it is always good to reset a graph before creating a new controller. ops.reset_default_graph() # ? shall we use InteractiveSession()? self.session = tf.Session() # tf.InteractiveSession() # This little guy will let us run tensorboard # tensorboard --logdir [LOG_DIR] journalist = tf.summary.FileWriter(self.LOG_DIR) # Brain maps from observation to Q values for different actions. # Here it is a done using a multi layer perceptron with 2 hidden # layers hidden_layer_size = max(int(self.observation_size * 1.2), 200) logger.info('Observation size {0}, hidden layer size {1}'.format( self.observation_size, hidden_layer_size)) brain = MLP([ self.observation_size, ], [hidden_layer_size, hidden_layer_size, self.opt['num_actions']], [tf.tanh, tf.tanh, tf.identity]) # The optimizer to use. Here we use RMSProp as recommended # by the publication optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9) # DiscreteDeepQ object self.controller = DiscreteDeepQ( (self.observation_size, ), self.opt['num_actions'], brain, optimizer, self.session, discount_rate=0.99, start_random_rate=self.start_random_rate, exploration_period=self.exploration_period, random_action_probability=self. opt['random_action_probability'], train_every_nth=1, summary_writer=journalist, k_action=int(self.opt['k_val'])) self.session.run(tf.initialize_all_variables()) self.session.run(self.controller.target_network_update) #checks if there is a model to be loaded before updating the graph if os.path.isfile(os.path.join(self.save_path, 'model')): self.controller.restore(self.save_path) logger.info('Loaded saved model from ' + self.save_path) else: logger.info('No saved model found') self.test_number_of_steps_after_restore = self.controller.actions_executed_so_far # graph was not available when journalist was created journalist.add_graph(self.session.graph) last_action_second = 0 # last action timestep last_training_step_duration = 0 # last training duration last_checkpoint_time = time.time() # last checkpoint while not self.stop_requested: begin_time = time.time() # set begin time to current time # Run training step logger.info('Start training step...') minibatch_size, prediction_error = self._do_training_step() if minibatch_size > 0: # Check checkpoint time for every self.checkpoint_time logger.info( f'Time before checkpoint: {self.checkpoint_time - (time.time() - last_checkpoint_time)}' ) if time.time( ) - last_checkpoint_time > self.checkpoint_time: # save controller checkpoint cp_path = os.path.join( self.save_path, 'checkpoint_' + time.strftime('%Y-%m-%d_%H-%M-%S')) os.mkdir(cp_path) self.controller.save(cp_path) # update checkpoint time last_checkpoint_time = time.time() logger.info('Checkpoint saved in ' + cp_path) # update last training duration last_training_step_duration = time.time() - begin_time logger.info( 'Finished {step}th training step in {time} seconds ' 'using {mb} samples with prediction error {error}.'. format(step=self.controller.iteration, time=last_training_step_duration, mb=minibatch_size, error=prediction_error)) else: logger.info('Not enough data for training yet.') # Check if it is time for tuning # (check if duration since last action passed compare to time left before next actions) if time.time() - ( last_action_second + 0.5 ) >= self.delay_between_actions - last_training_step_duration: if self.enable_tuning: logger.debug('Start tuning step...') try: # Update memcache for next traininf interval self.db.refresh_memcache() except: pass # get sleep time either 0 or what is left until next action is start sleep_time = max( 0, self.delay_between_actions - (time.time() - (last_action_second + 0.5))) if sleep_time > 0.05: # Do garbage cleaning up before long sleeping gc.collect() sleep_time = max( 0, self.delay_between_actions - (time.time() - (last_action_second + 0.5))) if sleep_time > 0.0001: logger.debug(f'Sleeping {sleep_time} seconds') # Welp, basically sleep time.sleep(sleep_time) # Do action step ts = int(time.time()) self._do_action_step(ts) # Update action to current time last_action_second = ts else: logger.debug('Tuning disabled.') # Check for new data every 200 steps to reduce checking overhead if self.controller.number_of_times_train_called % 200 == 0: try: self.db.refresh_memcache() pass except: pass # We always print out the reward to the log for analysis logger.info(f'Cumulative reward: {self.cumulative_reward}') # Clean log at the end for next run flush_log() finally: # set stopped to True, so daemon can properly stop self.stopped = True # controller.save should not work here as the controller is still NoneType # self.controller.save(self.save_path) logger.info('MLDaemon stopped.') if self.debugging_level >= 1: pr.disable() s = io.StringIO() sortby = 'cumulative' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue())
observation_size = 4 observations_in_seq = 1 input_size = observation_size * observations_in_seq # actions num_actions = 3 #brain = MLP([input_size,], [5, 5, 5, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.identity]) #brain = MLP([input_size,], [20, 20, 20, 20, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity]) #brain = MLP([input_size,], [32, 32, 32, 32, 32, num_actions], # [tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.identity]) brain = MLP([ input_size, ], [64, 64, num_actions], [tf.sigmoid, tf.sigmoid, tf.identity]) # The optimizer to use. Here we use RMSProp as recommended # by the publication #optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0001, decay=0.9) optimizer = tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=0.9) # DiscreteDeepQ object current_controller = DiscreteDeepQ(input_size, num_actions, brain, optimizer, session, discount_rate=0.95, target_network_update_rate=0.01,
n_prev_frames = 3 # Tensorflow business - it is always good to reset a graph before creating a new controller. tf.reset_default_graph() session = tf.InteractiveSession() # This little guy will let us run tensorboard # tensorboard --logdir [LOG_DIR] journalist = tf.train.SummaryWriter(LOG_DIR) # Brain maps from observation to Q values for different actions. # Here it is a done using a multi layer perceptron with 2 hidden # layers brain = MLP([ n_prev_frames * 4 + n_prev_frames - 1, ], [4], [tf.identity]) # The optimizer to use. Here we use RMSProp as recommended # by the publication optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9) # DiscreteDeepQ object current_controller = DiscreteDeepQ(n_prev_frames * 4 + n_prev_frames - 1, 4, brain, optimizer, session, discount_rate=0.9, exploration_period=100, max_experience=10000,
observation_size = 4; observations_in_seq = 1; input_size = observation_size*observations_in_seq; # actions num_actions = 1; #brain = MLP([input_size,], [5, 5, 5, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.identity]) #brain = MLP([input_size,], [20, 20, 20, 20, num_actions], # [tf.tanh, tf.tanh, tf.tanh, tf.tanh, tf.identity]) #brain = MLP([input_size,], [32, 32, 32, 32, 32, num_actions], # [tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.nn.relu, tf.identity]) critic = MLP([input_size, num_actions], [64, 64, 1], [tf.sigmoid, tf.sigmoid, tf.identity], scope='critic') actor = MLP([input_size,], [64, 64, num_actions], [tf.sigmoid, tf.sigmoid, tf.identity], scope='actor') # The optimizer to use. Here we use RMSProp as recommended # by the publication #optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.0001, decay=0.9) optimizer = tf.train.RMSPropOptimizer(learning_rate= 0.001, decay=0.9) # DiscreteDeepQ object current_controller = ContinuousDeepQ(input_size, num_actions, actor, critic, optimizer, session, discount_rate=0.99, target_actor_update_rate=0.001, target_critic_update_rate=0.001, exploration_period=5000, max_experience=10000, store_every_nth=4, train_every_nth=4, summary_writer=journalist) #class ContinuousDeepQ # observation_size, # action_size,
def __init__(self, train_loop): self.train_loop = train_loop self.graph = train_loop.graph self.sess = train_loop.sess journalist = train_loop.logger num_actions = self.train_loop.num_actions observation_size = self.train_loop.observation_size observations_in_seq = 1 input_size = observation_size * observations_in_seq learning_rate = 1e-4 r = tf.nn.relu t = tf.nn.tanh self.lstm_input_size = 41 self.lstm_layer_size = 64 self.lstm_layers_count = 2 self.lstm_steps_count = 5 class MULTI_LSTM_MLP(object): def __init__(self, input_size, step_count=5, layer_size=32, layers_count=2, batch_size=32, mlp=None, scope='lstm_mlp'): self.input_size = input_size self.step_count = step_count self.layer_size = layer_size self.layers_count = layers_count self.batch_size = batch_size self.mlp = mlp self.scope = scope with tf.variable_scope(scope) as sc: def lstm_cell(): # return tf.contrib.rnn.BasicLSTMCell(self.layer_size, reuse=sc.reuse) return tf.contrib.rnn.LSTMCell( self.layer_size, reuse=sc.reuse # initializer=tf.random_uniform_initializer(-0.05, 0.05) # activation=tf.nn.relu ) self.stacked_lstm = tf.contrib.rnn.MultiRNNCell( [lstm_cell() for _ in range(self.layers_count)]) fake_input = tf.placeholder( tf.float32, [self.batch_size, self.step_count, self.input_size]) self.initial_state_batch = self.stacked_lstm.zero_state( train_loop.batch_size, tf.float32) self.initial_state_one = self.stacked_lstm.zero_state( 1, tf.float32) self.lstm_output, state = self.stacked_lstm( fake_input[:, 0], self.initial_state_batch) self.model_variables = [ v for v in tf.trainable_variables() if v.name.startswith(sc.name) ] for v in self.model_variables: print("--- MULTI_LSTM_MLP v: " + v.name) def __call__(self, xs): # if this is critic we need to ignore input action # since it is already present print('call: ' + (self.scope if isinstance(self.scope, str) else self. scope.name)) if (isinstance(xs, list)): lstm_input = xs[0] else: lstm_input = xs print(lstm_input) # convert xs into steps lstm_input = tf.reshape(lstm_input, [-1, self.step_count, self.input_size]) print(lstm_input.get_shape()) initial_state = self.initial_state_batch if str(lstm_input.get_shape()[0]) == '?': print('--- dynamic shape') initial_state = self.initial_state_one print('--- initial state') print(lstm_input.get_shape()[0]) with tf.variable_scope(self.scope, reuse=True): state = initial_state for i in range(self.step_count): print('--- lstm step: {}'.format(i)) print(lstm_input[:, i].get_shape()) lstm_output, state = self.stacked_lstm( lstm_input[:, i], state) final_state = state if (isinstance(xs, list)): return self.mlp([lstm_output, xs[1]]) else: return self.mlp(lstm_output) def copy(self, scope=None): scope = scope or self.scope + "_copy" print("--- copy " + scope) with tf.variable_scope(scope) as sc: for v in self.model_variables: print("--- bn: " + base_name2(v) + " " + v.name) tf.get_variable( base_name2(v), v.get_shape(), initializer=lambda x, dtype=tf.float32, partition_info=None: v.initialized_value()) sc.reuse_variables() mlp_copy = self.mlp.copy('mlp_' + scope) return MULTI_LSTM_MLP(self.input_size, self.step_count, self.layer_size, self.layers_count, self.batch_size, mlp_copy, scope=sc) def variables(self): return self.model_variables + self.mlp.variables() mlp_critic = MLP([self.lstm_layer_size, num_actions], [256, 256, 256, 256, 1], [r, r, r, t, tf.identity], scope='mlp_critic') mlp_actor = MLP([ self.lstm_layer_size, ], [256, 256, 256, 256, num_actions], [r, r, r, t, tf.nn.sigmoid], scope='mlp_actor') self.actor = MULTI_LSTM_MLP(self.lstm_input_size, self.lstm_steps_count, self.lstm_layer_size, self.lstm_layers_count, train_loop.batch_size, mlp_actor, 'actor') critic = MULTI_LSTM_MLP(self.lstm_input_size, self.lstm_steps_count, self.lstm_layer_size, self.lstm_layers_count, train_loop.batch_size, mlp_critic, 'critic') optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.controller = ContinuousDeepQ( input_size, num_actions, self.actor, critic, optimizer, self.sess, discount_rate=0.99, target_actor_update_rate=0.01, target_critic_update_rate=0.01, exploration_period=5000, max_experience=10000, store_every_nth=4, train_every_nth=4, summary_writer=journalist, rewards=self.train_loop.dequeued_rewards, given_action=self.train_loop.dequeued_actions, observation=self.train_loop.dequeued_prev_states, next_observation=self.train_loop.dequeued_next_states, next_observation_mask=tf.ones( self.train_loop.dequeued_rewards.get_shape(), tf.float32))