def agent_action(step, sess, actor, online_state_inputs, is_training, state, replay_buffer, noise_process, env): policy_output = sess.run(fetches=[actor.online_action_outputs_tensor], feed_dict={ online_state_inputs: state, is_training: False }) # must reshape to (1,11) policy_output = policy_output[0] ##add noise and bound stochastic_action = policy_output_to_stochastic_action( policy_output, env.action_space, noise_process) ## excute a_t and store Transition. (state, reward, terminated) = env_step(env, stochastic_action) # episode_reward += reward # if step % 20 == 0: if step % 2000 == 0: tf.logging.info(' +++++++++++++++++++ global_step:{} action:{}' ' reward:{} term:{}'.format(step, stochastic_action, reward, terminated)) # replace transition with new one. transition = preprocess_low_dim(action=stochastic_action, reward=reward, terminated=terminated, state=state) ##even if terminated ,we still save next_state cause FF Q network # will use it, but will discard Q value in the end. replay_buffer.store(transition) return transition
def evaluate(env, num_eval_steps, preprocess_fn, estimate_fn, summary_writer, saver, sess, global_step, log_summary_op, summary_text_tensor): total_reward = 0 episode_reward = 0 max_episode_reward = 0 n_episodes = 0 n_rewards = 0 terminated = False transition = preprocess_fn(state=env.reset()) tf.logging.info( ' ####### start evaluate @ global step:{}## '.format(global_step)) for estep in range(1, num_eval_steps): policy_out = estimate_fn(transition.next_state[np.newaxis, :]) action = policy_output_to_deterministic_action(policy_out, env.action_space) (state, reward, terminated) = env_step(env, action) # we only need state to generate policy. transition = preprocess_fn(state) # record every reward total_reward += reward episode_reward += reward if reward != 0: n_rewards += 1 # 表示有效步数 if terminated: n_episodes += 1 # 记录单episode最大奖励 if episode_reward > max_episode_reward: max_episode_reward = episode_reward episode_reward = 0 # relaunch # only save state. transition = preprocess_fn(env.reset()) # -- end for estep --- avg_episode_reward = total_reward / max(1, n_episodes) avg_episode_steps = n_rewards / max(1, n_episodes) # we save model only during training. saved_name = 'eval_only_not_save_model' if saver is not None: saved_name = save_model(saver, sess, global_step) write_summary(summary_writer, global_step, avg_episode_reward, max_episode_reward, avg_episode_steps, saved_name, sess, log_summary_op, summary_text_tensor)
def agent_action(policy_out, replay_buffer,env): ##add noise and bound stochastic_action=policy_output_to_stochastic_action(policy_out, env.action_space) ## excute a_t and store Transition. (state, reward, terminated) = env_step(env, stochastic_action) # replace transition with new one. transition = preprocess_low_dim(action=stochastic_action, reward=reward, terminated=terminated, state=state) ##even if terminated ,we still save next_state. replay_buffer.store(transition) return transition
def agent_action(step, sess, actor, online_state_inputs, is_training, state, replay_buffer, noise_process, env): #make random play at beginning . to fill some states in replay buffer. if step < DDPG_CFG.learn_start: stochastic_action = [ np.random.uniform(low, high) for (low, high) in zip(env.action_space.low, env.action_space.high) ] else: policy_output = sess.run(fetches=[actor.online_action_outputs_tensor], feed_dict={ online_state_inputs: state, is_training: False }) # must reshape to (1,11) policy_output = policy_output[0] #list of tensor ##add noise and bound stochastic_action = policy_output_to_stochastic_action( policy_output, noise_process, env.action_space) ## excute a_t and store Transition. (state, reward, terminated) = env_step(env, stochastic_action) if step % 2000 == 0: tf.logging.info('@@@@@@@@@@ global_step:{} action:{}' ' reward:{} term:{} @@@@@@@@@@'.format( step, stochastic_action, reward, terminated)) # replace transition with new one. transition = preprocess_low_dim(action=stochastic_action, reward=reward, terminated=terminated, state=state) ##even if terminated ,we still save next_state cause FF Q network # will use it, but will discard Q value in the end. replay_buffer.store(transition) return transition
def evaluate_helper(env, num_eval_steps, preprocess_fn, estimate_fn): total_reward = 0 episode_reward = 0 episode_steps = 0 max_episode_reward = 0 n_episodes = 0 n_rewards = 0 terminated = False global prev_eval_time global max_avg_episode_reward transition = preprocess_fn(state=env.reset()) estep = 0 while not terminated: estep += 1 if estep > num_eval_steps: break policy_out = estimate_fn( transition.next_state[np.newaxis, :]) # must reshape to (1,11) ##TODO just give some initial speed action = policy_output_to_deterministic_action(policy_out, env.action_space) # #TODO just test # action[0] = 0. # action[1] = 1. # action[2] *= 1./3 (state, reward, terminated) = env_step(env, action) if estep % 2 == 0: tf.logging.info('@@@@@ eval step:{} action:{}' ' reward:{} term:{} @@@@@@@@@@'.format( estep, action, reward, terminated)) # we only need state to generate policy. transition = preprocess_fn(state) # record every reward total_reward += reward episode_reward += reward episode_steps += 1 if reward != 0: n_rewards += 1 # can represent effective(not still) steps in episode if terminated: tf.logging.info('@@@@@@ eval episode termed - episode_reward:{} -\ episode_steps:{} n_episode:{}- @@@@@@ '.format( episode_reward, episode_steps, n_episodes)) episode_steps = 0 if episode_reward > max_episode_reward: max_episode_reward = episode_reward episode_reward = 0 n_episodes += 1 # relaunch # only save state. transition = preprocess_fn(env.reset()) if estep < num_eval_steps: terminated = False # continue # -- end for estep --- avg_episode_reward = total_reward / max(1, n_episodes) avg_episode_steps = n_rewards / max(1, n_episodes) now = time.time() if prev_eval_time == 0: # first time eval. prev_eval_time = now # write_summary(summary_writer, global_step, avg_episode_reward, max_episode_reward, avg_episode_steps, now - prev_eval_time) prev_eval_time = now tf.logging.info( '@@@@@@ ==== end of evaluation, result: -steps:{} - avg_episode_reward:{} -\ max_episode_reward:{} - avg_episode_steps:{} - @@@@@@ '. format( estep, avg_episode_reward, max_episode_reward, avg_episode_steps, ))
def evaluate(env, num_eval_steps, preprocess_fn, estimate_fn, summary_writer, saver, sess, global_step, log_summary_op, summary_text_tensor): total_reward = 0 episode_reward = 0 max_episode_reward = 0 n_episodes = 1 n_rewards = 1 terminated = False global prev_eval_time # global max_avg_episode_reward transition = preprocess_fn(state=env.reset()) estep = 0 tf.logging.info( ' ####### start evaluate @ global step:{}## '.format(global_step)) while not terminated: estep += 1 if estep > num_eval_steps: #avoid too many low progress steps in one episode break policy_out = estimate_fn( transition.next_state[np.newaxis, :]) # must reshape to (1,11) action = policy_output_to_deterministic_action(policy_out, env.action_space) (state, reward, terminated) = env_step(env, action) # we only need state to generate policy. transition = preprocess_fn(state) # record every reward total_reward += reward episode_reward += reward if reward != 0: n_rewards += 1 # can represent effective(not still) steps in episode if terminated: n_episodes += 1 if episode_reward > max_episode_reward: max_episode_reward = episode_reward episode_reward = 0 # relaunch # only save state. transition = preprocess_fn(env.reset()) if estep < num_eval_steps: terminated = False # continue # -- end for estep --- avg_episode_reward = total_reward / max(1, n_episodes) avg_episode_steps = n_rewards / max(1, n_episodes) now = time.time() if prev_eval_time == 0: # first time eval. prev_eval_time = now #we always save model each evaluation. saved_name = save_model(saver, sess, global_step) write_summary(summary_writer, global_step, avg_episode_reward, max_episode_reward, avg_episode_steps, now - prev_eval_time, saved_name, sess, log_summary_op, summary_text_tensor) prev_eval_time = now # if avg_episode_reward > max_avg_episode_reward: # max_avg_episode_reward = avg_episode_reward tf.logging.info( '@@@@@@ eval save model : global_step:{} - avg_episode_reward:{} -\ max_episode_reward:{} - avg_episode_steps:{} - saved_file: {} @@@@@@ ' .format(global_step, avg_episode_reward, max_episode_reward, avg_episode_steps, saved_name))