def agent_action(step, sess, actor, online_state_inputs, is_training, state,
                 replay_buffer, noise_process, env):
    policy_output = sess.run(fetches=[actor.online_action_outputs_tensor],
                             feed_dict={
                                 online_state_inputs: state,
                                 is_training: False
                             })  # must reshape to (1,11)
    policy_output = policy_output[0]

    ##add noise and bound
    stochastic_action = policy_output_to_stochastic_action(
        policy_output, env.action_space, noise_process)

    ## excute a_t and store Transition.
    (state, reward, terminated) = env_step(env, stochastic_action)
    # episode_reward += reward

    # if step % 20 == 0:
    if step % 2000 == 0:
        tf.logging.info(' +++++++++++++++++++ global_step:{} action:{}'
                        '  reward:{} term:{}'.format(step, stochastic_action,
                                                     reward, terminated))
    # replace transition with new one.
    transition = preprocess_low_dim(action=stochastic_action,
                                    reward=reward,
                                    terminated=terminated,
                                    state=state)

    ##even if terminated ,we still save next_state cause FF Q network
    # will use it, but will discard Q value in the end.
    replay_buffer.store(transition)
    return transition
def evaluate(env, num_eval_steps, preprocess_fn, estimate_fn, summary_writer,
             saver, sess, global_step, log_summary_op, summary_text_tensor):
    total_reward = 0
    episode_reward = 0
    max_episode_reward = 0
    n_episodes = 0
    n_rewards = 0
    terminated = False
    transition = preprocess_fn(state=env.reset())

    tf.logging.info(
        ' ####### start evaluate @ global step:{}##  '.format(global_step))

    for estep in range(1, num_eval_steps):
        policy_out = estimate_fn(transition.next_state[np.newaxis, :])
        action = policy_output_to_deterministic_action(policy_out,
                                                       env.action_space)
        (state, reward, terminated) = env_step(env, action)

        # we only need state to generate policy.
        transition = preprocess_fn(state)

        # record every reward
        total_reward += reward
        episode_reward += reward

        if reward != 0:
            n_rewards += 1  # 表示有效步数

        if terminated:
            n_episodes += 1
            # 记录单episode最大奖励
            if episode_reward > max_episode_reward:
                max_episode_reward = episode_reward
                episode_reward = 0
            # relaunch
            # only save state.
            transition = preprocess_fn(env.reset())

    # -- end for estep ---
    avg_episode_reward = total_reward / max(1, n_episodes)
    avg_episode_steps = n_rewards / max(1, n_episodes)

    # we save model only during training.
    saved_name = 'eval_only_not_save_model'
    if saver is not None:
        saved_name = save_model(saver, sess, global_step)
    write_summary(summary_writer, global_step, avg_episode_reward,
                  max_episode_reward, avg_episode_steps, saved_name, sess,
                  log_summary_op, summary_text_tensor)
def agent_action(policy_out, replay_buffer,env):
    ##add noise and bound
    stochastic_action=policy_output_to_stochastic_action(policy_out, env.action_space)

    ## excute a_t and store Transition.
    (state, reward, terminated) = env_step(env, stochastic_action)

    # replace transition with new one.
    transition = preprocess_low_dim(action=stochastic_action,
        reward=reward,
        terminated=terminated,
        state=state)
    ##even if terminated ,we still save next_state.
    replay_buffer.store(transition)
    return transition
Esempio n. 4
0
def agent_action(step, sess, actor, online_state_inputs, is_training, state,
                 replay_buffer, noise_process, env):
    #make random play at beginning . to fill some states in replay buffer.
    if step < DDPG_CFG.learn_start:
        stochastic_action = [
            np.random.uniform(low, high)
            for (low, high) in zip(env.action_space.low, env.action_space.high)
        ]
    else:
        policy_output = sess.run(fetches=[actor.online_action_outputs_tensor],
                                 feed_dict={
                                     online_state_inputs: state,
                                     is_training: False
                                 })  # must reshape to (1,11)
        policy_output = policy_output[0]  #list of tensor

        ##add noise and bound
        stochastic_action = policy_output_to_stochastic_action(
            policy_output, noise_process, env.action_space)

    ## excute a_t and store Transition.
    (state, reward, terminated) = env_step(env, stochastic_action)

    if step % 2000 == 0:
        tf.logging.info('@@@@@@@@@@ global_step:{} action:{}'
                        '  reward:{} term:{} @@@@@@@@@@'.format(
                            step, stochastic_action, reward, terminated))

    # replace transition with new one.
    transition = preprocess_low_dim(action=stochastic_action,
                                    reward=reward,
                                    terminated=terminated,
                                    state=state)

    ##even if terminated ,we still save next_state cause FF Q network
    # will use it, but will discard Q value in the end.
    replay_buffer.store(transition)
    return transition
def evaluate_helper(env, num_eval_steps, preprocess_fn, estimate_fn):
    total_reward = 0
    episode_reward = 0
    episode_steps = 0
    max_episode_reward = 0
    n_episodes = 0
    n_rewards = 0
    terminated = False
    global prev_eval_time
    global max_avg_episode_reward

    transition = preprocess_fn(state=env.reset())
    estep = 0

    while not terminated:
        estep += 1
        if estep > num_eval_steps:
            break
        policy_out = estimate_fn(
            transition.next_state[np.newaxis, :])  # must reshape to (1,11)
        ##TODO just give some initial speed

        action = policy_output_to_deterministic_action(policy_out,
                                                       env.action_space)
        # #TODO just test
        # action[0] = 0.
        # action[1] = 1.
        # action[2] *= 1./3

        (state, reward, terminated) = env_step(env, action)
        if estep % 2 == 0:
            tf.logging.info('@@@@@ eval step:{} action:{}'
                            '  reward:{} term:{}  @@@@@@@@@@'.format(
                                estep, action, reward, terminated))

        # we only need state to generate policy.
        transition = preprocess_fn(state)

        # record every reward
        total_reward += reward
        episode_reward += reward
        episode_steps += 1

        if reward != 0:
            n_rewards += 1  # can represent effective(not still) steps in episode

        if terminated:
            tf.logging.info('@@@@@@ eval episode termed - episode_reward:{} -\
                        episode_steps:{}  n_episode:{}- @@@@@@ '.format(
                episode_reward, episode_steps, n_episodes))
            episode_steps = 0
            if episode_reward > max_episode_reward:
                max_episode_reward = episode_reward

            episode_reward = 0
            n_episodes += 1
            # relaunch
            # only save state.
            transition = preprocess_fn(env.reset())
            if estep < num_eval_steps:
                terminated = False  # continue

    # -- end for estep ---
    avg_episode_reward = total_reward / max(1, n_episodes)
    avg_episode_steps = n_rewards / max(1, n_episodes)
    now = time.time()
    if prev_eval_time == 0:  # first time eval.
        prev_eval_time = now

    # write_summary(summary_writer, global_step, avg_episode_reward, max_episode_reward, avg_episode_steps, now - prev_eval_time)
    prev_eval_time = now

    tf.logging.info(
        '@@@@@@ ==== end of evaluation, result: -steps:{} - avg_episode_reward:{} -\
                   max_episode_reward:{} - avg_episode_steps:{} - @@@@@@ '.
        format(
            estep,
            avg_episode_reward,
            max_episode_reward,
            avg_episode_steps,
        ))
Esempio n. 6
0
def evaluate(env, num_eval_steps, preprocess_fn, estimate_fn, summary_writer,
             saver, sess, global_step, log_summary_op, summary_text_tensor):
    total_reward = 0
    episode_reward = 0
    max_episode_reward = 0
    n_episodes = 1
    n_rewards = 1
    terminated = False
    global prev_eval_time
    # global max_avg_episode_reward

    transition = preprocess_fn(state=env.reset())
    estep = 0

    tf.logging.info(
        ' ####### start evaluate @ global step:{}##  '.format(global_step))

    while not terminated:
        estep += 1
        if estep > num_eval_steps:  #avoid too many low progress steps in one episode
            break

        policy_out = estimate_fn(
            transition.next_state[np.newaxis, :])  # must reshape to (1,11)
        action = policy_output_to_deterministic_action(policy_out,
                                                       env.action_space)
        (state, reward, terminated) = env_step(env, action)

        # we only need state to generate policy.
        transition = preprocess_fn(state)

        # record every reward
        total_reward += reward
        episode_reward += reward

        if reward != 0:
            n_rewards += 1  # can represent effective(not still) steps in episode

        if terminated:
            n_episodes += 1
            if episode_reward > max_episode_reward:
                max_episode_reward = episode_reward
                episode_reward = 0

            # relaunch
            # only save state.
            transition = preprocess_fn(env.reset())
            if estep < num_eval_steps:
                terminated = False  # continue

    # -- end for estep ---
    avg_episode_reward = total_reward / max(1, n_episodes)
    avg_episode_steps = n_rewards / max(1, n_episodes)
    now = time.time()
    if prev_eval_time == 0:  # first time eval.
        prev_eval_time = now

    #we always save model each evaluation.
    saved_name = save_model(saver, sess, global_step)
    write_summary(summary_writer, global_step, avg_episode_reward,
                  max_episode_reward, avg_episode_steps, now - prev_eval_time,
                  saved_name, sess, log_summary_op, summary_text_tensor)
    prev_eval_time = now
    # if avg_episode_reward > max_avg_episode_reward:
    #  max_avg_episode_reward = avg_episode_reward
    tf.logging.info(
        '@@@@@@ eval save model : global_step:{} - avg_episode_reward:{} -\
                   max_episode_reward:{} - avg_episode_steps:{} - saved_file: {} @@@@@@ '
        .format(global_step, avg_episode_reward, max_episode_reward,
                avg_episode_steps, saved_name))