예제 #1
0
critic = Critic(sess,
                n_features=env.observation_space.shape[0],
                learning_rate=0.01)

sess.run(tf.global_variables_initializer())

tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):
    observation = env.reset()
    t = 0
    track_reward = []
    while True:
        if RENDER: env.render()

        action = actor.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        if done: reward = -20

        track_reward.append(reward)

        td_error = critic.learn(observation, reward, observation_)
        actor.learn(observation, action, td_error)

        observation = observation_
        t += 1

        if done or t > MAX_EP_STEP:
            ep_rs_sum = sum(track_reward)
예제 #2
0
Action_dim = 4

sess = tf.Session()

actor = Actor(sess, State_dim=State_dim, Action_dim=Action_dim, lr=Actor_lr)
critic = Critic(sess, State_dim=State_dim, lr=Critic_lr)

sess.run(tf.global_variables_initializer())

for i_episode in range(MAX_EPISODE):
    s = env.reset()
    t = 0
    track_r = 0
    total_action = []
    done = False
    while (not done and t < 200):

        a = actor.choose_action(s)

        s_, r, done = env.step(env.t_action[a])
        total_action.append(env.t_action[a])
        if done: r = -200
        td_error = critic.learn(s, -r, s_)
        actor.learn(s, a, td_error)

        s = s_
        track_r += r
        t += 1
    print("episode:", i_episode, "  tracked actions to attempt goal:",
          total_action)
class ReinforcementLearner:
    """
    Reinforcement Learner agent using the Actor-Critic architecture

    ...

    Attributes
    ----------

    Methods
    -------
    run():
        Runs all episodes with pivotal parameters
    """
    def __init__(self):
        self.__actor = Actor(
            parameters.ACTOR_LEARNING_RATE,
            parameters.ACTOR_DISCOUNT_FACTOR,
            parameters.ACTOR_TRACE_DECAY,
            parameters.ACTOR_EPSILON,
            parameters.ACTOR_EPSILON_DECAY,
        )
        self.__critic = CriticFactory.get_critic(
            parameters.USE_TABLE_CRITIC,
            parameters.CRITIC_LEARNING_RATE,
            parameters.CRITIC_DISCOUNT_FACTOR,
            parameters.CRITIC_TRACE_DECAY,
            parameters.CRITIC_NN_DIMENSIONS,
        )

        self.__simulated_world = SimulatedWorld()
        self.__episodes = parameters.EPISODES

    def __run_one_episode(self, visualize: bool = False) -> None:
        self.__actor.reset_eligibilities()
        self.__critic.reset_eligibilities()

        state, possible_actions = self.__simulated_world.reset()
        action = self.__actor.choose_action(state, possible_actions)

        done = False

        while not done:
            next_state, reward, done, possible_actions = self.__simulated_world.step(
                action, visualize)
            next_action = self.__actor.choose_action(next_state,
                                                     possible_actions)

            self.__actor.replace_eligibilities(state, action)
            self.__critic.replace_eligibilities(state)

            td_error = self.__critic.td_error(reward, next_state, state)

            self.__critic.update(reward, next_state, state)
            self.__actor.update(td_error)

            state, action = next_state, next_action

    def run(self) -> None:
        """
        Runs all episodes with pivotal parameters.
        Visualizes one round at the end.
        """
        for episode in range(self.__episodes):
            print('Episode:', episode + 1)
            self.__run_one_episode()

        print('Training completed.')
        self.__actor.plot_training_data()
        self.__critic.plot_training_data()
        self.__simulated_world.plot_training_data()

        if parameters.VISUALIZE_GAMES:
            print('Showing one episode with the greedy strategy.')
            self.__actor.set_epsilon(0)
            self.__run_one_episode(True)
예제 #4
0
파일: train.py 프로젝트: xyh97/DemoLight
def _train(dic_exp_conf, dic_agent_conf, dic_traffic_env_conf, dic_path):
    random.seed(dic_agent_conf['SEED'])
    np.random.seed(dic_agent_conf['SEED'])
    tf.set_random_seed(dic_agent_conf['SEED'])

    dic_path["PATH_TO_LOG"] = os.path.join(dic_path['PATH_TO_WORK_DIRECTORY'],
                                           "train_round")
    #
    # dic_path['PATH_TO_SUMO_CONF'] = os.path.join(dic_path['PATH_TO_WORK_DIRECTORY'], "sumo_conf", task)

    if not os.path.exists(dic_path['PATH_TO_LOG']):
        os.makedirs(dic_path['PATH_TO_LOG'])

    # dic_exp_conf = copy.deepcopy(self.dic_exp_conf)

    if dic_traffic_env_conf['SIMULATOR_TYPE'] == 'sumo':
        path_to_work_directory = dic_path["PATH_TO_SUMO_CONF"]
        env = DIC_ENVS[dic_traffic_env_conf["SIMULATOR_TYPE"]](
            path_to_log=dic_path["PATH_TO_LOG"],
            path_to_work_directory=path_to_work_directory,
            dic_traffic_env_conf=dic_traffic_env_conf)

    elif dic_traffic_env_conf['SIMULATOR_TYPE'] == 'anon':
        env = DIC_ENVS[dic_traffic_env_conf["SIMULATOR_TYPE"]](
            path_to_log=dic_path["PATH_TO_LOG"],
            path_to_work_directory=dic_path["PATH_TO_DATA"],
            dic_traffic_env_conf=dic_traffic_env_conf)
    dic_agent_conf["PHI"] = 5
    dic_agent_conf["MIN_GREEN_VEC"] = 3
    dic_agent_conf["MAX_RED_VEC"] = 6
    demo_path = "../frap/demo_{}.p".format(dic_exp_conf["TRAFFIC_FILE"])

    with open(demo_path, 'rb') as f:
        demo_transitions = pickle.load(f)
        demo_transitions = deque(
            itertools.islice(demo_transitions, 0, Config.demo_buffer_size))

    sess = tf.InteractiveSession()
    actor = Actor(sess=sess,
                  n_features=16,
                  n_actions=8,
                  dic_traffic_env_conf=dic_traffic_env_conf,
                  lr=1e-3)  # 初始化Actor
    critic = Critic(sess=sess,
                    n_features=16,
                    config=DQfDConfig(),
                    dic_traffic_env_conf=dic_traffic_env_conf,
                    demo=demo_transitions,
                    lr=1e-3)  # 初始化Critic
    # agent = Agent(sess=sess, n_features=16, config=DQfDConfig(), dic_traffic_env_conf=dic_traffic_env_conf,
    #               demo=demo_transitions, lr=1e-3)
    # actor = Actor(sess=sess, n_features=16, n_actions=8, dic_traffic_env_conf=dic_traffic_env_conf, lr=1e-3)
    # critic = Critic(sess=sess, n_features=16, lr=1e-3)
    sess.run(tf.global_variables_initializer())  # 初始化参数

    for i in range(10):
        state, action = critic.train_Q_network(pre_train=True)
        actor.pretrain(state, action)

    for i in range(501):

        done = False
        state = env.reset()
        step_num = 0
        while not done and step_num < int(
                dic_exp_conf["EPISODE_LEN"] /
                dic_traffic_env_conf["MIN_ACTION_TIME"]):
            action_list = []
            for one_state in state:
                s = convert_to_input(one_state, dic_traffic_env_conf)
                action, probs = actor.choose_action(
                    s)  # one for multi-state, the other for multi-intersection
                # action, probs = agent.choose_action(s)
                action_list.append(action)  # for multi-state

            next_state, reward, done, ave_reward = env.step(action_list)

            s = convert_to_input(state[0], dic_traffic_env_conf)
            s_ = convert_to_input(next_state[0], dic_traffic_env_conf)

            next_action, _ = actor.choose_action(s_)
            # next_action, _ = agent.choose_action(s_)
            # #
            # q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action]), probs)
            # q_a = critic.learn(s, np.array(ave_reward), s_, np.array([action]), np.array([next_action]))
            if i != 0:
                q_a = critic.learn(s, np.array(ave_reward), s_,
                                   np.array([action]), np.array([next_action]),
                                   probs)
                # q_a = critic.learn(s, np.array(reward), s_)
                actor.learn(s, np.array([action]), q_a)
            # agent.learn_actor(s, np.array([action]), q_a)

            state = next_state
            step_num += 1
        # if i % 3 == 0 and i != 0:
        #     critic.sess.run(critic.update_target_net)
        env.bulk_log(i)
        write_summary(dic_path, dic_exp_conf, i)