Beispiel #1
0
def train():

    memory = []
    Transition = collections.namedtuple(
        "Transition",
        ["state", "action", "reward", "next_state", "next_action"])

    model = ActorCritic(flags.n_actions, flags.n_features, flags.lr_C,
                        flags.lr_A, flags.gamma, empty_goal_action)

    loss_his = []
    entropy_his = []
    reward_his = []

    for ii in range(flags.max_epoch):
        state = env.reset()
        init_state = state.copy()
        reward_all = 0
        done = False
        steps = 0
        loss = 0
        t_start = time.time()
        action = model.choose_action(state)

        while not done:
            next_state, reward, done, _ = env.step(action)
            next_action = model.choose_action(next_state)
            reward_all += reward
            steps += 1

            if len(memory) > flags.memory_size:
                memory.pop(0)
            memory.append(
                Transition(state, action, reward, next_state, next_action))

            state = next_state
            action = next_action

        if len(memory) > flags.batch_size:
            batch_transition = random.sample(memory, flags.batch_size)
            batch_state, batch_action, batch_reward, batch_next_state, batch_next_action = map(
                np.array, zip(*batch_transition))
            loss, _ = model.train(state=batch_state,
                                  action=batch_action,
                                  reward=batch_reward,
                                  state_=batch_next_state,
                                  action_=batch_next_action)
            entropy = model.compute_entropy(init_state)

        if loss != 0:
            loss_his.append(loss)
            entropy_his.append(entropy)
            reward_his.append(reward_all)
            print("epoch=", ii, "/time=",
                  time.time() - t_start, "/loss=", loss, "/entropy=", entropy,
                  "/reward=", reward_all)

    return loss_his, entropy_his, reward_his
Beispiel #2
0
class Agent(mp.Process):

    def __init__(self, global_actor_critic, optimizer, input_dims, nb_actions, gamma, lr, name, global_ep_index,
                 env_id):
        super(Agent, self).__init__()
        self.local_actor_critic = ActorCritic(input_dims, nb_actions, gamma)
        self.global_actor_critic = global_actor_critic
        self.name = "w%02i" % name
        self.episode_index = global_ep_index
        self.env = gym.make(env_id)
        self.optimizer = optimizer

    def run(self):
        t_step = 1
        while self.episode_index.value < EPISODES:
            done = False
            observation = self.env.reset()
            score = 0
            self.local_actor_critic.clear_memory()
            while not done:
                action = self.local_actor_critic.choose_action(observation)
                observation_, reward, done, info = self.env.step(action)
                score += reward
                self.local_actor_critic.remember(observation, action, reward)
                if (t_step % T_MAX) == 0 or done:
                    loss = self.local_actor_critic.calc_loss(done)
                    self.optimizer.zero_grad()
                    loss.backward()
                    for local_param, global_param in zip(
                            self.local_actor_critic.parameters(),
                            self.global_actor_critic.parameters()):
                        global_param._grad = local_param.grad
                    self.optimizer.step()
                    self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict())
                    self.local_actor_critic.clear_memory()
                t_step += 1
                observation = observation_
            with self.episode_index.get_lock():
                self.episode_index.value += 1
            print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score)