Ejemplo n.º 1
0
def train():
    global env_name, save_name, agent_args
    env = gym.make(env_name)
    if env_name == 'DobroHalfCheetah-v0':
        env.unwrapped.initialize(is_render=False)
    agent = Agent(env, agent_args)

    v_loss_logger = Logger(save_name, 'v_loss')
    p_loss_logger = Logger(save_name, 'p_loss')
    score_logger = Logger(save_name, 'score')
    graph = Graph(1000, save_name.upper(), agent.name)
    episodes = int(5e5)
    save_freq = 1

    save_period = 1000
    p_losses = deque(maxlen=save_period)
    v_losses = deque(maxlen=save_period)
    entropies = deque(maxlen=save_period)
    scores = deque(maxlen=save_period)

    for episode in range(episodes):
        state = env.reset()
        agent.actor_noise.reset()
        done = False
        score = 0
        step = 0

        while not done:
            step += 1
            action = agent.get_action(state, True)
            next_state, reward, done, info = env.step(action)
            agent.replay_memory.append([
                np.array(state, np.float32), action, reward, done,
                np.array(next_state, np.float32)
            ])
            ########################

            if len(agent.replay_memory) > agent.train_start:
                v_loss, p_loss = agent.train()
                v_loss_logger.write([1, v_loss])
                p_loss_logger.write([1, p_loss])
                p_losses.append(p_loss)
                v_losses.append(v_loss)
                value = agent.get_value(state, action)
                entropies.append(value)
                scores.append(reward)
                graph.update(np.mean(scores), np.mean(p_losses),
                             np.mean(v_losses), np.mean(entropies))
            state = next_state
            score += reward

        print(episode, score, agent.epsilon)
        score_logger.write([step, score])
        if (episode + 1) % save_freq == 0:
            agent.save()
            v_loss_logger.save()
            p_loss_logger.save()
            score_logger.save()

    graph.update(0, 0, 0, 0, finished=True)
Ejemplo n.º 2
0
    def thread_func(t_idx):
        global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
                loss_logger, score_logger, graph
        env = gym.make(env_name)
        agent = Agent("local_{}".format(t_idx), env, save_name, gamma)
        step = 0
        episode = 0

        while total_step < total_max_step:
            episode += 1
            #gradient reset & parameter synchronize
            agent.update_parameter(global_agent)
            ###
            start_step = step
            states = []
            actions = []
            rewards = []
            score = 0
            cnt = 0
            state = env.reset()
            while True:
                cnt += 1
                step += 1
                total_step += 1
                action = agent.get_action(state, True)
                next_state, reward, done, info = env.step(action)
                ####### modify reward function #######
                #reward = 200-cnt if done else 0
                reward += 10
                ####### modify reward function #######
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                score += reward
                if done or step - start_step == step_period:
                    ret = 0 if done else agent.get_value(next_state)
                    targets = []
                    for i in range(len(states)):
                        ret = rewards[-i - 1] + gamma * ret
                        targets.append(ret)
                    targets = targets[::-1]
                    p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient(
                        states, actions, targets)
                    global_agent.update_with_gradients(p_grad, v_grad)
                    #loss_logger.write([step-start_step,p_loss,v_loss])
                    if done:
                        break
                    agent.update_parameter(global_agent)
                    start_step = step
                    states = []
                    actions = []
                    rewards = []
                state = next_state
            #score_logger.write([cnt, score])
            if t_idx == 0:
                print(score)
                graph.update(score, p_loss, v_loss, entropy)
                if episode % 100 == 0: global_agent.save()
Ejemplo n.º 3
0
    def thread_func(t_idx):
        global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
                loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores
        env = gym.make(env_name)
        env.unwrapped.initialize(is_render=False)
        agent = Agent("local_{}".format(t_idx), env, save_name, gamma)
        episode = 0
        step = 0

        p_loss = None
        v_loss = None
        entropy = None

        #gradient reset & parameter synchronize
        agent.update_parameter(global_agent)
        start_step = step
        states = []
        actions = []
        rewards = []
        dones = []

        score = 0
        state = env.reset()
        while total_step < total_max_step:
            step += 1
            total_step += 1

            action = agent.get_action(state, True)
            #if action[0] > 0:
            #    a_t = 1
            #else :
            #    a_t = 0
            next_state, reward, done, info = env.step(action)
            #next_state, reward, done, info = env.step(a_t)
            ####### modify reward function #######
            #reward = 200-cnt if done else 0
            #reward /= 10
            ####### modify reward function #######
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)
            score += reward

            if step - start_step == step_period:
                ret = 0 if done else agent.get_value(next_state)
                targets = []
                for i in range(len(states)):
                    if dones[-i - 1]:
                        ret = 0
                    #elif i > 0:
                    #    ret = agent.get_value(states[-i])
                    ret = rewards[-i - 1] + gamma * ret
                    targets.append(ret)
                targets = targets[::-1]
                p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient(
                    states, actions, targets)
                p_losses.append(p_loss)
                v_losses.append(v_loss)
                entropies.append(entropy)
                global_agent.update_with_gradients(p_grad, v_grad)
                #loss_logger.write([step-start_step,p_loss,v_loss])
                agent.update_parameter(global_agent)
                if t_idx == 0:
                    graph.update(np.mean(scores), np.mean(p_losses),
                                 np.mean(v_losses), np.mean(entropies))

                start_step = step
                states = []
                actions = []
                rewards = []
                dones = []

            state = next_state
            #score_logger.write([cnt, score])
            if done:
                episode += 1
                if t_idx == 0 and episode % 10 == 0:
                    global_agent.save()
                scores.append(score)
                print(t_idx, score)
                score = 0
                state = env.reset()