Ejemplo n.º 1
0
    def thread_func(t_idx):
        global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
                loss_logger, score_logger, graph
        env = gym.make(env_name)
        agent = Agent("local_{}".format(t_idx), env, save_name, gamma)
        step = 0
        episode = 0

        while total_step < total_max_step:
            episode += 1
            #gradient reset & parameter synchronize
            agent.update_parameter(global_agent)
            ###
            start_step = step
            states = []
            actions = []
            rewards = []
            score = 0
            cnt = 0
            state = env.reset()
            while True:
                cnt += 1
                step += 1
                total_step += 1
                action = agent.get_action(state, True)
                next_state, reward, done, info = env.step(action)
                ####### modify reward function #######
                #reward = 200-cnt if done else 0
                reward += 10
                ####### modify reward function #######
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                score += reward
                if done or step - start_step == step_period:
                    ret = 0 if done else agent.get_value(next_state)
                    targets = []
                    for i in range(len(states)):
                        ret = rewards[-i - 1] + gamma * ret
                        targets.append(ret)
                    targets = targets[::-1]
                    p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient(
                        states, actions, targets)
                    global_agent.update_with_gradients(p_grad, v_grad)
                    #loss_logger.write([step-start_step,p_loss,v_loss])
                    if done:
                        break
                    agent.update_parameter(global_agent)
                    start_step = step
                    states = []
                    actions = []
                    rewards = []
                state = next_state
            #score_logger.write([cnt, score])
            if t_idx == 0:
                print(score)
                graph.update(score, p_loss, v_loss, entropy)
                if episode % 100 == 0: global_agent.save()
Ejemplo n.º 2
0
    def thread_func(t_idx):
        global total_step, total_max_step, env_name, global_agent, step_period, gamma, \
                loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores
        env = gym.make(env_name)
        env.unwrapped.initialize(is_render=False)
        agent = Agent("local_{}".format(t_idx), env, save_name, gamma)
        episode = 0
        step = 0

        p_loss = None
        v_loss = None
        entropy = None

        #gradient reset & parameter synchronize
        agent.update_parameter(global_agent)
        start_step = step
        states = []
        actions = []
        rewards = []
        dones = []

        score = 0
        state = env.reset()
        while total_step < total_max_step:
            step += 1
            total_step += 1

            action = agent.get_action(state, True)
            #if action[0] > 0:
            #    a_t = 1
            #else :
            #    a_t = 0
            next_state, reward, done, info = env.step(action)
            #next_state, reward, done, info = env.step(a_t)
            ####### modify reward function #######
            #reward = 200-cnt if done else 0
            #reward /= 10
            ####### modify reward function #######
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)
            score += reward

            if step - start_step == step_period:
                ret = 0 if done else agent.get_value(next_state)
                targets = []
                for i in range(len(states)):
                    if dones[-i - 1]:
                        ret = 0
                    #elif i > 0:
                    #    ret = agent.get_value(states[-i])
                    ret = rewards[-i - 1] + gamma * ret
                    targets.append(ret)
                targets = targets[::-1]
                p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient(
                    states, actions, targets)
                p_losses.append(p_loss)
                v_losses.append(v_loss)
                entropies.append(entropy)
                global_agent.update_with_gradients(p_grad, v_grad)
                #loss_logger.write([step-start_step,p_loss,v_loss])
                agent.update_parameter(global_agent)
                if t_idx == 0:
                    graph.update(np.mean(scores), np.mean(p_losses),
                                 np.mean(v_losses), np.mean(entropies))

                start_step = step
                states = []
                actions = []
                rewards = []
                dones = []

            state = next_state
            #score_logger.write([cnt, score])
            if done:
                episode += 1
                if t_idx == 0 and episode % 10 == 0:
                    global_agent.save()
                scores.append(score)
                print(t_idx, score)
                score = 0
                state = env.reset()