def thread_func(t_idx): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph env = gym.make(env_name) agent = Agent("local_{}".format(t_idx), env, save_name, gamma) step = 0 episode = 0 while total_step < total_max_step: episode += 1 #gradient reset & parameter synchronize agent.update_parameter(global_agent) ### start_step = step states = [] actions = [] rewards = [] score = 0 cnt = 0 state = env.reset() while True: cnt += 1 step += 1 total_step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) ####### modify reward function ####### #reward = 200-cnt if done else 0 reward += 10 ####### modify reward function ####### states.append(state) actions.append(action) rewards.append(reward) score += reward if done or step - start_step == step_period: ret = 0 if done else agent.get_value(next_state) targets = [] for i in range(len(states)): ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient( states, actions, targets) global_agent.update_with_gradients(p_grad, v_grad) #loss_logger.write([step-start_step,p_loss,v_loss]) if done: break agent.update_parameter(global_agent) start_step = step states = [] actions = [] rewards = [] state = next_state #score_logger.write([cnt, score]) if t_idx == 0: print(score) graph.update(score, p_loss, v_loss, entropy) if episode % 100 == 0: global_agent.save()
def thread_func(t_idx): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores env = gym.make(env_name) env.unwrapped.initialize(is_render=False) agent = Agent("local_{}".format(t_idx), env, save_name, gamma) episode = 0 step = 0 p_loss = None v_loss = None entropy = None #gradient reset & parameter synchronize agent.update_parameter(global_agent) start_step = step states = [] actions = [] rewards = [] dones = [] score = 0 state = env.reset() while total_step < total_max_step: step += 1 total_step += 1 action = agent.get_action(state, True) #if action[0] > 0: # a_t = 1 #else : # a_t = 0 next_state, reward, done, info = env.step(action) #next_state, reward, done, info = env.step(a_t) ####### modify reward function ####### #reward = 200-cnt if done else 0 #reward /= 10 ####### modify reward function ####### states.append(state) actions.append(action) rewards.append(reward) dones.append(done) score += reward if step - start_step == step_period: ret = 0 if done else agent.get_value(next_state) targets = [] for i in range(len(states)): if dones[-i - 1]: ret = 0 #elif i > 0: # ret = agent.get_value(states[-i]) ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient( states, actions, targets) p_losses.append(p_loss) v_losses.append(v_loss) entropies.append(entropy) global_agent.update_with_gradients(p_grad, v_grad) #loss_logger.write([step-start_step,p_loss,v_loss]) agent.update_parameter(global_agent) if t_idx == 0: graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) start_step = step states = [] actions = [] rewards = [] dones = [] state = next_state #score_logger.write([cnt, score]) if done: episode += 1 if t_idx == 0 and episode % 10 == 0: global_agent.save() scores.append(score) print(t_idx, score) score = 0 state = env.reset()