def train(): global env_name, save_name, agent_args env = gym.make(env_name) if env_name == 'DobroHalfCheetah-v0': env.unwrapped.initialize(is_render=False) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') p_loss_logger = Logger(save_name, 'p_loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), agent.name) episodes = int(5e5) save_freq = 1 save_period = 1000 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period) for episode in range(episodes): state = env.reset() agent.actor_noise.reset() done = False score = 0 step = 0 while not done: step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) agent.replay_memory.append([ np.array(state, np.float32), action, reward, done, np.array(next_state, np.float32) ]) ######################## if len(agent.replay_memory) > agent.train_start: v_loss, p_loss = agent.train() v_loss_logger.write([1, v_loss]) p_loss_logger.write([1, p_loss]) p_losses.append(p_loss) v_losses.append(v_loss) value = agent.get_value(state, action) entropies.append(value) scores.append(reward) graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) state = next_state score += reward print(episode, score, agent.epsilon) score_logger.write([step, score]) if (episode + 1) % save_freq == 0: agent.save() v_loss_logger.save() p_loss_logger.save() score_logger.save() graph.update(0, 0, 0, 0, finished=True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') cost_v_loss_logger = Logger(save_name, 'cost_v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') cost_logger = Logger(save_name, 'cost') graph = Graph( 1000, save_name, ['score', 'cost', 'value loss', 'cost value loss', 'kl divergence']) max_steps = 4000 max_ep_len = 1000 episodes = int(max_steps / max_ep_len) epochs = 500 save_freq = 10 log_length = 10 p_objectives = deque(maxlen=log_length) c_objectives = deque(maxlen=log_length) v_losses = deque(maxlen=log_length) cost_v_losses = deque(maxlen=log_length) kl_divergence = deque(maxlen=log_length) scores = deque(maxlen=log_length * episodes) costs = deque(maxlen=log_length * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] cost_targets = [] gaes = [] cost_gaes = [] avg_costs = [] ep_step = 0 while ep_step < max_steps: state = env.reset() done = False score = 0 cost = 0 step = 0 temp_rewards = [] temp_costs = [] values = [] cost_values = [] while True: step += 1 ep_step += 1 assert env.observation_space.contains(state) action, clipped_action, value, cost_value = agent.get_action( state, True) assert env.action_space.contains(clipped_action) next_state, reward, done, info = env.step(clipped_action) #for predict cost h_dist = hazard_dist(env.hazards_pos, env.world.robot_pos()) predict_cost = get_cost(h_dist) states.append(state) actions.append(action) temp_rewards.append(reward) temp_costs.append(predict_cost) values.append(value) cost_values.append(cost_value) state = next_state score += reward cost += info.get('cost', 0) #로그는 실제 cost를 남겨서, discrete한 cost랑 비교해야함. if done or step >= max_ep_len: break if step >= max_ep_len: action, clipped_action, value, cost_value = agent.get_action( state, True) else: value = 0 cost_value = 0 print("done before max_ep_len...") next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) next_cost_values = cost_values[1:] + [cost_value] temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets( temp_costs, cost_values, next_cost_values) avg_costs.append(np.mean(temp_costs)) targets += list(temp_targets) gaes += list(temp_gaes) cost_targets += list(temp_cost_targets) cost_gaes += list(temp_cost_gaes) score_logger.write([step, score]) cost_logger.write([step, cost]) scores.append(score) costs.append(cost) trajs = [ states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs ] v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train( trajs) v_loss_logger.write([ep_step, v_loss]) cost_v_loss_logger.write([ep_step, cost_v_loss]) kl_logger.write([ep_step, kl]) p_objectives.append(p_objective) c_objectives.append(cost_objective) v_losses.append(v_loss) cost_v_losses.append(cost_v_loss) kl_divergence.append(kl) print(np.mean(scores), np.mean(costs), np.mean(v_losses), np.mean(cost_v_losses), np.mean(kl_divergence), np.mean(c_objectives)) graph.update([ np.mean(scores), np.mean(costs), np.mean(v_losses), np.mean(cost_v_losses), np.mean(kl_divergence) ]) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() cost_v_loss_logger.save() kl_logger.save() score_logger.save() cost_logger.save() graph.update(None, finished=True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) p_loss_logger = Logger(save_name, 'p_loss') v_loss_logger = Logger(save_name, 'v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') graph = Graph( 1000, save_name, ['score', 'policy loss', 'value loss', 'kl divergence', 'entropy']) episodes = 10 max_steps = 4000 max_ep_len = min(1000, env.spec.max_episode_steps) epochs = int(1e5) save_freq = 10 save_period = 10 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) kl_divergence = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] next_states = [] rewards = [] gaes = [] ep_step = 0 #for episode in range(episodes): while ep_step < max_steps: state = env.reset() done = False score = 0 step = 0 temp_rewards = [] values = [] while True: step += 1 ep_step += 1 action, clipped_action, value = agent.get_action(state, True) next_state, reward, done, info = env.step(clipped_action) states.append(state) actions.append(action) temp_rewards.append(reward) next_states.append(next_state) rewards.append(reward) values.append(value) state = next_state score += reward if done or step >= max_ep_len: break if step >= max_ep_len: action, clipped_action, value = agent.get_action(state, True) else: #중간에 끝난 거면, 다 돌기전에 죽어버린거니, value = 0 으로 해야함 value = 0 print("done before max_ep_len...") next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) targets += list(temp_targets) gaes += list(temp_gaes) score_logger.write([step, score]) scores.append(score) trajs = [states, actions, targets, next_states, rewards, gaes] p_loss, v_loss, kl, entropy = agent.train(trajs) p_loss_logger.write([ep_step, p_loss]) v_loss_logger.write([ep_step, v_loss]) kl_logger.write([ep_step, kl]) p_losses.append(p_loss) v_losses.append(v_loss) kl_divergence.append(kl) entropies.append(entropy) print(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(kl_divergence), np.mean(entropies)) graph.update([ np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(kl_divergence), np.mean(entropies) ]) if (epoch + 1) % save_freq == 0: agent.save() p_loss_logger.save() v_loss_logger.save() kl_logger.save() score_logger.save() graph.update(None, finished=True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) env.unwrapped.initialize(is_render=False) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') p_loss_logger = Logger(save_name, 'p_loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), agent.name) episodes = 10 epochs = int(1e5) save_freq = 10 save_period = 100 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] ep_step = 0 for episode in range(episodes): state = env.reset() done = False score = 0 step = 0 temp_rewards = [] while not done: step += 1 ep_step += 1 action, clipped_action = agent.get_action(state, True) next_state, reward, done, info = env.step(clipped_action) states.append(state) actions.append(action) temp_rewards.append(reward) state = next_state score += reward score_logger.write([step, score]) scores.append(score) temp_targets = np.zeros_like(temp_rewards) ret = 0 for t in reversed(range(len(temp_rewards))): ret = temp_rewards[t] + agent.discount_factor * ret temp_targets[t] = ret targets += list(temp_targets) trajs = [states, actions, targets] v_loss, p_objective, kl = agent.train(trajs) v_loss_logger.write([ep_step, v_loss]) p_loss_logger.write([ep_step, p_objective]) p_losses.append(p_objective) v_losses.append(v_loss) entropies.append(kl) #print(v_loss, p_objective, kl) print(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() p_loss_logger.save() score_logger.save() graph.update(0, 0, 0, 0, finished=True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name, ['score', 'policy objective', 'value loss', 'kl divergence']) episodes = 10 epochs = int(1e5) save_freq = 10 save_period = 10 p_objectives = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) kl_divergence = deque(maxlen=save_period) scores = deque(maxlen=save_period * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] next_states = [] rewards = [] gaes = [] ep_step = 0 for episode in range(episodes): state = env.reset() done = False score = 0 step = 0 temp_rewards = [] values = [] while not done: step += 1 ep_step += 1 action, clipped_action, value = agent.get_action(state, True) if clipped_action > 0: a_t = 1 else: a_t = 0 next_state, reward, done, info = env.step(a_t) states.append(state) actions.append(action) temp_rewards.append(reward) next_states.append(next_state) rewards.append(reward) values.append(value) state = next_state score += reward score_logger.write([step, score]) scores.append(score) action, clipped_action, value = agent.get_action(state, True) next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) targets += list(temp_targets) gaes += list(temp_gaes) trajs = [states, actions, targets, next_states, rewards, gaes] v_loss, p_objective, kl = agent.train(trajs) v_loss_logger.write([ep_step, v_loss]) kl_logger.write([ep_step, kl]) p_objectives.append(p_objective) v_losses.append(v_loss) kl_divergence.append(kl) print(np.mean(scores), np.mean(p_objectives), np.mean(v_losses), np.mean(kl_divergence)) graph.update([ np.mean(scores), np.mean(p_objectives), np.mean(v_losses), np.mean(kl_divergence) ]) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() kl_logger.save() score_logger.save() graph.update(None, finished=True)
def train(): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph gamma = 0.99 num_thread = 10 total_step = 0 total_max_step = 1e6 step_period = 1e3 step_period = int(step_period / num_thread) save_name = env_name.split('-')[0] env = gym.make(env_name) global_agent = Agent("global", env, save_name, gamma) loss_logger = Logger(save_name, 'loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), 'A3C') env.close() def thread_func(t_idx): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph env = gym.make(env_name) agent = Agent("local_{}".format(t_idx), env, save_name, gamma) step = 0 episode = 0 while total_step < total_max_step: episode += 1 #gradient reset & parameter synchronize agent.update_parameter(global_agent) ### start_step = step states = [] actions = [] rewards = [] score = 0 cnt = 0 state = env.reset() while True: cnt += 1 step += 1 total_step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) ####### modify reward function ####### #reward = 200-cnt if done else 0 reward += 10 ####### modify reward function ####### states.append(state) actions.append(action) rewards.append(reward) score += reward if done or step - start_step == step_period: ret = 0 if done else agent.get_value(next_state) targets = [] for i in range(len(states)): ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient( states, actions, targets) global_agent.update_with_gradients(p_grad, v_grad) #loss_logger.write([step-start_step,p_loss,v_loss]) if done: break agent.update_parameter(global_agent) start_step = step states = [] actions = [] rewards = [] state = next_state #score_logger.write([cnt, score]) if t_idx == 0: print(score) graph.update(score, p_loss, v_loss, entropy) if episode % 100 == 0: global_agent.save() threads = [] for i in range(num_thread): threads.append(threading.Thread(target=thread_func, args=(i, ))) threads[-1].start() for thread in threads: thread.join() graph.update(0, 0, 0, 0, True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name, ['score', 'policy loss', 'Q value loss', 'entropy']) max_steps = 4000 max_ep_len = min(1000, env.spec.max_episode_steps) start_training_after_steps = 1000 step_per_training = 50 epochs = 1000 save_freq = 1 record_length = 10 p_losses = deque(maxlen=record_length * int(max_ep_len / step_per_training)) q_losses = deque(maxlen=record_length * int(max_ep_len / step_per_training)) entropies = deque(maxlen=record_length * int(max_ep_len / step_per_training)) scores = deque(maxlen=record_length) total_step = 0 for epoch in range(epochs): ep_step = 0 while ep_step < max_steps: state = env.reset() score = 0 step = 0 while True: step += 1 ep_step += 1 total_step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) done = False if step >= max_ep_len else done agent.replay_memory.append( [state, action, reward, np.float(done), next_state]) if len(agent.replay_memory) > start_training_after_steps and ( total_step + 1) % step_per_training == 0: for _ in range(step_per_training): p_loss, q_loss, entropy = agent.train() p_losses.append(p_loss) q_losses.append(q_loss) entropies.append(entropy) print(np.mean(scores), np.mean(p_losses), np.mean(q_losses), np.mean(entropies)) state = next_state score += reward if done or step >= max_ep_len: break score_logger.write([step, score]) scores.append(score) graph.update([ np.mean(scores), np.mean(p_losses), np.mean(q_losses), np.mean(entropies) ]) if (epoch + 1) % save_freq == 0: agent.save() score_logger.save() graph.update(None, finished=True)
next_states.append(next_state) score += reward if done: break state = next_state scores.append(score) targets = [] for i in range(len(states)): if dones[-i - 1]: #ret = 0 ret = agent.get_value(next_states[-i - 1]) value_list.append(ret) ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] for i in range(value_epoch): v_loss = agent.value_train(states, actions, targets) for i in range(policy_epoch): p_loss, entropy = agent.policy_train(states, actions, targets) p_losses.append(p_loss) v_losses.append(v_loss) entropies.append(entropy) graph.update( np.mean(scores), np.mean(p_losses), [np.mean(v_losses), np.mean(value_list)], np.mean(entropies)) if (total_iter + 1) % 10 == 0: print(np.mean(scores)) agent.save() graph.update(0, 0, 0, 0, True)
def train(): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores gamma = 0.99 num_thread = 10 total_step = 0 total_max_step = 1e7 step_period = 1e4 #1e4 step_period = int(step_period / num_thread) save_name = env_name.split('-')[0] env = gym.make(env_name) env.unwrapped.initialize(is_render=False) global_agent = Agent("global", env, save_name, gamma) loss_logger = Logger(save_name, 'loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), 'A3C') env.close() p_losses = deque(maxlen=step_period) v_losses = deque(maxlen=step_period) entropies = deque(maxlen=step_period) scores = deque(maxlen=step_period) def thread_func(t_idx): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores env = gym.make(env_name) env.unwrapped.initialize(is_render=False) agent = Agent("local_{}".format(t_idx), env, save_name, gamma) episode = 0 step = 0 p_loss = None v_loss = None entropy = None #gradient reset & parameter synchronize agent.update_parameter(global_agent) start_step = step states = [] actions = [] rewards = [] dones = [] score = 0 state = env.reset() while total_step < total_max_step: step += 1 total_step += 1 action = agent.get_action(state, True) #if action[0] > 0: # a_t = 1 #else : # a_t = 0 next_state, reward, done, info = env.step(action) #next_state, reward, done, info = env.step(a_t) ####### modify reward function ####### #reward = 200-cnt if done else 0 #reward /= 10 ####### modify reward function ####### states.append(state) actions.append(action) rewards.append(reward) dones.append(done) score += reward if step - start_step == step_period: ret = 0 if done else agent.get_value(next_state) targets = [] for i in range(len(states)): if dones[-i - 1]: ret = 0 #elif i > 0: # ret = agent.get_value(states[-i]) ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient( states, actions, targets) p_losses.append(p_loss) v_losses.append(v_loss) entropies.append(entropy) global_agent.update_with_gradients(p_grad, v_grad) #loss_logger.write([step-start_step,p_loss,v_loss]) agent.update_parameter(global_agent) if t_idx == 0: graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) start_step = step states = [] actions = [] rewards = [] dones = [] state = next_state #score_logger.write([cnt, score]) if done: episode += 1 if t_idx == 0 and episode % 10 == 0: global_agent.save() scores.append(score) print(t_idx, score) score = 0 state = env.reset() threads = [] for i in range(num_thread): threads.append(threading.Thread(target=thread_func, args=(i, ))) threads[-1].start() for thread in threads: thread.join() graph.update(0, 0, 0, 0, True)