def train(): global env_name, save_name, agent_args env = gym.make(env_name) if env_name == 'DobroHalfCheetah-v0': env.unwrapped.initialize(is_render=False) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') p_loss_logger = Logger(save_name, 'p_loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), agent.name) episodes = int(5e5) save_freq = 1 save_period = 1000 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period) for episode in range(episodes): state = env.reset() agent.actor_noise.reset() done = False score = 0 step = 0 while not done: step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) agent.replay_memory.append([ np.array(state, np.float32), action, reward, done, np.array(next_state, np.float32) ]) ######################## if len(agent.replay_memory) > agent.train_start: v_loss, p_loss = agent.train() v_loss_logger.write([1, v_loss]) p_loss_logger.write([1, p_loss]) p_losses.append(p_loss) v_losses.append(v_loss) value = agent.get_value(state, action) entropies.append(value) scores.append(reward) graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) state = next_state score += reward print(episode, score, agent.epsilon) score_logger.write([step, score]) if (episode + 1) % save_freq == 0: agent.save() v_loss_logger.save() p_loss_logger.save() score_logger.save() graph.update(0, 0, 0, 0, finished=True)
def train(): global env, env_name env_name = env_name.split('-')[0] agent = Agent(env, env_name) loss_logger = Logger(env_name, 'loss') score_logger = Logger(env_name, 'score') action_low = env.action_space.low[0] action_high = env.action_space.high[0] episodes = int(5e2) avg_Q = deque(maxlen=200) for episode in range(episodes): state = env.reset() done = False score = 0 step = 0 while not done: step += 1 action = agent.get_action(state) a_t = (action/(agent.n_action-1)) a_t = a_t*(action_high - action_low) + action_low next_state, reward, done, info = env.step([a_t]) agent.replay_memory.append([np.array(state, np.float32), action, reward, done, np.array(next_state, np.float32)]) ######################## #replay 메모리에 어느정도 쌓이면 학습시작하기 if len(agent.replay_memory) > agent.train_start: Q, loss = agent.train() loss_logger.write([1, loss]) avg_Q.append(Q) state = next_state score += reward #print(episode, accumulate+100, self.epsilon) print(episode, score, agent.epsilon, np.mean(avg_Q)) agent.update_target_model() score_logger.write([step, score]) if (episode+1)%agent.save_freq == 0: agent.save() loss_logger.save() score_logger.save()
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') cost_v_loss_logger = Logger(save_name, 'cost_v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') cost_logger = Logger(save_name, 'cost') graph = Graph( 1000, save_name, ['score', 'cost', 'value loss', 'cost value loss', 'kl divergence']) max_steps = 4000 max_ep_len = 1000 episodes = int(max_steps / max_ep_len) epochs = 500 save_freq = 10 log_length = 10 p_objectives = deque(maxlen=log_length) c_objectives = deque(maxlen=log_length) v_losses = deque(maxlen=log_length) cost_v_losses = deque(maxlen=log_length) kl_divergence = deque(maxlen=log_length) scores = deque(maxlen=log_length * episodes) costs = deque(maxlen=log_length * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] cost_targets = [] gaes = [] cost_gaes = [] avg_costs = [] ep_step = 0 while ep_step < max_steps: state = env.reset() done = False score = 0 cost = 0 step = 0 temp_rewards = [] temp_costs = [] values = [] cost_values = [] while True: step += 1 ep_step += 1 assert env.observation_space.contains(state) action, clipped_action, value, cost_value = agent.get_action( state, True) assert env.action_space.contains(clipped_action) next_state, reward, done, info = env.step(clipped_action) #for predict cost h_dist = hazard_dist(env.hazards_pos, env.world.robot_pos()) predict_cost = get_cost(h_dist) states.append(state) actions.append(action) temp_rewards.append(reward) temp_costs.append(predict_cost) values.append(value) cost_values.append(cost_value) state = next_state score += reward cost += info.get('cost', 0) #로그는 실제 cost를 남겨서, discrete한 cost랑 비교해야함. if done or step >= max_ep_len: break if step >= max_ep_len: action, clipped_action, value, cost_value = agent.get_action( state, True) else: value = 0 cost_value = 0 print("done before max_ep_len...") next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) next_cost_values = cost_values[1:] + [cost_value] temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets( temp_costs, cost_values, next_cost_values) avg_costs.append(np.mean(temp_costs)) targets += list(temp_targets) gaes += list(temp_gaes) cost_targets += list(temp_cost_targets) cost_gaes += list(temp_cost_gaes) score_logger.write([step, score]) cost_logger.write([step, cost]) scores.append(score) costs.append(cost) trajs = [ states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs ] v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train( trajs) v_loss_logger.write([ep_step, v_loss]) cost_v_loss_logger.write([ep_step, cost_v_loss]) kl_logger.write([ep_step, kl]) p_objectives.append(p_objective) c_objectives.append(cost_objective) v_losses.append(v_loss) cost_v_losses.append(cost_v_loss) kl_divergence.append(kl) print(np.mean(scores), np.mean(costs), np.mean(v_losses), np.mean(cost_v_losses), np.mean(kl_divergence), np.mean(c_objectives)) graph.update([ np.mean(scores), np.mean(costs), np.mean(v_losses), np.mean(cost_v_losses), np.mean(kl_divergence) ]) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() cost_v_loss_logger.save() kl_logger.save() score_logger.save() cost_logger.save() graph.update(None, finished=True)
def train(): global env_name, save_name, agent_args, env_real, env_sim, nets env_real = env_real.Env_real(False) env_sim = env_sim.Env_sim(True) GAT_model = nets.GAT_net(env_real, env_sim, GAT_args) agent = Agent(env_sim, agent_args) # wandb.init(project=save_name) accum_step = 0 avg_temp_cost = 0 v_loss_logger = Logger(save_name, 'v_loss') cost_v_loss_logger = Logger(save_name, 'cost_v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') cost_logger = Logger(save_name, 'cost') max_steps = 2000 max_ep_len = 1000 episodes = int(max_steps / max_ep_len) epochs = 2 #50 save_freq = 1 log_length = 10 p_objectives = deque(maxlen=log_length) c_objectives = deque(maxlen=log_length) v_losses = deque(maxlen=log_length) cost_v_losses = deque(maxlen=log_length) kl_divergence = deque(maxlen=log_length) scores = deque(maxlen=log_length * episodes) costs = deque(maxlen=log_length * episodes) is_backup = False backup_name = '{}/backup.pkl'.format(save_name) if os.path.isfile(backup_name): #input_value = raw_input('backup file exists. wanna continue the last work?( y/n )') #if input_value != 'n': # is_backup = True is_backup = True if is_backup: with open(backup_name, 'rb') as f: backup_list = pickle.load(f) start_iter = backup_list[0] else: start_iter = 0 backup_list = [start_iter] for epoch in range(start_iter, epochs): #continue? print("=" * 20) print("Epoch : {}".format(epoch + 1)) #input_value = raw_input("wanna continue episodes?( y/n )") #if input_value == 'n': # break states = [] actions = [] targets = [] cost_targets = [] gaes = [] cost_gaes = [] avg_costs = [] ep_step = 0 while ep_step < max_steps: #input_value = raw_input("ready?") state = env_sim.reset() done = False score = 0 cost = 0 step = 0 temp_rewards = [] temp_costs = [] values = [] cost_values = [] while True: if rospy.is_shutdown(): sys.exit() step += 1 ep_step += 1 action, clipped_action, value, cost_value = agent.get_action( state, True) # action transformer by GAT transformed_next_state = GAT_model.forward_transform( state, clipped_action) transformed_action = GAT_model.backward_transform( state, transformed_next_state) next_state, reward, done, info = env_sim.step( transformed_action) predict_cost = info['continuous_cost'] states.append(state) actions.append(action) temp_rewards.append(reward) temp_costs.append(predict_cost) values.append(value) cost_values.append(cost_value) state = next_state score += reward cost += info.get('cost', 0) if done or step >= max_ep_len: break print("step : {}, score : {}".format(step, score)) if step >= max_ep_len: action, clipped_action, value, cost_value = agent.get_action( state, True) else: value = 0 cost_value = 0 print("done before max_ep_len...") next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) next_cost_values = cost_values[1:] + [cost_value] temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets( temp_costs, cost_values, next_cost_values) avg_costs.append(np.mean(temp_costs)) targets += list(temp_targets) gaes += list(temp_gaes) cost_targets += list(temp_cost_targets) cost_gaes += list(temp_cost_gaes) score_logger.write([step, score]) cost_logger.write([step, cost]) scores.append(score) costs.append(cost) accum_step += step avg_temp_cost = np.mean(temp_costs) # wandb.log({'step': accum_step, 'score':score, 'cost':cost, 'avg_temp_cost':avg_temp_cost}) trajs = [ states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs ] v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train( trajs) v_loss_logger.write([ep_step, v_loss]) cost_v_loss_logger.write([ep_step, cost_v_loss]) kl_logger.write([ep_step, kl]) p_objectives.append(p_objective) c_objectives.append(cost_objective) v_losses.append(v_loss) cost_v_losses.append(cost_v_loss) kl_divergence.append(kl) print(np.mean(scores), np.mean(costs), np.mean(v_losses), np.mean(cost_v_losses), np.mean(kl_divergence), np.mean(c_objectives)) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() cost_v_loss_logger.save() kl_logger.save() score_logger.save() cost_logger.save() #backup backup_list[0] = epoch + 1 with open(backup_name, 'wb') as f: pickle.dump(backup_list, f)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) p_loss_logger = Logger(save_name, 'p_loss') v_loss_logger = Logger(save_name, 'v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') graph = Graph( 1000, save_name, ['score', 'policy loss', 'value loss', 'kl divergence', 'entropy']) episodes = 10 max_steps = 4000 max_ep_len = min(1000, env.spec.max_episode_steps) epochs = int(1e5) save_freq = 10 save_period = 10 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) kl_divergence = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] next_states = [] rewards = [] gaes = [] ep_step = 0 #for episode in range(episodes): while ep_step < max_steps: state = env.reset() done = False score = 0 step = 0 temp_rewards = [] values = [] while True: step += 1 ep_step += 1 action, clipped_action, value = agent.get_action(state, True) next_state, reward, done, info = env.step(clipped_action) states.append(state) actions.append(action) temp_rewards.append(reward) next_states.append(next_state) rewards.append(reward) values.append(value) state = next_state score += reward if done or step >= max_ep_len: break if step >= max_ep_len: action, clipped_action, value = agent.get_action(state, True) else: #중간에 끝난 거면, 다 돌기전에 죽어버린거니, value = 0 으로 해야함 value = 0 print("done before max_ep_len...") next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) targets += list(temp_targets) gaes += list(temp_gaes) score_logger.write([step, score]) scores.append(score) trajs = [states, actions, targets, next_states, rewards, gaes] p_loss, v_loss, kl, entropy = agent.train(trajs) p_loss_logger.write([ep_step, p_loss]) v_loss_logger.write([ep_step, v_loss]) kl_logger.write([ep_step, kl]) p_losses.append(p_loss) v_losses.append(v_loss) kl_divergence.append(kl) entropies.append(entropy) print(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(kl_divergence), np.mean(entropies)) graph.update([ np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(kl_divergence), np.mean(entropies) ]) if (epoch + 1) % save_freq == 0: agent.save() p_loss_logger.save() v_loss_logger.save() kl_logger.save() score_logger.save() graph.update(None, finished=True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) env.unwrapped.initialize(is_render=False) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') p_loss_logger = Logger(save_name, 'p_loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), agent.name) episodes = 10 epochs = int(1e5) save_freq = 10 save_period = 100 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] ep_step = 0 for episode in range(episodes): state = env.reset() done = False score = 0 step = 0 temp_rewards = [] while not done: step += 1 ep_step += 1 action, clipped_action = agent.get_action(state, True) next_state, reward, done, info = env.step(clipped_action) states.append(state) actions.append(action) temp_rewards.append(reward) state = next_state score += reward score_logger.write([step, score]) scores.append(score) temp_targets = np.zeros_like(temp_rewards) ret = 0 for t in reversed(range(len(temp_rewards))): ret = temp_rewards[t] + agent.discount_factor * ret temp_targets[t] = ret targets += list(temp_targets) trajs = [states, actions, targets] v_loss, p_objective, kl = agent.train(trajs) v_loss_logger.write([ep_step, v_loss]) p_loss_logger.write([ep_step, p_objective]) p_losses.append(p_objective) v_losses.append(v_loss) entropies.append(kl) #print(v_loss, p_objective, kl) print(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() p_loss_logger.save() score_logger.save() graph.update(0, 0, 0, 0, finished=True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name, ['score', 'policy loss', 'Q value loss', 'entropy']) max_steps = 4000 max_ep_len = min(1000, env.spec.max_episode_steps) start_training_after_steps = 1000 step_per_training = 50 epochs = 1000 save_freq = 1 record_length = 10 p_losses = deque(maxlen=record_length * int(max_ep_len / step_per_training)) q_losses = deque(maxlen=record_length * int(max_ep_len / step_per_training)) entropies = deque(maxlen=record_length * int(max_ep_len / step_per_training)) scores = deque(maxlen=record_length) total_step = 0 for epoch in range(epochs): ep_step = 0 while ep_step < max_steps: state = env.reset() score = 0 step = 0 while True: step += 1 ep_step += 1 total_step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) done = False if step >= max_ep_len else done agent.replay_memory.append( [state, action, reward, np.float(done), next_state]) if len(agent.replay_memory) > start_training_after_steps and ( total_step + 1) % step_per_training == 0: for _ in range(step_per_training): p_loss, q_loss, entropy = agent.train() p_losses.append(p_loss) q_losses.append(q_loss) entropies.append(entropy) print(np.mean(scores), np.mean(p_losses), np.mean(q_losses), np.mean(entropies)) state = next_state score += reward if done or step >= max_ep_len: break score_logger.write([step, score]) scores.append(score) graph.update([ np.mean(scores), np.mean(p_losses), np.mean(q_losses), np.mean(entropies) ]) if (epoch + 1) % save_freq == 0: agent.save() score_logger.save() graph.update(None, finished=True)