frame_idx = 0 episode_rewards = [] for i_episode in range(max_episodes): if ENV == 'Reacher': state = env.reset(SCREEN_SHOT) elif ENV == 'Pendulum': state = env.reset() episode_reward = 0 for step in range(max_steps): # action = qt_opt.policy.act(state) action = qt_opt.cem_optimal_action(state) if ENV == 'Reacher': next_state, reward, done, _ = env.step( action, SPARSE_REWARD, SCREEN_SHOT) elif ENV == 'Pendulum': next_state, reward, done, _ = env.step(action) env.render() episode_reward += reward replay_buffer.push(state, action, reward, next_state, done) state = next_state if len(replay_buffer) > batch_size: qt_opt.update(batch_size) qt_opt.save_model(model_path) episode_rewards.append(episode_reward) if i_episode % 10 == 0: plot(episode_rewards)
def worker(id, sac_trainer, ENV, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \ update_itr, action_itr, AUTO_ENTROPY, DETERMINISTIC, hidden_dim, model_path): ''' the function for sampling with multi-processing ''' with torch.cuda.device(id % torch.cuda.device_count()): sac_trainer.to_cuda() print(sac_trainer, replay_buffer) # sac_tainer are not the same, but all networks and optimizers in it are the same; replay buffer is the same one. if ENV == 'Reacher': NUM_JOINTS=2 LINK_LENGTH=[200, 140] INI_JOING_ANGLES=[0.1, 0.1] SCREEN_SIZE=1000 SPARSE_REWARD=False SCREEN_SHOT=False action_range = 10.0 env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \ ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False) action_dim = env.num_actions state_dim = env.num_observations elif ENV == 'Pendulum': env = NormalizedActions(gym.make("Pendulum-v0")) action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] action_range=1. frame_idx=0 rewards=[] # training loop for eps in range(max_episodes): episode_reward = 0 if ENV == 'Reacher': state = env.reset(SCREEN_SHOT) elif ENV == 'Pendulum': state = env.reset() for step in range(max_steps): if frame_idx > explore_steps: action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() try: if ENV == 'Reacher': next_state, reward, done, _ = env.step(action, SPARSE_REWARD, SCREEN_SHOT) elif ENV == 'Pendulum': next_state, reward, done, _ = env.step(action) env.render() except KeyboardInterrupt: print('Finished') sac_trainer.save_model(model_path) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 # if len(replay_buffer) > batch_size: if replay_buffer.get_length() > batch_size: for i in range(update_itr): _ = sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1. * action_dim) if eps % 10 == 0 and eps > 0: # plot(rewards, id) sac_trainer.save_model(model_path) if done: break print('Worker: ', id, '| Episode: ', eps, '| Episode Reward: ', episode_reward) # if len(rewards) == 0: # rewards.append(episode_reward) # else: # rewards.append(rewards[-1] * 0.9 + episode_reward * 0.1) rewards_queue.put(episode_reward) sac_trainer.save_model(model_path)
max_steps = 100 frame_idx = 0 rewards=[] for i_episode in range (max_episodes): q_loss_list=[] policy_loss_list=[] state = env.reset() episode_reward = 0 for step in range(max_steps): if frame_idx > explore_steps: action = alg.policy_net.get_action(state) else: action = alg.policy_net.sample_action() next_state, reward, done, _ = env.step(action) if ENV !='Reacher': env.render() replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 if len(replay_buffer) > batch_size: q_loss, policy_loss = alg.update(batch_size) q_loss_list.append(q_loss) policy_loss_list.append(policy_loss) if done: break
def worker(id, ): # thread could read global variables ''' the function for sampling with multi-threading ''' print(sac_trainer, replay_buffer) if ENV == 'Reacher': env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \ ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False) elif ENV == 'Pendulum': env = NormalizedActions(gym.make("Pendulum-v0")) print(env) frame_idx = 0 rewards = [] # training loop for eps in range(max_episodes): episode_reward = 0 if ENV == 'Reacher': state = env.reset(SCREEN_SHOT) elif ENV == 'Pendulum': state = env.reset() for step in range(max_steps): if frame_idx > explore_steps: action = sac_trainer.policy_net.get_action( state, deterministic=DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() try: if ENV == 'Reacher': next_state, reward, done, _ = env.step( action, SPARSE_REWARD, SCREEN_SHOT) elif ENV == 'Pendulum': next_state, reward, done, _ = env.step(action) env.render() except KeyboardInterrupt: print('Finished') sac_trainer.save_model(model_path) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 if len(replay_buffer) > batch_size: for i in range(update_itr): _ = sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1. * action_dim) if eps % 10 == 0 and eps > 0: plot(rewards, id) sac_trainer.save_model(model_path) if done: break print('Episode: ', eps, '| Episode Reward: ', episode_reward) # if len(rewards) == 0: rewards.append(episode_reward) # else: rewards.append(rewards[-1]*0.9+episode_reward*0.1) sac_trainer.save_model(model_path)
env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \ ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True) ppo = PPO() if args.train: all_ep_r = [] for ep in range(EP_MAX): s = env.reset(SCREEN_SHOT) s = s / 100. buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in range(EP_LEN): # in one episode # env.render() a = ppo.choose_action(s) s_, r, done, distance2goal = env.step(a, SPARSE_REWARD, SCREEN_SHOT) s_ = s_ / 100. buffer_s.append(s) buffer_a.append(a) # print('r, norm_r: ', r, (r+8)/8) '''the normalization makes reacher's reward almost same and not work''' # buffer_r.append((r+8)/8) # normalize reward, find to be useful buffer_r.append(r) s = s_ ep_r += r # update ppo if (t + 1) % BATCH == 0 or t == EP_LEN - 1: v_s_ = ppo.get_v(s_) discounted_r = [] for r in buffer_r[::-1]: