def main(env_name): """ Run the gym test using the specified environment :param env_name: Name of the Unity environment binary to launch """ env = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True) try: # Examine environment parameters print(str(env)) # Reset the environment initial_observations = env.reset() if len(env.observation_space.shape) == 1: # Examine the initial vector observation print("Agent observations look like: \n{}".format( initial_observations)) for _episode in range(10): env.reset() done = False episode_rewards = 0 while not done: actions = env.action_space.sample() obs, reward, done, _ = env.step(actions) episode_rewards += reward print("Total reward this episode: {}".format(episode_rewards)) finally: env.close()
class Chaser_v1(Environment): unity_env_worker_id = 0 def __init__(self, platform): if platform == OSName.MAC: env_filename = EnvironmentName.CHASER_V1_MAC.value elif platform == OSName.WINDOWS: env_filename = EnvironmentName.CHASER_V1_WINDOWS.value else: env_filename = None self.env = UnityEnv(environment_filename=env_filename, worker_id=Chaser_v1.unity_env_worker_id, use_visual=True, multiagent=True).unwrapped self.increase_env_worker_id() super(Chaser_v1, self).__init__() self.action_shape = self.get_action_shape() self.state_shape = self.get_state_shape() self.cnn_input_height = self.state_shape[0] self.cnn_input_width = self.state_shape[1] self.cnn_input_channels = self.state_shape[2] self.observation_space = self.env.observation_space self.continuous = True @staticmethod def increase_env_worker_id(): Chaser_v1.unity_env_worker_id += 1 def get_n_states(self): n_states = 3 return n_states def get_n_actions(self): n_actions = 3 return n_actions def get_state_shape(self): return self.env.observation_space.shape def get_action_shape(self): return self.env.action_space.shape def reset(self): state = self.env.reset() return state def step(self, action): next_state, reward, done, info = self.env.step(action) adjusted_reward = reward return next_state, reward, adjusted_reward, done, info def close(self): self.env.close()
def worker(id, td3_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \ update_itr, explore_noise_scale, eval_noise_scale, reward_scale, DETERMINISTIC, hidden_dim, model_path): ''' the function for sampling with multi-processing ''' print(td3_trainer, replay_buffer) env_name="./tac_follow_new" env = UnityEnv(env_name, worker_id=id+15, use_visual=False, use_both=True) # training loop for eps in range(max_episodes): frame_idx=0 rewards=[] episode_reward = 0 state, info = env.reset() # state=state[:6] for step in range(max_steps): if frame_idx > explore_steps: action = td3_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC, explore_noise_scale=explore_noise_scale) else: action = td3_trainer.policy_net.sample_action() try: next_state, reward, done, info = env.step(action) # next_state = next_state[:6] except KeyboardInterrupt: print('Finished') td3_trainer.save_model(model_path) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 # if len(replay_buffer) > batch_size: if replay_buffer.get_length() > batch_size: for i in range(update_itr): _=td3_trainer.update(batch_size, deterministic=DETERMINISTIC, eval_noise_scale=eval_noise_scale, reward_scale=reward_scale) if eps % 10 == 0 and eps>0: # plot(rewards, id) td3_trainer.save_model(model_path) if done: break print('Episode: ', eps, '| Episode Reward: ', episode_reward) if len(rewards) == 0: rewards.append(episode_reward) else: rewards.append(rewards[-1]*0.9+episode_reward*0.1) rewards_queue.put(episode_reward) td3_trainer.save_model(model_path)
def test_multi_agent(mock_env): mock_brain = create_mock_group_spec() mock_braininfo = create_mock_vector_step_result(num_agents=2) setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) with pytest.raises(UnityGymException): UnityEnv(" ", multiagent=False) env = UnityEnv(" ", use_visual=False, multiagent=True) assert isinstance(env.reset(), list) actions = [env.action_space.sample() for i in range(env.number_agents)] obs, rew, done, info = env.step(actions) assert isinstance(obs, list) assert isinstance(rew, list) assert isinstance(done, list)
def test_gym_wrapper(mock_env): mock_brain = create_mock_brainparams() mock_braininfo = create_mock_vector_braininfo() setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = UnityEnv(" ", use_visual=False, multiagent=False) assert isinstance(env, UnityEnv) assert isinstance(env.reset(), np.ndarray) actions = env.action_space.sample() assert actions.shape[0] == 2 obs, rew, done, info = env.step(actions) assert isinstance(obs, np.ndarray) assert isinstance(rew, float) assert isinstance(done, bool) assert isinstance(info, dict)
def test_gym_wrapper(mock_env): mock_brain = create_mock_group_spec() mock_braininfo = create_mock_vector_step_result() setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = UnityEnv(" ", use_visual=False, multiagent=False) assert isinstance(env, UnityEnv) assert isinstance(env.reset(), np.ndarray) actions = env.action_space.sample() assert actions.shape[0] == 2 obs, rew, done, info = env.step(actions) assert env.observation_space.contains(obs) assert isinstance(obs, np.ndarray) assert isinstance(rew, float) assert isinstance(done, (bool, np.bool_))
class UnityEnvWrapper(gym.Env): def __init__(self, env_config): self.vector_index = env_config.vector_index self.worker_index = env_config.worker_index self.worker_id = env_config["unity_worker_id"] + env_config.worker_index # Name of the Unity environment binary to launch env_name = '/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux' self.env = UnityEnv(env_name, worker_id=self.worker_id, use_visual=False, multiagent=False, no_graphics=True) # self.action_space = self.env.action_space self.observation_space = self.env.observation_space def reset(self): return self.env.reset() def step(self, action): return self.env.step(action)
def test_multi_agent(mock_communicator, mock_launcher): mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0, stack=False, num_agents=2) # Test for incorrect number of agents. with pytest.raises(UnityGymException): UnityEnv(' ', multiagent=False) env = UnityEnv(' ', use_visual=False, multiagent=True) assert isinstance(env.reset(), list) actions = [env.action_space.sample() for i in range(env.number_agents)] obs, rew, done, info = env.step(actions) assert isinstance(obs, list) assert isinstance(rew, list) assert isinstance(done, list) assert isinstance(info, dict)
def test_gym_wrapper_visual(mock_env, use_uint8): mock_spec = create_mock_group_spec(number_visual_observations=1) mock_decision_step, mock_terminal_step = create_mock_vector_steps( mock_spec, number_visual_observations=1) setup_mock_unityenvironment(mock_env, mock_spec, mock_decision_step, mock_terminal_step) env = UnityEnv(" ", use_visual=True, uint8_visual=use_uint8) assert isinstance(env, UnityEnv) assert isinstance(env.reset(), np.ndarray) actions = env.action_space.sample() assert actions.shape[0] == 2 obs, rew, done, info = env.step(actions) assert env.observation_space.contains(obs) assert isinstance(obs, np.ndarray) assert isinstance(rew, float) assert isinstance(done, (bool, np.bool_)) assert isinstance(info, dict)
def test_gym_wrapper(mock_communicator, mock_launcher): mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0, stack=False, num_agents=1) # Test for incorrect number of agents. with pytest.raises(UnityGymException): UnityEnv(' ', use_visual=False, multiagent=True) env = UnityEnv(' ', use_visual=False) assert isinstance(env, UnityEnv) assert isinstance(env.reset(), np.ndarray) actions = env.action_space.sample() assert actions.shape[0] == 2 obs, rew, done, info = env.step(actions) assert isinstance(obs, np.ndarray) assert isinstance(rew, float) assert isinstance(done, bool) assert isinstance(info, dict)
# env = gym.make('Pendulum-v0') # env=Reacher(render=True) # env = UnityEnv(env_name, worker_id=10, use_visual=True, use_both=True) # s, info = env.reset() # for t in range(100): # # env.render() # s, r, done, info = env.step(GLOBAL_PPO.choose_action(s)) GLOBAL_PPO.save(model_path) if args.test: env = UnityEnv(env_name, worker_id=np.random.randint(0, 10), use_visual=False, use_both=True) env.reset() GLOBAL_PPO = PPO() GLOBAL_PPO.load(model_path) test_steps = 100 test_episode = 10 for _ in range(test_episode): s, info = env.reset() for t in range(test_steps): # env.render() a = GLOBAL_PPO.choose_action(s[:6]) print(a) s, r, done, info = env.step(a)
class Worker(object): def __init__(self, wid): self.wid = wid self.env = UnityEnv(env_name, worker_id=wid, use_visual=True, use_both=True) # self.env=Reacher(render=True) self.ppo = GLOBAL_PPO self.pins_x = [] self.pins_y = [] def ImgProcess(self, img, Done=False): cimg, edge_detected_image, contour_centers = image_processing(img) # cimg = large_circle_detect(cimg, edge_detected_image) # this consumes most time cimg, VALID_DETECT = contour_center_check(contour_centers, cimg, NUM_PINS=NUM_PINS) # cv2.imwrite(save_path+str(filename),cimg) contour_centers = CenterRegister(contour_centers) if VALID_DETECT: # pins detection correct reshape_contour_centers = np.array(contour_centers).transpose() self.pins_x.append(reshape_contour_centers[0]) self.pins_y.append(reshape_contour_centers[1]) reshape_pins_x = np.array(self.pins_x).transpose() reshape_pins_y = np.array(self.pins_y).transpose() displacement_pins_x = self.pins_x[-1] - self.pins_x[0] displacement_pins_y = self.pins_y[-1] - self.pins_y[0] plt.figure(1) for i in range(NUM_PINS): plt.subplot(211) plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]) plt.title('Position') plt.subplot(212) plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i] - reshape_pins_x[i][0]) plt.title('Displacement') plt.tight_layout() plt.savefig('./ppo_pins.png') if Done: plt.clf() # return pins position x, y for current frame, displacement of pins position x,y return self.pins_x[-1], self.pins_y[ -1], displacement_pins_x, displacement_pins_y def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER step_set = [] epr_set = [] step = 0 while not COORD.should_stop(): s, info = self.env.reset() ''' image processing ''' img = (s[:, :, 0] * 255).astype(np.uint8) try: pins_x, pins_y, pins_dis_x, pins_dis_y = self.ImgProcess( img, Done=False) except: print('Image Processing Error!') s = np.concatenate((pins_dis_x, pins_dis_y)) '''''' # vector_s = info["brain_info"].vector_observations[0, :] # get the vector observation # s=vector_s # print(s.shape, info["brain_info"].vector_observations[0, :]) step += 1 ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] self.pins_x = [] self.pins_y = [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) s_, r, done, info = self.env.step(a) ''' implementation of plot version one, deprecated ''' # plt.imshow(s_[:,:,0]) # # plt.show() # plt.savefig('./img256_test/tac_test'+str(step)+str(t)+'.png') ''' image size of plt is not exactly the original array size, but with axis etc; therefore use Image--imlementation of plot version two ''' # im = Image.fromarray((s_[:,:,0] * 255).astype(np.uint8)) # im.save('./img256f_r30/tac'+str(step)+str(t)+'.png') ''' image processing ''' img = (s_[:, :, 0] * 255).astype(np.uint8) if t > EP_LEN - 1: Done = True else: Done = False try: pins_x, pins_y, pins_dis_x, pins_dis_y = self.ImgProcess( img, Done) except: print('Image Processing Error!') s_ = np.concatenate((pins_dis_x, pins_dis_y)) ''' get the vector observation ''' # vector_s = info["brain_info"].vector_observations[0, :] # get the vector observation # s_=vector_s # print('a: ',a) # shape: [] # print('s: ',s_) # shape: [] # plt.imshow(s[:,:,0]) # plt.show() # print('r: ',r) # shape: scalar # print('done: ', done) # shape: True/False # s=s.reshape(-1) # convert from 3D to 1D buffer_s.append(s) buffer_a.append(a) buffer_r.append( (r + 8) / 8) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, ) step_set.append(step) # print(step) epr_set.append(ep_r)
episode_length = 150 env_name = "./tac_touch_random2" # Name of the Unity environment binary to launch env = UnityEnv(env_name, worker_id=np.random.randint(0, 10), use_visual=False, use_both=True) batch_s = [] cnt = 0 for eps in range(training_episodes): print(eps) s, info = env.reset() s0 = np.array(s[7:]) for step in range(episode_length): # plot(s) # print(np.mean(np.abs(np.array(s[7:])-s0))) # choose 0.03 if step > 0 and np.mean( np.abs(np.array(s[7:]) - s0) ) > 0.03 and s[4] + s[5] + s[ 6] != 0: # set a threshold to extract deformation frames batch_s.append( s ) # dim of s total 280 (select 182 as obs): 0 object index, 1-3 rotation value, 4-6 average contact point position, 7-279 pins positions cnt += 1 s_, r, done, info = env.step([0]) s = s_ print('total number of samples: ', cnt) pickle.dump(batch_s, data_file)
if __name__ == "__main__": env = UnityEnv('test.app', 0,use_visual=True) ppo = PPO(env) all_ep_r = [] t = 0 for ep in range(EP_MAX): s = env.reset() ep_r = 0 done = False while not done: t+=1 env.render() a,v = ppo.choose_action(s) s_, r, done, _ = env.step(a) ppo.buffer_s.append(s) ppo.buffer_a.append(a) ppo.buffer_r.append(r) ppo.buffer_v.append(v) ppo.buffer_done.append(done) s = s_ ep_r += r # update ppo if (t+1) % BATCH == 0: print("updating...") t = 0 v_s_ = v discounted_r = [] rewards = np.array(ppo.buffer_r)
class PPO(): def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.betas = (0.9, 0.999) self.gamma = 0.99 self.eps_clip = 0.2 self.buffer_size = 2048 self.batch_size = 256 self.K_epochs = 3 self.max_steps = 100000 self.tau = 0.95 self.entropy_coef = 0.001 self.value_loss_coef = 0.5 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=True) channel.set_configuration_parameters(time_scale=100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents print("Nº of Agents: ", self.n_agents) # Model self.model = ActorCritic(self.state_size, self.action_size, seed=0).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, betas=self.betas) self.MseLoss = nn.MSELoss() # Buffer memory self.memory = [] for _ in range(self.n_agents): self.memory.append(Buffer()) # Initialize time step (for updating when buffer_size is full) self.t_step = 1 def train(self): # Initial observation env_info = self.env.reset() state = env_info # Data self.data = Data(self.n_agents, self.summary_freq) # Training loop for _ in range(self.max_steps): action = [] logprobs = [] value = [] # Action of agent for i in range(self.n_agents): a, b, c = self.act(state[i]) action.append(a) logprobs.append(b) value.append(c) # Send the action to the environment next_state, reward, done, info = self.env.step(action) # Done done_ = [] for i in range(self.n_agents): done_.append(1 - done[i]) # Agent step for i in range(self.n_agents): self.step(state[i], action[i], reward[i], next_state[i], done_[i], logprobs[i], value[i], self.memory[i]) # Update t_step self.t_step += 1 # Next state state = next_state # Update the score self.data.update_score(reward, value, done, self.t_step) # Summary if self.t_step % self.summary_freq == 0: self.data.summary(self.t_step) # Save self.save() def save(self): torch.save(self.model.state_dict(), 'Saved Models/model.pth') self.data.results() def load_model(self, model): self.model.load_state_dict(torch.load(model)) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get actions probabilities and value from ActorCritic model self.model.eval() with torch.no_grad(): action_probs, value = self.model(state) self.model.train() prob = F.softmax(action_probs, -1) log_probs = F.log_softmax(action_probs, -1) # Get action and log of probabilities action = prob.multinomial(num_samples=1) log_probs = log_probs.gather(1, action) return action, log_probs, value def step(self, state, action, reward, next_state, done, logprobs, value, memory): # Update model when buffer_size is full if memory.len_() == (self.buffer_size / self.n_agents): self.learn() for i in range(self.n_agents): self.memory[i].reset() # Save experience in buffer memory memory.add(state, action, reward, next_state, done, logprobs, value) def evaluate(self, states, next_states, actions, rewards, masks, compute_gae): logits, values = self.model(states) probs = F.softmax(logits, -1) log_probs = F.log_softmax(logits, -1) entropies = -(log_probs * probs).sum(1, keepdim=True) log_probs = log_probs.gather(1, actions.unsqueeze(1)) values_ = values _, value = self.model(next_states) values = torch.cat((values, value.data)) returns = [] if (compute_gae): gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): # Generalized Advantage Estimation delta_t = rewards[i] + self.gamma * masks[i] * values[ i + 1].data - values[i].data gae = gae * self.gamma * self.tau * masks[i] + delta_t returns.insert(0, gae + values[i]) return log_probs, values_, entropies, returns def compute_returns(self): returns_ = [] for i in range(self.n_agents): # Get Experiences (of each agent) experiences = self.memory[i].get() states, actions, rewards, next_states, dones, logprobs_, values_ = experiences # Evaluate _, _, _, r = self.evaluate(states, next_states, actions, rewards, dones, compute_gae=True) returns_.append(r) l = [] for i in range(len(returns_)): for j in range(len(returns_[0])): l.append(returns_[i][j]) return l def learn(self): # Get Experiences states, actions, rewards, next_states, dones, logprobs_, values_ = self.getExp( ) returns_eval = self.compute_returns() returns_eval = torch.tensor(returns_eval).to(device) returns_eval = returns_eval.unsqueeze(1) # Optimize policy for K epochs: for _ in range(self.K_epochs): # List with all indices l = np.arange(self.buffer_size) l = list(l) x = self.buffer_size // self.batch_size for _ in range(x): # Take a random batch indices = random.sample(l, self.batch_size) old_logprobs = torch.empty(self.batch_size, 1) old_values = torch.empty(self.batch_size, 1) old_actions = torch.empty(self.batch_size) old_states = torch.empty(self.batch_size, self.state_size) old_next_states = torch.empty(self.batch_size, self.state_size) old_rewards = np.zeros(self.batch_size) returns = torch.empty(self.batch_size, 1) for i in range(len(indices)): old_logprobs[i] = logprobs_[indices[i]] old_values[i] = values_[indices[i]] old_actions[i] = actions[indices[i]] old_states[i] = states[indices[i]] old_next_states[i] = next_states[indices[i]] old_rewards[i] = rewards[indices[i]] returns[i] = returns_eval[indices[i]] old_actions = old_actions.long() # Remove indices to not repeat for i in indices: l.remove(i) # Evaluate logprobs, state_values, dist_entropy, _ = self.evaluate( old_states, old_next_states, old_actions, rewards, dones, compute_gae=False) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs) # Finding Surrogate Loss: advantages = returns - old_values surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages # LOSS = ACTOR LOSS + CRITIC_DISCOUNT * CRITIC_LOSS - ENTROPY_BETA * ENTROPY loss = -torch.min( surr1, surr2) + self.value_loss_coef * self.MseLoss( state_values, returns) - self.entropy_coef * dist_entropy # Optimizer step self.optimizerStep(self.optimizer, loss.mean()) def optimizerStep(self, optimizer, loss): optimizer.zero_grad() loss.backward() optimizer.step() def getExp(self): states, actions, rewards, next_states, dones, logprobs, values = [], [], [], [], [], [], [] for i in range(self.n_agents): experiences = self.memory[i].get() states.append(experiences[0]) actions.append(experiences[1]) rewards.append(experiences[2]) next_states.append(experiences[3]) dones.append(experiences[4]) logprobs.append(experiences[5]) values.append(experiences[6]) states_, actions_, rewards_, next_states_, dones_, logprobs_, values_ = [], [], [], [], [], [], [] for i in range(len(states)): for j in range(len(states[0])): states_.append(states[i][j]) actions_.append(actions[i][j]) rewards_.append(rewards[i][j]) next_states_.append(next_states[i][j]) dones_.append(dones[i][j]) logprobs_.append(logprobs[i][j]) values_.append(values[i][j]) states__ = torch.empty(self.buffer_size, self.state_size) actions__ = torch.empty(self.buffer_size) next_states__ = torch.empty(self.buffer_size, self.state_size) dones__ = torch.empty(self.buffer_size) logprobs__ = torch.empty(self.buffer_size, 1, 1) values__ = torch.empty(self.buffer_size) for i in range(self.buffer_size): states__[i] = states_[i] actions__[i] = actions_[i] next_states__[i] = next_states_[i] dones__[i] = dones_[i] logprobs__[i] = logprobs_[i] values__[i] = values_[i] return states__, actions__, rewards_, next_states__, dones__, logprobs__, values__
class Worker(object): def __init__(self, wid): self.wid = wid self.env = UnityEnv(env_name, worker_id=wid, use_visual=False, use_both=True) # self.env=Reacher(render=True) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER step_set = [] epr_set = [] step = 0 while not COORD.should_stop(): s, info = self.env.reset() s = s[:8] step += 1 ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] self.pins_x = [] self.pins_y = [] self.pins_z = [] self.object_x = [] self.object_y = [] self.object_z = [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) s_, r, done, info = self.env.step(a) # print(np.array(s_).shape) # plot pins buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful pins_x = s_[6::3] pins_z = s_[8::3] self.object_x.append(s_[0]) self.object_z.append(s_[2]) self.pins_x.append(pins_x) self.pins_z.append(pins_z) relative_x = pins_x - s_[0] relative_z = pins_z - s_[2] dis = (relative_x - (self.pins_x[0] - self.object_x[0]))**2 + ( relative_z - (self.pins_z[0] - self.object_z[0]))**2 min_idx = np.argmin(dis) max_idx = np.argmax(dis) # add relative position of the pin with smallest deformation # s_ = np.append(s_[:6], relative_x[min_idx]) # s_ = np.append(s_, relative_z[min_idx]) s_ = np.append(s_[:6], relative_x[max_idx]) s_ = np.append(s_, relative_z[max_idx]) s = s_ ep_r += r # print('minimal displacement idx: ', min_idx) GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break if GLOBAL_EP % 50 == 0 and GLOBAL_EP > 0: self.ppo.save(model_path) reshape_pins_x = np.array(self.pins_x).transpose() reshape_pins_z = np.array(self.pins_z).transpose() plt.clf() for i in range(NUM_PINS): plt.subplot(411) plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]) plt.title('X-Position') plt.subplot(412) plt.plot(np.arange(len(self.pins_z)), reshape_pins_z[i]) plt.title( 'Y-Position') # although it's z, to match reality, use y plt.subplot(413) # plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]-self.object_x) # plt.title('X-Relative') # plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]-reshape_pins_x[i][0]) # plt.title('X-Displacement') plt.plot(np.arange(len(self.pins_x)), (reshape_pins_x[i] - self.object_x) - (reshape_pins_x[i][0] - self.object_x[0])) plt.title('X-Displacement') plt.subplot(414) plt.plot(np.arange(len(self.pins_x)), (reshape_pins_z[i] - self.object_z) - (reshape_pins_z[i][0] - self.object_z[0])) plt.title('Y-Displacement') plt.xlabel('Time Step') plt.tight_layout() plt.savefig('./ppo_pins.png') # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, ) step_set.append(step) # print(step) epr_set.append(ep_r) if step % 10 == 0: # plot every N episode; some error about main thread for plotting plt.clf() plt.plot(step_set, epr_set) plt.xlabel('Episode') plt.ylabel('Reward') try: plt.savefig('./tac_pins8.png') except: print('writing conflict!')
#print("______________episode Number______________") #print(e)\\\\\ actionlst = [] action1 = agent1.act(state1) action2 = agent2.act(state2) print("__Randomly Selected Action__________") print(action1) print(action2) actionlst.append(action1) actionlst.append(action2) # Advance the game to the next frame based on the action. # Reward is 1 for every frame the pole survived print("______TIME") print(time_t) next_state, reward, done, _ = env.step(actionlst) print("_____________reward_____________") print(reward) print("____________nextstaet agen 1_____________") print(next_state[0]) print("_____________next state agent 2_______") print(next_state[1]) #next_state = np.reshape(next_state, [1, 42336]) # Remember the previous state, action, reward, and done agent1.remember(state1, action1, reward[0], next_state[0], done[0]) agent2.remember(state2, action2, reward[1], next_state[1], done[1]) # make next_state the new current state for the next frame. state1 = next_state[0] state2 = next_state[1]
def cartpole(): env = UnityEnv(environment_filename=ENV_NAME, worker_id=5, use_visual=False, multiagent = True) score_logger = ScoreLogger(ENV_NAME) agents_brain = [] agents_action = [] num_agents = env.number_agents observation_space = env.observation_space.shape[0] print("____________Observation_space") print(observation_space) action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) for x in range ((env.number_agents)): agents_brain.append(DQNSolver(observation_space, action_space)) print ("Length of BrainList: ",len(agents_brain)) run = 0 state = env.reset() print("______INITIAL______") print(state) #initialstate = copy.deepcopy(state) print("*****************************initial state for unity envirmonet**************") #print(initialstate) jk = 1 while True: run += 1 state = env.reset() #state = copy.deepcopy(initialstate) num_agents = int(state[0][-5]) print("_____________State _______________") print(int(state[0][12])) step = 0 print("################################This is loop################################# :" , jk) while True: step += 1 env.render() agents_action = [1] * len(state) print(state[0]) print("*******************Length of state******************") print(len(state)) for x in range(len(state)): state[x] = np.reshape(state[x], [1, observation_space]) agents_action[x] = agents_brain[int(state[x][0,12]) - 1].act(state[x]) print("Agents Actions List: ",agents_action) state_next, reward, terminal, info = env.step(agents_action) #print ("_____________STATE_NEXT___________") #print (state_next) if (len(state_next) == 0): break agents_alive = state_next[0][-13:-5] print ("Agents_alive: ", agents_alive) print ("Rewards: ",reward) num_agents = int(state_next[0][-5]) print ("Number of agents: ",num_agents) print("_________Terminal list_______" , terminal) if (terminal[0] == True): print("**************************Brain saved******************************") for x in range(len(agents_brain)): agents_brain[x].save(str(run) + "brain" + str(x) + ".h5") jk+=1 print("#####################################Loop is######################## :" , jk) #break for x in range(len(state_next)): state[x] = np.reshape(state[x], [1, observation_space]) state_next[x] = np.reshape(state_next[x], [1, observation_space]) agents_brain[int(state_next[x][0,12]) - 1].remember(state[x], agents_action[x], reward[x], state_next[x], terminal[x]) agents_brain[int(state_next[x][0,12]) - 1].experience_replay() state = state_next
class AC(): def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.gamma = 0.99 self.batch_size = 256 self.max_steps = 100000 self.tau = 0.95 self.entropy_coef = 0.001 self.value_loss_coef = 0.5 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=False) channel.set_configuration_parameters(time_scale=100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents # Model self.model = ActorCritic(self.state_size, self.action_size, seed=0).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) # Buffer memory self.memory = Buffer() # Initialize time step (for updating every "batch_size" time steps) self.t_step = 1 def train(self): # Initial observation env_info = self.env.reset() state = env_info # Data self.data = Data(self.n_agents, self.summary_freq) # Training loop for _ in range(self.max_steps): # Action of agent action, value = self.act(state) # Send the action to the environment next_state, reward, done, info = self.env.step(action) # Agent step self.step(state, action, reward, next_state, done) # Update t_step self.t_step += 1 # Next state state = next_state # Update the score reward_ = np.expand_dims(reward, axis=0) value_ = value.unsqueeze(0) done_ = np.expand_dims(done, axis=0) self.data.update_score(reward_, value_, done_, self.t_step) # Summary if self.t_step % self.summary_freq == 0: self.data.summary(self.t_step) # Save self.save() def save(self): torch.save(self.model.state_dict(), 'Saved Models/model.pth') self.data.results() def load_model(self, model): self.model.load_state_dict(torch.load(model)) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get actions probabilities and value from ActorCritic model self.model.eval() with torch.no_grad(): action_probs, value = self.model(state) self.model.train() prob = F.softmax(action_probs, -1) # Get action and log of probabilities action = prob.multinomial(num_samples=1) return action, value def step(self, state, action, reward, next_state, done): # Save experience in buffer memory self.memory.add(state, action, reward, next_state, done) # Learn every "batch_size" time steps if self.t_step % self.batch_size == 0: experiences = self.memory.get() self.learn(experiences) self.memory.reset() def learn(self, experiences): # Get Experiences states, actions, rewards, next_states = experiences logits, values = self.model(states) probs = F.softmax(logits, -1) log_probs = F.log_softmax(logits, -1) entropies = -(log_probs * probs).sum(1, keepdim=True) log_probs = log_probs.gather(1, actions.unsqueeze(1)) _, value = self.model(next_states) values = torch.cat((values, value.data)) policy_loss = 0 value_loss = 0 R = values[-1] gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = self.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + self.gamma * values[i + 1].data - values[i].data gae = gae * self.gamma * self.tau + delta_t policy_loss = policy_loss - (log_probs[i] * gae) - ( self.entropy_coef * entropies[i]) # Loss loss = (policy_loss + self.value_loss_coef * value_loss) # Optimizer step self.optimizerStep(self.optimizer, loss) def optimizerStep(self, optimizer, loss): optimizer.zero_grad() loss.backward() optimizer.step()
def cartpole(): env = UnityEnv(environment_filename=ENV_NAME, worker_id=1, use_visual=False, multiagent=True) #score_logger = ScoreLogger(ENV_NAME) agents_brain = [] agents_action = [] pathname = "C:/HinaProgramm/testingFolder/Unity Environment" num_agents = env.number_agents print("Number of agents in enviroment : ", num_agents) observation_space = env.observation_space.shape[0] print("____________Observation_space______________") print(observation_space) print("__________Action Space________________") action_space = env.action_space.n print(action_space) dqn_solver = DQNSolver(observation_space, action_space) for x in range((num_agents)): agents_brain.append(DQNSolver(observation_space, action_space)) print("Length of BrainList: ", len(agents_brain)) run = 0 state = env.reset() #print("______INITIAL______") #print(state) initialstate = copy.deepcopy(state) #print("*****************************initial state for unity envirmonet**************") #print(initialstate) jk = 1 sharecount = 0 eatcount = 0 filecount = 0 #f = str(filecount) + "sahre.csv" f = open(str(filecount) + "sahre.csv", 'ab') #J = str(filecount) + "eat.csv" J = open(str(filecount) + "eat.csv", 'ab') while True: run += 1 env.reset() state = copy.deepcopy(initialstate) num_agents = int(state[0][-8]) print("_numagents__________", num_agents) print("_____________State _______________") print(int(state[0][12])) step = 0 print( "################################This is loop################################# :", jk) print("_____Run _______ :", run) while True: #print("************Number of agents *********") #print(env.number_agents) step += 1 env.render() agents_action = [1] * len(state) #print(state[0]) #print("*******************Length of state******************") #print(len(state)) for x in range(len(state)): state[x] = np.reshape(state[x], [1, observation_space]) agents_action[x] = agents_brain[int(state[x][0, 12]) - 1].act( state[x]) sharecount += agents_action.count(5) eatcount += agents_action.count(6) #print("Agents Actions List: ",agents_action) state_next, reward, terminal, info = env.step(agents_action) for x in range(len(agents_action)): if (agents_action[x] == 5): new = np.asarray([state_next[x]]) np.savetxt(f, new, delimiter=",") #f.write(str(state_next[x])+"\r\n") if (agents_action[x] == 6): #J.write(str(state_next[x])+"\r\n") new = np.asarray([state_next[x]]) np.savetxt(J, new, delimiter=",") print("_____________STATE_NEXT___________") print(state_next) if (len(state_next) == 0): #f.write(str(sharecount)) #J.write(str(eatcount)) #f.close() #J.close() filecount += 1 np.savetxt(f, sharecount, delimiter=",") np.savetxt(J, eatcount, delimiter=",") break agents_alive = state_next[0][-16:-8] print("Agents_alive: ", agents_alive) print("Rewards: ", reward) num_agents = int(state_next[0][-8]) print("Number of agents: ", num_agents) #print("_________Terminal list_______" , terminal) if (terminal[0] == True): print( "**************************Brain saved******************************" ) for x in range(len(agents_brain)): agents_brain[x].model.save(pathname + str(run) + "brain" + str(x) + ".h5") jk += 1 print( "#####################################Loop is######################## :", jk) #f.write(str(sharecount)) #J.write(str(eatcount)) #f.close() #J.close() filecount += 1 break for x in range(len(state_next)): state[x] = np.reshape(state[x], [1, observation_space]) state_next[x] = np.reshape(state_next[x], [1, observation_space]) agents_brain[int(state_next[x][0, 12]) - 1].remember( state[x], agents_action[x], reward[x], state_next[x], terminal[x]) agents_brain[int(state_next[x][0, 12]) - 1].experience_replay() state = state_next
# reset environment obs = env.reset() obs = np.stack(obs) # convert observation to tensor if isinstance(obs, np.ndarray): obs = torch.from_numpy(obs).float() total_reward = 0.0 rr = np.zeros((n_agents, )) for i_step in range(max_steps): obs = obs.type(FloatTensor) actions = maddpg.select_action(obs).data.cpu() actions_list = actions.tolist() obs_, reward, done, _ = env.step(actions_list) reward = torch.FloatTensor(reward).type(FloatTensor) obs_ = np.stack(obs_) obs_ = torch.from_numpy(obs_).float() if i_step != max_steps - 1: next_obs = obs_ else: next_obs = None total_reward += reward.sum() rr += reward.cpu().numpy() maddpg.memory.push(obs.data, actions, next_obs, reward) obs = next_obs
import numpy as np import sys from gym_unity.envs import UnityEnv env_name = "../env/GridWorld.x86_64" # Name of the Unity environment binary to launch env = UnityEnv(env_name) # Examine environment parameters print(str(env)) # Reset the environment initial_observation = env.reset() for episode in range(10): initial_observation = env.reset() done = False episode_rewards = 0 while not done: observation, reward, done, info = env.step(env.action_space.sample()) episode_rewards += reward print("Total reward this episode: {}".format(episode_rewards)) env.close()
env_name = "./tac_real2" # Name of the Unity environment binary to launch replay_buffer = ReplayBuffer(1e6) td3_trainer=TD3_Trainer(replay_buffer,state_dim=state_dim, action_dim=action_dim, hidden_dim=hidden_dim, policy_target_update_interval=3, action_range=20. ) model_path = './model/td3_all' env = UnityEnv(env_name, worker_id=np.random.randint(0,10), use_visual=False, use_both=True) td3_trainer.load_model(model_path) batch_s = [] cnt=0 for eps in range(training_episodes): print(eps) s,info = env.reset() for step in range(episode_length): batch_s.append(s) s= state_process(s) a = td3_trainer.policy_net.get_action(s, deterministic = DETERMINISTIC, explore_noise_scale=0.0) a+=np.random.normal(0, 5, a.shape[0]) s_, r, d, _ = env.step(a) cnt+=1 s=s_ # # print(np.mean(np.abs(np.array(s[7:])-s0))) # choose 0.03 # if step >0 and np.mean(np.abs(np.array(s[7:])-s0))>0.03 and s[4]+s[5]+s[6]!=0: # set a threshold to extract deformation frames # batch_s.append(s) # dim of s total 280 (select 182 as obs): 0 object index, 1-3 rotation value, 4-6 average contact point position, 7-279 pins positions # cnt+=1 print('total number of samples: ', cnt) pickle.dump(batch_s, data_file)
env.render() # Reset the environment obv = env.reset() # the initial state state_0 = state_to_bucket(obv) total_reward = 0 for t in range(MAX_T): # Select an action action = select_action(state_0, 0) # execute the action obv, reward, done, _ = env.step(action) # Observe the result state = state_to_bucket(obv) total_reward += reward print(state, t, total_reward) # # Update the Q based on the result # best_q = np.amax(q_table[state]) # q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor * (best_q) - q_table[state_0 + (action,)]) env.render() time.sleep(1) # Setting up for the next iteration state_0 = state if done:
for t in range(total_training_timesteps): loss = 0 r_t = 0 a_t = np.zeros([action_size]) # Epsilon Greedy action_idx = agent.get_action( s_t, m_t, goal, inference_goal) #KOE: This is the forward pass through the NN. #KOEComment: My unity agent also skips 5 frames between actions, controlled in the Unity interface. #The vector space in Unity has 4 branches, with multiple actions i each! Those can also be combined! #I need the ANN output to be able to select all combinations. #TODO Believe step just wants the index of the action. observation, reward, done, info = env.step(action_idx) if battery_limited and battery < 0: done = True print("Battery empty. Stopping.") if (done): print("Game done at timestep ", t) if ((food - poison) > max_reward): max_reward = (food - poison) GAME += 1 reward_buffer.append(food - poison) food_buffer.append(food) poison_buffer.append(poison) battery_buffer.append(battery) num_batteries_buffer.append(num_batteries)
import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize from stable_baselines import PPO2 from gym_unity.envs import UnityEnv env = UnityEnv('./envs/Walker') env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run # Automatically normalize the input features env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) # Load the trained agent model = PPO2.load('./models/my-model') # Enjoy trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
# s, info = env.reset() # for t in range(100): # # env.render() # s, r, done, info = env.step(GLOBAL_PPO.choose_action(s)) GLOBAL_PPO.save(model_path) if args.test: env = UnityEnv(env_name, worker_id=np.random.randint(0, 10), use_visual=True, use_both=True) env.reset() GLOBAL_PPO = PPO() GLOBAL_PPO.load(model_path) test_steps = 200 test_episode = 10 for _ in range(test_episode): s, info = env.reset() '''''' # vector_s = info["brain_info"].vector_observations[0, :] # get the vector observation # s=vector_s for t in range(test_steps): # env.render() s, r, done, info = env.step(GLOBAL_PPO.choose_action(s)) '''''' # vector_s = info["brain_info"].vector_observations[0, :] # get the vector observation # s=vector_s
print("\nwith no render") env = UnityEnv(env_name, no_graphics=False, multiagent=True, worker_id=1) """ res = [] for j in range(10): """ print(str(env)) ini_obs = env.reset() curr_t = time.time() for i in range(10000): actions = [env.action_space.sample() for agent in range(env.number_agents)] obs, rew, done, info = env.step(actions) res = time.time() - curr_t print("\nTime for 1000 step") print(res) print("\n\n") """ res = [] for j in range(10): ini_obs = env.reset() curr_t = time.time()
# choose env # env_name="./tac_follow_new4" # env_name="tac_follow_new4_random02" env_name = "tac_follow_new4_random" env = UnityEnv(env_name, worker_id=22, use_visual=False, use_both=True) td3_trainer.load_model(model_path) eps_r = [] for eps in range(20): state, info = env.reset() state0 = state state = state_process(state, state0) episode_reward = 0 for step in range(max_steps): action = td3_trainer.policy_net.get_action( state, deterministic=DETERMINISTIC, explore_noise_scale=0.0) next_state, reward, done, info = env.step(action) reward += 100 next_state = state_process(next_state, state0) episode_reward += reward state = next_state if done: break print('Episode: ', eps, '| Episode Reward: ', episode_reward) eps_r.append(episode_reward) print(eps_r) print(np.average(eps_r))
def cartpole(): env = UnityEnv(environment_filename=ENV_NAME, worker_id=2, use_visual=False, multiagent=True) score_logger = ScoreLogger(ENV_NAME) agents_brain = [] agents_action = [] index_list = [] agents_alive = [] count = 0 count1 = 0 num_agents = env.number_agents print("___________Number of agents in cartpole __") print(num_agents) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) print("__dqn solver______") print(dqn_solver) #model = tf.keras.models.load_model("") for x in range((env.number_agents)): agents_brain.append(dqn_solver) print("______agentbrain____") print(agents_brain) print("_Agent action___") print(agents_action) learning_brain = copy.deepcopy(agents_brain) run = 0 state = env.reset() initialstate = copy.deepcopy(state) while True: run += 1 env.reset() print("____________STATE____________-") print(state[0]) state = copy.deepcopy(initialstate) agents_brain = [] agents_action = [] index_list = [] agents_alive = [] count = 0 count1 = 0 num_agents = int(state[0][-5]) agents_brain = copy.deepcopy(learning_brain) print(learning_brain) print(agents_brain) print(state) #for x in range ( (env.number_agents - 1) ): step = 0 while True: step += 1 env.render() print("___________STatte Lenth_______") print(len(state)) print("______selffish___") print(state[0]) agents_action = [1] * len(state) copied_agents_alive = copy.deepcopy(agents_alive) print("__________numagents_____") for x in range(num_agents - 1): state[x] = np.reshape(state[x], [1, observation_space]) agents_action[x] = agents_brain[x].act(state[x]) print(agents_action) state_next, reward, terminal, info = env.step( agents_action, num_agents) print("_______Reward________") print(reward) print("_____________NEXT STATE LENGTH____________") print(len(state_next)) if (len(state_next) == 0): break agents_alive = state_next[0][-13:-5] num_agents = int(state_next[0][-5]) print("_______num agnets in cartpole________") print(num_agents) print("_____index list") print(index_list) print(agents_alive) agents_alive1 = np.delete(agents_alive, index_list) print("_______Alive agent list_______") print(agents_alive1) flag = False # del agents_alive[index_list[x]] for x in range(len(agents_alive)): if (agents_alive[x] == float(1)): for y in range(len(index_list)): if (index_list[y] == x): flag = True if (flag == False): index_list.append(x) flag = False index_to_remove = [] for x in range(len(agents_alive1)): if (agents_alive1[x] == float(1)): learning_brain[index_list[count]] = agents_brain[x] index_to_remove.append(x) count = count + 1 agents_brain = [ i for j, i in enumerate(agents_brain) if j not in index_to_remove ] print("____________AGENTS_BRAIN_________") print(len(agents_brain)) print("_______________Terminal_____________") print(terminal) if (terminal[0] == True): print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) for x in range(len(copied_agents_alive)): learning_brain[x] = agents_brain[count1] count1 = count1 + 1 for x in range(len(learning_brain)): learning_brain[x].save(str(run) + "brain" + str(x) + ".h5") break for x in range(num_agents - 1): state[x] = np.reshape(state[x], [1, observation_space]) state_next[x] = np.reshape(state_next[x], [1, observation_space]) agents_brain[x].remember(state[x], agents_action[x], reward[x], state_next[x], terminal[x]) agents_brain[x].experience_replay() state = state_next