def main(env_name): """ Run the gym test using the specified environment :param env_name: Name of the Unity environment binary to launch """ env = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True) try: # Examine environment parameters print(str(env)) # Reset the environment initial_observations = env.reset() if len(env.observation_space.shape) == 1: # Examine the initial vector observation print("Agent observations look like: \n{}".format( initial_observations)) for _episode in range(10): env.reset() done = False episode_rewards = 0 while not done: actions = env.action_space.sample() obs, reward, done, _ = env.step(actions) episode_rewards += reward print("Total reward this episode: {}".format(episode_rewards)) finally: env.close()
class Chaser_v1(Environment): unity_env_worker_id = 0 def __init__(self, platform): if platform == OSName.MAC: env_filename = EnvironmentName.CHASER_V1_MAC.value elif platform == OSName.WINDOWS: env_filename = EnvironmentName.CHASER_V1_WINDOWS.value else: env_filename = None self.env = UnityEnv(environment_filename=env_filename, worker_id=Chaser_v1.unity_env_worker_id, use_visual=True, multiagent=True).unwrapped self.increase_env_worker_id() super(Chaser_v1, self).__init__() self.action_shape = self.get_action_shape() self.state_shape = self.get_state_shape() self.cnn_input_height = self.state_shape[0] self.cnn_input_width = self.state_shape[1] self.cnn_input_channels = self.state_shape[2] self.observation_space = self.env.observation_space self.continuous = True @staticmethod def increase_env_worker_id(): Chaser_v1.unity_env_worker_id += 1 def get_n_states(self): n_states = 3 return n_states def get_n_actions(self): n_actions = 3 return n_actions def get_state_shape(self): return self.env.observation_space.shape def get_action_shape(self): return self.env.action_space.shape def reset(self): state = self.env.reset() return state def step(self, action): next_state, reward, done, info = self.env.step(action) adjusted_reward = reward return next_state, reward, adjusted_reward, done, info def close(self): self.env.close()
def worker(id, td3_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \ update_itr, explore_noise_scale, eval_noise_scale, reward_scale, DETERMINISTIC, hidden_dim, model_path): ''' the function for sampling with multi-processing ''' print(td3_trainer, replay_buffer) env_name="./tac_follow_new" env = UnityEnv(env_name, worker_id=id+15, use_visual=False, use_both=True) # training loop for eps in range(max_episodes): frame_idx=0 rewards=[] episode_reward = 0 state, info = env.reset() # state=state[:6] for step in range(max_steps): if frame_idx > explore_steps: action = td3_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC, explore_noise_scale=explore_noise_scale) else: action = td3_trainer.policy_net.sample_action() try: next_state, reward, done, info = env.step(action) # next_state = next_state[:6] except KeyboardInterrupt: print('Finished') td3_trainer.save_model(model_path) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 # if len(replay_buffer) > batch_size: if replay_buffer.get_length() > batch_size: for i in range(update_itr): _=td3_trainer.update(batch_size, deterministic=DETERMINISTIC, eval_noise_scale=eval_noise_scale, reward_scale=reward_scale) if eps % 10 == 0 and eps>0: # plot(rewards, id) td3_trainer.save_model(model_path) if done: break print('Episode: ', eps, '| Episode Reward: ', episode_reward) if len(rewards) == 0: rewards.append(episode_reward) else: rewards.append(rewards[-1]*0.9+episode_reward*0.1) rewards_queue.put(episode_reward) td3_trainer.save_model(model_path)
def test_multi_agent(mock_env): mock_brain = create_mock_group_spec() mock_braininfo = create_mock_vector_step_result(num_agents=2) setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) with pytest.raises(UnityGymException): UnityEnv(" ", multiagent=False) env = UnityEnv(" ", use_visual=False, multiagent=True) assert isinstance(env.reset(), list) actions = [env.action_space.sample() for i in range(env.number_agents)] obs, rew, done, info = env.step(actions) assert isinstance(obs, list) assert isinstance(rew, list) assert isinstance(done, list)
def test_closing(env_name): """ Run the gym test and closes the environment multiple times :param env_name: Name of the Unity environment binary to launch """ try: env1 = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True) env1.close() env1 = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True) env2 = UnityEnv(env_name, worker_id=2, use_visual=False, no_graphics=True) env2.reset() finally: env1.close() env2.close()
def test_gym_wrapper(mock_env): mock_brain = create_mock_group_spec() mock_braininfo = create_mock_vector_step_result() setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = UnityEnv(" ", use_visual=False, multiagent=False) assert isinstance(env, UnityEnv) assert isinstance(env.reset(), np.ndarray) actions = env.action_space.sample() assert actions.shape[0] == 2 obs, rew, done, info = env.step(actions) assert env.observation_space.contains(obs) assert isinstance(obs, np.ndarray) assert isinstance(rew, float) assert isinstance(done, (bool, np.bool_))
def test_gym_wrapper(mock_env): mock_brain = create_mock_brainparams() mock_braininfo = create_mock_vector_braininfo() setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = UnityEnv(" ", use_visual=False, multiagent=False) assert isinstance(env, UnityEnv) assert isinstance(env.reset(), np.ndarray) actions = env.action_space.sample() assert actions.shape[0] == 2 obs, rew, done, info = env.step(actions) assert isinstance(obs, np.ndarray) assert isinstance(rew, float) assert isinstance(done, bool) assert isinstance(info, dict)
class UnityEnvWrapper(gym.Env): def __init__(self, env_config): self.vector_index = env_config.vector_index self.worker_index = env_config.worker_index self.worker_id = env_config["unity_worker_id"] + env_config.worker_index # Name of the Unity environment binary to launch env_name = '/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux' self.env = UnityEnv(env_name, worker_id=self.worker_id, use_visual=False, multiagent=False, no_graphics=True) # self.action_space = self.env.action_space self.observation_space = self.env.observation_space def reset(self): return self.env.reset() def step(self, action): return self.env.step(action)
def test_multi_agent(mock_communicator, mock_launcher): mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0, stack=False, num_agents=2) # Test for incorrect number of agents. with pytest.raises(UnityGymException): UnityEnv(' ', multiagent=False) env = UnityEnv(' ', use_visual=False, multiagent=True) assert isinstance(env.reset(), list) actions = [env.action_space.sample() for i in range(env.number_agents)] obs, rew, done, info = env.step(actions) assert isinstance(obs, list) assert isinstance(rew, list) assert isinstance(done, list) assert isinstance(info, dict)
def test_gym_wrapper_visual(mock_env, use_uint8): mock_spec = create_mock_group_spec(number_visual_observations=1) mock_decision_step, mock_terminal_step = create_mock_vector_steps( mock_spec, number_visual_observations=1) setup_mock_unityenvironment(mock_env, mock_spec, mock_decision_step, mock_terminal_step) env = UnityEnv(" ", use_visual=True, uint8_visual=use_uint8) assert isinstance(env, UnityEnv) assert isinstance(env.reset(), np.ndarray) actions = env.action_space.sample() assert actions.shape[0] == 2 obs, rew, done, info = env.step(actions) assert env.observation_space.contains(obs) assert isinstance(obs, np.ndarray) assert isinstance(rew, float) assert isinstance(done, (bool, np.bool_)) assert isinstance(info, dict)
def test_gym_wrapper(mock_communicator, mock_launcher): mock_communicator.return_value = MockCommunicator(discrete_action=False, visual_inputs=0, stack=False, num_agents=1) # Test for incorrect number of agents. with pytest.raises(UnityGymException): UnityEnv(' ', use_visual=False, multiagent=True) env = UnityEnv(' ', use_visual=False) assert isinstance(env, UnityEnv) assert isinstance(env.reset(), np.ndarray) actions = env.action_space.sample() assert actions.shape[0] == 2 obs, rew, done, info = env.step(actions) assert isinstance(obs, np.ndarray) assert isinstance(rew, float) assert isinstance(done, bool) assert isinstance(info, dict)
class TEST(): def __init__(self, n_episodes, env_name, model): # Nª Episodes self.n_episodes = n_episodes # Environment self.env_name = env_name channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=False) self.action_size, self.state_size = Utils.getActionStateSize(self.env) # Model self.model = ActorCritic(self.state_size, self.action_size, seed=0).to(device) # Initialize time step (for updating every "update_every" time steps) self.t_step = 1 # Start test self.load_model(model) self.test() def test(self): # Initial observation env_info = self.env.reset() state = env_info # Data self.data = Data(1, 100) # Episodes done n_done = 0 # Test loop while n_done <= self.n_episodes: # Action of agent action, value = self.act(state) # Send the action to the environment next_state, reward, done, info = self.env.step(action) # Update t_step self.t_step += 1 # Update n_done if done: n_done += 1 # Next state state = next_state # Update the score reward_ = np.expand_dims(reward, axis=0) value_ = value.unsqueeze(0) done_ = np.expand_dims(done, axis=0) self.data.update_score(reward_, value_, done_, self.t_step) # Summary if done: self.data.summary(self.t_step) def load_model(self, model): self.model.load_state_dict(torch.load(model)) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get actions probabilities and value from ActorCritic model self.model.eval() with torch.no_grad(): action_probs, value = self.model(state) self.model.train() prob = F.softmax(action_probs, -1) # Get action and log of probabilities action = prob.multinomial(num_samples=1) return action, value
env = UnityEnv(environment_filename=env_name, worker_id=0, use_visual=False, multiagent=True) print(str(env)) state_size = env.observation_space.shape[0] '''if (env.observation_space.shape[2]==3): plt.imshow(env.observation_space[0, :, :, :])''' action_size = env.action_space.n agent = DQNAgent(state_size, action_size) # agent.load("./save/cartpole-ddqn.h5")// done = False batch_size = 332 for e in range(EPISODES): state = env.reset() #print (state.shape) state = np.reshape(state, [3, state_size, state_size]) print("_____________state size______") print(state_size) for time in range(500): # env.render() #action = agent.act(state) actionlst = [] i = 0 while (i < 32): action = agent.act(state) actionlst.append(action) i = i + 1 next_state, reward, done, _ = env.step(action)
def cartpole(): env = UnityEnv(environment_filename=ENV_NAME, worker_id=2, use_visual=False, multiagent=True) score_logger = ScoreLogger(ENV_NAME) agents_brain = [] agents_action = [] index_list = [] agents_alive = [] count = 0 count1 = 0 num_agents = env.number_agents print("___________Number of agents in cartpole __") print(num_agents) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) print("__dqn solver______") print(dqn_solver) #model = tf.keras.models.load_model("") for x in range((env.number_agents)): agents_brain.append(dqn_solver) print("______agentbrain____") print(agents_brain) print("_Agent action___") print(agents_action) learning_brain = copy.deepcopy(agents_brain) run = 0 state = env.reset() initialstate = copy.deepcopy(state) while True: run += 1 env.reset() print("____________STATE____________-") print(state[0]) state = copy.deepcopy(initialstate) agents_brain = [] agents_action = [] index_list = [] agents_alive = [] count = 0 count1 = 0 num_agents = int(state[0][-5]) agents_brain = copy.deepcopy(learning_brain) print(learning_brain) print(agents_brain) print(state) #for x in range ( (env.number_agents - 1) ): step = 0 while True: step += 1 env.render() print("___________STatte Lenth_______") print(len(state)) print("______selffish___") print(state[0]) agents_action = [1] * len(state) copied_agents_alive = copy.deepcopy(agents_alive) print("__________numagents_____") for x in range(num_agents - 1): state[x] = np.reshape(state[x], [1, observation_space]) agents_action[x] = agents_brain[x].act(state[x]) print(agents_action) state_next, reward, terminal, info = env.step( agents_action, num_agents) print("_______Reward________") print(reward) print("_____________NEXT STATE LENGTH____________") print(len(state_next)) if (len(state_next) == 0): break agents_alive = state_next[0][-13:-5] num_agents = int(state_next[0][-5]) print("_______num agnets in cartpole________") print(num_agents) print("_____index list") print(index_list) print(agents_alive) agents_alive1 = np.delete(agents_alive, index_list) print("_______Alive agent list_______") print(agents_alive1) flag = False # del agents_alive[index_list[x]] for x in range(len(agents_alive)): if (agents_alive[x] == float(1)): for y in range(len(index_list)): if (index_list[y] == x): flag = True if (flag == False): index_list.append(x) flag = False index_to_remove = [] for x in range(len(agents_alive1)): if (agents_alive1[x] == float(1)): learning_brain[index_list[count]] = agents_brain[x] index_to_remove.append(x) count = count + 1 agents_brain = [ i for j, i in enumerate(agents_brain) if j not in index_to_remove ] print("____________AGENTS_BRAIN_________") print(len(agents_brain)) print("_______________Terminal_____________") print(terminal) if (terminal[0] == True): print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) for x in range(len(copied_agents_alive)): learning_brain[x] = agents_brain[count1] count1 = count1 + 1 for x in range(len(learning_brain)): learning_brain[x].save(str(run) + "brain" + str(x) + ".h5") break for x in range(num_agents - 1): state[x] = np.reshape(state[x], [1, observation_space]) state_next[x] = np.reshape(state_next[x], [1, observation_space]) agents_brain[x].remember(state[x], agents_action[x], reward[x], state_next[x], terminal[x]) agents_brain[x].experience_replay() state = state_next
class Worker(object): def __init__(self, wid): self.wid = wid self.env = UnityEnv(env_name, worker_id=wid, use_visual=True, use_both=True) # self.env=Reacher(render=True) self.ppo = GLOBAL_PPO self.pins_x = [] self.pins_y = [] def ImgProcess(self, img, Done=False): cimg, edge_detected_image, contour_centers = image_processing(img) # cimg = large_circle_detect(cimg, edge_detected_image) # this consumes most time cimg, VALID_DETECT = contour_center_check(contour_centers, cimg, NUM_PINS=NUM_PINS) # cv2.imwrite(save_path+str(filename),cimg) contour_centers = CenterRegister(contour_centers) if VALID_DETECT: # pins detection correct reshape_contour_centers = np.array(contour_centers).transpose() self.pins_x.append(reshape_contour_centers[0]) self.pins_y.append(reshape_contour_centers[1]) reshape_pins_x = np.array(self.pins_x).transpose() reshape_pins_y = np.array(self.pins_y).transpose() displacement_pins_x = self.pins_x[-1] - self.pins_x[0] displacement_pins_y = self.pins_y[-1] - self.pins_y[0] plt.figure(1) for i in range(NUM_PINS): plt.subplot(211) plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]) plt.title('Position') plt.subplot(212) plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i] - reshape_pins_x[i][0]) plt.title('Displacement') plt.tight_layout() plt.savefig('./ppo_pins.png') if Done: plt.clf() # return pins position x, y for current frame, displacement of pins position x,y return self.pins_x[-1], self.pins_y[ -1], displacement_pins_x, displacement_pins_y def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER step_set = [] epr_set = [] step = 0 while not COORD.should_stop(): s, info = self.env.reset() ''' image processing ''' img = (s[:, :, 0] * 255).astype(np.uint8) try: pins_x, pins_y, pins_dis_x, pins_dis_y = self.ImgProcess( img, Done=False) except: print('Image Processing Error!') s = np.concatenate((pins_dis_x, pins_dis_y)) '''''' # vector_s = info["brain_info"].vector_observations[0, :] # get the vector observation # s=vector_s # print(s.shape, info["brain_info"].vector_observations[0, :]) step += 1 ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] self.pins_x = [] self.pins_y = [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) s_, r, done, info = self.env.step(a) ''' implementation of plot version one, deprecated ''' # plt.imshow(s_[:,:,0]) # # plt.show() # plt.savefig('./img256_test/tac_test'+str(step)+str(t)+'.png') ''' image size of plt is not exactly the original array size, but with axis etc; therefore use Image--imlementation of plot version two ''' # im = Image.fromarray((s_[:,:,0] * 255).astype(np.uint8)) # im.save('./img256f_r30/tac'+str(step)+str(t)+'.png') ''' image processing ''' img = (s_[:, :, 0] * 255).astype(np.uint8) if t > EP_LEN - 1: Done = True else: Done = False try: pins_x, pins_y, pins_dis_x, pins_dis_y = self.ImgProcess( img, Done) except: print('Image Processing Error!') s_ = np.concatenate((pins_dis_x, pins_dis_y)) ''' get the vector observation ''' # vector_s = info["brain_info"].vector_observations[0, :] # get the vector observation # s_=vector_s # print('a: ',a) # shape: [] # print('s: ',s_) # shape: [] # plt.imshow(s[:,:,0]) # plt.show() # print('r: ',r) # shape: scalar # print('done: ', done) # shape: True/False # s=s.reshape(-1) # convert from 3D to 1D buffer_s.append(s) buffer_a.append(a) buffer_r.append( (r + 8) / 8) # normalize reward, find to be useful s = s_ ep_r += r GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, ) step_set.append(step) # print(step) epr_set.append(ep_r)
K.set_session(sess) #Setting up the env #TODO Worker_id can be changed to run in parallell #Flatten_branched gives us a onehot encoding of all 54 action combinations. print("Opening unity env") env = UnityEnv( "../unity_envs/kais_banana_with_explicit_charge_decision_red_battery_300_timesteps", worker_id=22, use_visual=True, flatten_branched=True, seed=seed ) #KOE: Note: If I accept images as uint8_visual=True, I have to convert to float later. print("Resetting env") initial_observation = env.reset() #KOETODO This would have to be manually configured for each environment. battery = 100 # [Health] prev_battery = battery # game.get_available_buttons_size() # [Turn Left, Turn Right, Move Forward] print("Action space is: ", env.action_space) action_size = env.action_space.n print("Env has ", action_size, " actions.") measurement_size = 3 # [Battery, posion, food] timesteps = [1, 2, 4, 8, 16, 32] # For long horizon: [4,8,16,32,64,128] goal_size = measurement_size * len(timesteps) img_rows, img_cols = 84, 84 #KOE: Think this is still correct. # Convert image into Black and white
class PPO(): def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.betas = (0.9, 0.999) self.gamma = 0.99 self.eps_clip = 0.2 self.buffer_size = 2048 self.batch_size = 256 self.K_epochs = 3 self.max_steps = 100000 self.tau = 0.95 self.entropy_coef = 0.001 self.value_loss_coef = 0.5 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=True) channel.set_configuration_parameters(time_scale=100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents print("Nº of Agents: ", self.n_agents) # Model self.model = ActorCritic(self.state_size, self.action_size, seed=0).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, betas=self.betas) self.MseLoss = nn.MSELoss() # Buffer memory self.memory = [] for _ in range(self.n_agents): self.memory.append(Buffer()) # Initialize time step (for updating when buffer_size is full) self.t_step = 1 def train(self): # Initial observation env_info = self.env.reset() state = env_info # Data self.data = Data(self.n_agents, self.summary_freq) # Training loop for _ in range(self.max_steps): action = [] logprobs = [] value = [] # Action of agent for i in range(self.n_agents): a, b, c = self.act(state[i]) action.append(a) logprobs.append(b) value.append(c) # Send the action to the environment next_state, reward, done, info = self.env.step(action) # Done done_ = [] for i in range(self.n_agents): done_.append(1 - done[i]) # Agent step for i in range(self.n_agents): self.step(state[i], action[i], reward[i], next_state[i], done_[i], logprobs[i], value[i], self.memory[i]) # Update t_step self.t_step += 1 # Next state state = next_state # Update the score self.data.update_score(reward, value, done, self.t_step) # Summary if self.t_step % self.summary_freq == 0: self.data.summary(self.t_step) # Save self.save() def save(self): torch.save(self.model.state_dict(), 'Saved Models/model.pth') self.data.results() def load_model(self, model): self.model.load_state_dict(torch.load(model)) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get actions probabilities and value from ActorCritic model self.model.eval() with torch.no_grad(): action_probs, value = self.model(state) self.model.train() prob = F.softmax(action_probs, -1) log_probs = F.log_softmax(action_probs, -1) # Get action and log of probabilities action = prob.multinomial(num_samples=1) log_probs = log_probs.gather(1, action) return action, log_probs, value def step(self, state, action, reward, next_state, done, logprobs, value, memory): # Update model when buffer_size is full if memory.len_() == (self.buffer_size / self.n_agents): self.learn() for i in range(self.n_agents): self.memory[i].reset() # Save experience in buffer memory memory.add(state, action, reward, next_state, done, logprobs, value) def evaluate(self, states, next_states, actions, rewards, masks, compute_gae): logits, values = self.model(states) probs = F.softmax(logits, -1) log_probs = F.log_softmax(logits, -1) entropies = -(log_probs * probs).sum(1, keepdim=True) log_probs = log_probs.gather(1, actions.unsqueeze(1)) values_ = values _, value = self.model(next_states) values = torch.cat((values, value.data)) returns = [] if (compute_gae): gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): # Generalized Advantage Estimation delta_t = rewards[i] + self.gamma * masks[i] * values[ i + 1].data - values[i].data gae = gae * self.gamma * self.tau * masks[i] + delta_t returns.insert(0, gae + values[i]) return log_probs, values_, entropies, returns def compute_returns(self): returns_ = [] for i in range(self.n_agents): # Get Experiences (of each agent) experiences = self.memory[i].get() states, actions, rewards, next_states, dones, logprobs_, values_ = experiences # Evaluate _, _, _, r = self.evaluate(states, next_states, actions, rewards, dones, compute_gae=True) returns_.append(r) l = [] for i in range(len(returns_)): for j in range(len(returns_[0])): l.append(returns_[i][j]) return l def learn(self): # Get Experiences states, actions, rewards, next_states, dones, logprobs_, values_ = self.getExp( ) returns_eval = self.compute_returns() returns_eval = torch.tensor(returns_eval).to(device) returns_eval = returns_eval.unsqueeze(1) # Optimize policy for K epochs: for _ in range(self.K_epochs): # List with all indices l = np.arange(self.buffer_size) l = list(l) x = self.buffer_size // self.batch_size for _ in range(x): # Take a random batch indices = random.sample(l, self.batch_size) old_logprobs = torch.empty(self.batch_size, 1) old_values = torch.empty(self.batch_size, 1) old_actions = torch.empty(self.batch_size) old_states = torch.empty(self.batch_size, self.state_size) old_next_states = torch.empty(self.batch_size, self.state_size) old_rewards = np.zeros(self.batch_size) returns = torch.empty(self.batch_size, 1) for i in range(len(indices)): old_logprobs[i] = logprobs_[indices[i]] old_values[i] = values_[indices[i]] old_actions[i] = actions[indices[i]] old_states[i] = states[indices[i]] old_next_states[i] = next_states[indices[i]] old_rewards[i] = rewards[indices[i]] returns[i] = returns_eval[indices[i]] old_actions = old_actions.long() # Remove indices to not repeat for i in indices: l.remove(i) # Evaluate logprobs, state_values, dist_entropy, _ = self.evaluate( old_states, old_next_states, old_actions, rewards, dones, compute_gae=False) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs) # Finding Surrogate Loss: advantages = returns - old_values surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages # LOSS = ACTOR LOSS + CRITIC_DISCOUNT * CRITIC_LOSS - ENTROPY_BETA * ENTROPY loss = -torch.min( surr1, surr2) + self.value_loss_coef * self.MseLoss( state_values, returns) - self.entropy_coef * dist_entropy # Optimizer step self.optimizerStep(self.optimizer, loss.mean()) def optimizerStep(self, optimizer, loss): optimizer.zero_grad() loss.backward() optimizer.step() def getExp(self): states, actions, rewards, next_states, dones, logprobs, values = [], [], [], [], [], [], [] for i in range(self.n_agents): experiences = self.memory[i].get() states.append(experiences[0]) actions.append(experiences[1]) rewards.append(experiences[2]) next_states.append(experiences[3]) dones.append(experiences[4]) logprobs.append(experiences[5]) values.append(experiences[6]) states_, actions_, rewards_, next_states_, dones_, logprobs_, values_ = [], [], [], [], [], [], [] for i in range(len(states)): for j in range(len(states[0])): states_.append(states[i][j]) actions_.append(actions[i][j]) rewards_.append(rewards[i][j]) next_states_.append(next_states[i][j]) dones_.append(dones[i][j]) logprobs_.append(logprobs[i][j]) values_.append(values[i][j]) states__ = torch.empty(self.buffer_size, self.state_size) actions__ = torch.empty(self.buffer_size) next_states__ = torch.empty(self.buffer_size, self.state_size) dones__ = torch.empty(self.buffer_size) logprobs__ = torch.empty(self.buffer_size, 1, 1) values__ = torch.empty(self.buffer_size) for i in range(self.buffer_size): states__[i] = states_[i] actions__[i] = actions_[i] next_states__[i] = next_states_[i] dones__[i] = dones_[i] logprobs__[i] = logprobs_[i] values__[i] = values_[i] return states__, actions__, rewards_, next_states__, dones__, logprobs__, values__
class AC(): def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.gamma = 0.99 self.batch_size = 256 self.max_steps = 100000 self.tau = 0.95 self.entropy_coef = 0.001 self.value_loss_coef = 0.5 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=False) channel.set_configuration_parameters(time_scale=100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents # Model self.model = ActorCritic(self.state_size, self.action_size, seed=0).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) # Buffer memory self.memory = Buffer() # Initialize time step (for updating every "batch_size" time steps) self.t_step = 1 def train(self): # Initial observation env_info = self.env.reset() state = env_info # Data self.data = Data(self.n_agents, self.summary_freq) # Training loop for _ in range(self.max_steps): # Action of agent action, value = self.act(state) # Send the action to the environment next_state, reward, done, info = self.env.step(action) # Agent step self.step(state, action, reward, next_state, done) # Update t_step self.t_step += 1 # Next state state = next_state # Update the score reward_ = np.expand_dims(reward, axis=0) value_ = value.unsqueeze(0) done_ = np.expand_dims(done, axis=0) self.data.update_score(reward_, value_, done_, self.t_step) # Summary if self.t_step % self.summary_freq == 0: self.data.summary(self.t_step) # Save self.save() def save(self): torch.save(self.model.state_dict(), 'Saved Models/model.pth') self.data.results() def load_model(self, model): self.model.load_state_dict(torch.load(model)) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get actions probabilities and value from ActorCritic model self.model.eval() with torch.no_grad(): action_probs, value = self.model(state) self.model.train() prob = F.softmax(action_probs, -1) # Get action and log of probabilities action = prob.multinomial(num_samples=1) return action, value def step(self, state, action, reward, next_state, done): # Save experience in buffer memory self.memory.add(state, action, reward, next_state, done) # Learn every "batch_size" time steps if self.t_step % self.batch_size == 0: experiences = self.memory.get() self.learn(experiences) self.memory.reset() def learn(self, experiences): # Get Experiences states, actions, rewards, next_states = experiences logits, values = self.model(states) probs = F.softmax(logits, -1) log_probs = F.log_softmax(logits, -1) entropies = -(log_probs * probs).sum(1, keepdim=True) log_probs = log_probs.gather(1, actions.unsqueeze(1)) _, value = self.model(next_states) values = torch.cat((values, value.data)) policy_loss = 0 value_loss = 0 R = values[-1] gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = self.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + self.gamma * values[i + 1].data - values[i].data gae = gae * self.gamma * self.tau + delta_t policy_loss = policy_loss - (log_probs[i] * gae) - ( self.entropy_coef * entropies[i]) # Loss loss = (policy_loss + self.value_loss_coef * value_loss) # Optimizer step self.optimizerStep(self.optimizer, loss) def optimizerStep(self, optimizer, loss): optimizer.zero_grad() loss.backward() optimizer.step()
from tensorflow.python import keras from tensorflow.python.keras import Sequential from tensorflow.python.keras.optimizers import Adam #from tensorflow.python.keras._impl.keras.optimizers import Adam from tensorflow.python.layers.core import Dense import matplotlib.pyplot as plt import sys from gym_unity.envs import UnityEnv multi_env_name = "D:/ml-agents-0.8.0/UnitySDK/A.exe" multi_env = UnityEnv(multi_env_name, worker_id=1, use_visual=False, multiagent=True) # Examine environment parameters print(str(multi_env)) # Reset the environment initial_observations = multi_env.reset() if len(multi_env.observation_space.shape) == 1: # Examine the initial vector observation print("Agent observations look like: \n{}".format(initial_observations[0])) else: # Examine the initial visual observation print("Agent observations look like:") if multi_env.observation_space.shape[2] == 3: plt.imshow(initial_observations[0][:,:,:]) else: plt.imshow(initial_observations[0][:,:,0]) for episode in range(10): initial_observation = multi_env.reset()
target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau) self.target_model.set_weights(target_weights) def save_model(self, fn): self.model.save(fn) env_name = "../env/GridWorld.x86_64" # Name of the Unity environment binary to launch env = UnityEnv(env_name, worker_id=0, use_visual=True) episodes = 2000 # updateTargetNetwork = 1000 dqn_agent = DQN(env=env) steps = [] env.reset() for ep in range(episodes): cur_state = np.array([env.reset()]) done = False ep_reward = 0 n_steps = 0 while not done: action = dqn_agent.act(cur_state) new_state, reward, done, info = env.step(action) new_state = np.array([new_state]) ep_reward += reward # reward = reward if not done else -20 dqn_agent.remember(cur_state, action, reward, new_state, done)
# Environment name # Remember to put battle royale environment configuration within the config folder env_name = "environment/battle-royale-static" env = UnityEnv(env_name, worker_id=4, use_visual=False, multiagent=True) print(str(env)) # ## Examine Observation Space # In[3]: # Examine observation space observation = env.observation_space env.reset() print("Agent observation space type: {}".format(observation)) # ## Examine Action Space # In[4]: # Examine action space action = env.action_space print("Agent action space type: {}".format(action)) # ## Agents Training # This part shows agent training using MADDPG algoritm # ### Setup Algorithm Dependencies
def cartpole(): env = UnityEnv(environment_filename=ENV_NAME, worker_id=5, use_visual=False, multiagent = True) score_logger = ScoreLogger(ENV_NAME) agents_brain = [] agents_action = [] num_agents = env.number_agents observation_space = env.observation_space.shape[0] print("____________Observation_space") print(observation_space) action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) for x in range ((env.number_agents)): agents_brain.append(DQNSolver(observation_space, action_space)) print ("Length of BrainList: ",len(agents_brain)) run = 0 state = env.reset() print("______INITIAL______") print(state) #initialstate = copy.deepcopy(state) print("*****************************initial state for unity envirmonet**************") #print(initialstate) jk = 1 while True: run += 1 state = env.reset() #state = copy.deepcopy(initialstate) num_agents = int(state[0][-5]) print("_____________State _______________") print(int(state[0][12])) step = 0 print("################################This is loop################################# :" , jk) while True: step += 1 env.render() agents_action = [1] * len(state) print(state[0]) print("*******************Length of state******************") print(len(state)) for x in range(len(state)): state[x] = np.reshape(state[x], [1, observation_space]) agents_action[x] = agents_brain[int(state[x][0,12]) - 1].act(state[x]) print("Agents Actions List: ",agents_action) state_next, reward, terminal, info = env.step(agents_action) #print ("_____________STATE_NEXT___________") #print (state_next) if (len(state_next) == 0): break agents_alive = state_next[0][-13:-5] print ("Agents_alive: ", agents_alive) print ("Rewards: ",reward) num_agents = int(state_next[0][-5]) print ("Number of agents: ",num_agents) print("_________Terminal list_______" , terminal) if (terminal[0] == True): print("**************************Brain saved******************************") for x in range(len(agents_brain)): agents_brain[x].save(str(run) + "brain" + str(x) + ".h5") jk+=1 print("#####################################Loop is######################## :" , jk) #break for x in range(len(state_next)): state[x] = np.reshape(state[x], [1, observation_space]) state_next[x] = np.reshape(state_next[x], [1, observation_space]) agents_brain[int(state_next[x][0,12]) - 1].remember(state[x], agents_action[x], reward[x], state_next[x], terminal[x]) agents_brain[int(state_next[x][0,12]) - 1].experience_replay() state = state_next
class Worker(object): def __init__(self, wid): self.wid = wid self.env = UnityEnv(env_name, worker_id=wid, use_visual=False, use_both=True) # self.env=Reacher(render=True) self.ppo = GLOBAL_PPO def work(self): global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER step_set = [] epr_set = [] step = 0 while not COORD.should_stop(): s, info = self.env.reset() s = s[:8] step += 1 ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] self.pins_x = [] self.pins_y = [] self.pins_z = [] self.object_x = [] self.object_y = [] self.object_z = [] for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated buffer_s, buffer_a, buffer_r = [], [], [ ] # clear history buffer, use new policy to collect data a = self.ppo.choose_action(s) s_, r, done, info = self.env.step(a) # print(np.array(s_).shape) # plot pins buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful pins_x = s_[6::3] pins_z = s_[8::3] self.object_x.append(s_[0]) self.object_z.append(s_[2]) self.pins_x.append(pins_x) self.pins_z.append(pins_z) relative_x = pins_x - s_[0] relative_z = pins_z - s_[2] dis = (relative_x - (self.pins_x[0] - self.object_x[0]))**2 + ( relative_z - (self.pins_z[0] - self.object_z[0]))**2 min_idx = np.argmin(dis) max_idx = np.argmax(dis) # add relative position of the pin with smallest deformation # s_ = np.append(s_[:6], relative_x[min_idx]) # s_ = np.append(s_, relative_z[min_idx]) s_ = np.append(s_[:6], relative_x[max_idx]) s_ = np.append(s_, relative_z[max_idx]) s = s_ ep_r += r # print('minimal displacement idx: ', min_idx) GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: v_s_ = self.ppo.get_v(s_) discounted_r = [] # compute discounted reward for r in buffer_r[::-1]: v_s_ = r + GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: ROLLING_EVENT.clear() # stop collecting data UPDATE_EVENT.set() # globalPPO update if GLOBAL_EP >= EP_MAX: # stop training COORD.request_stop() break if GLOBAL_EP % 50 == 0 and GLOBAL_EP > 0: self.ppo.save(model_path) reshape_pins_x = np.array(self.pins_x).transpose() reshape_pins_z = np.array(self.pins_z).transpose() plt.clf() for i in range(NUM_PINS): plt.subplot(411) plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]) plt.title('X-Position') plt.subplot(412) plt.plot(np.arange(len(self.pins_z)), reshape_pins_z[i]) plt.title( 'Y-Position') # although it's z, to match reality, use y plt.subplot(413) # plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]-self.object_x) # plt.title('X-Relative') # plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]-reshape_pins_x[i][0]) # plt.title('X-Displacement') plt.plot(np.arange(len(self.pins_x)), (reshape_pins_x[i] - self.object_x) - (reshape_pins_x[i][0] - self.object_x[0])) plt.title('X-Displacement') plt.subplot(414) plt.plot(np.arange(len(self.pins_x)), (reshape_pins_z[i] - self.object_z) - (reshape_pins_z[i][0] - self.object_z[0])) plt.title('Y-Displacement') plt.xlabel('Time Step') plt.tight_layout() plt.savefig('./ppo_pins.png') # record reward changes, plot later if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 print( '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, ) step_set.append(step) # print(step) epr_set.append(ep_r) if step % 10 == 0: # plot every N episode; some error about main thread for plotting plt.clf() plt.plot(step_set, epr_set) plt.xlabel('Episode') plt.ylabel('Reward') try: plt.savefig('./tac_pins8.png') except: print('writing conflict!')
def cartpole(): env = UnityEnv(environment_filename=ENV_NAME, worker_id=1, use_visual=False, multiagent=True) #score_logger = ScoreLogger(ENV_NAME) agents_brain = [] agents_action = [] pathname = "C:/HinaProgramm/testingFolder/Unity Environment" num_agents = env.number_agents print("Number of agents in enviroment : ", num_agents) observation_space = env.observation_space.shape[0] print("____________Observation_space______________") print(observation_space) print("__________Action Space________________") action_space = env.action_space.n print(action_space) dqn_solver = DQNSolver(observation_space, action_space) for x in range((num_agents)): agents_brain.append(DQNSolver(observation_space, action_space)) print("Length of BrainList: ", len(agents_brain)) run = 0 state = env.reset() #print("______INITIAL______") #print(state) initialstate = copy.deepcopy(state) #print("*****************************initial state for unity envirmonet**************") #print(initialstate) jk = 1 sharecount = 0 eatcount = 0 filecount = 0 #f = str(filecount) + "sahre.csv" f = open(str(filecount) + "sahre.csv", 'ab') #J = str(filecount) + "eat.csv" J = open(str(filecount) + "eat.csv", 'ab') while True: run += 1 env.reset() state = copy.deepcopy(initialstate) num_agents = int(state[0][-8]) print("_numagents__________", num_agents) print("_____________State _______________") print(int(state[0][12])) step = 0 print( "################################This is loop################################# :", jk) print("_____Run _______ :", run) while True: #print("************Number of agents *********") #print(env.number_agents) step += 1 env.render() agents_action = [1] * len(state) #print(state[0]) #print("*******************Length of state******************") #print(len(state)) for x in range(len(state)): state[x] = np.reshape(state[x], [1, observation_space]) agents_action[x] = agents_brain[int(state[x][0, 12]) - 1].act( state[x]) sharecount += agents_action.count(5) eatcount += agents_action.count(6) #print("Agents Actions List: ",agents_action) state_next, reward, terminal, info = env.step(agents_action) for x in range(len(agents_action)): if (agents_action[x] == 5): new = np.asarray([state_next[x]]) np.savetxt(f, new, delimiter=",") #f.write(str(state_next[x])+"\r\n") if (agents_action[x] == 6): #J.write(str(state_next[x])+"\r\n") new = np.asarray([state_next[x]]) np.savetxt(J, new, delimiter=",") print("_____________STATE_NEXT___________") print(state_next) if (len(state_next) == 0): #f.write(str(sharecount)) #J.write(str(eatcount)) #f.close() #J.close() filecount += 1 np.savetxt(f, sharecount, delimiter=",") np.savetxt(J, eatcount, delimiter=",") break agents_alive = state_next[0][-16:-8] print("Agents_alive: ", agents_alive) print("Rewards: ", reward) num_agents = int(state_next[0][-8]) print("Number of agents: ", num_agents) #print("_________Terminal list_______" , terminal) if (terminal[0] == True): print( "**************************Brain saved******************************" ) for x in range(len(agents_brain)): agents_brain[x].model.save(pathname + str(run) + "brain" + str(x) + ".h5") jk += 1 print( "#####################################Loop is######################## :", jk) #f.write(str(sharecount)) #J.write(str(eatcount)) #f.close() #J.close() filecount += 1 break for x in range(len(state_next)): state[x] = np.reshape(state[x], [1, observation_space]) state_next[x] = np.reshape(state_next[x], [1, observation_space]) agents_brain[int(state_next[x][0, 12]) - 1].remember( state[x], agents_action[x], reward[x], state_next[x], terminal[x]) agents_brain[int(state_next[x][0, 12]) - 1].experience_replay() state = state_next
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] else: y, adv = 0, [] terminals_reversed = terminal_array[1:][::-1] for step, dt in enumerate(reversed(x)): y = dt + gamma * y * (1 - terminals_reversed[step]) adv.append(y) return np.array(adv)[::-1] if __name__ == "__main__": env = UnityEnv('test.app', 0,use_visual=True) ppo = PPO(env) all_ep_r = [] t = 0 for ep in range(EP_MAX): s = env.reset() ep_r = 0 done = False while not done: t+=1 env.render() a,v = ppo.choose_action(s) s_, r, done, _ = env.step(a) ppo.buffer_s.append(s) ppo.buffer_a.append(a) ppo.buffer_r.append(r) ppo.buffer_v.append(v) ppo.buffer_done.append(done) s = s_
import numpy as np import sys from gym_unity.envs import UnityEnv env_name = "../env/GridWorld.x86_64" # Name of the Unity environment binary to launch env = UnityEnv(env_name) # Examine environment parameters print(str(env)) # Reset the environment initial_observation = env.reset() for episode in range(10): initial_observation = env.reset() done = False episode_rewards = 0 while not done: observation, reward, done, info = env.step(env.action_space.sample()) episode_rewards += reward print("Total reward this episode: {}".format(episode_rewards)) env.close()
# env=Reacher(render=True) # env = UnityEnv(env_name, worker_id=10, use_visual=True, use_both=True) # s, info = env.reset() # for t in range(100): # # env.render() # s, r, done, info = env.step(GLOBAL_PPO.choose_action(s)) GLOBAL_PPO.save(model_path) if args.test: env = UnityEnv(env_name, worker_id=np.random.randint(0, 10), use_visual=True, use_both=True) env.reset() GLOBAL_PPO = PPO() GLOBAL_PPO.load(model_path) test_steps = 200 test_episode = 10 for _ in range(test_episode): s, info = env.reset() '''''' # vector_s = info["brain_info"].vector_observations[0, :] # get the vector observation # s=vector_s for t in range(test_steps): # env.render() s, r, done, info = env.step(GLOBAL_PPO.choose_action(s)) '''''' # vector_s = info["brain_info"].vector_observations[0, :] # get the vector observation
Defining the environment related constants ''' # Number of discrete states (bucket) per state dimension MAZE_SIZE = (5, 5) NUM_BUCKETS = MAZE_SIZE # one bucket per grid # Number of discrete actions NUM_ACTIONS = env.action_space.n # ["N", "S", "E", "W"] STATE_BOUNDS = [(0.0, 4.0), (0.0, 4.0)] MAX_T = np.prod(MAZE_SIZE, dtype=int) * 100 q_table = np.load('q_table.npy') env.render() # Reset the environment obv = env.reset() # the initial state state_0 = state_to_bucket(obv) total_reward = 0 for t in range(MAX_T): # Select an action action = select_action(state_0, 0) # execute the action obv, reward, done, _ = env.step(action) # Observe the result state = state_to_bucket(obv)
[p.join() for p in processes] # finished at the same time td3_trainer.save_model(model_path) print(rewards) if args.test: # choose env # env_name="./tac_follow_new4" # env_name="tac_follow_new4_random02" env_name = "tac_follow_new4_random" env = UnityEnv(env_name, worker_id=22, use_visual=False, use_both=True) td3_trainer.load_model(model_path) eps_r = [] for eps in range(20): state, info = env.reset() state0 = state state = state_process(state, state0) episode_reward = 0 for step in range(max_steps): action = td3_trainer.policy_net.get_action( state, deterministic=DETERMINISTIC, explore_noise_scale=0.0) next_state, reward, done, info = env.step(action) reward += 100 next_state = state_process(next_state, state0) episode_reward += reward state = next_state if done:
import time from gym_unity.envs import UnityEnv env_name = "../envs/Cat_2/Cats.exe" print("\nwith no render") env = UnityEnv(env_name, no_graphics=False, multiagent=True, worker_id=1) """ res = [] for j in range(10): """ print(str(env)) ini_obs = env.reset() curr_t = time.time() for i in range(10000): actions = [env.action_space.sample() for agent in range(env.number_agents)] obs, rew, done, info = env.step(actions) res = time.time() - curr_t print("\nTime for 1000 step") print(res) print("\n\n") """ res = []