Ejemplo n.º 1
0
def main(env_name):
    """
    Run the gym test using the specified environment
    :param env_name: Name of the Unity environment binary to launch
    """
    env = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True)

    try:
        # Examine environment parameters
        print(str(env))

        # Reset the environment
        initial_observations = env.reset()

        if len(env.observation_space.shape) == 1:
            # Examine the initial vector observation
            print("Agent observations look like: \n{}".format(
                initial_observations))

        for _episode in range(10):
            env.reset()
            done = False
            episode_rewards = 0
            while not done:
                actions = env.action_space.sample()
                obs, reward, done, _ = env.step(actions)
                episode_rewards += reward
            print("Total reward this episode: {}".format(episode_rewards))
    finally:
        env.close()
Ejemplo n.º 2
0
class Chaser_v1(Environment):
    unity_env_worker_id = 0

    def __init__(self, platform):
        if platform == OSName.MAC:
            env_filename = EnvironmentName.CHASER_V1_MAC.value
        elif platform == OSName.WINDOWS:
            env_filename = EnvironmentName.CHASER_V1_WINDOWS.value
        else:
            env_filename = None

        self.env = UnityEnv(environment_filename=env_filename,
                            worker_id=Chaser_v1.unity_env_worker_id,
                            use_visual=True,
                            multiagent=True).unwrapped
        self.increase_env_worker_id()
        super(Chaser_v1, self).__init__()
        self.action_shape = self.get_action_shape()
        self.state_shape = self.get_state_shape()

        self.cnn_input_height = self.state_shape[0]
        self.cnn_input_width = self.state_shape[1]
        self.cnn_input_channels = self.state_shape[2]

        self.observation_space = self.env.observation_space
        self.continuous = True

    @staticmethod
    def increase_env_worker_id():
        Chaser_v1.unity_env_worker_id += 1

    def get_n_states(self):
        n_states = 3
        return n_states

    def get_n_actions(self):
        n_actions = 3
        return n_actions

    def get_state_shape(self):
        return self.env.observation_space.shape

    def get_action_shape(self):
        return self.env.action_space.shape

    def reset(self):
        state = self.env.reset()
        return state

    def step(self, action):
        next_state, reward, done, info = self.env.step(action)

        adjusted_reward = reward

        return next_state, reward, adjusted_reward, done, info

    def close(self):
        self.env.close()
Ejemplo n.º 3
0
def worker(id, td3_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \
            update_itr, explore_noise_scale, eval_noise_scale, reward_scale, DETERMINISTIC, hidden_dim, model_path):
    '''
    the function for sampling with multi-processing
    '''
    print(td3_trainer, replay_buffer)

    env_name="./tac_follow_new"
    env = UnityEnv(env_name, worker_id=id+15, use_visual=False, use_both=True)



    # training loop
    for eps in range(max_episodes):
        frame_idx=0
        rewards=[]
        episode_reward = 0
        state, info = env.reset()
        # state=state[:6]
        
        for step in range(max_steps):
            if frame_idx > explore_steps:
                action = td3_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC, explore_noise_scale=explore_noise_scale)
            else:
                action = td3_trainer.policy_net.sample_action()
    
            try:
                next_state, reward, done, info = env.step(action)
                # next_state = next_state[:6]
            except KeyboardInterrupt:
                print('Finished')
                td3_trainer.save_model(model_path)
    
            replay_buffer.push(state, action, reward, next_state, done)
            
            state = next_state
            episode_reward += reward
            frame_idx += 1
            
            
            # if len(replay_buffer) > batch_size:
            if replay_buffer.get_length() > batch_size:
                for i in range(update_itr):
                    _=td3_trainer.update(batch_size, deterministic=DETERMINISTIC, eval_noise_scale=eval_noise_scale, reward_scale=reward_scale)
            
            if eps % 10 == 0 and eps>0:
                # plot(rewards, id)
                td3_trainer.save_model(model_path)
            
            if done:
                break
        print('Episode: ', eps, '| Episode Reward: ', episode_reward)
        if len(rewards) == 0: rewards.append(episode_reward)
        else: rewards.append(rewards[-1]*0.9+episode_reward*0.1)
        rewards_queue.put(episode_reward)

    td3_trainer.save_model(model_path)
Ejemplo n.º 4
0
def test_multi_agent(mock_env):
    mock_brain = create_mock_group_spec()
    mock_braininfo = create_mock_vector_step_result(num_agents=2)
    setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)

    with pytest.raises(UnityGymException):
        UnityEnv(" ", multiagent=False)

    env = UnityEnv(" ", use_visual=False, multiagent=True)
    assert isinstance(env.reset(), list)
    actions = [env.action_space.sample() for i in range(env.number_agents)]
    obs, rew, done, info = env.step(actions)
    assert isinstance(obs, list)
    assert isinstance(rew, list)
    assert isinstance(done, list)
Ejemplo n.º 5
0
def test_closing(env_name):
    """
    Run the gym test and closes the environment multiple times
    :param env_name: Name of the Unity environment binary to launch
    """

    try:
        env1 = UnityEnv(env_name,
                        worker_id=1,
                        use_visual=False,
                        no_graphics=True)
        env1.close()
        env1 = UnityEnv(env_name,
                        worker_id=1,
                        use_visual=False,
                        no_graphics=True)
        env2 = UnityEnv(env_name,
                        worker_id=2,
                        use_visual=False,
                        no_graphics=True)
        env2.reset()
    finally:
        env1.close()
        env2.close()
Ejemplo n.º 6
0
def test_gym_wrapper(mock_env):
    mock_brain = create_mock_group_spec()
    mock_braininfo = create_mock_vector_step_result()
    setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)

    env = UnityEnv(" ", use_visual=False, multiagent=False)
    assert isinstance(env, UnityEnv)
    assert isinstance(env.reset(), np.ndarray)
    actions = env.action_space.sample()
    assert actions.shape[0] == 2
    obs, rew, done, info = env.step(actions)
    assert env.observation_space.contains(obs)
    assert isinstance(obs, np.ndarray)
    assert isinstance(rew, float)
    assert isinstance(done, (bool, np.bool_))
Ejemplo n.º 7
0
def test_gym_wrapper(mock_env):
    mock_brain = create_mock_brainparams()
    mock_braininfo = create_mock_vector_braininfo()
    setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)

    env = UnityEnv(" ", use_visual=False, multiagent=False)
    assert isinstance(env, UnityEnv)
    assert isinstance(env.reset(), np.ndarray)
    actions = env.action_space.sample()
    assert actions.shape[0] == 2
    obs, rew, done, info = env.step(actions)
    assert isinstance(obs, np.ndarray)
    assert isinstance(rew, float)
    assert isinstance(done, bool)
    assert isinstance(info, dict)
Ejemplo n.º 8
0
    class UnityEnvWrapper(gym.Env):
        def __init__(self, env_config):
            self.vector_index = env_config.vector_index
            self.worker_index = env_config.worker_index
            self.worker_id = env_config["unity_worker_id"] + env_config.worker_index
            # Name of the Unity environment binary to launch
            env_name = '/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux'
            self.env = UnityEnv(env_name, worker_id=self.worker_id, use_visual=False, multiagent=False, no_graphics=True) #
            self.action_space = self.env.action_space
            self.observation_space = self.env.observation_space

        def reset(self):
            return self.env.reset()

        def step(self, action):
            return self.env.step(action)
Ejemplo n.º 9
0
def test_multi_agent(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0,
                                                      stack=False,
                                                      num_agents=2)

    # Test for incorrect number of agents.
    with pytest.raises(UnityGymException):
        UnityEnv(' ', multiagent=False)

    env = UnityEnv(' ', use_visual=False, multiagent=True)
    assert isinstance(env.reset(), list)
    actions = [env.action_space.sample() for i in range(env.number_agents)]
    obs, rew, done, info = env.step(actions)
    assert isinstance(obs, list)
    assert isinstance(rew, list)
    assert isinstance(done, list)
    assert isinstance(info, dict)
Ejemplo n.º 10
0
def test_gym_wrapper_visual(mock_env, use_uint8):
    mock_spec = create_mock_group_spec(number_visual_observations=1)
    mock_decision_step, mock_terminal_step = create_mock_vector_steps(
        mock_spec, number_visual_observations=1)
    setup_mock_unityenvironment(mock_env, mock_spec, mock_decision_step,
                                mock_terminal_step)

    env = UnityEnv(" ", use_visual=True, uint8_visual=use_uint8)
    assert isinstance(env, UnityEnv)
    assert isinstance(env.reset(), np.ndarray)
    actions = env.action_space.sample()
    assert actions.shape[0] == 2
    obs, rew, done, info = env.step(actions)
    assert env.observation_space.contains(obs)
    assert isinstance(obs, np.ndarray)
    assert isinstance(rew, float)
    assert isinstance(done, (bool, np.bool_))
    assert isinstance(info, dict)
Ejemplo n.º 11
0
def test_gym_wrapper(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0,
                                                      stack=False,
                                                      num_agents=1)

    # Test for incorrect number of agents.
    with pytest.raises(UnityGymException):
        UnityEnv(' ', use_visual=False, multiagent=True)

    env = UnityEnv(' ', use_visual=False)
    assert isinstance(env, UnityEnv)
    assert isinstance(env.reset(), np.ndarray)
    actions = env.action_space.sample()
    assert actions.shape[0] == 2
    obs, rew, done, info = env.step(actions)
    assert isinstance(obs, np.ndarray)
    assert isinstance(rew, float)
    assert isinstance(done, bool)
    assert isinstance(info, dict)
Ejemplo n.º 12
0
class TEST():
    def __init__(self, n_episodes, env_name, model):

        # Nª Episodes
        self.n_episodes = n_episodes

        # Environment
        self.env_name = env_name
        channel = EngineConfigurationChannel()
        self.env = UnityEnv(self.env_name,
                            worker_id=0,
                            use_visual=False,
                            side_channels=[channel],
                            no_graphics=False,
                            multiagent=False)
        self.action_size, self.state_size = Utils.getActionStateSize(self.env)

        # Model
        self.model = ActorCritic(self.state_size, self.action_size,
                                 seed=0).to(device)

        # Initialize time step (for updating every "update_every" time steps)
        self.t_step = 1

        # Start test
        self.load_model(model)
        self.test()

    def test(self):

        # Initial observation
        env_info = self.env.reset()
        state = env_info

        # Data
        self.data = Data(1, 100)

        # Episodes done
        n_done = 0

        # Test loop
        while n_done <= self.n_episodes:

            # Action of agent
            action, value = self.act(state)

            # Send the action to the environment
            next_state, reward, done, info = self.env.step(action)

            # Update t_step
            self.t_step += 1

            # Update n_done
            if done:
                n_done += 1

            # Next state
            state = next_state

            # Update the score
            reward_ = np.expand_dims(reward, axis=0)
            value_ = value.unsqueeze(0)
            done_ = np.expand_dims(done, axis=0)
            self.data.update_score(reward_, value_, done_, self.t_step)

            # Summary
            if done:
                self.data.summary(self.t_step)

    def load_model(self, model):
        self.model.load_state_dict(torch.load(model))

    def act(self, state):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Get actions probabilities and value from ActorCritic model
        self.model.eval()
        with torch.no_grad():
            action_probs, value = self.model(state)
        self.model.train()

        prob = F.softmax(action_probs, -1)

        # Get action and log of probabilities
        action = prob.multinomial(num_samples=1)

        return action, value
Ejemplo n.º 13
0
    env = UnityEnv(environment_filename=env_name,
                   worker_id=0,
                   use_visual=False,
                   multiagent=True)
    print(str(env))
    state_size = env.observation_space.shape[0]
    '''if (env.observation_space.shape[2]==3):
        plt.imshow(env.observation_space[0, :, :, :])'''
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    # agent.load("./save/cartpole-ddqn.h5")//
    done = False
    batch_size = 332

    for e in range(EPISODES):
        state = env.reset()
        #print (state.shape)
        state = np.reshape(state, [3, state_size, state_size])
        print("_____________state size______")
        print(state_size)
        for time in range(500):
            # env.render()

            #action = agent.act(state)
            actionlst = []
            i = 0
            while (i < 32):
                action = agent.act(state)
                actionlst.append(action)
                i = i + 1
            next_state, reward, done, _ = env.step(action)
Ejemplo n.º 14
0
def cartpole():
    env = UnityEnv(environment_filename=ENV_NAME,
                   worker_id=2,
                   use_visual=False,
                   multiagent=True)
    score_logger = ScoreLogger(ENV_NAME)
    agents_brain = []
    agents_action = []
    index_list = []
    agents_alive = []
    count = 0
    count1 = 0
    num_agents = env.number_agents
    print("___________Number of agents in cartpole __")
    print(num_agents)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    print("__dqn solver______")
    print(dqn_solver)
    #model = tf.keras.models.load_model("")
    for x in range((env.number_agents)):
        agents_brain.append(dqn_solver)
        print("______agentbrain____")
        print(agents_brain)
        print("_Agent action___")
        print(agents_action)

    learning_brain = copy.deepcopy(agents_brain)
    run = 0
    state = env.reset()
    initialstate = copy.deepcopy(state)
    while True:
        run += 1
        env.reset()
        print("____________STATE____________-")
        print(state[0])
        state = copy.deepcopy(initialstate)
        agents_brain = []
        agents_action = []
        index_list = []
        agents_alive = []
        count = 0
        count1 = 0
        num_agents = int(state[0][-5])
        agents_brain = copy.deepcopy(learning_brain)
        print(learning_brain)
        print(agents_brain)
        print(state)
        #for x in range ( (env.number_agents - 1) ):

        step = 0
        while True:
            step += 1
            env.render()
            print("___________STatte Lenth_______")
            print(len(state))
            print("______selffish___")
            print(state[0])
            agents_action = [1] * len(state)
            copied_agents_alive = copy.deepcopy(agents_alive)
            print("__________numagents_____")
            for x in range(num_agents - 1):
                state[x] = np.reshape(state[x], [1, observation_space])
                agents_action[x] = agents_brain[x].act(state[x])
            print(agents_action)
            state_next, reward, terminal, info = env.step(
                agents_action, num_agents)
            print("_______Reward________")
            print(reward)
            print("_____________NEXT STATE LENGTH____________")
            print(len(state_next))
            if (len(state_next) == 0):
                break
            agents_alive = state_next[0][-13:-5]
            num_agents = int(state_next[0][-5])
            print("_______num agnets in cartpole________")
            print(num_agents)
            print("_____index list")
            print(index_list)
            print(agents_alive)
            agents_alive1 = np.delete(agents_alive, index_list)
            print("_______Alive agent list_______")
            print(agents_alive1)
            flag = False
            # del agents_alive[index_list[x]]
            for x in range(len(agents_alive)):
                if (agents_alive[x] == float(1)):
                    for y in range(len(index_list)):
                        if (index_list[y] == x):
                            flag = True
                    if (flag == False):
                        index_list.append(x)

                flag = False

            index_to_remove = []
            for x in range(len(agents_alive1)):
                if (agents_alive1[x] == float(1)):
                    learning_brain[index_list[count]] = agents_brain[x]
                    index_to_remove.append(x)
                    count = count + 1

            agents_brain = [
                i for j, i in enumerate(agents_brain)
                if j not in index_to_remove
            ]
            print("____________AGENTS_BRAIN_________")
            print(len(agents_brain))
            print("_______________Terminal_____________")
            print(terminal)
            if (terminal[0] == True):
                print("Run: " + str(run) + ", exploration: " +
                      str(dqn_solver.exploration_rate) + ", score: " +
                      str(step))
                score_logger.add_score(step, run)
                for x in range(len(copied_agents_alive)):
                    learning_brain[x] = agents_brain[count1]
                    count1 = count1 + 1
                for x in range(len(learning_brain)):
                    learning_brain[x].save(str(run) + "brain" + str(x) + ".h5")

                break

            for x in range(num_agents - 1):
                state[x] = np.reshape(state[x], [1, observation_space])
                state_next[x] = np.reshape(state_next[x],
                                           [1, observation_space])
                agents_brain[x].remember(state[x], agents_action[x], reward[x],
                                         state_next[x], terminal[x])
                agents_brain[x].experience_replay()
            state = state_next
Ejemplo n.º 15
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = UnityEnv(env_name,
                            worker_id=wid,
                            use_visual=True,
                            use_both=True)

        # self.env=Reacher(render=True)
        self.ppo = GLOBAL_PPO

        self.pins_x = []
        self.pins_y = []

    def ImgProcess(self, img, Done=False):
        cimg, edge_detected_image, contour_centers = image_processing(img)
        # cimg = large_circle_detect(cimg, edge_detected_image)  # this consumes most time
        cimg, VALID_DETECT = contour_center_check(contour_centers,
                                                  cimg,
                                                  NUM_PINS=NUM_PINS)
        # cv2.imwrite(save_path+str(filename),cimg)
        contour_centers = CenterRegister(contour_centers)

        if VALID_DETECT:  # pins detection correct
            reshape_contour_centers = np.array(contour_centers).transpose()
            self.pins_x.append(reshape_contour_centers[0])
            self.pins_y.append(reshape_contour_centers[1])

        reshape_pins_x = np.array(self.pins_x).transpose()
        reshape_pins_y = np.array(self.pins_y).transpose()
        displacement_pins_x = self.pins_x[-1] - self.pins_x[0]
        displacement_pins_y = self.pins_y[-1] - self.pins_y[0]

        plt.figure(1)
        for i in range(NUM_PINS):
            plt.subplot(211)
            plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i])
            plt.title('Position')
            plt.subplot(212)
            plt.plot(np.arange(len(self.pins_x)),
                     reshape_pins_x[i] - reshape_pins_x[i][0])
            plt.title('Displacement')
            plt.tight_layout()
        plt.savefig('./ppo_pins.png')
        if Done:
            plt.clf()
        # return pins position x, y for current frame, displacement of pins position x,y
        return self.pins_x[-1], self.pins_y[
            -1], displacement_pins_x, displacement_pins_y

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        step_set = []
        epr_set = []
        step = 0
        while not COORD.should_stop():
            s, info = self.env.reset()
            ''' image processing '''
            img = (s[:, :, 0] * 255).astype(np.uint8)
            try:
                pins_x, pins_y, pins_dis_x, pins_dis_y = self.ImgProcess(
                    img, Done=False)
            except:
                print('Image Processing Error!')
            s = np.concatenate((pins_dis_x, pins_dis_y))
            ''''''
            # vector_s = info["brain_info"].vector_observations[0, :]  # get the vector observation
            # s=vector_s
            # print(s.shape, info["brain_info"].vector_observations[0, :])
            step += 1
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            self.pins_x = []
            self.pins_y = []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data
                a = self.ppo.choose_action(s)
                s_, r, done, info = self.env.step(a)
                ''' implementation of plot version one, deprecated '''
                # plt.imshow(s_[:,:,0])
                # # plt.show()
                # plt.savefig('./img256_test/tac_test'+str(step)+str(t)+'.png')
                ''' 
                image size of plt is not exactly the original array size, but with axis etc;
                therefore use Image--imlementation of plot version two
                '''
                # im = Image.fromarray((s_[:,:,0] * 255).astype(np.uint8))
                # im.save('./img256f_r30/tac'+str(step)+str(t)+'.png')
                ''' image processing '''
                img = (s_[:, :, 0] * 255).astype(np.uint8)
                if t > EP_LEN - 1:
                    Done = True
                else:
                    Done = False
                try:
                    pins_x, pins_y, pins_dis_x, pins_dis_y = self.ImgProcess(
                        img, Done)
                except:
                    print('Image Processing Error!')

                s_ = np.concatenate((pins_dis_x, pins_dis_y))
                ''' get the vector observation '''
                # vector_s = info["brain_info"].vector_observations[0, :]  # get the vector observation
                # s_=vector_s

                # print('a: ',a)  # shape: []
                # print('s: ',s_) # shape: []
                # plt.imshow(s[:,:,0])
                # plt.show()
                # print('r: ',r) # shape: scalar
                # print('done: ', done)  # shape: True/False
                # s=s.reshape(-1)  # convert from 3D to 1D
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(
                    (r + 8) / 8)  # normalize reward, find to be useful
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
            step_set.append(step)
            # print(step)
            epr_set.append(ep_r)
Ejemplo n.º 16
0
    K.set_session(sess)

    #Setting up the env
    #TODO Worker_id can be changed to run in parallell
    #Flatten_branched gives us a onehot encoding of all 54 action combinations.
    print("Opening unity env")
    env = UnityEnv(
        "../unity_envs/kais_banana_with_explicit_charge_decision_red_battery_300_timesteps",
        worker_id=22,
        use_visual=True,
        flatten_branched=True,
        seed=seed
    )  #KOE: Note: If I accept images as uint8_visual=True, I have to convert to float later.

    print("Resetting env")
    initial_observation = env.reset()
    #KOETODO This would have to be manually configured for each environment.

    battery = 100  # [Health]
    prev_battery = battery

    # game.get_available_buttons_size() # [Turn Left, Turn Right, Move Forward]
    print("Action space is: ", env.action_space)
    action_size = env.action_space.n
    print("Env has ", action_size, " actions.")
    measurement_size = 3  # [Battery, posion, food]
    timesteps = [1, 2, 4, 8, 16, 32]  # For long horizon: [4,8,16,32,64,128]
    goal_size = measurement_size * len(timesteps)

    img_rows, img_cols = 84, 84  #KOE: Think this is still correct.
    # Convert image into Black and white
Ejemplo n.º 17
0
class PPO():
    def __init__(self):

        # Hyperparameters
        self.learning_rate = 0.0003
        self.betas = (0.9, 0.999)
        self.gamma = 0.99
        self.eps_clip = 0.2
        self.buffer_size = 2048
        self.batch_size = 256
        self.K_epochs = 3
        self.max_steps = 100000

        self.tau = 0.95
        self.entropy_coef = 0.001
        self.value_loss_coef = 0.5

        self.summary_freq = 1000

        # Environment
        self.env_name = "Environments/env1/Unity Environment"
        channel = EngineConfigurationChannel()
        self.env = UnityEnv(self.env_name,
                            worker_id=0,
                            use_visual=False,
                            side_channels=[channel],
                            no_graphics=False,
                            multiagent=True)
        channel.set_configuration_parameters(time_scale=100)
        self.action_size, self.state_size = Utils.getActionStateSize(self.env)
        self.n_agents = self.env.number_agents
        print("Nº of Agents: ", self.n_agents)

        # Model
        self.model = ActorCritic(self.state_size, self.action_size,
                                 seed=0).to(device)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate,
                                    betas=self.betas)
        self.MseLoss = nn.MSELoss()

        # Buffer memory
        self.memory = []
        for _ in range(self.n_agents):
            self.memory.append(Buffer())

        # Initialize time step (for updating when buffer_size is full)
        self.t_step = 1

    def train(self):

        # Initial observation
        env_info = self.env.reset()
        state = env_info

        # Data
        self.data = Data(self.n_agents, self.summary_freq)

        # Training loop
        for _ in range(self.max_steps):

            action = []
            logprobs = []
            value = []

            # Action of agent
            for i in range(self.n_agents):
                a, b, c = self.act(state[i])
                action.append(a)
                logprobs.append(b)
                value.append(c)

            # Send the action to the environment
            next_state, reward, done, info = self.env.step(action)

            # Done
            done_ = []
            for i in range(self.n_agents):
                done_.append(1 - done[i])

            # Agent step
            for i in range(self.n_agents):
                self.step(state[i], action[i], reward[i], next_state[i],
                          done_[i], logprobs[i], value[i], self.memory[i])

            # Update t_step
            self.t_step += 1

            # Next state
            state = next_state

            # Update the score
            self.data.update_score(reward, value, done, self.t_step)

            # Summary
            if self.t_step % self.summary_freq == 0:
                self.data.summary(self.t_step)

        # Save
        self.save()

    def save(self):
        torch.save(self.model.state_dict(), 'Saved Models/model.pth')
        self.data.results()

    def load_model(self, model):
        self.model.load_state_dict(torch.load(model))

    def act(self, state):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Get actions probabilities and value from ActorCritic model
        self.model.eval()
        with torch.no_grad():
            action_probs, value = self.model(state)
        self.model.train()

        prob = F.softmax(action_probs, -1)
        log_probs = F.log_softmax(action_probs, -1)

        # Get action and log of probabilities
        action = prob.multinomial(num_samples=1)
        log_probs = log_probs.gather(1, action)

        return action, log_probs, value

    def step(self, state, action, reward, next_state, done, logprobs, value,
             memory):

        # Update model when buffer_size is full
        if memory.len_() == (self.buffer_size / self.n_agents):
            self.learn()
            for i in range(self.n_agents):
                self.memory[i].reset()

        # Save experience in buffer memory
        memory.add(state, action, reward, next_state, done, logprobs, value)

    def evaluate(self, states, next_states, actions, rewards, masks,
                 compute_gae):

        logits, values = self.model(states)
        probs = F.softmax(logits, -1)
        log_probs = F.log_softmax(logits, -1)
        entropies = -(log_probs * probs).sum(1, keepdim=True)
        log_probs = log_probs.gather(1, actions.unsqueeze(1))

        values_ = values

        _, value = self.model(next_states)
        values = torch.cat((values, value.data))

        returns = []

        if (compute_gae):

            gae = torch.zeros(1, 1)

            for i in reversed(range(len(rewards))):

                # Generalized Advantage Estimation
                delta_t = rewards[i] + self.gamma * masks[i] * values[
                    i + 1].data - values[i].data
                gae = gae * self.gamma * self.tau * masks[i] + delta_t

                returns.insert(0, gae + values[i])

        return log_probs, values_, entropies, returns

    def compute_returns(self):

        returns_ = []

        for i in range(self.n_agents):

            # Get Experiences (of each agent)
            experiences = self.memory[i].get()
            states, actions, rewards, next_states, dones, logprobs_, values_ = experiences

            # Evaluate
            _, _, _, r = self.evaluate(states,
                                       next_states,
                                       actions,
                                       rewards,
                                       dones,
                                       compute_gae=True)
            returns_.append(r)

        l = []

        for i in range(len(returns_)):
            for j in range(len(returns_[0])):
                l.append(returns_[i][j])

        return l

    def learn(self):

        # Get Experiences
        states, actions, rewards, next_states, dones, logprobs_, values_ = self.getExp(
        )

        returns_eval = self.compute_returns()
        returns_eval = torch.tensor(returns_eval).to(device)
        returns_eval = returns_eval.unsqueeze(1)

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):

            # List with all indices
            l = np.arange(self.buffer_size)
            l = list(l)

            x = self.buffer_size // self.batch_size

            for _ in range(x):

                # Take a random batch
                indices = random.sample(l, self.batch_size)

                old_logprobs = torch.empty(self.batch_size, 1)
                old_values = torch.empty(self.batch_size, 1)
                old_actions = torch.empty(self.batch_size)
                old_states = torch.empty(self.batch_size, self.state_size)
                old_next_states = torch.empty(self.batch_size, self.state_size)
                old_rewards = np.zeros(self.batch_size)
                returns = torch.empty(self.batch_size, 1)

                for i in range(len(indices)):

                    old_logprobs[i] = logprobs_[indices[i]]
                    old_values[i] = values_[indices[i]]
                    old_actions[i] = actions[indices[i]]
                    old_states[i] = states[indices[i]]
                    old_next_states[i] = next_states[indices[i]]
                    old_rewards[i] = rewards[indices[i]]
                    returns[i] = returns_eval[indices[i]]

                old_actions = old_actions.long()

                # Remove indices to not repeat
                for i in indices:
                    l.remove(i)

                # Evaluate
                logprobs, state_values, dist_entropy, _ = self.evaluate(
                    old_states,
                    old_next_states,
                    old_actions,
                    rewards,
                    dones,
                    compute_gae=False)

                # Finding the ratio (pi_theta / pi_theta__old):
                ratios = torch.exp(logprobs - old_logprobs)

                # Finding Surrogate Loss:
                advantages = returns - old_values
                surr1 = ratios * advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip,
                                    1 + self.eps_clip) * advantages

                # LOSS = ACTOR LOSS + CRITIC_DISCOUNT * CRITIC_LOSS - ENTROPY_BETA * ENTROPY
                loss = -torch.min(
                    surr1, surr2) + self.value_loss_coef * self.MseLoss(
                        state_values,
                        returns) - self.entropy_coef * dist_entropy

                # Optimizer step
                self.optimizerStep(self.optimizer, loss.mean())

    def optimizerStep(self, optimizer, loss):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def getExp(self):

        states, actions, rewards, next_states, dones, logprobs, values = [], [], [], [], [], [], []

        for i in range(self.n_agents):
            experiences = self.memory[i].get()
            states.append(experiences[0])
            actions.append(experiences[1])
            rewards.append(experiences[2])
            next_states.append(experiences[3])
            dones.append(experiences[4])
            logprobs.append(experiences[5])
            values.append(experiences[6])

        states_, actions_, rewards_, next_states_, dones_, logprobs_, values_ = [], [], [], [], [], [], []

        for i in range(len(states)):
            for j in range(len(states[0])):
                states_.append(states[i][j])
                actions_.append(actions[i][j])
                rewards_.append(rewards[i][j])
                next_states_.append(next_states[i][j])
                dones_.append(dones[i][j])
                logprobs_.append(logprobs[i][j])
                values_.append(values[i][j])

        states__ = torch.empty(self.buffer_size, self.state_size)
        actions__ = torch.empty(self.buffer_size)
        next_states__ = torch.empty(self.buffer_size, self.state_size)
        dones__ = torch.empty(self.buffer_size)
        logprobs__ = torch.empty(self.buffer_size, 1, 1)
        values__ = torch.empty(self.buffer_size)

        for i in range(self.buffer_size):
            states__[i] = states_[i]
            actions__[i] = actions_[i]
            next_states__[i] = next_states_[i]
            dones__[i] = dones_[i]
            logprobs__[i] = logprobs_[i]
            values__[i] = values_[i]

        return states__, actions__, rewards_, next_states__, dones__, logprobs__, values__
Ejemplo n.º 18
0
class AC():
    def __init__(self):

        # Hyperparameters
        self.learning_rate = 0.0003
        self.gamma = 0.99
        self.batch_size = 256
        self.max_steps = 100000

        self.tau = 0.95
        self.entropy_coef = 0.001
        self.value_loss_coef = 0.5

        self.summary_freq = 1000

        # Environment
        self.env_name = "Environments/env1/Unity Environment"
        channel = EngineConfigurationChannel()
        self.env = UnityEnv(self.env_name,
                            worker_id=0,
                            use_visual=False,
                            side_channels=[channel],
                            no_graphics=False,
                            multiagent=False)
        channel.set_configuration_parameters(time_scale=100)
        self.action_size, self.state_size = Utils.getActionStateSize(self.env)
        self.n_agents = self.env.number_agents

        # Model
        self.model = ActorCritic(self.state_size, self.action_size,
                                 seed=0).to(device)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        # Buffer memory
        self.memory = Buffer()

        # Initialize time step (for updating every "batch_size" time steps)
        self.t_step = 1

    def train(self):

        # Initial observation
        env_info = self.env.reset()
        state = env_info

        # Data
        self.data = Data(self.n_agents, self.summary_freq)

        # Training loop
        for _ in range(self.max_steps):

            # Action of agent
            action, value = self.act(state)

            # Send the action to the environment
            next_state, reward, done, info = self.env.step(action)

            # Agent step
            self.step(state, action, reward, next_state, done)

            # Update t_step
            self.t_step += 1

            # Next state
            state = next_state

            # Update the score
            reward_ = np.expand_dims(reward, axis=0)
            value_ = value.unsqueeze(0)
            done_ = np.expand_dims(done, axis=0)
            self.data.update_score(reward_, value_, done_, self.t_step)

            # Summary
            if self.t_step % self.summary_freq == 0:
                self.data.summary(self.t_step)

        # Save
        self.save()

    def save(self):
        torch.save(self.model.state_dict(), 'Saved Models/model.pth')
        self.data.results()

    def load_model(self, model):
        self.model.load_state_dict(torch.load(model))

    def act(self, state):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Get actions probabilities and value from ActorCritic model
        self.model.eval()
        with torch.no_grad():
            action_probs, value = self.model(state)
        self.model.train()

        prob = F.softmax(action_probs, -1)

        # Get action and log of probabilities
        action = prob.multinomial(num_samples=1)

        return action, value

    def step(self, state, action, reward, next_state, done):

        # Save experience in buffer memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every "batch_size" time steps
        if self.t_step % self.batch_size == 0:
            experiences = self.memory.get()
            self.learn(experiences)
            self.memory.reset()

    def learn(self, experiences):

        # Get Experiences
        states, actions, rewards, next_states = experiences

        logits, values = self.model(states)
        probs = F.softmax(logits, -1)
        log_probs = F.log_softmax(logits, -1)
        entropies = -(log_probs * probs).sum(1, keepdim=True)

        log_probs = log_probs.gather(1, actions.unsqueeze(1))

        _, value = self.model(next_states)
        values = torch.cat((values, value.data))

        policy_loss = 0
        value_loss = 0
        R = values[-1]
        gae = torch.zeros(1, 1)

        for i in reversed(range(len(rewards))):

            R = self.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + self.gamma * values[i +
                                                       1].data - values[i].data
            gae = gae * self.gamma * self.tau + delta_t
            policy_loss = policy_loss - (log_probs[i] * gae) - (
                self.entropy_coef * entropies[i])

        # Loss
        loss = (policy_loss + self.value_loss_coef * value_loss)

        # Optimizer step
        self.optimizerStep(self.optimizer, loss)

    def optimizerStep(self, optimizer, loss):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Ejemplo n.º 19
0
from tensorflow.python import keras
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.optimizers import Adam
#from tensorflow.python.keras._impl.keras.optimizers import Adam
from tensorflow.python.layers.core import Dense
import matplotlib.pyplot as plt
import sys
from gym_unity.envs import UnityEnv
multi_env_name = "D:/ml-agents-0.8.0/UnitySDK/A.exe"
multi_env = UnityEnv(multi_env_name, worker_id=1,
                     use_visual=False, multiagent=True)

# Examine environment parameters
print(str(multi_env))
# Reset the environment
initial_observations = multi_env.reset()

if len(multi_env.observation_space.shape) == 1:
    # Examine the initial vector observation
    print("Agent observations look like: \n{}".format(initial_observations[0]))
else:
    # Examine the initial visual observation
    print("Agent observations look like:")
    if multi_env.observation_space.shape[2] == 3:
        plt.imshow(initial_observations[0][:,:,:])
    else:
        plt.imshow(initial_observations[0][:,:,0])


for episode in range(10):
    initial_observation = multi_env.reset()
Ejemplo n.º 20
0
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)


env_name = "../env/GridWorld.x86_64"  # Name of the Unity environment binary to launch
env = UnityEnv(env_name, worker_id=0, use_visual=True)

episodes = 2000

# updateTargetNetwork = 1000
dqn_agent = DQN(env=env)
steps = []
env.reset()

for ep in range(episodes):
    cur_state = np.array([env.reset()])
    done = False
    ep_reward = 0
    n_steps = 0
    while not done:
        action = dqn_agent.act(cur_state)
        new_state, reward, done, info = env.step(action)
        new_state = np.array([new_state])

        ep_reward += reward

        # reward = reward if not done else -20
        dqn_agent.remember(cur_state, action, reward, new_state, done)
Ejemplo n.º 21
0
# Environment name
# Remember to put battle royale environment configuration within the config folder
env_name = "environment/battle-royale-static"

env = UnityEnv(env_name, worker_id=4, use_visual=False, multiagent=True)

print(str(env))

# ## Examine Observation Space

# In[3]:

# Examine observation space
observation = env.observation_space
env.reset()
print("Agent observation space type: {}".format(observation))

# ## Examine Action Space

# In[4]:

# Examine action space
action = env.action_space
print("Agent action space type: {}".format(action))

# ## Agents Training
# This part shows agent training using MADDPG algoritm

# ### Setup Algorithm Dependencies
Ejemplo n.º 22
0
def cartpole():
    env = UnityEnv(environment_filename=ENV_NAME, worker_id=5, use_visual=False, multiagent = True)
    score_logger = ScoreLogger(ENV_NAME)
    agents_brain = []
    agents_action = []

    num_agents = env.number_agents
    observation_space = env.observation_space.shape[0]
    print("____________Observation_space")
    print(observation_space)
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    for x in range ((env.number_agents)):
        agents_brain.append(DQNSolver(observation_space, action_space))
    print ("Length of BrainList:    ",len(agents_brain))
    run = 0
    state = env.reset()
    print("______INITIAL______")
    print(state)
    #initialstate = copy.deepcopy(state)
    print("*****************************initial state for unity  envirmonet**************")
    #print(initialstate)
    jk = 1
    while True:
        run += 1
        state = env.reset()
        #state = copy.deepcopy(initialstate)
        num_agents = int(state[0][-5])
        print("_____________State _______________")
        print(int(state[0][12]))
        step = 0

        print("################################This is loop################################# :" , jk)
        while True:
            step += 1
            env.render()
            agents_action = [1] * len(state)
            print(state[0])
            print("*******************Length of state******************")
            print(len(state))
            for x in range(len(state)):
                state[x] = np.reshape(state[x], [1, observation_space])
                agents_action[x] = agents_brain[int(state[x][0,12]) - 1].act(state[x])
            print("Agents Actions List: ",agents_action)
            state_next, reward, terminal, info = env.step(agents_action)
            #print ("_____________STATE_NEXT___________")
            #print (state_next)
            if (len(state_next) == 0):
                break
            agents_alive = state_next[0][-13:-5]
            print ("Agents_alive:    ", agents_alive)
            print ("Rewards:    ",reward)
            num_agents = int(state_next[0][-5])
            print ("Number of agents:   ",num_agents)
            print("_________Terminal list_______" , terminal)
            if (terminal[0] == True):
                print("**************************Brain saved******************************")
                for x in range(len(agents_brain)):
                    agents_brain[x].save(str(run) + "brain" + str(x) + ".h5")

                jk+=1
                print("#####################################Loop is######################## :" , jk)
                #break

            for x in range(len(state_next)):
                state[x] = np.reshape(state[x], [1, observation_space])
                state_next[x] = np.reshape(state_next[x], [1, observation_space])
                agents_brain[int(state_next[x][0,12]) - 1].remember(state[x], agents_action[x], reward[x], state_next[x], terminal[x])
                agents_brain[int(state_next[x][0,12]) - 1].experience_replay()
            state = state_next
Ejemplo n.º 23
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = UnityEnv(env_name,
                            worker_id=wid,
                            use_visual=False,
                            use_both=True)

        # self.env=Reacher(render=True)
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        step_set = []
        epr_set = []
        step = 0
        while not COORD.should_stop():
            s, info = self.env.reset()
            s = s[:8]
            step += 1
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            self.pins_x = []
            self.pins_y = []
            self.pins_z = []
            self.object_x = []
            self.object_y = []
            self.object_z = []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data

                a = self.ppo.choose_action(s)
                s_, r, done, info = self.env.step(a)
                # print(np.array(s_).shape)

                # plot pins
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)  # normalize reward, find to be useful

                pins_x = s_[6::3]
                pins_z = s_[8::3]
                self.object_x.append(s_[0])
                self.object_z.append(s_[2])
                self.pins_x.append(pins_x)
                self.pins_z.append(pins_z)

                relative_x = pins_x - s_[0]
                relative_z = pins_z - s_[2]
                dis = (relative_x - (self.pins_x[0] - self.object_x[0]))**2 + (
                    relative_z - (self.pins_z[0] - self.object_z[0]))**2
                min_idx = np.argmin(dis)
                max_idx = np.argmax(dis)
                # add relative position of the pin with smallest deformation
                # s_ = np.append(s_[:6], relative_x[min_idx])
                # s_ = np.append(s_, relative_z[min_idx])

                s_ = np.append(s_[:6], relative_x[max_idx])
                s_ = np.append(s_, relative_z[max_idx])
                s = s_
                ep_r += r

                # print('minimal displacement idx: ', min_idx)

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

            if GLOBAL_EP % 50 == 0 and GLOBAL_EP > 0:
                self.ppo.save(model_path)

            reshape_pins_x = np.array(self.pins_x).transpose()
            reshape_pins_z = np.array(self.pins_z).transpose()
            plt.clf()
            for i in range(NUM_PINS):
                plt.subplot(411)
                plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i])
                plt.title('X-Position')
                plt.subplot(412)
                plt.plot(np.arange(len(self.pins_z)), reshape_pins_z[i])
                plt.title(
                    'Y-Position')  # although it's z, to match reality, use y
                plt.subplot(413)
                # plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]-self.object_x)
                # plt.title('X-Relative')
                # plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]-reshape_pins_x[i][0])
                # plt.title('X-Displacement')
                plt.plot(np.arange(len(self.pins_x)),
                         (reshape_pins_x[i] - self.object_x) -
                         (reshape_pins_x[i][0] - self.object_x[0]))
                plt.title('X-Displacement')

                plt.subplot(414)
                plt.plot(np.arange(len(self.pins_x)),
                         (reshape_pins_z[i] - self.object_z) -
                         (reshape_pins_z[i][0] - self.object_z[0]))
                plt.title('Y-Displacement')
                plt.xlabel('Time Step')
                plt.tight_layout()
            plt.savefig('./ppo_pins.png')

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
            step_set.append(step)
            # print(step)
            epr_set.append(ep_r)
            if step % 10 == 0:  # plot every N episode; some error about main thread for plotting
                plt.clf()
                plt.plot(step_set, epr_set)
                plt.xlabel('Episode')
                plt.ylabel('Reward')
                try:
                    plt.savefig('./tac_pins8.png')
                except:
                    print('writing conflict!')
Ejemplo n.º 24
0
def cartpole():
    env = UnityEnv(environment_filename=ENV_NAME,
                   worker_id=1,
                   use_visual=False,
                   multiagent=True)
    #score_logger = ScoreLogger(ENV_NAME)
    agents_brain = []
    agents_action = []
    pathname = "C:/HinaProgramm/testingFolder/Unity Environment"
    num_agents = env.number_agents
    print("Number of agents in enviroment : ", num_agents)
    observation_space = env.observation_space.shape[0]
    print("____________Observation_space______________")
    print(observation_space)
    print("__________Action Space________________")
    action_space = env.action_space.n
    print(action_space)
    dqn_solver = DQNSolver(observation_space, action_space)
    for x in range((num_agents)):
        agents_brain.append(DQNSolver(observation_space, action_space))
    print("Length of BrainList:    ", len(agents_brain))
    run = 0
    state = env.reset()
    #print("______INITIAL______")
    #print(state)
    initialstate = copy.deepcopy(state)
    #print("*****************************initial state for unity  envirmonet**************")
    #print(initialstate)
    jk = 1
    sharecount = 0
    eatcount = 0
    filecount = 0
    #f = str(filecount) + "sahre.csv"
    f = open(str(filecount) + "sahre.csv", 'ab')
    #J = str(filecount) + "eat.csv"
    J = open(str(filecount) + "eat.csv", 'ab')
    while True:
        run += 1
        env.reset()

        state = copy.deepcopy(initialstate)
        num_agents = int(state[0][-8])
        print("_numagents__________", num_agents)
        print("_____________State _______________")
        print(int(state[0][12]))
        step = 0
        print(
            "################################This is loop################################# :",
            jk)
        print("_____Run _______ :", run)
        while True:

            #print("************Number of agents *********")
            #print(env.number_agents)
            step += 1
            env.render()
            agents_action = [1] * len(state)
            #print(state[0])
            #print("*******************Length of state******************")
            #print(len(state))
            for x in range(len(state)):
                state[x] = np.reshape(state[x], [1, observation_space])
                agents_action[x] = agents_brain[int(state[x][0, 12]) - 1].act(
                    state[x])

            sharecount += agents_action.count(5)
            eatcount += agents_action.count(6)
            #print("Agents Actions List: ",agents_action)

            state_next, reward, terminal, info = env.step(agents_action)
            for x in range(len(agents_action)):
                if (agents_action[x] == 5):
                    new = np.asarray([state_next[x]])
                    np.savetxt(f, new, delimiter=",")
                    #f.write(str(state_next[x])+"\r\n")
                if (agents_action[x] == 6):
                    #J.write(str(state_next[x])+"\r\n")
                    new = np.asarray([state_next[x]])
                    np.savetxt(J, new, delimiter=",")
            print("_____________STATE_NEXT___________")
            print(state_next)
            if (len(state_next) == 0):
                #f.write(str(sharecount))
                #J.write(str(eatcount))
                #f.close()
                #J.close()
                filecount += 1
                np.savetxt(f, sharecount, delimiter=",")
                np.savetxt(J, eatcount, delimiter=",")
                break
            agents_alive = state_next[0][-16:-8]
            print("Agents_alive:    ", agents_alive)
            print("Rewards:    ", reward)
            num_agents = int(state_next[0][-8])
            print("Number of agents:   ", num_agents)
            #print("_________Terminal list_______" , terminal)
            if (terminal[0] == True):
                print(
                    "**************************Brain saved******************************"
                )
                for x in range(len(agents_brain)):
                    agents_brain[x].model.save(pathname + str(run) + "brain" +
                                               str(x) + ".h5")

                jk += 1
                print(
                    "#####################################Loop is######################## :",
                    jk)
                #f.write(str(sharecount))
                #J.write(str(eatcount))
                #f.close()
                #J.close()
                filecount += 1
                break

            for x in range(len(state_next)):
                state[x] = np.reshape(state[x], [1, observation_space])
                state_next[x] = np.reshape(state_next[x],
                                           [1, observation_space])
                agents_brain[int(state_next[x][0, 12]) - 1].remember(
                    state[x], agents_action[x], reward[x], state_next[x],
                    terminal[x])
                agents_brain[int(state_next[x][0, 12]) - 1].experience_replay()
            state = state_next
Ejemplo n.º 25
0
        return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
    else:
        y, adv = 0, []
        terminals_reversed = terminal_array[1:][::-1]
        for step, dt in enumerate(reversed(x)):
            y = dt + gamma * y * (1 - terminals_reversed[step])
            adv.append(y)
        return np.array(adv)[::-1]

if __name__ == "__main__":
    env = UnityEnv('test.app', 0,use_visual=True)
    ppo = PPO(env)
    all_ep_r = []
    t = 0
    for ep in range(EP_MAX):
        s = env.reset()
        
        ep_r = 0
        
        done = False
        while not done: 
            t+=1
            env.render()
            a,v = ppo.choose_action(s)
            s_, r, done, _ = env.step(a)
            ppo.buffer_s.append(s)
            ppo.buffer_a.append(a)
            ppo.buffer_r.append(r)
            ppo.buffer_v.append(v)
            ppo.buffer_done.append(done)
            s = s_
Ejemplo n.º 26
0
import numpy as np
import sys

from gym_unity.envs import UnityEnv

env_name = "../env/GridWorld.x86_64"  # Name of the Unity environment binary to launch
env = UnityEnv(env_name)

# Examine environment parameters
print(str(env))

# Reset the environment
initial_observation = env.reset()

for episode in range(10):
    initial_observation = env.reset()
    done = False
    episode_rewards = 0
    while not done:
        observation, reward, done, info = env.step(env.action_space.sample())
        episode_rewards += reward
    print("Total reward this episode: {}".format(episode_rewards))

env.close()
Ejemplo n.º 27
0
        # env=Reacher(render=True)
        # env = UnityEnv(env_name, worker_id=10, use_visual=True, use_both=True)

        # s, info = env.reset()
        # for t in range(100):
        #     # env.render()
        #     s, r, done, info = env.step(GLOBAL_PPO.choose_action(s))

        GLOBAL_PPO.save(model_path)

    if args.test:
        env = UnityEnv(env_name,
                       worker_id=np.random.randint(0, 10),
                       use_visual=True,
                       use_both=True)
        env.reset()
        GLOBAL_PPO = PPO()
        GLOBAL_PPO.load(model_path)
        test_steps = 200
        test_episode = 10

        for _ in range(test_episode):
            s, info = env.reset()
            ''''''
            # vector_s = info["brain_info"].vector_observations[0, :]  # get the vector observation
            # s=vector_s
            for t in range(test_steps):
                # env.render()
                s, r, done, info = env.step(GLOBAL_PPO.choose_action(s))
                ''''''
                # vector_s = info["brain_info"].vector_observations[0, :]  # get the vector observation
Ejemplo n.º 28
0
Defining the environment related constants
'''
# Number of discrete states (bucket) per state dimension
MAZE_SIZE = (5, 5)
NUM_BUCKETS = MAZE_SIZE  # one bucket per grid

# Number of discrete actions
NUM_ACTIONS = env.action_space.n  # ["N", "S", "E", "W"]
STATE_BOUNDS = [(0.0, 4.0), (0.0, 4.0)]
MAX_T = np.prod(MAZE_SIZE, dtype=int) * 100

q_table = np.load('q_table.npy')

env.render()
# Reset the environment
obv = env.reset()

# the initial state
state_0 = state_to_bucket(obv)
total_reward = 0

for t in range(MAX_T):

    # Select an action
    action = select_action(state_0, 0)

    # execute the action
    obv, reward, done, _ = env.step(action)

    # Observe the result
    state = state_to_bucket(obv)
Ejemplo n.º 29
0
        [p.join() for p in processes]  # finished at the same time

        td3_trainer.save_model(model_path)
        print(rewards)

    if args.test:
        # choose env
        # env_name="./tac_follow_new4"
        # env_name="tac_follow_new4_random02"
        env_name = "tac_follow_new4_random"
        env = UnityEnv(env_name, worker_id=22, use_visual=False, use_both=True)
        td3_trainer.load_model(model_path)
        eps_r = []
        for eps in range(20):
            state, info = env.reset()
            state0 = state
            state = state_process(state, state0)
            episode_reward = 0

            for step in range(max_steps):
                action = td3_trainer.policy_net.get_action(
                    state,
                    deterministic=DETERMINISTIC,
                    explore_noise_scale=0.0)
                next_state, reward, done, info = env.step(action)
                reward += 100
                next_state = state_process(next_state, state0)
                episode_reward += reward
                state = next_state
                if done:
Ejemplo n.º 30
0
import time

from gym_unity.envs import UnityEnv

env_name = "../envs/Cat_2/Cats.exe"

print("\nwith no render")

env = UnityEnv(env_name, no_graphics=False, multiagent=True, worker_id=1)
"""
res = []

for j in range(10):
"""
print(str(env))
ini_obs = env.reset()

curr_t = time.time()

for i in range(10000):
    actions = [env.action_space.sample() for agent in range(env.number_agents)]
    obs, rew, done, info = env.step(actions)

res = time.time() - curr_t

print("\nTime for 1000 step")
print(res)
print("\n\n")
"""
res = []