Ejemplo n.º 1
0
def main(env_name):
    """
    Run the gym test using the specified environment
    :param env_name: Name of the Unity environment binary to launch
    """
    env = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True)

    try:
        # Examine environment parameters
        print(str(env))

        # Reset the environment
        initial_observations = env.reset()

        if len(env.observation_space.shape) == 1:
            # Examine the initial vector observation
            print("Agent observations look like: \n{}".format(
                initial_observations))

        for _episode in range(10):
            env.reset()
            done = False
            episode_rewards = 0
            while not done:
                actions = env.action_space.sample()
                obs, reward, done, _ = env.step(actions)
                episode_rewards += reward
            print("Total reward this episode: {}".format(episode_rewards))
    finally:
        env.close()
Ejemplo n.º 2
0
class Chaser_v1(Environment):
    unity_env_worker_id = 0

    def __init__(self, platform):
        if platform == OSName.MAC:
            env_filename = EnvironmentName.CHASER_V1_MAC.value
        elif platform == OSName.WINDOWS:
            env_filename = EnvironmentName.CHASER_V1_WINDOWS.value
        else:
            env_filename = None

        self.env = UnityEnv(environment_filename=env_filename,
                            worker_id=Chaser_v1.unity_env_worker_id,
                            use_visual=True,
                            multiagent=True).unwrapped
        self.increase_env_worker_id()
        super(Chaser_v1, self).__init__()
        self.action_shape = self.get_action_shape()
        self.state_shape = self.get_state_shape()

        self.cnn_input_height = self.state_shape[0]
        self.cnn_input_width = self.state_shape[1]
        self.cnn_input_channels = self.state_shape[2]

        self.observation_space = self.env.observation_space
        self.continuous = True

    @staticmethod
    def increase_env_worker_id():
        Chaser_v1.unity_env_worker_id += 1

    def get_n_states(self):
        n_states = 3
        return n_states

    def get_n_actions(self):
        n_actions = 3
        return n_actions

    def get_state_shape(self):
        return self.env.observation_space.shape

    def get_action_shape(self):
        return self.env.action_space.shape

    def reset(self):
        state = self.env.reset()
        return state

    def step(self, action):
        next_state, reward, done, info = self.env.step(action)

        adjusted_reward = reward

        return next_state, reward, adjusted_reward, done, info

    def close(self):
        self.env.close()
Ejemplo n.º 3
0
    def run():
        # LINUX: Disable the Unity window -> no_graphics=True
        env = UnityEnv(env_name,
                       worker_id=1000,
                       use_visual=False,
                       uint8_visual=False,
                       allow_multiple_visual_obs=False,
                       no_graphics=False)

        # Create the agent
        model = PPO2(MlpPolicy, env, verbose=0, learning_rate=1.0e-4)
        model.learn(total_timesteps=num_episodes)

        env.close()

        print("Successfully trained")
Ejemplo n.º 4
0
def test_closing(env_name):
    """
    Run the gym test and closes the environment multiple times
    :param env_name: Name of the Unity environment binary to launch
    """

    try:
        env1 = UnityEnv(env_name,
                        worker_id=1,
                        use_visual=False,
                        no_graphics=True)
        env1.close()
        env1 = UnityEnv(env_name,
                        worker_id=1,
                        use_visual=False,
                        no_graphics=True)
        env2 = UnityEnv(env_name,
                        worker_id=2,
                        use_visual=False,
                        no_graphics=True)
        env2.reset()
    finally:
        env1.close()
        env2.close()
Ejemplo n.º 5
0
                mavg_poison = np.mean(np.array(poison_buffer))
                mavg_loss = np.mean(loss_buffer)
                mavg_num_batteries = np.mean(np.array(num_batteries_buffer))
                food_buffer = []
                battery_buffer = []
                poison_buffer = []
                reward_buffer = []
                loss_buffer = []
                num_batteries_buffer = []

                # Write Rolling Statistics to file
                with open(SAVE_TO_FOLDER + "/dfp_stats.txt",
                          "a+") as stats_file:
                    stats_file.write(str(GAME) + " ")
                    stats_file.write(str(max_reward) + " ")
                    stats_file.write(str(mavg_score) + ' ')
                    stats_file.write(str(mavg_loss) + ' ')
                    stats_file.write(str(var_score) + ' ')
                    stats_file.write(str(mavg_battery) + ' ')
                    stats_file.write(str(mavg_num_batteries) + ' ')
                    stats_file.write(str(mavg_food) + ' ')
                    stats_file.write(str(mavg_poison) + '\n')

    env.close()
    end = time.time()
    time_elapsed = end - start
    with open(SAVE_TO_FOLDER + "/timing_info.txt", "w") as text_file:
        print("Time Elapsed: {}".format(time_elapsed), file=text_file)

#KOE: Made it to the end. Now test running, print out, debug, etc.
Ejemplo n.º 6
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = UnityEnv(env_id, rank)
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank==0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Ejemplo n.º 7
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.lock = threading.Lock()
        self.env = UnityEnv(GameDir, wid,use_visual=True)

        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_COUNTER
        t = 0
        while not COORD.should_stop():
            s = self.env.reset()
            ep_r = 0
            buffer_s, buffer_a, buffer_r, buffer_v ,buffer_done = [], [], [], [], []
            done = False
            
            while not done:
                if not COLLECT_EVENT.is_set():                  
                    COLLECT_EVENT.wait()                        
                    buffer_s, buffer_a, buffer_r, buffer_v ,buffer_done = [], [], [], [], []
                a,v = self.ppo.choose_action(s)
                s_, r, done, _ = self.env.step(a)
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)
                buffer_v.append(v)
                buffer_done.append(done)
                s = s_
                ep_r += r
                t+=1
                GLOBAL_COUNTER += 1
                # update ppo
                if (done or GLOBAL_COUNTER >= BATCH):
                    
                    t = 0
                    rewards = np.array(buffer_r)
                    v_final = [v * (1 - done)] 
                    terminals = np.array(buffer_done + [done])
                    values = np.array(buffer_v + v_final)
                    delta = rewards + GAMMA * values[1:] * (1 - terminals[1:]) - values[:-1]
                    advantage = discount(delta, GAMMA * LAMBDA, terminals)
                    returns = advantage + np.array(buffer_v)
                    advantage = (advantage - advantage.mean()) / np.maximum(advantage.std(), 1e-6)


                    bs, ba, br,badv = np.reshape(buffer_s, (-1,) + self.ppo.s_dim), np.vstack(buffer_a), \
                                    np.vstack(returns), np.vstack(advantage)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    buffer_v, buffer_done = [], []
                    COLLECT_EVENT.wait()
                    self.lock.acquire()
                    for i in range(len(bs)):
                        GLOBAL_DATA["state"].append(bs[i])
                        GLOBAL_DATA["reward"].append(br[i])
                        GLOBAL_DATA["action"].append(ba[i])
                        GLOBAL_DATA["advantage"].append(badv[i])
                    self.lock.release()
                    if GLOBAL_COUNTER >= BATCH and len(GLOBAL_DATA["state"])>= BATCH:
                        COLLECT_EVENT.clear()
                        UPDATE_EVENT.set() 
                    # self.ppo.update(bs, ba, br,badv)

                if GLOBAL_EP >= EP_MAX:
                    self.env.close()
                    COORD.request_stop()
                    break
            print("episode = {}, ep_r = {}, wid = {}".format(GLOBAL_EP,ep_r,self.wid))
            GLOBAL_EP += 1
            if GLOBAL_EP != 0 and GLOBAL_EP % 500 == 0:
                self.ppo.save_model(steps=GLOBAL_EP)
Ejemplo n.º 8
0
                        UPDATE_EVENT.set() 
                    # self.ppo.update(bs, ba, br,badv)

                if GLOBAL_EP >= EP_MAX:
                    self.env.close()
                    COORD.request_stop()
                    break
            print("episode = {}, ep_r = {}, wid = {}".format(GLOBAL_EP,ep_r,self.wid))
            GLOBAL_EP += 1
            if GLOBAL_EP != 0 and GLOBAL_EP % 500 == 0:
                self.ppo.save_model(steps=GLOBAL_EP)

if __name__ == '__main__':
    tmpenv = UnityEnv(GameDir, 0,use_visual=True).unwrapped
    GLOBAL_PPO = PPO(tmpenv,ModelPath=modelPath)
    tmpenv.close()
    GLOBAL_DATA = {"state":[],"action":[],"reward":[],"advantage":[]}
    UPDATE_EVENT, COLLECT_EVENT = threading.Event(), threading.Event()
    UPDATE_EVENT.clear()
    COLLECT_EVENT.set()
    workers = [Worker(wid=i) for i in range(1,N_WORKER+1)]
    
    GLOBAL_COUNTER, GLOBAL_EP = 0, 0
    COORD = tf.train.Coordinator()
    
    threads = []
    for worker in workers:
        t = threading.Thread(target=worker.work, args=())
        t.start()
        threads.append(t)
    threads.append(threading.Thread(target=GLOBAL_PPO.update,))
Ejemplo n.º 9
0
class Drone_Racing(Environment):
    worker_id = 0

    def __init__(self, platform):
        if platform == OSName.MAC:
            env_filename = EnvironmentName.DRONE_RACING_MAC.value
        elif platform == OSName.WINDOWS:
            env_filename = EnvironmentName.DRONE_RACING_WINDOWS.value
        else:
            env_filename = None

        self.env = UnityEnv(
            environment_filename=env_filename,
            worker_id=randrange(65536),
            use_visual=False,
            multiagent=False
        ).unwrapped

        super(Drone_Racing, self).__init__()
        Drone_Racing.worker_id += 1
        self.action_shape = self.get_action_shape()
        self.action_space = self.env.action_space

        self.continuous = False

        self.skipping_state_fq = 3
        self.skipping_state_index = 0

        self.WIN_AND_LEARN_FINISH_SCORE = 200

    def get_n_states(self):
        return self.env.observation_space.shape[0]

    def get_n_actions(self):
        return self.env.action_space.shape[0]

    def get_state_shape(self):
        return self.env.observation_space

    def get_action_shape(self):
        return self.env.action_space

    def get_action_space(self):
        return self.env.action_space

    @property
    def action_meanings(self):
        action_meanings = ["FORWARD", "BACKWARD", "RIGHT", "LEFT", "UP", "DOWN", "R_ROTATE", "L_ROTATE", "HOVER"]
        return action_meanings

    def reset(self):
        state = self.env.reset()
        return state

    def step(self, action):
        action_list = [0] * 9

        if self.is_skip_phase():
            action_list[8] = 1  # hover action
        else:
            action_list[action] = 1

        next_state, reward, done, info = self.env.step(action_list)
        adjusted_reward = reward

        info["skipping"] = True
        if not self.is_skip_phase():
            self.skipping_state_index = 0
            info["skipping"] = False

        self.skipping_state_index += 1

        return next_state, reward, adjusted_reward, done, info

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()

    def is_skip_phase(self):
        return self.skipping_state_index != self.skipping_state_fq