Ejemplo n.º 1
0
def main():
    #Load parse parameters
    #parser = otc_arg_parser()
    #args = parser.parse_args()

    #Challenge environment
    # if args.env == 'ObtRetro-v6':
    #     env = ObstacleTowerEnv(
    #         '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
    #         timeout_wait=6000,
    #         retro=args.retro,
    #         realtime_mode=args.test)
    #     env = RetroWrapper(env, args.sample_normal)
    #     env = OTCPreprocessing(env, args.action_reduction)
    #     # if show_obs:
    #     #     env = RenderObservations(env)
    #     #     env = KeyboardControlWrapper(env)
    # else:
    env = ObstacleTowerEnv(
        '/home/home/Data/Carmen/py_workspace/ObstacleTower_v3/ObstacleTower-v3.1/obstacletower.x86_64',
        retro=args.retro,
        realtime_mode=args.test,
        timeout_wait=6000)

    #env = ObstacleTowerEnv('OBSTACLE_TOWER_PATH', retro=args.retro, realtime_mode=args.test, timeout_wait=6000)

    #Dict of actions created by the ObstacleTowerEnv Class of obstacle_tower_env library
    #print("ACTIONS:", env._flattener.action_lookup)

    print('FEATURES :', args.features)

    #Preprocess the environment (Grey Scales and action space reduction)
    env = OTCPreprocessing(env, args.action_reduction, args.features)
    env = DummyVecEnv([lambda: env])
    #env = VecEnv(1, env.observation_space, env.action_space)

    print("ACTION SPACE  ///////////:", env.action_space)
    print("OBSERVATION SPACE ///////////////:", env.observation_space)
    #env = make_vec_env(env, n_envs=4)

    ########Training########

    #Study of the impact of different values of the PPO params
    if args.study:
        params_test(MlpPolicy, env)

    #If no Study Mode
    else:
        #If no Test Mode
        if not args.test:

            seed = random.seed(0)

            if args.pretrained_model:

                t = 300000

                model = PPO2.load(args.pretrained_model,
                                  env=env,
                                  tensorboard_log=args.tensorboard_logdir)

            else:

                t = 0

                #If Generalized Advantage Estimator is used
                if args.use_gae:

                    model = PPO2(MlpPolicy,
                                 env,
                                 n_steps=args.num_steps,
                                 verbose=1,
                                 tensorboard_log=args.tensorboard_logdir,
                                 cliprange=args.clip_param,
                                 learning_rate=args.lr,
                                 ent_coef=args.entropy_coef,
                                 vf_coef=args.value_loss_coef,
                                 max_grad_norm=args.max_grad_norm,
                                 gamma=args.gamma,
                                 lam=args.gae_lambda,
                                 noptepochs=args.ppo_epoch,
                                 seed=seed)

                #If Generalized Advantage Estimator is not used
                else:

                    model = PPO2(MlpPolicy,
                                 env,
                                 n_steps=args.num_steps,
                                 verbose=1,
                                 tensorboard_log=args.tensorboard_logdir,
                                 cliprange=args.clip_param,
                                 learning_rate=args.lr,
                                 ent_coef=args.entropy_coef,
                                 vf_coef=args.value_loss_coef,
                                 max_grad_norm=args.max_grad_norm,
                                 gamma=args.gamma,
                                 noptepochs=args.ppo_epoch,
                                 seed=seed)
        else:

            model = PPO2.load(args.pretrained_model, env=env)

        #model.learn(total_timesteps=50000)
        #model.save("ObstacleTower_prueba")

        filename = 'argsparams.txt'
        os.makedirs(args.results_dir, exist_ok=True)
        myfile = open(args.results_dir + filename, 'a')
        myfile.write(
            'clip range: %f \n learning rate: %f \n coeficiente de entropía: %f \n coeficiente de pérdida: %f \n '
            'máximo gradiente: %f \n gamma: %f \n ppo epoch: %f \n' %
            (args.clip_param, args.lr, args.entropy_coef, args.value_loss_coef,
             args.max_grad_norm, args.gamma, args.ppo_epoch))
        myfile.close()

        if not args.test:
            while t < args.num_env_steps:
                #TRAIN MODEL
                if t == 0:
                    model.learn(total_timesteps=args.eval_interval)

                else:
                    model.learn(total_timesteps=args.eval_interval,
                                reset_num_timesteps=False)

                os.makedirs(GLOBAL_PATH, exist_ok=True)
                print("Saving in '" + GLOBAL_PATH + "'")
                model.save(GLOBAL_PATH + args.training_name + "_" +
                           str(int(t)).zfill(10))

                avg_reward, avg_floor = test(
                    t, model, env=env, global_path=args.results_dir)  # Test
                log('T = ' + str(t) + ' / ' + str(args.num_env_steps) +
                    ' | Avg. reward: ' + str(avg_reward) + ' | Avg. floor: ' +
                    str(avg_floor))

                t += args.eval_interval
        else:
            obs = env.reset()
            t = 0
            while t < args.num_env_steps:

                action, _states = model.predict(obs)
                obs, rewards, done, info = env.step(action)
                #print('action :', info)
                env.render('rgb_array')
Ejemplo n.º 2
0
        obs, reward, done, info = env.step(action)
        action = 18
        obs, reward, done, info = env.step(action)
        obs, reward, done, info = env.step(action)
        obs, reward, done, info = env.step(action)
        obs, reward, done, info = env.step(action)
        s, reward, done, info = env.step(action)
        s = rgb2gray(s)
        s = np.expand_dims(s, axis=2)

        t = 0
        track_r = []
        track_a = []
        track_s = []
        while True:
            env.render()

            a = actor.choose_action(
                s
            )  # state of dim (1, 84, 84) fed to actor to choose the next action based on the current state

            s_, r, done, info = env.step(a)
            s_ = rgb2gray(s_)
            s_ = np.expand_dims(s_, axis=2)

            if done:
                r = -20

            track_r.append(r)
            track_a.append(a)
            track_s.append(s)
Ejemplo n.º 3
0
        handcrafted_step(18, steps[2])

        s, reward, done, info = env.step(18)
        s = rgb2gray(s)
        s = np.expand_dims(s, axis=2)

        t = 0
        track_score = []
        track_r = []
        track_a = []
        track_s = []
        bad_seq = 0
        score = 0
        steps_after_key = 0
        while True:
            if RENDER: env.render()

            a = actor.choose_action(
                s
            )  # state of dim (1, 84, 84) fed to actor to choose the next action based on the current state

            s_, r, done, info = env.step(a)
            t += 1
            if i_episode % 100 == 0 and i_episode != 0:
                test_phase = True
                test_counter += 1
                if test_phase:
                    if test_counter > 5:
                        performance.append(keys_in_test / test_counter)
                        if keys_in_test > 2:
                            steps[-1] -= 1
Ejemplo n.º 4
0
class StableA2C():
    def __init__(self,
                 env_path,
                 train,
                 evaluate,
                 policy_name='CnnPolicy',
                 save_dir='./model_files/',
                 eval_seeds=[]):
        self.save_dir = save_dir
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        self.model_path = os.path.join(self.save_dir, 'model_stable_a2c')
        self.log_dir = './logs/stable_a2c'
        self.policy_name = policy_name
        self.evaluate = evaluate

        if train:
            self.env = ObstacleTowerEnv(env_path,
                                        worker_id=0,
                                        retro=True,
                                        realtime_mode=False,
                                        config=train_env_reset_config)
        else:
            if evaluate:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=False,
                                            config=eval_env_reset_config)
                self.env = ObstacleTowerEvaluation(self.env, eval_seeds)
            else:
                self.env = ObstacleTowerEnv(env_path,
                                            worker_id=0,
                                            retro=True,
                                            realtime_mode=True,
                                            config=eval_env_reset_config)

    def load_model(self):
        print('Loading model from: {}'.format(self.model_path))
        model = A2C.load(self.model_path)
        model.set_env(self.env)
        model.tensorboard_log = self.log_dir
        return model

    def train(self, timesteps=10000, continue_training=False):
        start_time = time.time()
        if not continue_training:
            print("Initializing from scratch")
            model = A2C(self.policy_name,
                        self.env,
                        verbose=1,
                        tensorboard_log=self.log_dir)
        else:
            model = self.load_model()
            print("Restored from {}".format(self.model_path))

        model.learn(total_timesteps=timesteps)
        print('\nTraining complete. Time taken = {} secs'.format(time.time() -
                                                                 start_time))
        model.save(self.model_path)

    def play_single_episode(self):
        """ have the trained agent play a single game """
        action_space = ActionSpace()
        done = False
        reward_sum = 0
        step_counter = 0

        model = self.load_model()
        obs = self.env.reset()
        try:
            print("Playing single episode...")
            while not done:
                action, _states = model.predict(obs)
                obs, reward, done, info = self.env.step(action)
                print("{}. Reward: {}, action: {}".format(
                    step_counter, reward_sum,
                    action_space.get_full_action_meaning(action)))
                self.env.render()
                step_counter += 1
                reward_sum += reward
        except KeyboardInterrupt:
            print("Received Keyboard Interrupt. Shutting down.")
        finally:
            if not self.evaluate:
                self.env.close()
                print("Environment closed.")
            print("Game play completed.")
            return reward_sum

    def evaluate(self):
        """ run episodes until evaluation is complete """
        while not self.env.evaluation_complete:
            episode_reward = self.play_single_episode()

        pprint(self.env.results)
        self.env.close()