Ejemplo n.º 1
0
    def build(self):
        """
        This function builds the ACER agent based on the selected mode and runs it according to:
            1- Initializes the env
            2- If mode train is selected, train the model from scratch, learn, and save.
            3- If mode continue is selected, provide a path for pretrained model, load the model, learn, and save.
            4- If mode test is selected, provide a path fror pretrained model, load the model and test. 
        """
        # Create the vectorized environment
        if self.inp.acer_dict['ncores'][0] > 1:
            self.env = SubprocVecEnv([
                self.make_env(self.inp.gen_dict['env'][0], i)
                for i in range(self.inp.acer_dict['ncores'][0])
            ],
                                     daemon=self.inp.gen_dict['daemon'][0])
        else:
            self.env = gym.make(self.inp.gen_dict['env'][0],
                                casename=self.inp.acer_dict['casename'][0],
                                log_dir=self.log_dir,
                                exepath=self.inp.gen_dict['exepath'][0],
                                env_data=self.inp.gen_dict['env_data'][0],
                                env_seed=1)

        #tensorboard activation (if used)
        #to view tensorboard type
        #tensorboard --logdir=./log_dir/{self.casename}_tensorlog
        if self.inp.acer_dict['tensorboard'][0]:
            tensorboard_log = self.log_dir + '{}_tensorlog'.format(
                self.inp.acer_dict['casename'][0])
        else:
            tensorboard_log = None

        if self.mode == 'train':
            # Train from scratch, initialize the model and then learn, and save the last model.
            # Callbacks are used if provided
            model = ACER(
                MlpPolicy,
                self.env,
                n_steps=self.inp.acer_dict['n_steps'][0],
                gamma=self.inp.acer_dict['gamma'][0],
                learning_rate=self.inp.acer_dict['learning_rate'][0],
                q_coef=self.inp.acer_dict['q_coef'][0],
                max_grad_norm=self.inp.acer_dict['max_grad_norm'][0],
                ent_coef=self.inp.acer_dict['ent_coef'][0],
                alpha=self.inp.acer_dict['alpha'][0],
                lr_schedule=self.inp.acer_dict['lr_schedule'][0],
                rprop_alpha=self.inp.acer_dict['rprop_alpha'][0],
                rprop_epsilon=self.inp.acer_dict['rprop_epsilon'][0],
                buffer_size=self.inp.acer_dict['buffer_size'][0],
                replay_ratio=self.inp.acer_dict['replay_ratio'][0],
                replay_start=self.inp.acer_dict['replay_start'][0],
                correction_term=self.inp.acer_dict['correction_term'][0],
                trust_region=self.inp.acer_dict['trust_region'][0],
                delta=self.inp.acer_dict['delta'][0],
                verbose=1,
                tensorboard_log=tensorboard_log,
                seed=2,
                n_cpu_tf_sess=1)
            model.learn(total_timesteps=self.inp.acer_dict['time_steps'][0],
                        callback=self.callback)
            model.save(self.log_dir + self.inp.acer_dict['casename'][0] +
                       '_model_last.pkl')

        if self.mode == 'continue':
            # load, contine learning, and save last model
            model = ACER.load(
                self.inp.acer_dict['model_load_path'][0],
                env=self.env,
                n_steps=self.inp.acer_dict['n_steps'][0],
                gamma=self.inp.acer_dict['gamma'][0],
                learning_rate=self.inp.acer_dict['learning_rate'][0],
                q_coef=self.inp.acer_dict['q_coef'][0],
                max_grad_norm=self.inp.acer_dict['max_grad_norm'][0],
                ent_coef=self.inp.acer_dict['ent_coef'][0],
                alpha=self.inp.acer_dict['alpha'][0],
                lr_schedule=self.inp.acer_dict['lr_schedule'][0],
                rprop_alpha=self.inp.acer_dict['rprop_alpha'][0],
                rprop_epsilon=self.inp.acer_dict['rprop_epsilon'][0],
                buffer_size=self.inp.acer_dict['buffer_size'][0],
                replay_ratio=self.inp.acer_dict['replay_ratio'][0],
                replay_start=self.inp.acer_dict['replay_start'][0],
                correction_term=self.inp.acer_dict['correction_term'][0],
                trust_region=self.inp.acer_dict['trust_region'][0],
                delta=self.inp.acer_dict['delta'][0],
                verbose=1,
                tensorboard_log=tensorboard_log,
                seed=2,
                n_cpu_tf_sess=1)
            model.learn(total_timesteps=self.inp.acer_dict['time_steps'][0],
                        callback=self.callback)
            model.save(self.log_dir + self.inp.acer_dict['casename'][0] +
                       '_lastmodel.pkl')

        if self.mode == 'test':
            # load and test the agent. Env is recreated since test mode only works in single core
            print(
                'debug: acer is running in test mode, single core is used to test the policy'
            )
            env = gym.make(self.inp.gen_dict['env'][0],
                           casename=self.inp.acer_dict['casename'][0],
                           log_dir=self.log_dir,
                           exepath=self.inp.gen_dict['exepath'][0],
                           env_data=self.inp.gen_dict['env_data'][0],
                           env_seed=1)
            model = ACER.load(self.inp.acer_dict['model_load_path'][0])
            evaluate_policy(
                model,
                env,
                log_dir=self.log_dir + 'acer',
                n_eval_episodes=self.inp.acer_dict["n_eval_episodes"][0],
                render=self.inp.acer_dict["render"][0])
Ejemplo n.º 2
0
    def build(self):
        """
        This function builds the PPO agent based on the selected mode and runs it according to:
            1- Initializes the env
            2- If mode train is selected, train the model from scratch, learn, and save.
            3- If mode continue is selected, provide a path for pretrained model, load the model, learn, and save.
            4- If mode test is selected, provide a path fror pretrained model, load the model and test. 
        """
        # Create the vectorized environment
        if self.inp.ppo_dict['ncores'][0] > 1:
            self.env = SubprocVecEnv([
                self.make_env(self.inp.gen_dict['env'][0], i)
                for i in range(self.inp.ppo_dict['ncores'][0])
            ],
                                     daemon=self.inp.gen_dict['daemon'][0])
        else:
            self.env = gym.make(self.inp.gen_dict['env'][0],
                                casename=self.inp.ppo_dict['casename'][0],
                                log_dir=self.log_dir,
                                exepath=self.inp.gen_dict['exepath'][0],
                                env_data=self.inp.gen_dict['env_data'][0],
                                env_seed=1)

        #tensorboard activation (if used)
        #to view tensorboard type
        #tensorboard --logdir=./log_dir/{self.casename}_tensorlog
        if self.inp.ppo_dict['tensorboard'][0]:
            tensorboard_log = self.log_dir + '{}_tensorlog'.format(
                self.inp.ppo_dict['casename'][0])
        else:
            tensorboard_log = None

        if self.mode == 'train':
            # Train from scratch, initialize the model and then learn, and save the last model.
            # Callbacks are used if provided
            model = PPO2(MlpPolicy,
                         self.env,
                         n_steps=self.inp.ppo_dict['n_steps'][0],
                         gamma=self.inp.ppo_dict['gamma'][0],
                         learning_rate=self.inp.ppo_dict['learning_rate'][0],
                         ent_coef=self.inp.ppo_dict['ent_coef'][0],
                         vf_coef=self.inp.ppo_dict['vf_coef'][0],
                         max_grad_norm=self.inp.ppo_dict['max_grad_norm'][0],
                         lam=self.inp.ppo_dict['lam'][0],
                         nminibatches=self.inp.ppo_dict['nminibatches'][0],
                         noptepochs=self.inp.ppo_dict['noptepochs'][0],
                         cliprange=self.inp.ppo_dict['cliprange'][0],
                         verbose=1,
                         seed=3)
            model.learn(total_timesteps=self.inp.ppo_dict['time_steps'][0],
                        callback=self.callback)
            model.save(self.log_dir + self.inp.ppo_dict['casename'][0] +
                       '_lastmodel.pkl')

        if self.mode == 'continue':
            # load, contine learning, and save last model
            model = PPO2.load(
                self.inp.ppo_dict['model_load_path'][0],
                env=self.env,
                n_steps=self.inp.ppo_dict['n_steps'][0],
                gamma=self.inp.ppo_dict['gamma'][0],
                learning_rate=self.inp.ppo_dict['learning_rate'][0],
                ent_coef=self.inp.ppo_dict['ent_coef'][0],
                vf_coef=self.inp.ppo_dict['vf_coef'][0],
                max_grad_norm=self.inp.ppo_dict['max_grad_norm'][0],
                lam=self.inp.ppo_dict['lam'][0],
                nminibatches=self.inp.ppo_dict['nminibatches'][0],
                noptepochs=self.inp.ppo_dict['noptepochs'][0],
                cliprange=self.inp.ppo_dict['cliprange'][0],
                verbose=1,
                seed=3)
            model.learn(total_timesteps=self.inp.ppo_dict['time_steps'][0],
                        callback=self.callback)
            model.save(self.log_dir + self.inp.ppo_dict['casename'][0] +
                       '_lastmodel.pkl')

        if self.mode == 'test':
            # load and test the agent. Env is recreated since test mode only works in single core
            print(
                'debug: ppo is running in test mode, single core is used to test the policy'
            )
            env = gym.make(self.inp.gen_dict['env'][0],
                           log_dir=self.log_dir,
                           casename=self.inp.ppo_dict['casename'][0],
                           exepath=self.inp.gen_dict['exepath'][0],
                           env_data=self.inp.gen_dict['env_data'][0],
                           env_seed=1)
            model = PPO2.load(self.inp.ppo_dict['model_load_path'][0])
            evaluate_policy(
                model,
                env,
                log_dir=self.log_dir + 'ppo',
                n_eval_episodes=self.inp.ppo_dict["n_eval_episodes"][0],
                render=self.inp.ppo_dict["render"][0])
Ejemplo n.º 3
0
    def build(self):
        """
        This function builds the DQN agent (ONLY 1 Env/Core is supported) based on the selected mode and runs it according to:
            1- Initializes the env
            2- If mode train is selected, train the model from scratch, learn, and save.
            3- If mode continue is selected, provide a path for pretrained model, load the model, learn, and save.
            4- If mode test is selected, provide a path fror pretrained model, load the model and test. 
        """
        #tensorboard activation (if used)
        #to view tensorboard type
        #tensorboard --logdir=./log_dir/{self.casename}_tensorlog
        if self.inp.dqn_dict['tensorboard'][0]:
            tensorboard_log = self.log_dir + '{}_tensorlog'.format(
                self.inp.dqn_dict['casename'][0])
        else:
            tensorboard_log = None

        if self.mode == 'train':
            # Train from scratch, initialize the model and then learn, and save the last model.
            # Callbacks are used if provided
            model = DQN(
                MlpPolicy,
                self.env,
                gamma=self.inp.dqn_dict['gamma'][0],
                learning_rate=self.inp.dqn_dict['learning_rate'][0],
                buffer_size=self.inp.dqn_dict['buffer_size'][0],
                exploration_fraction=self.inp.dqn_dict['exploration_fraction']
                [0],
                eps_final=self.inp.dqn_dict['eps_final'][0],
                learning_starts=self.inp.dqn_dict['learning_starts'][0],
                batch_size=self.inp.dqn_dict['batch_size'][0],
                target_network_update_freq=self.inp.
                dqn_dict['target_network_update_freq'][0],
                eps_init=self.inp.dqn_dict['eps_init'][0],
                train_freq=self.inp.dqn_dict['train_freq'][0],
                prioritized_replay=self.inp.dqn_dict['prioritized_replay'][0],
                verbose=2,
                seed=1)
            model.learn(total_timesteps=self.inp.dqn_dict['time_steps'][0],
                        callback=self.callback)
            model.save(self.log_dir + self.inp.dqn_dict['casename'][0] +
                       '_lastmodel.pkl')

        if self.mode == 'continue':
            # load, contine learning, and save last model
            model = DQN.load(
                self.inp.dqn_dict['model_load_path'][0],
                env=self.env,
                gamma=self.inp.dqn_dict['gamma'][0],
                learning_rate=self.inp.dqn_dict['learning_rate'][0],
                buffer_size=self.inp.dqn_dict['buffer_size'][0],
                exploration_fraction=self.inp.dqn_dict['exploration_fraction']
                [0],
                eps_final=self.inp.dqn_dict['eps_final'][0],
                learning_starts=self.inp.dqn_dict['learning_starts'][0],
                batch_size=self.inp.dqn_dict['batch_size'][0],
                target_network_update_freq=self.inp.
                dqn_dict['target_network_update_freq'][0],
                eps_init=self.inp.dqn_dict['eps_init'][0],
                train_freq=self.inp.dqn_dict['train_freq'][0],
                prioritized_replay=self.inp.dqn_dict['prioritized_replay'][0],
                verbose=2,
                seed=1)

            model.learn(total_timesteps=self.inp.dqn_dict['time_steps'][0],
                        callback=self.callback)
            model.save(self.log_dir + self.inp.dqn_dict['casename'][0] +
                       '_lastmodel.pkl')

        if self.mode == 'test':
            # load and test the agent. Env is recreated since test mode only works in single core
            print(
                'debug: dqn is running in test mode, single core is used to test the policy'
            )
            env = gym.make(self.inp.gen_dict['env'][0],
                           casename=self.inp.dqn_dict['casename'][0],
                           log_dir=self.log_dir,
                           exepath=self.inp.gen_dict['exepath'][0],
                           env_data=self.inp.gen_dict['env_data'][0],
                           env_seed=1)
            model = DQN.load(self.inp.dqn_dict['model_load_path'][0])
            evaluate_policy(
                model,
                env,
                log_dir=self.log_dir + 'dqn',
                n_eval_episodes=self.inp.dqn_dict["n_eval_episodes"][0],
                render=self.inp.dqn_dict["render"][0])