Beispiel #1
0
def train_ddpg(name: str = "DPPG-WormDomain"):
    """ Train the DDPG on the unity ML environment.
    """
    ###########################################
    # Ensure that the path exists for logging #
    ###########################################
    folder = Path(f'models/{datetime.now().date()}/{name}/')
    folder.mkdir(parents=True, exist_ok=True)

    # Store logs directly nearby the results!
    fh = log.FileHandler(
        f'models/{datetime.now().date()}/{name}/{datetime.now().date()}.log')
    fh.setFormatter(logFormatter)
    log.getLogger().addHandler(fh)

    env = "envs/worm_dynamic_one_agent/win/UnityEnvironment"  # Windows
    # env = "./envs/worm_dynamic_one_agent/linux/worm_dynamic"  # Linux
    env = get_env(env, False)

    trainer = DDPGTrainer()

    log.info("Start DDPG training (WormDomain)...")

    trainer.train(env, name=name)

    log.info("Training done!")
Beispiel #2
0
def train_ddpg_gym(env_name: str = "Pendulum-v0"):
    """ Train the DDPG on a gym environment
    """
    ###########################################
    # Ensure that the path exists for logging #
    ###########################################
    folder = Path(f'models/{datetime.now().date()}/DPPG-{env_name}/')
    folder.mkdir(parents=True, exist_ok=True)

    # Store logs directly nearby the results!
    fh = log.FileHandler(
        f'models/{datetime.now().date()}/DPPG-{env_name}/{datetime.now().date()}.log'
    )
    fh.setFormatter(logFormatter)
    log.getLogger().addHandler(fh)

    env = gym.make(env_name)

    trainer = DDPGTrainer()

    log.info(f"Start DDPG training ({env_name})...")

    # Define default parameter
    # trainer.config["episodes"] = 1000
    # trainer.config["training_steps"] = 700

    trainer.train(env, name=f"DPPG-{env_name}")
    log.info("Training done!")
Beispiel #3
0
    def forward(self, state, action):
        log.info(f"state: {state.shape}")
        log.info(f"action: {action.shape}")

        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2
Beispiel #4
0
    def start_training(self,
                       env: object,
                       trials: int = 1,
                       render: bool = False,
                       name: str = None,
                       training_steps: int = None,
                       default: bool = False):
        """ Method to start HPO.

            Parameters:
            -----------
                env: UnityEnvironment or GymEnvironment
                    The environment the agent interacts in.
                trials: int
                    Number of HPO runs that are executed from the HPO
                    library (trials = 2 means the train method is two times
                    executed with different parameters)
                render: bool
                    Flag to decide if we want to render in case of a gym
                    environment.
                name: str
                    A name for the model/agent that is used to store the best
                    model.
                trainings_steps: int
                    Custom number of steps that should be used for training.
                default: bool
                    For deciding to use default parameter (fixed paramter).
        """
        self.default = default

        log.info(f"Optuna set up - trials: {trials}, name: {name}")

        ######################
        # Set Up HPO library #
        ######################
        study = optuna.create_study(direction="maximize")
        log.info("Start optimization!")

        ###########
        # Run HPO #
        ###########
        study.optimize(lambda trial: self.train_hpo(trial, env, render, name,
                                                    study, training_steps),
                       n_trials=trials,
                       timeout=600)

        log.info("Optimization done.")

        ##########################
        # Store study of HPO run #
        ##########################
        with open(f'{self.path}/{name}_study.pickle', 'wb+') as fout:
            pickle.dump(study, fout)

        log.info(f"Study stored. ({self.path}/{name}_study.pickle)")

        env.close()

        return study
Beispiel #5
0
    def train(self,
              env: object,
              render: bool = False,
              name: str = None,
              render_training: bool = False):
        """ Standard train method for train an DDPG agent on an environment.

            Parameters:
            -----------
                env: GymEnvironment or UnityEnvironment
                    The environment that is used for training
                render: bool (optional)
                    For enable the rendering during training. Only usable for gym environments.
                name: str
                    Name of the agents for storing the results etc.
        """
        #############
        # Set seeds #
        #############
        env.action_space.seed(0)
        torch.manual_seed(0)
        np.random.seed(0)

        #########################
        # Init directory Set Up #
        #########################
        self.track_setup(name)

        ###########################
        # Gather env. information #
        ###########################
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])

        ################
        # Create agent #
        ################
        self.ddpg_agent = DDPGagent(
            env, [self.config["n_units_l0"], self.config["n_units_l1"]],
            self.config["actor_lr"], self.config["critic_lr"],
            self.config["gamma"], self.config["tau"])

        self.ddpg_agent.max_action = max_action

        replay_buffer = ReplayBuffer(state_dim, action_dim)

        rewards = []
        noise = OUNoise(env.action_space)

        episodes = self.config["episodes"]
        training_steps = self.config["training_steps"]
        explore_threshold = self.config["explore_threshold"]
        batch_size = self.config["batch_size"]

        log.info(f"Start episodes ({episodes}) with {training_steps} steps.")

        self.episode_num = 0
        self.eval_rewards = []
        self.train_rewards = []

        log.info("Parameter:")
        log.info(self.config)

        overall_steps = 0

        #############################
        # Start Trainings Procedure #
        #############################
        for episode in range(episodes):
            state = env.reset()
            episode_reward = 0

            if episode % 10 == 0:
                log.info(f"Episode-Step: {episode}/{episodes}")

            ############
            # Training #
            ############
            for step in range(training_steps):

                ###############
                # Exploration #
                ###############
                if overall_steps < explore_threshold * (training_steps *
                                                        episodes):
                    if step % 50 == 0:
                        log.info(
                            f"Trainings-Step: {step}/{training_steps} (Explore)"
                        )

                    # Just sample some action (random sampling)
                    action = env.action_space.sample()
                else:
                    if step % 50 == 0:
                        log.info(f"Trainings-Step: {step}/{training_steps}")

                    action = self.ddpg_agent.get_action(np.array(state))

                    # Gaussian Noise. Used from TD3. Paper recommend OU Noise
                    # noise = np.random.normal(0, self.ddpg_agent.max_action * 0.1, size=self.ddpg_agent.num_actions)

                    # action = (action + noise).clip(-self.ddpg_agent.max_action,
                    #                                self.ddpg_agent.max_action)
                    action = noise.get_action(action, step)

                # Important for the Pendulum domain.
                if np.array(action).size > 1:
                    action = np.array(action).reshape((1, 9))

                # Tracking to files
                # self.track_action(action, step,
                #                   training_steps)

                next_state, reward, done, _ = env.step(action)

                if render_training:
                    env.render()

                # From TD3 implementation
                # done = (True
                #         if step < self.config["training_steps"]
                #         else False)
                done_bool = float(done)

                # Gather experiences
                replay_buffer.add(state, action, next_state, reward, done_bool)

                state = next_state
                episode_reward += reward

                # Tracking to files
                # self.track_training_reward(episode_reward,
                #                            step,
                #                            training_steps)

                if done:
                    self.track_successful_episodes(episode, episode_reward,
                                                   step)
                    state, done = env.reset(), False
                    episode_reward = 0
                    self.episode_num += 1

                overall_steps += 1

            state, done = env.reset(), False
            episode_reward = 0

            # Important: Set an appropriate replay buffer.
            self.ddpg_agent.memory_buffer = replay_buffer

            #################################
            # Update neural nets (Learning) #
            #################################
            if (len(self.ddpg_agent.memory_buffer) > batch_size
                    and overall_steps >= explore_threshold *
                (training_steps * episodes)):
                for step in range(training_steps):
                    # Only train the nets when we have enough experience and we
                    # do not randomly explore anymore.
                    self.ddpg_agent.update(batch_size)

            ########################
            # Evaluation per epoch #
            ########################
            # Evaluation after we learned something
            if overall_steps >= explore_threshold * (training_steps *
                                                     episodes):
                log.info(
                    f"Start Evaluation: {self.config['evaluation_steps']}")

                self.eval_episode_reward = 0

                # For monitoring the progress.
                if render:
                    path = f'models/{datetime.now().date()}/{name}/'
                    eval_env = wrappers.Monitor(env, path, force=True)
                else:
                    eval_env = env

                # env.action_space.seed(0)
                for step in range(self.config["evaluation_steps"]):
                    if step % 50 == 0:
                        log.info(
                            f"Evaluation-Episode: {step}/{self.config['evaluation_steps']}"
                        )

                    state = eval_env.reset()
                    done = False
                    k = 0
                    while not done:
                        action = self.ddpg_agent.get_action(np.array(state))

                        # For pendulum
                        if np.array(action).size > 1:
                            action = np.array(action).reshape((1, 9))

                        action = action.clip(-self.ddpg_agent.max_action,
                                             self.ddpg_agent.max_action)

                        state, reward, done, _ = eval_env.step(action)

                        self.eval_episode_reward += reward

                        # If you want to do not all evaluation steps. Important
                        # for the gym monitor it is important that the episode
                        # is done before the environment is closed!
                        if self.config[
                                "evaluation_lim"] is not None and self.config[
                                    "evaluation_lim"] < k:
                            break

                        k += 1

                if self.config['evaluation_steps'] > 0:
                    log.info(
                        f"Evaluation Reward: {self.eval_episode_reward/self.config['evaluation_steps']}"
                    )

                if self.config["evaluation_steps"] > 0:
                    self.eval_rewards.append(self.eval_episode_reward /
                                             self.config["evaluation_steps"])

            # Loacal tracking
            # self.track_reward(episode_reward, episode)

            rewards.append(episode_reward)

            ################################
            # Persist Tracking per Episode #
            ################################
            folder = Path(f'models/{datetime.now().date()}/{name}/')
            folder.mkdir(parents=True, exist_ok=True)

            pd.DataFrame(self.eval_rewards).to_csv(
                f'models/{datetime.now().date()}/{name}/eval_rewards.csv')

            # Training rewards
            # pd.DataFrame(self.train_rewards).to_csv(f'models/{datetime.now().date()}/{name}/train_rewards.csv')

            with open(f'models/{datetime.now().date()}/{name}/config.json',
                      "w+") as f:
                json.dump(self.config, f)

            ##############
            # Save Agent #
            ##############
            with open(
                    f'models/{datetime.now().date()}/{name}/ddpg_agent_training.pickle',
                    "wb+") as f:
                pickle.dump(self.ddpg_agent, f)

        log.info("End episode!")
        log.info("Close environment")
        env.close()
Beispiel #6
0
    def train_hpo(self,
                  trial: object,
                  env: object,
                  render: bool = False,
                  name: str = None,
                  study: object = None,
                  training_steps: int = 1000):
        """ Trainingsprocedure for running HPO.

            The actual trainingsprocedure is equal to the on in train().
            However here we have additional HPO initilizations.

            Important: This trainingprocedure doesn't have a evaluation loop.

            Parameters:
            -----------
                trial: optuna.Trial
                    The trail object from the handovered from the HPO library.
                    This contains functionality for selecting hyper parameters.
                env: UnityEnvironment or GymEnvironment
                    The environemnt in which the agent should train.
                render: bool
                    Falg to decide if we want to render the steps in case of a
                    gym environment.
                name: str
                    A name for the model/agent that is used to store the best
                    model.
                study: optuna.Study
                    Study object that contains informationen about the training
                    (parameters for each run, best parameter set etc.)
                training_steps: int
                    Custom number of training steps that the agent should do.
            Return:
            -------
                reward: float
                    Since we want to optimize the reward in our case we return
                    the reward of the trainings run.
        """

        log.info(f"Start trial#{trial.number}")

        # Init directory set up.
        self.track_setup(name, trial)

        ############################
        # Hyperparameter for Agent #
        ############################
        num_hidden_layers = trial.suggest_int("n_layers", 1, 3)

        hidden_dim = []

        for i in range(num_hidden_layers):
            hidden_dim += [trial.suggest_int("n_units_l{}".format(i), 32, 256)]

        actor_lr = trial.suggest_uniform("actor_lr", 1e-4, 1e-2)
        critic_lr = trial.suggest_uniform("critic_lr", 1e-4, 1e-2)

        gamma = trial.suggest_uniform("gamma", 0.95, 0.999)
        tau = trial.suggest_uniform("tau", 1e-4, 1e-1)

        ######################################
        # Hyperparameter for Training set up #
        ######################################
        if not training_steps:
            # Trainings steps are fixed!
            training_steps = trial.suggest_int("training_steps", 1000, 1500)

        episodes = trial.suggest_int("episodes", training_steps,
                                     training_steps * 1.5)
        explore_threshold = trial.suggest_uniform("explore_threshold", 0.15,
                                                  0.5)
        batch_size = trial.suggest_int("batch_size", 32, 512)

        if self.default:
            num_hidden_layers = self.default_trail["n_layers"]
            hidden_dim = [
                self.default_trail["n_units_l0"],
                self.default_trail["n_units_l1"]
            ]

            actor_lr = self.default_trail["actor_lr"]
            critic_lr = self.default_trail["critic_lr"]

            gamma = self.default_trail["gamma"]
            tau = self.default_trail["tau"]

            episodes = self.default_trail['episodes']
            training_steps = self.default_trail['training_steps']

            explore_threshold = self.default_trail['explore_threshold']
            batch_size = self.default_trail['batch_size']

            log.info(f"Params {self.default_trail}")
        else:
            log.info(f"Params {trial.params}")

        ###########################
        # Gather env. information #
        ###########################
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])

        ################
        # Create agent #
        ################
        self.ddpg_agent = DDPGagent(env, hidden_dim, actor_lr, critic_lr,
                                    gamma, tau)
        self.ddpg_agent.max_action = max_action

        replay_buffer = ReplayBuffer(state_dim, action_dim)

        rewards = []
        noise = OUNoise(env.action_space)

        log.info(f"Start episodes ({episodes}) with {training_steps} steps.")

        self.episode_num = 0

        overall_steps = 0
        for episode in range(episodes):
            state = env.reset()
            noise.reset()
            episode_reward = 0

            if episode % 50 == 0:
                log.info(f"Trainings-Step: {episode}/{episodes}")

            for step in range(training_steps):
                if render:
                    env.render()

                # Explorate the first
                if overall_steps < explore_threshold * (training_steps *
                                                        episodes):
                    if step % 50 == 0:
                        log.info(
                            f"Trainings-Step: {step}/{training_steps} (Explore)"
                        )

                    action = env.action_space.sample()
                else:
                    if step % 50 == 0:
                        log.info(f"Trainings-Step: {step}/{training_steps}")

                    action = self.ddpg_agent.get_action(state)
                    action = noise.get_action(action, step)

                new_state, reward, done, _ = env.step(action)
                # self.ddpg_agent.memory_buffer.push(state, action, reward,
                #                                    new_state, done)
                # Gather experiences
                replay_buffer.add(state, action, new_state, reward, done)

                # Important: Set an appropriate replay buffer.
                self.ddpg_agent.memory_buffer = replay_buffer

                #####################################
                # Train/Update Actor and Critic     #
                # Here we train within the train    #
                # loop. This is a major difference  #
                # to the other trainings procedures #
                #####################################
                if (len(self.ddpg_agent.memory_buffer) > batch_size
                        and overall_steps >= explore_threshold *
                    (training_steps * episodes)):
                    self.ddpg_agent.update(batch_size)

                state = new_state
                episode_reward += reward

                self.track_training_reward(episode_reward, step,
                                           training_steps)

                if done:
                    self.track_successful_episodes(episode, episode_reward,
                                                   step)
                    state, done = env.reset(), False
                    episode_reward = 0
                    self.episode_num += 1

                overall_steps += 1

            self.track_reward(episode_reward, episode)
            trial.report(episode_reward, episode)

            rewards.append(episode_reward)

        log.info("End episode!")

        ########################
        # HPO Pruning handling #
        ########################
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        ####################
        # Save best models #
        ####################
        if study and name:
            try:
                if study.best_value < np.array(rewards).mean():
                    with open(
                            f'{self.path}/best_agent/{name}_best_agent.pickle',
                            'wb+') as fout:
                        pickle.dump(self.ddpg_agent, fout)

                    if self.default:
                        with open(
                                f'{self.path}/best_agent/{name}_best_params.json',
                                'w+') as fout:
                            json.dump(self.default_trail, fout)
                    else:
                        with open(
                                f'{self.path}/best_agent/{name}_best_params.json',
                                'w+') as fout:
                            json.dump(trial.params, fout)

                    self.training_rewards_df.to_csv(
                        f'{self.path}/best_agent/trainings_rewards.csv')

                    copyfile(f'{self.path_trial}/results/rewards.csv',
                             f'{self.path}/best_agent/rewards.csv')

                    copyfile(
                        f'{self.path_trial}/results/successful_episodes.csv',
                        f'{self.path}/best_agent/successful_episodes.csv')

                    log.info(
                        f"Best agent stored at {self.path}/best_agent/{name}_best_agent.pickle"
                    )
            except Exception:
                # Is executed when no study exists. I.e. first run.
                with open(f'{self.path}/best_agent/{name}_best_agent.pickle',
                          'wb+') as fout:
                    pickle.dump(self.ddpg_agent, fout)

                if self.default:
                    with open(
                            f'{self.path}/best_agent/{name}_best_params.json',
                            'w+') as fout:
                        json.dump(self.default_trail, fout)
                else:
                    with open(
                            f'{self.path}/best_agent/{name}_best_params.json',
                            'w+') as fout:
                        json.dump(trial.params, fout)

                self.training_rewards_df.to_csv(
                    f'{self.path}/best_agent/trainings_rewards.csv')

                copyfile(f'{self.path_trial}/results/rewards.csv',
                         f'{self.path}/best_agent/rewards.csv')

                copyfile(f'{self.path_trial}/results/successful_episodes.csv',
                         f'{self.path}/best_agent/successful_episodes.csv')

                log.info(
                    f"Initial agent stored at {self.path}/best_agent/{name}_best_agent.pickle"
                )

        log.info(f"End trial#{trial.number}")

        return np.array(rewards).mean()
Beispiel #7
0
    def train_baseline(self,
                       env: object,
                       name: str,
                       render: bool = False,
                       nb_epochs: int = 50,
                       nb_epoch_cycles: int = 20,
                       nb_rollout_steps: int = 100,
                       nb_train_steps: int = 100,
                       nb_eval_steps: int = 100):
        """ Trainings procedure from baseline implementation

            This implementation has some additional loops and parameters.
            We do not recommend to use this training procedure, since we
            are not sure if it works properly. It was implemented for testing
            purpose.

            Link: https://github.com/openai/baselines/tree/master/baselines/ddpg

            Parameters:
            -----------
                env: GymEnvironment or UnityEnvironment
                    THe environment in that the agent interact.
                name: str
                    Name of the agent for tracking purpose.
                render: bool
                    Render flag that decide to render the steps.
                nb_epochs: int
                    Number of epochs (corresponds to number of episodes)
                nb_epoch_cycles: int
                    Number of cycles within one epoch.
                nb_rollout_steps: int
                    Number of steps the agent should do for exploration and explotation.
                    (Within an epoch cycle)
                nb_train_steps: int
                    Number of trainings steps the agent should do.
                    (Within an epcoh cycle)
                nb_eval_steps: int
                    Number of evaluation steps.
        """

        ###########################
        # Gather env. information #
        ###########################
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])

        ################
        # Create agent #
        ################
        self.ddpg_agent = DDPGagent(
            env, [self.config["n_units_l0"], self.config["n_units_l1"]],
            self.config["actor_lr"], self.config["critic_lr"],
            self.config["gamma"], self.config["tau"])
        self.ddpg_agent.max_action = max_action

        # Use some white noise.
        gaussian_noise = GaussianNoise(
            mu=np.zeros(self.ddpg_agent.num_actions),
            sigma=np.ones(self.ddpg_agent.num_actions))

        replay_buffer = ReplayBuffer(state_dim, action_dim)

        self.episode_reward = np.zeros(1, dtype=np.float32)  # vector
        self.episode_step = np.zeros(1, dtype=int)  # vector
        self.episodes = 0  # scalar
        t = 0  # scalar

        epoch = 0
        batch_size = self.config["batch_size"]

        self.epoch_episode_rewards = []
        self.epoch_episode_steps = []
        self.epoch_actions = []
        self.epoch_reward = []
        self.epoch_episodes = 0
        self.episode_rewards_history = deque(maxlen=100)

        self.eval_rewards = []
        self.train_rewards = []
        self.eval_episode_rewards = []

        # Number of total iterations the agent takes
        self.number_it = (nb_epoch_cycles * nb_rollout_steps * nb_epochs)

        # Exploration fraction
        self.explore = 0.35

        for epoch in range(nb_epochs):
            log.info(f"Epoch: {epoch} / {nb_epochs}")
            state = env.reset()

            ############
            # Training #
            ############
            self.train_episode_reward = 0
            for cycle in range(nb_epoch_cycles):
                # log.info(f"Start cycle {cycle}/{nb_epoch_cycles}")
                for t_rollout in range(nb_rollout_steps):
                    # Explore 35% of all steps
                    if (cycle * t_rollout *
                            epoch) < self.number_it * self.explore:
                        # Explore
                        action = env.action_space.sample()
                    else:
                        # Predict next action.
                        action = self.ddpg_agent.get_action(state)
                        action += gaussian_noise()

                        action = np.clip(action, env.action_space.low,
                                         env.action_space.high)

                    new_state, reward, done, _ = env.step(action)

                    t += 1

                    self.episode_reward += reward
                    self.train_episode_reward += reward
                    self.episode_step += 1

                    # Book-keeping.
                    self.epoch_actions.append(action)

                    # Gather experiences
                    replay_buffer.add(state, action, new_state, reward, done)

                    state = new_state

                    if done:
                        # Episode done.
                        self.epoch_episode_rewards.append(self.episode_reward)
                        self.episode_rewards_history.append(
                            self.episode_reward)
                        self.epoch_episode_steps.append(self.episode_step)
                        self.epoch_episodes += 1
                        self.episodes += 1

                # Important: Set an appropriate replay buffer.
                self.ddpg_agent.memory_buffer = replay_buffer

                ################
                # Update Agent #
                ################
                # log.info(f"Start Training ({nb_train_steps})")
                for t_train in range(nb_train_steps):
                    self.ddpg_agent.update(batch_size)

            #####################
            # Trainings rewards #
            #####################
            self.train_episode_reward /= (nb_epoch_cycles * nb_rollout_steps)
            self.train_rewards.append(self.train_episode_reward)

            ########################
            # Evaluation per epoch #
            ########################
            self.eval_episode_reward = 0  # np.zeros(1, dtype=np.float32)
            for t_rollout in range(nb_eval_steps):
                state = env.reset()
                done = False
                while not done:
                    action = self.ddpg_agent.get_action(state)
                    action += gaussian_noise()
                    action = np.clip(action, env.action_space.low,
                                     env.action_space.high)

                    new_state, reward, done, _ = env.step(action)

                    if False and render:
                        env.render()
                    self.eval_episode_reward += reward

            ######################
            # Evaluation rewards #
            ######################
            self.eval_episode_reward /= nb_eval_steps
            self.eval_rewards.append(self.eval_episode_reward)

            #################
            # Track Results #
            #################
            folder = Path(f'models/{datetime.now().date()}/{name}/')
            folder.mkdir(parents=True, exist_ok=True)

            # Evaluation rewards
            pd.DataFrame(self.eval_rewards).to_csv(
                f'models/{datetime.now().date()}/{name}/eval_rewards.csv')

            # Training rewards
            pd.DataFrame(self.train_rewards).to_csv(
                f'models/{datetime.now().date()}/{name}/train_rewards.csv')

            # Config
            cfg = {
                "epochs": nb_epochs,
                "nb_epoch_cycles": nb_epoch_cycles,
                "nb_rollout_steps": nb_rollout_steps,
                "nb_train_steps": nb_train_steps,
                "nb_eval_steps": nb_eval_steps,
                "finished_episodes_train": self.epoch_episodes,
                "mean_reward_training": np.asarray(self.train_rewards).mean(),
                "mean_reward_eval": np.asarray(self.eval_rewards).mean(),
                "exploration_abs": self.number_it * self.explore,
                "exploration": self.explore
            }
            with open(f'models/{datetime.now().date()}/{name}/config.json',
                      "w+") as f:
                json.dump(cfg, f)

            # Agent
            with open(
                    f'models/{datetime.now().date()}/{name}/ddpg_agent_baseline_training.pickle',
                    "wb+") as f:
                pickle.dump(self.ddpg_agent, f)
Beispiel #8
0
    def train(self, env: object, render: bool = False, name: str = None):
        """ Train method from td3/training.py

            Parameters:
            -----------
                env: GymEnvironment or UnityEnvironment
                    Environment in which the agent interacts
                rende: bool
                    Falg to decide if the intermediate steps should be rendered
                name: str
                    Name of the model/agent for tracking
        """
        #############
        # Set seeds #
        #############
        env.action_space.seed(self.config["seed"])
        torch.manual_seed(self.config["seed"])
        np.random.seed(self.config["seed"])

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])

        replay_buffer = ReplayBuffer(state_dim, action_dim)
        best_buffer = ReplayBuffer(state_dim, action_dim)
        der_buffer = DynamicExperienceReplay(state_dim, action_dim)

        # Evaluate untrained policy
        state, done = env.reset(), False
        episode_reward = 0

        # Init directory set up.
        self.track_setup(name)

        ################
        # Create agent #
        ################
        self.td3_agent = TD3agent(state_dim, action_dim,
                                  self.config["discount"], self.config["tau"],
                                  self.config["policy_noise"],
                                  self.config["noise_clip"],
                                  self.config["policy_freq"])

        rewards = []

        episodes = self.config["episodes"]
        training_steps = self.config["training_steps"]

        log.info(
            f"Start episodes ({self.config['episodes']}) with {self.config['training_steps']} steps."
        )

        self.episode_num = 0
        self.eval_rewards = []
        self.train_rewards = []

        log.info("Parameter:")
        log.info(self.config)

        #############################
        # Start Trainings Procedure #
        #############################
        for episode in range(episodes):
            state = env.reset()
            episode_reward = 0

            if episode % 50 == 0:
                log.info(f"Episode-Step: {episode}/{episodes}")

            ############
            # Training #
            ############
            for step in range(training_steps):

                ###############
                # Exploration #
                ###############
                if episode < self.config["training_episodes"]:
                    if step % 50 == 0:
                        log.info(
                            f"Trainings-Step: {step}/{training_steps} (Explore)"
                        )

                    action = env.action_space.sample()
                else:
                    if step % 50 == 0:
                        log.info(f"Trainings-Step: {step}/{training_steps}")
                    action = self.td3_agent.select_action(np.array(state))
                    noise = np.random.normal(0,
                                             max_action *
                                             self.config["expl_noise"],
                                             size=action_dim)

                    action = (action + noise).clip(-max_action, max_action)

                # Important for the Pendulum domain.
                if np.array(action).size > 1:
                    action = np.array(action).reshape((1, 9))

                next_state, reward, done, _ = env.step(action)
                done = (True
                        if step < self.config["training_steps"] else False)
                done_bool = float(done)

                # Store data in replay buffer
                replay_buffer.add(state, action, next_state, reward, done_bool)
                best_buffer.add(state, action, next_state, reward, done_bool)

                # Store buffer
                if done:
                    der_buffer.add(best_buffer)
                    best_buffer = ReplayBuffer(state_dim, action_dim)

                state = next_state
                episode_reward += reward

            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0

            #################################
            # Update neural nets (Learning) #
            #################################
            if episode >= self.config["training_episodes"]:
                # Only train after exploration
                self.td3_agent.train(replay_buffer, self.config["batch_size"])
                # Reset replay buffer after training
                # replay_buffer = ReplayBuffer(state_dim, action_dim)

            ########################
            # Evaluation per epoch #
            ########################
            log.info(f"Start Evaluation: {self.config['evaluation_steps']}")
            eval_env = env
            eval_env.action_space.seed(self.config['seed'] + 100)

            avg_reward = 0.
            episode = 0
            k = 0
            for _ in range(self.config['evaluation_steps']):
                state, done = eval_env.reset(), False
                while not done:
                    action = self.td3_agent.select_action(np.array(state))
                    action = np.array(action).reshape((1, 9))
                    # print(eval_env.action_space)
                    state, reward, done, _ = eval_env.step(action)
                    avg_reward += reward

                    if self.config[
                            "evaluation_lim"] is not None and self.config[
                                "evaluation_lim"] < k:
                        break

                    k += 1

                log.info("Eval Episode:  " + str(episode))
                episode += 1

            avg_reward /= self.config['evaluation_steps']

            log.info(f"Evaluation Reward: {avg_reward}")

            self.eval_rewards.append(avg_reward)

            self.track_reward(episode_reward, episode)
            rewards.append(episode_reward)

            ################################
            # Persist Tracking per Episode #
            ################################
            folder = Path(f'models/{datetime.now().date()}/{name}/')
            folder.mkdir(parents=True, exist_ok=True)

            pd.DataFrame(self.eval_rewards).to_csv(
                f'models/{datetime.now().date()}/{name}/eval_rewards.csv')

            # Training rewards
            # pd.DataFrame(self.train_rewards).to_csv(f'models/{datetime.now().date()}/{name}/train_rewards.csv')

            with open(f'models/{datetime.now().date()}/{name}/config.json',
                      "w+") as f:
                json.dump(self.config, f)

            ##############
            # Save Agent #
            ##############
            with open(
                    f'models/{datetime.now().date()}/{name}/td3_agent_trained.pickle',
                    "wb+") as f:
                pickle.dump(self.td3_agent, f)

        log.info("End episode!")